fs/buffer.c

   1 /*
   2  *  linux/fs/buffer.c
   3  *
   4  *  Copyright (C) 1991, 1992  Linus Torvalds
   5  */
   6
   7 /*
   8  *  'buffer.c' implements the buffer-cache functions. Race-conditions have
   9  * been avoided by NEVER letting an interrupt change a buffer (except for the
  10  * data, of course), but instead letting the caller do it.
  11  */
  12
  13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
  14
  15 /* Removed a lot of unnecessary code and simplified things now that
  16  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  17  */
  18
  19 /* Speed up hash, lru, and free list operations.  Use gfp() for allocating
  20  * hash table, use SLAB cache for buffer heads. -DaveM
  21  */
  22
  23 /* Added 32k buffer block sizes - these are required older ARM systems.
  24  * - RMK
  25  */
  26
  27 /* Thread it... -DaveM */
  28
  29 /* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
  30
  31 #include <linux/config.h>
  32 #include <linux/sched.h>
  33 #include <linux/fs.h>
  34 #include <linux/malloc.h>
  35 #include <linux/locks.h>
  36 #include <linux/errno.h>
  37 #include <linux/swap.h>
  38 #include <linux/smp_lock.h>
  39 #include <linux/vmalloc.h>
  40 #include <linux/blkdev.h>
  41 #include <linux/sysrq.h>
  42 #include <linux/file.h>
  43 #include <linux/init.h>
  44 #include <linux/quotaops.h>
  45 #include <linux/iobuf.h>
  46 #include <linux/highmem.h>
  47
  48 #include <asm/uaccess.h>
  49 #include <asm/io.h>
  50 #include <asm/bitops.h>
  51 #include <asm/mmu_context.h>
  52
  53 #define NR_SIZES 7
  54 static char buffersize_index[65] =
  55 {-1,  0,  1, -1,  2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
  56   4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  57   5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  58  -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  59   6};
  60
  61 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
  62 #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
  63 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
  64 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
  65                                              number of unused buffer heads */
  66
  67 /* Anti-deadlock ordering:
  68  *      lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
  69  */
  70
  71 /*
  72  * Hash table gook..
  73  */
  74 static unsigned int bh_hash_mask;
  75 static unsigned int bh_hash_shift;
  76 static struct buffer_head **hash_table;
  77 static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
  78
  79 static struct buffer_head *lru_list[NR_LIST];
  80 static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED;
  81 static int nr_buffers_type[NR_LIST];
  82 static unsigned long size_buffers_type[NR_LIST];
  83
  84 static struct buffer_head * unused_list;
  85 static int nr_unused_buffer_heads;
  86 static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
  87 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
  88
  89 struct bh_free_head {
  90         struct buffer_head *list;
  91         spinlock_t lock;
  92 };
  93 static struct bh_free_head free_list[NR_SIZES];
  94
  95 kmem_cache_t *bh_cachep;
  96
  97 static int grow_buffers(int size);
  98 static void __refile_buffer(struct buffer_head *);
  99
 100 /* This is used by some architectures to estimate available memory. */
 101 atomic_t buffermem_pages = ATOMIC_INIT(0);
 102
 103 /* Here is the parameter block for the bdflush process. If you add or
 104  * remove any of the parameters, make sure to update kernel/sysctl.c.
 105  */
 106
 107 #define N_PARAM 9
 108
 109 /* The dummy values in this structure are left in there for compatibility
 110  * with old programs that play with the /proc entries.
 111  */
 112 union bdflush_param {
 113         struct {
 114                 int nfract;  /* Percentage of buffer cache dirty to
 115                                 activate bdflush */
 116                 int ndirty;  /* Maximum number of dirty blocks to write out per
 117                                 wake-cycle */
 118                 int nrefill; /* Number of clean buffers to try to obtain
 119                                 each time we call refill */
 120                 int nref_dirt; /* Dirty buffer threshold for activating bdflush
 121                                   when trying to refill buffers. */
 122                 int interval; /* jiffies delay between kupdate flushes */
 123                 int age_buffer;  /* Time for normal buffer to age before we flush it */
 124                 int age_super;  /* Time for superblock to age before we flush it */
 125                 int dummy2;    /* unused */
 126                 int dummy3;    /* unused */
 127         } b_un;
 128         unsigned int data[N_PARAM];
 129 } bdf_prm = {{40, 500, 64, 256, 5*HZ, 30*HZ, 5*HZ, 1884, 2}};
 130
 131 /* These are the min and max parameter values that we will allow to be assigned */
 132 int bdflush_min[N_PARAM] = {  0,  10,    5,   25,  0,   1*HZ,   1*HZ, 1, 1};
 133 int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,600*HZ, 6000*HZ, 6000*HZ, 2047, 5};
 134
 135 /*
 136  * Rewrote the wait-routines to use the "new" wait-queue functionality,
 137  * and getting rid of the cli-sti pairs. The wait-queue routines still
 138  * need cli-sti, but now it's just a couple of 386 instructions or so.
 139  *
 140  * Note that the real wait_on_buffer() is an inline function that checks
 141  * if 'b_wait' is set before calling this, so that the queues aren't set
 142  * up unnecessarily.
 143  */
 144 void __wait_on_buffer(struct buffer_head * bh)
 145 {
 146         struct task_struct *tsk = current;
 147         DECLARE_WAITQUEUE(wait, tsk);
 148
 149         atomic_inc(&bh->b_count);
 150         add_wait_queue(&bh->b_wait, &wait);
 151         do {
 152                 run_task_queue(&tq_disk);
 153                 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 154                 if (!buffer_locked(bh))
 155                         break;
 156                 schedule();
 157         } while (buffer_locked(bh));
 158         tsk->state = TASK_RUNNING;
 159         remove_wait_queue(&bh->b_wait, &wait);
 160         atomic_dec(&bh->b_count);
 161 }
 162
 163 /* Call sync_buffers with wait!=0 to ensure that the call does not
 164  * return until all buffer writes have completed.  Sync() may return
 165  * before the writes have finished; fsync() may not.
 166  */
 167
 168 /* Godamity-damn.  Some buffers (bitmaps for filesystems)
 169  * spontaneously dirty themselves without ever brelse being called.
 170  * We will ultimately want to put these in a separate list, but for
 171  * now we search all of the lists for dirty buffers.
 172  */
 173 static int sync_buffers(kdev_t dev, int wait)
 174 {
 175         int i, retry, pass = 0, err = 0;
 176         struct buffer_head * bh, *next;
 177
 178         /* One pass for no-wait, three for wait:
 179          * 0) write out all dirty, unlocked buffers;
 180          * 1) write out all dirty buffers, waiting if locked;
 181          * 2) wait for completion by waiting for all buffers to unlock.
 182          */
 183         do {
 184                 retry = 0;
 185
 186                 /* We search all lists as a failsafe mechanism, not because we expect
 187                  * there to be dirty buffers on any of the other lists.
 188                  */
 189 repeat:
 190                 spin_lock(&lru_list_lock);
 191                 bh = lru_list[BUF_DIRTY];
 192                 if (!bh)
 193                         goto repeat2;
 194
 195                 for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
 196                         next = bh->b_next_free;
 197
 198                         if (!lru_list[BUF_DIRTY])
 199                                 break;
 200                         if (dev && bh->b_dev != dev)
 201                                 continue;
 202                         if (buffer_locked(bh)) {
 203                                 /* Buffer is locked; skip it unless wait is
 204                                  * requested AND pass > 0.
 205                                  */
 206                                 if (!wait || !pass) {
 207                                         retry = 1;
 208                                         continue;
 209                                 }
 210                                 atomic_inc(&bh->b_count);
 211                                 spin_unlock(&lru_list_lock);
 212                                 wait_on_buffer (bh);
 213                                 atomic_dec(&bh->b_count);
 214                                 goto repeat;
 215                         }
 216
 217                         /* If an unlocked buffer is not uptodate, there has
 218                          * been an IO error. Skip it.
 219                          */
 220                         if (wait && buffer_req(bh) && !buffer_locked(bh) &&
 221                             !buffer_dirty(bh) && !buffer_uptodate(bh)) {
 222                                 err = -EIO;
 223                                 continue;
 224                         }
 225
 226                         /* Don't write clean buffers.  Don't write ANY buffers
 227                          * on the third pass.
 228                          */
 229                         if (!buffer_dirty(bh) || pass >= 2)
 230                                 continue;
 231
 232                         atomic_inc(&bh->b_count);
 233                         spin_unlock(&lru_list_lock);
 234                         ll_rw_block(WRITE, 1, &bh);
 235                         atomic_dec(&bh->b_count);
 236                         retry = 1;
 237                         goto repeat;
 238                 }
 239
 240     repeat2:
 241                 bh = lru_list[BUF_LOCKED];
 242                 if (!bh) {
 243                         spin_unlock(&lru_list_lock);
 244                         break;
 245                 }
 246                 for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
 247                         next = bh->b_next_free;
 248
 249                         if (!lru_list[BUF_LOCKED])
 250                                 break;
 251                         if (dev && bh->b_dev != dev)
 252                                 continue;
 253                         if (buffer_locked(bh)) {
 254                                 /* Buffer is locked; skip it unless wait is
 255                                  * requested AND pass > 0.
 256                                  */
 257                                 if (!wait || !pass) {
 258                                         retry = 1;
 259                                         continue;
 260                                 }
 261                                 atomic_inc(&bh->b_count);
 262                                 spin_unlock(&lru_list_lock);
 263                                 wait_on_buffer (bh);
 264                                 spin_lock(&lru_list_lock);
 265                                 atomic_dec(&bh->b_count);
 266                                 goto repeat2;
 267                         }
 268                 }
 269                 spin_unlock(&lru_list_lock);
 270
 271                 /* If we are waiting for the sync to succeed, and if any dirty
 272                  * blocks were written, then repeat; on the second pass, only
 273                  * wait for buffers being written (do not pass to write any
 274                  * more buffers on the second pass).
 275                  */
 276         } while (wait && retry && ++pass<=2);
 277         return err;
 278 }
 279
 280 void sync_dev(kdev_t dev)
 281 {
 282         sync_supers(dev);
 283         sync_inodes(dev);
 284         DQUOT_SYNC(dev);
 285         /* sync all the dirty buffers out to disk only _after_ all the
 286            high level layers finished generated buffer dirty data
 287            (or we'll return with some buffer still dirty on the blockdevice
 288            so breaking the semantics of this call) */
 289         sync_buffers(dev, 0);
 290         /*
 291          * FIXME(eric) we need to sync the physical devices here.
 292          * This is because some (scsi) controllers have huge amounts of
 293          * cache onboard (hundreds of Mb), and we need to instruct
 294          * them to commit all of the dirty memory to disk, and we should
 295          * not return until this has happened.
 296          *
 297          * This would need to get implemented by going through the assorted
 298          * layers so that each block major number can be synced, and this
 299          * would call down into the upper and mid-layer scsi.
 300          */
 301 }
 302
 303 int fsync_dev(kdev_t dev)
 304 {
 305         sync_buffers(dev, 0);
 306
 307         lock_kernel();
 308         sync_supers(dev);
 309         sync_inodes(dev);
 310         DQUOT_SYNC(dev);
 311         unlock_kernel();
 312
 313         return sync_buffers(dev, 1);
 314 }
 315
 316 asmlinkage long sys_sync(void)
 317 {
 318         fsync_dev(0);
 319         return 0;
 320 }
 321
 322 /*
 323  *      filp may be NULL if called via the msync of a vma.
 324  */
 325
 326 int file_fsync(struct file *filp, struct dentry *dentry)
 327 {
 328         struct inode * inode = dentry->d_inode;
 329         struct super_block * sb;
 330         kdev_t dev;
 331         int ret;
 332
 333         lock_kernel();
 334         /* sync the inode to buffers */
 335         write_inode_now(inode);
 336
 337         /* sync the superblock to buffers */
 338         sb = inode->i_sb;
 339         wait_on_super(sb);
 340         if (sb->s_op && sb->s_op->write_super)
 341                 sb->s_op->write_super(sb);
 342
 343         /* .. finally sync the buffers to disk */
 344         dev = inode->i_dev;
 345         ret = sync_buffers(dev, 1);
 346         unlock_kernel();
 347         return ret;
 348 }
 349
 350 asmlinkage long sys_fsync(unsigned int fd)
 351 {
 352         struct file * file;
 353         struct dentry * dentry;
 354         struct inode * inode;
 355         int err;
 356
 357         err = -EBADF;
 358         file = fget(fd);
 359         if (!file)
 360                 goto out;
 361
 362         dentry = file->f_dentry;
 363         if (!dentry)
 364                 goto out_putf;
 365
 366         inode = dentry->d_inode;
 367         if (!inode)
 368                 goto out_putf;
 369
 370         err = -EINVAL;
 371         if (!file->f_op || !file->f_op->fsync)
 372                 goto out_putf;
 373
 374         /* We need to protect against concurrent writers.. */
 375         down(&inode->i_sem);
 376         err = file->f_op->fsync(file, dentry);
 377         up(&inode->i_sem);
 378
 379 out_putf:
 380         fput(file);
 381 out:
 382         return err;
 383 }
 384
 385 asmlinkage long sys_fdatasync(unsigned int fd)
 386 {
 387         struct file * file;
 388         struct dentry * dentry;
 389         struct inode * inode;
 390         int err;
 391
 392         err = -EBADF;
 393         file = fget(fd);
 394         if (!file)
 395                 goto out;
 396
 397         dentry = file->f_dentry;
 398         if (!dentry)
 399                 goto out_putf;
 400
 401         inode = dentry->d_inode;
 402         if (!inode)
 403                 goto out_putf;
 404
 405         err = -EINVAL;
 406         if (!file->f_op || !file->f_op->fsync)
 407                 goto out_putf;
 408
 409         /* this needs further work, at the moment it is identical to fsync() */
 410         down(&inode->i_sem);
 411         err = file->f_op->fsync(file, dentry);
 412         up(&inode->i_sem);
 413
 414 out_putf:
 415         fput(file);
 416 out:
 417         return err;
 418 }
 419
 420 /* After several hours of tedious analysis, the following hash
 421  * function won.  Do not mess with it... -DaveM
 422  */
 423 #define _hashfn(dev,block)      \
 424         ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
 425          (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ ((block) << (bh_hash_shift - 12))))
 426 #define hash(dev,block) hash_table[(_hashfn(dev,block) & bh_hash_mask)]
 427
 428 static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head)
 429 {
 430         if ((bh->b_next = *head) != NULL)
 431                 bh->b_next->b_pprev = &bh->b_next;
 432         *head = bh;
 433         bh->b_pprev = head;
 434 }
 435
 436 static __inline__ void __hash_unlink(struct buffer_head *bh)
 437 {
 438         if (bh->b_pprev) {
 439                 if (bh->b_next)
 440                         bh->b_next->b_pprev = bh->b_pprev;
 441                 *(bh->b_pprev) = bh->b_next;
 442                 bh->b_pprev = NULL;
 443         }
 444 }
 445
 446 static void __insert_into_lru_list(struct buffer_head * bh, int blist)
 447 {
 448         struct buffer_head **bhp = &lru_list[blist];
 449
 450         if(!*bhp) {
 451                 *bhp = bh;
 452                 bh->b_prev_free = bh;
 453         }
 454         bh->b_next_free = *bhp;
 455         bh->b_prev_free = (*bhp)->b_prev_free;
 456         (*bhp)->b_prev_free->b_next_free = bh;
 457         (*bhp)->b_prev_free = bh;
 458         nr_buffers_type[blist]++;
 459         size_buffers_type[blist] += bh->b_size;
 460 }
 461
 462 static void __remove_from_lru_list(struct buffer_head * bh, int blist)
 463 {
 464         if (bh->b_prev_free || bh->b_next_free) {
 465                 bh->b_prev_free->b_next_free = bh->b_next_free;
 466                 bh->b_next_free->b_prev_free = bh->b_prev_free;
 467                 if (lru_list[blist] == bh)
 468                         lru_list[blist] = bh->b_next_free;
 469                 if (lru_list[blist] == bh)
 470                         lru_list[blist] = NULL;
 471                 bh->b_next_free = bh->b_prev_free = NULL;
 472                 nr_buffers_type[blist]--;
 473                 size_buffers_type[blist] -= bh->b_size;
 474         }
 475 }
 476
 477 static void __remove_from_free_list(struct buffer_head * bh, int index)
 478 {
 479         if(bh->b_next_free == bh)
 480                  free_list[index].list = NULL;
 481         else {
 482                 bh->b_prev_free->b_next_free = bh->b_next_free;
 483                 bh->b_next_free->b_prev_free = bh->b_prev_free;
 484                 if (free_list[index].list == bh)
 485                          free_list[index].list = bh->b_next_free;
 486         }
 487         bh->b_next_free = bh->b_prev_free = NULL;
 488 }
 489
 490 /* must be called with both the hash_table_lock and the lru_list_lock
 491    held */
 492 static void __remove_from_queues(struct buffer_head *bh)
 493 {
 494         __hash_unlink(bh);
 495         __remove_from_lru_list(bh, bh->b_list);
 496 }
 497
 498 static void insert_into_queues(struct buffer_head *bh)
 499 {
 500         struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
 501
 502         spin_lock(&lru_list_lock);
 503         write_lock(&hash_table_lock);
 504         __hash_link(bh, head);
 505         __insert_into_lru_list(bh, bh->b_list);
 506         write_unlock(&hash_table_lock);
 507         spin_unlock(&lru_list_lock);
 508 }
 509
 510 /* This function must only run if there are no other
 511  * references _anywhere_ to this buffer head.
 512  */
 513 static void put_last_free(struct buffer_head * bh)
 514 {
 515         struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)];
 516         struct buffer_head **bhp = &head->list;
 517
 518         bh->b_state = 0;
 519
 520         spin_lock(&head->lock);
 521         bh->b_dev = B_FREE;
 522         if(!*bhp) {
 523                 *bhp = bh;
 524                 bh->b_prev_free = bh;
 525         }
 526         bh->b_next_free = *bhp;
 527         bh->b_prev_free = (*bhp)->b_prev_free;
 528         (*bhp)->b_prev_free->b_next_free = bh;
 529         (*bhp)->b_prev_free = bh;
 530         spin_unlock(&head->lock);
 531 }
 532
 533 /*
 534  * Why like this, I hear you say... The reason is race-conditions.
 535  * As we don't lock buffers (unless we are reading them, that is),
 536  * something might happen to it while we sleep (ie a read-error
 537  * will force it bad). This shouldn't really happen currently, but
 538  * the code is ready.
 539  */
 540 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
 541 {
 542         struct buffer_head **head = &hash(dev, block);
 543         struct buffer_head *bh;
 544
 545         read_lock(&hash_table_lock);
 546         for(bh = *head; bh; bh = bh->b_next)
 547                 if (bh->b_blocknr == block      &&
 548                     bh->b_size    == size       &&
 549                     bh->b_dev     == dev)
 550                         break;
 551         if (bh)
 552                 atomic_inc(&bh->b_count);
 553         read_unlock(&hash_table_lock);
 554
 555         return bh;
 556 }
 557
 558 unsigned int get_hardblocksize(kdev_t dev)
 559 {
 560         /*
 561          * Get the hard sector size for the given device.  If we don't know
 562          * what it is, return 0.
 563          */
 564         if (hardsect_size[MAJOR(dev)] != NULL) {
 565                 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
 566                 if (blksize != 0)
 567                         return blksize;
 568         }
 569
 570         /*
 571          * We don't know what the hardware sector size for this device is.
 572          * Return 0 indicating that we don't know.
 573          */
 574         return 0;
 575 }
 576
 577 /* If invalidate_buffers() will trash dirty buffers, it means some kind
 578    of fs corruption is going on. Trashing dirty data always imply losing
 579    information that was supposed to be just stored on the physical layer
 580    by the user.
 581
 582    Thus invalidate_buffers in general usage is not allwowed to trash dirty
 583    buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved.
 584
 585    NOTE: In the case where the user removed a removable-media-disk even if
 586    there's still dirty data not synced on disk (due a bug in the device driver
 587    or due an error of the user), by not destroying the dirty buffers we could
 588    generate corruption also on the next media inserted, thus a parameter is
 589    necessary to handle this case in the most safe way possible (trying
 590    to not corrupt also the new disk inserted with the data belonging to
 591    the old now corrupted disk). Also for the ramdisk the natural thing
 592    to do in order to release the ramdisk memory is to destroy dirty buffers.
 593
 594    These are two special cases. Normal usage imply the device driver
 595    to issue a sync on the device (without waiting I/O completation) and
 596    then an invalidate_buffers call that doesn't trashes dirty buffers. */
 597 void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
 598 {
 599         int i, nlist, slept;
 600         struct buffer_head * bh, * bh_next;
 601
 602  retry:
 603         slept = 0;
 604         spin_lock(&lru_list_lock);
 605         for(nlist = 0; nlist < NR_LIST; nlist++) {
 606                 bh = lru_list[nlist];
 607                 if (!bh)
 608                         continue;
 609                 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
 610                         bh_next = bh->b_next_free;
 611                         if (bh->b_dev != dev)
 612                                 continue;
 613                         if (buffer_locked(bh)) {
 614                                 atomic_inc(&bh->b_count);
 615                                 spin_unlock(&lru_list_lock);
 616                                 wait_on_buffer(bh);
 617                                 slept = 1;
 618                                 spin_lock(&lru_list_lock);
 619                                 atomic_dec(&bh->b_count);
 620                         }
 621
 622                         write_lock(&hash_table_lock);
 623                         if (!atomic_read(&bh->b_count) &&
 624                             (destroy_dirty_buffers || !buffer_dirty(bh))) {
 625                                 __remove_from_queues(bh);
 626                                 put_last_free(bh);
 627                         }
 628                         write_unlock(&hash_table_lock);
 629                         if (slept)
 630                                 goto out;
 631                 }
 632         }
 633 out:
 634         spin_unlock(&lru_list_lock);
 635         if (slept)
 636                 goto retry;
 637 }
 638
 639 void set_blocksize(kdev_t dev, int size)
 640 {
 641         extern int *blksize_size[];
 642         int i, nlist, slept;
 643         struct buffer_head * bh, * bh_next;
 644
 645         if (!blksize_size[MAJOR(dev)])
 646                 return;
 647
 648         /* Size must be a power of two, and between 512 and PAGE_SIZE */
 649         if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
 650                 panic("Invalid blocksize passed to set_blocksize");
 651
 652         if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
 653                 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 654                 return;
 655         }
 656         if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
 657                 return;
 658         sync_buffers(dev, 2);
 659         blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 660
 661  retry:
 662         slept = 0;
 663         spin_lock(&lru_list_lock);
 664         for(nlist = 0; nlist < NR_LIST; nlist++) {
 665                 bh = lru_list[nlist];
 666                 if (!bh)
 667                         continue;
 668                 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
 669                         bh_next = bh->b_next_free;
 670                         if (bh->b_dev != dev || bh->b_size == size)
 671                                 continue;
 672                         if (buffer_locked(bh)) {
 673                                 atomic_inc(&bh->b_count);
 674                                 spin_unlock(&lru_list_lock);
 675                                 wait_on_buffer(bh);
 676                                 slept = 1;
 677                                 spin_lock(&lru_list_lock);
 678                                 atomic_dec(&bh->b_count);
 679                         }
 680
 681                         write_lock(&hash_table_lock);
 682                         if (!atomic_read(&bh->b_count)) {
 683                                 if (buffer_dirty(bh))
 684                                         printk(KERN_WARNING
 685                                                "set_blocksize: dev %s buffer_dirty %lu size %hu\n",
 686                                                kdevname(dev), bh->b_blocknr, bh->b_size);
 687                                 __remove_from_queues(bh);
 688                                 put_last_free(bh);
 689                         } else {
 690                                 if (atomic_set_buffer_clean(bh))
 691                                         __refile_buffer(bh);
 692                                 clear_bit(BH_Uptodate, &bh->b_state);
 693                                 printk(KERN_WARNING
 694                                        "set_blocksize: "
 695                                        "b_count %d, dev %s, block %lu, from %p\n",
 696                                        atomic_read(&bh->b_count), bdevname(bh->b_dev),
 697                                        bh->b_blocknr, __builtin_return_address(0));
 698                         }
 699                         write_unlock(&hash_table_lock);
 700                         if (slept)
 701                                 goto out;
 702                 }
 703         }
 704  out:
 705         spin_unlock(&lru_list_lock);
 706         if (slept)
 707                 goto retry;
 708 }
 709
 710 /*
 711  * We used to try various strange things. Let's not.
 712  */
 713 static void refill_freelist(int size)
 714 {
 715         if (!grow_buffers(size)) {
 716                 wakeup_bdflush(1);
 717                 current->policy |= SCHED_YIELD;
 718                 schedule();
 719         }
 720 }
 721
 722 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *dev_id)
 723 {
 724         bh->b_list = BUF_CLEAN;
 725         bh->b_end_io = handler;
 726         bh->b_dev_id = dev_id;
 727 }
 728
 729 static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 730 {
 731         mark_buffer_uptodate(bh, uptodate);
 732         unlock_buffer(bh);
 733 }
 734
 735 static void end_buffer_io_bad(struct buffer_head *bh, int uptodate)
 736 {
 737         mark_buffer_uptodate(bh, uptodate);
 738         unlock_buffer(bh);
 739         BUG();
 740 }
 741
 742 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
 743 {
 744         static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
 745         unsigned long flags;
 746         struct buffer_head *tmp;
 747         struct page *page;
 748
 749         mark_buffer_uptodate(bh, uptodate);
 750
 751         /* This is a temporary buffer used for page I/O. */
 752         page = bh->b_page;
 753
 754         if (!uptodate)
 755                 SetPageError(page);
 756
 757         /*
 758          * Be _very_ careful from here on. Bad things can happen if
 759          * two buffer heads end IO at almost the same time and both
 760          * decide that the page is now completely done.
 761          *
 762          * Async buffer_heads are here only as labels for IO, and get
 763          * thrown away once the IO for this page is complete.  IO is
 764          * deemed complete once all buffers have been visited
 765          * (b_count==0) and are now unlocked. We must make sure that
 766          * only the _last_ buffer that decrements its count is the one
 767          * that unlock the page..
 768          */
 769         spin_lock_irqsave(&page_uptodate_lock, flags);
 770         unlock_buffer(bh);
 771         atomic_dec(&bh->b_count);
 772         tmp = bh->b_this_page;
 773         while (tmp != bh) {
 774                 if (tmp->b_end_io == end_buffer_io_async && buffer_locked(tmp))
 775                         goto still_busy;
 776                 tmp = tmp->b_this_page;
 777         }
 778
 779         /* OK, the async IO on this page is complete. */
 780         spin_unlock_irqrestore(&page_uptodate_lock, flags);
 781
 782         /*
 783          * if none of the buffers had errors then we can set the
 784          * page uptodate:
 785          */
 786         if (!PageError(page))
 787                 SetPageUptodate(page);
 788
 789         /*
 790          * Run the hooks that have to be done when a page I/O has completed.
 791          */
 792         if (PageTestandClearDecrAfter(page))
 793                 atomic_dec(&nr_async_pages);
 794
 795         UnlockPage(page);
 796
 797         return;
 798
 799 still_busy:
 800         spin_unlock_irqrestore(&page_uptodate_lock, flags);
 801         return;
 802 }
 803
 804 /*
 805  * Ok, this is getblk, and it isn't very clear, again to hinder
 806  * race-conditions. Most of the code is seldom used, (ie repeating),
 807  * so it should be much more efficient than it looks.
 808  *
 809  * The algorithm is changed: hopefully better, and an elusive bug removed.
 810  *
 811  * 14.02.92: changed it to sync dirty buffers a bit: better performance
 812  * when the filesystem starts to get full of dirty blocks (I hope).
 813  */
 814 struct buffer_head * getblk(kdev_t dev, int block, int size)
 815 {
 816         struct buffer_head * bh;
 817         int isize;
 818
 819 repeat:
 820         bh = get_hash_table(dev, block, size);
 821         if (bh)
 822                 goto out;
 823
 824         isize = BUFSIZE_INDEX(size);
 825         spin_lock(&free_list[isize].lock);
 826         bh = free_list[isize].list;
 827         if (bh) {
 828                 __remove_from_free_list(bh, isize);
 829                 atomic_set(&bh->b_count, 1);
 830         }
 831         spin_unlock(&free_list[isize].lock);
 832
 833         /*
 834          * OK, FINALLY we know that this buffer is the only one of
 835          * its kind, we hold a reference (b_count>0), it is unlocked,
 836          * and it is clean.
 837          */
 838         if (bh) {
 839                 init_buffer(bh, end_buffer_io_sync, NULL);
 840                 bh->b_dev = dev;
 841                 bh->b_blocknr = block;
 842                 bh->b_state = 1 << BH_Mapped;
 843
 844                 /* Insert the buffer into the regular lists */
 845                 insert_into_queues(bh);
 846         out:
 847                 touch_buffer(bh);
 848                 return bh;
 849         }
 850
 851         /*
 852          * If we block while refilling the free list, somebody may
 853          * create the buffer first ... search the hashes again.
 854          */
 855         refill_freelist(size);
 856         goto repeat;
 857 }
 858
 859 /* -1 -> no need to flush
 860     0 -> async flush
 861     1 -> sync flush (wait for I/O completation) */
 862 static int balance_dirty_state(kdev_t dev)
 863 {
 864         unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
 865
 866         dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
 867         tot = nr_free_buffer_pages();
 868         tot -= size_buffers_type[BUF_PROTECTED] >> PAGE_SHIFT;
 869
 870         dirty *= 200;
 871         soft_dirty_limit = tot * bdf_prm.b_un.nfract;
 872         hard_dirty_limit = soft_dirty_limit * 2;
 873
 874         if (dirty > soft_dirty_limit) {
 875                 if (dirty > hard_dirty_limit)
 876                         return 1;
 877                 return 0;
 878         }
 879         return -1;
 880 }
 881
 882 /*
 883  * if a new dirty buffer is created we need to balance bdflush.
 884  *
 885  * in the future we might want to make bdflush aware of different
 886  * pressures on different devices - thus the (currently unused)
 887  * 'dev' parameter.
 888  */
 889 void balance_dirty(kdev_t dev)
 890 {
 891         int state = balance_dirty_state(dev);
 892
 893         if (state < 0)
 894                 return;
 895         wakeup_bdflush(state);
 896 }
 897
 898 static __inline__ void __mark_dirty(struct buffer_head *bh, int flag)
 899 {
 900         bh->b_flushtime = jiffies + (flag ? bdf_prm.b_un.age_super : bdf_prm.b_un.age_buffer);
 901         refile_buffer(bh);
 902 }
 903
 904 /* atomic version, the user must call balance_dirty() by hand
 905    as soon as it become possible to block */
 906 void __mark_buffer_dirty(struct buffer_head *bh, int flag)
 907 {
 908         if (!atomic_set_buffer_dirty(bh))
 909                 __mark_dirty(bh, flag);
 910 }
 911
 912 void mark_buffer_dirty(struct buffer_head *bh, int flag)
 913 {
 914         __mark_buffer_dirty(bh, flag);
 915         balance_dirty(bh->b_dev);
 916 }
 917
 918 /*
 919  * A buffer may need to be moved from one buffer list to another
 920  * (e.g. in case it is not shared any more). Handle this.
 921  */
 922 static void __refile_buffer(struct buffer_head *bh)
 923 {
 924         int dispose = BUF_CLEAN;
 925         if (buffer_locked(bh))
 926                 dispose = BUF_LOCKED;
 927         if (buffer_dirty(bh))
 928                 dispose = BUF_DIRTY;
 929         if (buffer_protected(bh))
 930                 dispose = BUF_PROTECTED;
 931         if (dispose != bh->b_list) {
 932                 __remove_from_lru_list(bh, bh->b_list);
 933                 bh->b_list = dispose;
 934                 __insert_into_lru_list(bh, dispose);
 935         }
 936 }
 937
 938 void refile_buffer(struct buffer_head *bh)
 939 {
 940         spin_lock(&lru_list_lock);
 941         __refile_buffer(bh);
 942         spin_unlock(&lru_list_lock);
 943 }
 944
 945 /*
 946  * Release a buffer head
 947  */
 948 void __brelse(struct buffer_head * buf)
 949 {
 950         if (atomic_read(&buf->b_count)) {
 951                 atomic_dec(&buf->b_count);
 952                 return;
 953         }
 954         printk("VFS: brelse: Trying to free free buffer\n");
 955 }
 956
 957 /*
 958  * bforget() is like brelse(), except it puts the buffer on the
 959  * free list if it can.. We can NOT free the buffer if:
 960  *  - there are other users of it
 961  *  - it is locked and thus can have active IO
 962  */
 963 void __bforget(struct buffer_head * buf)
 964 {
 965         /* grab the lru lock here to block bdflush. */
 966         spin_lock(&lru_list_lock);
 967         write_lock(&hash_table_lock);
 968         if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf))
 969                 goto in_use;
 970         __hash_unlink(buf);
 971         write_unlock(&hash_table_lock);
 972         __remove_from_lru_list(buf, buf->b_list);
 973         spin_unlock(&lru_list_lock);
 974         put_last_free(buf);
 975         return;
 976
 977  in_use:
 978         write_unlock(&hash_table_lock);
 979         spin_unlock(&lru_list_lock);
 980 }
 981
 982 /*
 983  * bread() reads a specified block and returns the buffer that contains
 984  * it. It returns NULL if the block was unreadable.
 985  */
 986 struct buffer_head * bread(kdev_t dev, int block, int size)
 987 {
 988         struct buffer_head * bh;
 989
 990         bh = getblk(dev, block, size);
 991         if (buffer_uptodate(bh))
 992                 return bh;
 993         ll_rw_block(READ, 1, &bh);
 994         wait_on_buffer(bh);
 995         if (buffer_uptodate(bh))
 996                 return bh;
 997         brelse(bh);
 998         return NULL;
 999 }
1000
1001 /*
1002  * Ok, breada can be used as bread, but additionally to mark other
1003  * blocks for reading as well. End the argument list with a negative
1004  * number.
1005  */
1006
1007 #define NBUF 16
1008
1009 struct buffer_head * breada(kdev_t dev, int block, int bufsize,
1010         unsigned int pos, unsigned int filesize)
1011 {
1012         struct buffer_head * bhlist[NBUF];
1013         unsigned int blocks;
1014         struct buffer_head * bh;
1015         int index;
1016         int i, j;
1017
1018         if (pos >= filesize)
1019                 return NULL;
1020
1021         if (block < 0)
1022                 return NULL;
1023
1024         bh = getblk(dev, block, bufsize);
1025         index = BUFSIZE_INDEX(bh->b_size);
1026
1027         if (buffer_uptodate(bh))
1028                 return(bh);
1029         else ll_rw_block(READ, 1, &bh);
1030
1031         blocks = (filesize - pos) >> (9+index);
1032
1033         if (blocks < (read_ahead[MAJOR(dev)] >> index))
1034                 blocks = read_ahead[MAJOR(dev)] >> index;
1035         if (blocks > NBUF)
1036                 blocks = NBUF;
1037
1038 /*      if (blocks) printk("breada (new) %d blocks\n",blocks); */
1039
1040         bhlist[0] = bh;
1041         j = 1;
1042         for(i=1; i<blocks; i++) {
1043                 bh = getblk(dev,block+i,bufsize);
1044                 if (buffer_uptodate(bh)) {
1045                         brelse(bh);
1046                         break;
1047                 }
1048                 else bhlist[j++] = bh;
1049         }
1050
1051         /* Request the read for these buffers, and then release them. */
1052         if (j>1)
1053                 ll_rw_block(READA, (j-1), bhlist+1);
1054         for(i=1; i<j; i++)
1055                 brelse(bhlist[i]);
1056
1057         /* Wait for this buffer, and then continue on. */
1058         bh = bhlist[0];
1059         wait_on_buffer(bh);
1060         if (buffer_uptodate(bh))
1061                 return bh;
1062         brelse(bh);
1063         return NULL;
1064 }
1065
1066 /*
1067  * Note: the caller should wake up the buffer_wait list if needed.
1068  */
1069 static __inline__ void __put_unused_buffer_head(struct buffer_head * bh)
1070 {
1071         if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1072                 kmem_cache_free(bh_cachep, bh);
1073         } else {
1074                 bh->b_blocknr = -1;
1075                 init_waitqueue_head(&bh->b_wait);
1076                 nr_unused_buffer_heads++;
1077                 bh->b_next_free = unused_list;
1078                 bh->b_this_page = NULL;
1079                 unused_list = bh;
1080         }
1081 }
1082
1083 /*
1084  * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1085  * no-buffer-head deadlock.  Return NULL on failure; waiting for
1086  * buffer heads is now handled in create_buffers().
1087  */
1088 static struct buffer_head * get_unused_buffer_head(int async)
1089 {
1090         struct buffer_head * bh;
1091
1092         spin_lock(&unused_list_lock);
1093         if (nr_unused_buffer_heads > NR_RESERVED) {
1094                 bh = unused_list;
1095                 unused_list = bh->b_next_free;
1096                 nr_unused_buffer_heads--;
1097                 spin_unlock(&unused_list_lock);
1098                 return bh;
1099         }
1100         spin_unlock(&unused_list_lock);
1101
1102         /* This is critical.  We can't swap out pages to get
1103          * more buffer heads, because the swap-out may need
1104          * more buffer-heads itself.  Thus SLAB_BUFFER.
1105          */
1106         if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
1107                 memset(bh, 0, sizeof(*bh));
1108                 init_waitqueue_head(&bh->b_wait);
1109                 return bh;
1110         }
1111
1112         /*
1113          * If we need an async buffer, use the reserved buffer heads.
1114          */
1115         if (async) {
1116                 spin_lock(&unused_list_lock);
1117                 if (unused_list) {
1118                         bh = unused_list;
1119                         unused_list = bh->b_next_free;
1120                         nr_unused_buffer_heads--;
1121                         spin_unlock(&unused_list_lock);
1122                         return bh;
1123                 }
1124                 spin_unlock(&unused_list_lock);
1125         }
1126 #if 0
1127         /*
1128          * (Pending further analysis ...)
1129          * Ordinary (non-async) requests can use a different memory priority
1130          * to free up pages. Any swapping thus generated will use async
1131          * buffer heads.
1132          */
1133         if(!async &&
1134            (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1135                 memset(bh, 0, sizeof(*bh));
1136                 init_waitqueue_head(&bh->b_wait);
1137                 return bh;
1138         }
1139 #endif
1140
1141         return NULL;
1142 }
1143
1144 void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
1145 {
1146         bh->b_page = page;
1147         if (offset >= PAGE_SIZE)
1148                 BUG();
1149         if (PageHighMem(page))
1150                 /*
1151                  * This catches illegal uses and preserves the offset:
1152                  */
1153                 bh->b_data = (char *)(0 + offset);
1154         else
1155                 bh->b_data = (char *)(page_address(page) + offset);
1156 }
1157
1158 /*
1159  * Create the appropriate buffers when given a page for data area and
1160  * the size of each buffer.. Use the bh->b_this_page linked list to
1161  * follow the buffers created.  Return NULL if unable to create more
1162  * buffers.
1163  * The async flag is used to differentiate async IO (paging, swapping)
1164  * from ordinary buffer allocations, and only async requests are allowed
1165  * to sleep waiting for buffer heads.
1166  */
1167 static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
1168 {
1169         struct buffer_head *bh, *head;
1170         long offset;
1171
1172 try_again:
1173         head = NULL;
1174         offset = PAGE_SIZE;
1175         while ((offset -= size) >= 0) {
1176                 bh = get_unused_buffer_head(async);
1177                 if (!bh)
1178                         goto no_grow;
1179
1180                 bh->b_dev = B_FREE;  /* Flag as unused */
1181                 bh->b_this_page = head;
1182                 head = bh;
1183
1184                 bh->b_state = 0;
1185                 bh->b_next_free = NULL;
1186                 bh->b_pprev = NULL;
1187                 atomic_set(&bh->b_count, 0);
1188                 bh->b_size = size;
1189
1190                 set_bh_page(bh, page, offset);
1191
1192                 bh->b_list = BUF_CLEAN;
1193                 bh->b_end_io = end_buffer_io_bad;
1194         }
1195         return head;
1196 /*
1197  * In case anything failed, we just free everything we got.
1198  */
1199 no_grow:
1200         if (head) {
1201                 spin_lock(&unused_list_lock);
1202                 do {
1203                         bh = head;
1204                         head = head->b_this_page;
1205                         __put_unused_buffer_head(bh);
1206                 } while (head);
1207                 spin_unlock(&unused_list_lock);
1208
1209                 /* Wake up any waiters ... */
1210                 wake_up(&buffer_wait);
1211         }
1212
1213         /*
1214          * Return failure for non-async IO requests.  Async IO requests
1215          * are not allowed to fail, so we have to wait until buffer heads
1216          * become available.  But we don't want tasks sleeping with
1217          * partially complete buffers, so all were released above.
1218          */
1219         if (!async)
1220                 return NULL;
1221
1222         /* We're _really_ low on memory. Now we just
1223          * wait for old buffer heads to become free due to
1224          * finishing IO.  Since this is an async request and
1225          * the reserve list is empty, we're sure there are
1226          * async buffer heads in use.
1227          */
1228         run_task_queue(&tq_disk);
1229
1230         /*
1231          * Set our state for sleeping, then check again for buffer heads.
1232          * This ensures we won't miss a wake_up from an interrupt.
1233          */
1234         wait_event(buffer_wait, nr_unused_buffer_heads >= MAX_BUF_PER_PAGE);
1235         goto try_again;
1236 }
1237
1238 static int create_page_buffers(int rw, struct page *page, kdev_t dev, int b[], int size)
1239 {
1240         struct buffer_head *head, *bh, *tail;
1241         int block;
1242
1243         if (!PageLocked(page))
1244                 BUG();
1245         /*
1246          * Allocate async buffer heads pointing to this page, just for I/O.
1247          * They don't show up in the buffer hash table, but they *are*
1248          * registered in page->buffers.
1249          */
1250         head = create_buffers(page, size, 1);
1251         if (page->buffers)
1252                 BUG();
1253         if (!head)
1254                 BUG();
1255         tail = head;
1256         for (bh = head; bh; bh = bh->b_this_page) {
1257                 block = *(b++);
1258
1259                 tail = bh;
1260                 init_buffer(bh, end_buffer_io_async, NULL);
1261                 bh->b_dev = dev;
1262                 bh->b_blocknr = block;
1263
1264                 set_bit(BH_Mapped, &bh->b_state);
1265         }
1266         tail->b_this_page = head;
1267         page_cache_get(page);
1268         page->buffers = head;
1269         return 0;
1270 }
1271
1272 static void unmap_buffer(struct buffer_head * bh)
1273 {
1274         if (buffer_mapped(bh)) {
1275                 mark_buffer_clean(bh);
1276                 wait_on_buffer(bh);
1277                 clear_bit(BH_Uptodate, &bh->b_state);
1278                 clear_bit(BH_Mapped, &bh->b_state);
1279                 clear_bit(BH_Req, &bh->b_state);
1280                 clear_bit(BH_New, &bh->b_state);
1281         }
1282 }
1283
1284 /*
1285  * We don't have to release all buffers here, but
1286  * we have to be sure that no dirty buffer is left
1287  * and no IO is going on (no buffer is locked), because
1288  * we have truncated the file and are going to free the
1289  * blocks on-disk..
1290  */
1291 int block_flushpage(struct page *page, unsigned long offset)
1292 {
1293         struct buffer_head *head, *bh, *next;
1294         unsigned int curr_off = 0;
1295
1296         if (!PageLocked(page))
1297                 BUG();
1298         if (!page->buffers)
1299                 return 1;
1300
1301         head = page->buffers;
1302         bh = head;
1303         do {
1304                 unsigned int next_off = curr_off + bh->b_size;
1305                 next = bh->b_this_page;
1306
1307                 /*
1308                  * is this block fully flushed?
1309                  */
1310                 if (offset <= curr_off)
1311                         unmap_buffer(bh);
1312                 curr_off = next_off;
1313                 bh = next;
1314         } while (bh != head);
1315
1316         /*
1317          * subtle. We release buffer-heads only if this is
1318          * the 'final' flushpage. We have invalidated the get_block
1319          * cached value unconditionally, so real IO is not
1320          * possible anymore.
1321          *
1322          * If the free doesn't work out, the buffers can be
1323          * left around - they just turn into anonymous buffers
1324          * instead.
1325          */
1326         if (!offset) {
1327                 if (!try_to_free_buffers(page)) {
1328                         atomic_inc(&buffermem_pages);
1329                         return 0;
1330                 }
1331         }
1332
1333         return 1;
1334 }
1335
1336 static void create_empty_buffers(struct page *page, struct inode *inode, unsigned long blocksize)
1337 {
1338         struct buffer_head *bh, *head, *tail;
1339
1340         head = create_buffers(page, blocksize, 1);
1341         if (page->buffers)
1342                 BUG();
1343
1344         bh = head;
1345         do {
1346                 bh->b_dev = inode->i_dev;
1347                 bh->b_blocknr = 0;
1348                 bh->b_end_io = end_buffer_io_bad;
1349                 tail = bh;
1350                 bh = bh->b_this_page;
1351         } while (bh);
1352         tail->b_this_page = head;
1353         page->buffers = head;
1354         page_cache_get(page);
1355 }
1356
1357 static void unmap_underlying_metadata(struct buffer_head * bh)
1358 {
1359         struct buffer_head *old_bh;
1360
1361         old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
1362         if (old_bh) {
1363                 unmap_buffer(old_bh);
1364                 /* Here we could run brelse or bforget. We use
1365                    bforget because it will try to put the buffer
1366                    in the freelist. */
1367                 __bforget(old_bh);
1368         }
1369 }
1370
1371 /*
1372  * block_write_full_page() is SMP-safe - currently it's still
1373  * being called with the kernel lock held, but the code is ready.
1374  */
1375 static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
1376 {
1377         int err, i, need_balance_dirty = 0;
1378         unsigned long block;
1379         struct buffer_head *bh, *head;
1380
1381         if (!PageLocked(page))
1382                 BUG();
1383
1384         if (!page->buffers)
1385                 create_empty_buffers(page, inode, inode->i_sb->s_blocksize);
1386         head = page->buffers;
1387
1388         block = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1389
1390         bh = head;
1391         i = 0;
1392         do {
1393                 /*
1394                  * If the buffer isn't up-to-date, we can't be sure
1395                  * that the buffer has been initialized with the proper
1396                  * block number information etc..
1397                  *
1398                  * Leave it to the low-level FS to make all those
1399                  * decisions (block #0 may actually be a valid block)
1400                  */
1401                 bh->b_end_io = end_buffer_io_sync;
1402                 if (!buffer_mapped(bh)) {
1403                         err = get_block(inode, block, bh, 1);
1404                         if (err)
1405                                 goto out;
1406                         if (buffer_new(bh))
1407                                 unmap_underlying_metadata(bh);
1408                 }
1409                 set_bit(BH_Uptodate, &bh->b_state);
1410                 if (!atomic_set_buffer_dirty(bh)) {
1411                         __mark_dirty(bh, 0);
1412                         need_balance_dirty = 1;
1413                 }
1414
1415                 bh = bh->b_this_page;
1416                 block++;
1417         } while (bh != head);
1418
1419         if (need_balance_dirty)
1420                 balance_dirty(bh->b_dev);
1421
1422         SetPageUptodate(page);
1423         return 0;
1424 out:
1425         ClearPageUptodate(page);
1426         return err;
1427 }
1428
1429 static int __block_prepare_write(struct inode *inode, struct page *page,
1430                 unsigned from, unsigned to, get_block_t *get_block)
1431 {
1432         unsigned block_start, block_end;
1433         unsigned long block;
1434         int err = 0;
1435         unsigned blocksize, bbits;
1436         struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1437         char *kaddr = (char *)kmap(page);
1438
1439         blocksize = inode->i_sb->s_blocksize;
1440         if (!page->buffers)
1441                 create_empty_buffers(page, inode, blocksize);
1442         head = page->buffers;
1443
1444         bbits = inode->i_sb->s_blocksize_bits;
1445         block = page->index << (PAGE_CACHE_SHIFT - bbits);
1446
1447         for(bh = head, block_start = 0; bh != head || !block_start;
1448             block++, block_start=block_end, bh = bh->b_this_page) {
1449                 if (!bh)
1450                         BUG();
1451                 block_end = block_start+blocksize;
1452                 if (block_end <= from)
1453                         continue;
1454                 if (block_start >= to)
1455                         break;
1456                 bh->b_end_io = end_buffer_io_sync;
1457                 if (!buffer_mapped(bh)) {
1458                         err = get_block(inode, block, bh, 1);
1459                         if (err)
1460                                 goto out;
1461                         if (buffer_new(bh)) {
1462                                 unmap_underlying_metadata(bh);
1463                                 if (block_end > to)
1464                                         memset(kaddr+to, 0, block_end-to);
1465                                 if (block_start < from)
1466                                         memset(kaddr+block_start, 0, from-block_start);
1467                                 continue;
1468                         }
1469                 }
1470                 if (!buffer_uptodate(bh) &&
1471                      (block_start < from || block_end > to)) {
1472                         ll_rw_block(READ, 1, &bh);
1473                         *wait_bh++=bh;
1474                 }
1475         }
1476         /*
1477          * If we issued read requests - let them complete.
1478          */
1479         while(wait_bh > wait) {
1480                 wait_on_buffer(*--wait_bh);
1481                 err = -EIO;
1482                 if (!buffer_uptodate(*wait_bh))
1483                         goto out;
1484         }
1485         return 0;
1486 out:
1487         return err;
1488 }
1489
1490 static int __block_commit_write(struct inode *inode, struct page *page,
1491                 unsigned from, unsigned to)
1492 {
1493         unsigned block_start, block_end;
1494         int partial = 0, need_balance_dirty = 0;
1495         unsigned blocksize;
1496         struct buffer_head *bh, *head;
1497
1498         blocksize = inode->i_sb->s_blocksize;
1499
1500         for(bh = head = page->buffers, block_start = 0;
1501             bh != head || !block_start;
1502             block_start=block_end, bh = bh->b_this_page) {
1503                 block_end = block_start + blocksize;
1504                 if (block_end <= from || block_start >= to) {
1505                         if (!buffer_uptodate(bh))
1506                                 partial = 1;
1507                 } else {
1508                         set_bit(BH_Uptodate, &bh->b_state);
1509                         if (!atomic_set_buffer_dirty(bh)) {
1510                                 __mark_dirty(bh, 0);
1511                                 need_balance_dirty = 1;
1512                         }
1513                 }
1514         }
1515
1516         if (need_balance_dirty)
1517                 balance_dirty(bh->b_dev);
1518         /*
1519          * is this a partial write that happened to make all buffers
1520          * uptodate then we can optimize away a bogus readpage() for
1521          * the next read(). Here we 'discover' wether the page went
1522          * uptodate as a result of this (potentially partial) write.
1523          */
1524         if (!partial)
1525                 SetPageUptodate(page);
1526         return 0;
1527 }
1528
1529 /*
1530  * Generic "read page" function for block devices that have the normal
1531  * get_block functionality. This is most of the block device filesystems.
1532  * Reads the page asynchronously --- the unlock_buffer() and
1533  * mark_buffer_uptodate() functions propagate buffer state into the
1534  * page struct once IO has completed.
1535  */
1536 int block_read_full_page(struct page *page, get_block_t *get_block)
1537 {
1538         struct inode *inode = (struct inode*)page->mapping->host;
1539         unsigned long iblock, lblock;
1540         struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1541         unsigned int blocksize, blocks;
1542         unsigned long kaddr = 0;
1543         int nr, i;
1544
1545         if (!PageLocked(page))
1546                 PAGE_BUG(page);
1547         blocksize = inode->i_sb->s_blocksize;
1548         if (!page->buffers)
1549                 create_empty_buffers(page, inode, blocksize);
1550         head = page->buffers;
1551
1552         blocks = PAGE_CACHE_SIZE >> inode->i_sb->s_blocksize_bits;
1553         iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1554         lblock = (inode->i_size+blocksize-1) >> inode->i_sb->s_blocksize_bits;
1555         bh = head;
1556         nr = 0;
1557         i = 0;
1558
1559         do {
1560                 if (buffer_uptodate(bh))
1561                         continue;
1562
1563                 if (!buffer_mapped(bh)) {
1564                         if (iblock < lblock)
1565                                 get_block(inode, iblock, bh, 0);
1566                         if (!buffer_mapped(bh)) {
1567                                 if (!kaddr)
1568                                         kaddr = kmap(page);
1569                                 memset((char *)(kaddr + i*blocksize), 0, blocksize);
1570                                 set_bit(BH_Uptodate, &bh->b_state);
1571                                 continue;
1572                         }
1573                 }
1574
1575                 init_buffer(bh, end_buffer_io_async, NULL);
1576                 atomic_inc(&bh->b_count);
1577                 arr[nr] = bh;
1578                 nr++;
1579         } while (i++, iblock++, (bh = bh->b_this_page) != head);
1580
1581         if (nr) {
1582                 if (Page_Uptodate(page))
1583                         BUG();
1584                 ll_rw_block(READ, nr, arr);
1585         } else {
1586                 /*
1587                  * all buffers are uptodate - we can set the page
1588                  * uptodate as well.
1589                  */
1590                 SetPageUptodate(page);
1591                 UnlockPage(page);
1592         }
1593         if (kaddr)
1594                 kunmap(page);
1595         return 0;
1596 }
1597
1598 /*
1599  * For moronic filesystems that do not allow holes in file.
1600  * We may have to extend the file.
1601  */
1602
1603 int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes)
1604 {
1605         struct address_space *mapping = page->mapping;
1606         struct inode *inode = (struct inode*)mapping->host;
1607         struct page *new_page;
1608         unsigned long pgpos;
1609         long status;
1610         unsigned zerofrom;
1611         unsigned blocksize = inode->i_sb->s_blocksize;
1612         char *kaddr;
1613
1614         while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
1615                 status = -ENOMEM;
1616                 new_page = grab_cache_page(mapping, pgpos);
1617                 if (!new_page)
1618                         goto out;
1619                 /* we might sleep */
1620                 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
1621                         UnlockPage(new_page);
1622                         page_cache_release(new_page);
1623                         continue;
1624                 }
1625                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1626                 if (zerofrom & (blocksize-1)) {
1627                         *bytes |= (blocksize-1);
1628                         (*bytes)++;
1629                 }
1630                 status = __block_prepare_write(inode, new_page, zerofrom,
1631                                                 PAGE_CACHE_SIZE, get_block);
1632                 if (status)
1633                         goto out_unmap;
1634                 kaddr = (char*)page_address(page);
1635                 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
1636                 __block_commit_write(inode, new_page, zerofrom, to);
1637                 kunmap(new_page);
1638                 UnlockPage(new_page);
1639                 page_cache_release(new_page);
1640         }
1641
1642         if (page->index < pgpos) {
1643                 /* completely inside the area */
1644                 zerofrom = offset;
1645         } else {
1646                 /* page covers the boundary, find the boundary offset */
1647                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1648
1649                 /* if we will expand the thing last block will be filled */
1650                 if (to > zerofrom && (zerofrom & (blocksize-1))) {
1651                         *bytes |= (blocksize-1);
1652                         (*bytes)++;
1653                 }
1654
1655                 /* starting below the boundary? Nothing to zero out */
1656                 if (offset <= zerofrom)
1657                         zerofrom = offset;
1658         }
1659         status = __block_prepare_write(inode, page, zerofrom, to, get_block);
1660         if (status)
1661                 goto out1;
1662         kaddr = (char*)page_address(page);
1663         if (zerofrom < offset) {
1664                 memset(kaddr+zerofrom, 0, offset-zerofrom);
1665                 __block_commit_write(inode, page, zerofrom, offset);
1666         }
1667         return 0;
1668 out1:
1669         ClearPageUptodate(page);
1670         kunmap(page);
1671         return status;
1672
1673 out_unmap:
1674         ClearPageUptodate(new_page);
1675         kunmap(new_page);
1676         UnlockPage(new_page);
1677         page_cache_release(new_page);
1678 out:
1679         return status;
1680 }
1681
1682 int block_prepare_write(struct page *page, unsigned from, unsigned to,
1683                         get_block_t *get_block)
1684 {
1685         struct inode *inode = (struct inode*)page->mapping->host;
1686         int err = __block_prepare_write(inode, page, from, to, get_block);
1687         if (err) {
1688                 ClearPageUptodate(page);
1689                 kunmap(page);
1690         }
1691         return err;
1692 }
1693
1694 int generic_commit_write(struct file *file, struct page *page,
1695                 unsigned from, unsigned to)
1696 {
1697         struct inode *inode = (struct inode*)page->mapping->host;
1698         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1699         __block_commit_write(inode,page,from,to);
1700         kunmap(page);
1701         if (pos > inode->i_size)
1702                 inode->i_size = pos;
1703         return 0;
1704 }
1705
1706 int block_write_full_page(struct page *page, get_block_t *get_block)
1707 {
1708         struct inode *inode = (struct inode*)page->mapping->host;
1709         unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1710         unsigned offset;
1711         int err;
1712
1713         /* easy case */
1714         if (page->index < end_index)
1715                 return __block_write_full_page(inode, page, get_block);
1716
1717         /* things got complicated... */
1718         offset = inode->i_size & (PAGE_CACHE_SIZE-1);
1719         /* OK, are we completely out? */
1720         if (page->index >= end_index+1 || !offset)
1721                 return -EIO;
1722         /* Sigh... will have to work, then... */
1723         err = __block_prepare_write(inode, page, 0, offset, get_block);
1724         if (!err) {
1725                 memset((char *)page_address(page)+offset, 0, PAGE_CACHE_SIZE-offset);
1726                 __block_commit_write(inode,page,0,offset);
1727 done:
1728                 kunmap(page);
1729                 return err;
1730         }
1731         ClearPageUptodate(page);
1732         goto done;
1733 }
1734
1735 int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block)
1736 {
1737         struct buffer_head tmp;
1738         struct inode *inode = (struct inode*)mapping->host;
1739         tmp.b_state = 0;
1740         tmp.b_blocknr = 0;
1741         get_block(inode, block, &tmp, 0);
1742         return tmp.b_blocknr;
1743 }
1744
1745 /*
1746  * IO completion routine for a buffer_head being used for kiobuf IO: we
1747  * can't dispatch the kiobuf callback until io_count reaches 0.
1748  */
1749
1750 static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate)
1751 {
1752         struct kiobuf *kiobuf;
1753
1754         mark_buffer_uptodate(bh, uptodate);
1755
1756         kiobuf = bh->b_kiobuf;
1757         unlock_buffer(bh);
1758         end_kio_request(kiobuf, uptodate);
1759 }
1760
1761
1762 /*
1763  * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
1764  * for them to complete.  Clean up the buffer_heads afterwards.
1765  */
1766
1767 static int do_kio(int rw, int nr, struct buffer_head *bh[], int size)
1768 {
1769         int iosize;
1770         int i;
1771         struct buffer_head *tmp;
1772
1773         struct task_struct *tsk = current;
1774         DECLARE_WAITQUEUE(wait, tsk);
1775
1776         if (rw == WRITE)
1777                 rw = WRITERAW;
1778         ll_rw_block(rw, nr, bh);
1779
1780         iosize = 0;
1781         spin_lock(&unused_list_lock);
1782
1783         for (i = nr; --i >= 0; ) {
1784                 iosize += size;
1785                 tmp = bh[i];
1786                 if (buffer_locked(tmp)) {
1787                         spin_unlock(&unused_list_lock);
1788                         wait_on_buffer(tmp);
1789                         spin_lock(&unused_list_lock);
1790                 }
1791
1792                 if (!buffer_uptodate(tmp)) {
1793                         /* We are traversing bh'es in reverse order so
1794                            clearing iosize on error calculates the
1795                            amount of IO before the first error. */
1796                         iosize = 0;
1797                 }
1798                 __put_unused_buffer_head(tmp);
1799         }
1800
1801         spin_unlock(&unused_list_lock);
1802
1803         return iosize;
1804 }
1805
1806 /*
1807  * Start I/O on a physical range of kernel memory, defined by a vector
1808  * of kiobuf structs (much like a user-space iovec list).
1809  *
1810  * The kiobuf must already be locked for IO.  IO is submitted
1811  * asynchronously: you need to check page->locked, page->uptodate, and
1812  * maybe wait on page->wait.
1813  *
1814  * It is up to the caller to make sure that there are enough blocks
1815  * passed in to completely map the iobufs to disk.
1816  */
1817
1818 int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
1819                kdev_t dev, unsigned long b[], int size)
1820 {
1821         int             err;
1822         int             length;
1823         int             transferred;
1824         int             i;
1825         int             bufind;
1826         int             pageind;
1827         int             bhind;
1828         int             offset;
1829         unsigned long   blocknr;
1830         struct kiobuf * iobuf = NULL;
1831         struct page *   map;
1832         struct buffer_head *tmp, *bh[KIO_MAX_SECTORS];
1833
1834         if (!nr)
1835                 return 0;
1836
1837         /*
1838          * First, do some alignment and validity checks
1839          */
1840         for (i = 0; i < nr; i++) {
1841                 iobuf = iovec[i];
1842                 if ((iobuf->offset & (size-1)) ||
1843                     (iobuf->length & (size-1)))
1844                         return -EINVAL;
1845                 if (!iobuf->nr_pages)
1846                         panic("brw_kiovec: iobuf not initialised");
1847         }
1848
1849         /*
1850          * OK to walk down the iovec doing page IO on each page we find.
1851          */
1852         bufind = bhind = transferred = err = 0;
1853         for (i = 0; i < nr; i++) {
1854                 iobuf = iovec[i];
1855                 offset = iobuf->offset;
1856                 length = iobuf->length;
1857                 iobuf->errno = 0;
1858
1859                 for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
1860                         map  = iobuf->maplist[pageind];
1861                         if (!map) {
1862                                 err = -EFAULT;
1863                                 goto error;
1864                         }
1865
1866                         while (length > 0) {
1867                                 blocknr = b[bufind++];
1868                                 tmp = get_unused_buffer_head(0);
1869                                 if (!tmp) {
1870                                         err = -ENOMEM;
1871                                         goto error;
1872                                 }
1873
1874                                 tmp->b_dev = B_FREE;
1875                                 tmp->b_size = size;
1876                                 set_bh_page(tmp, map, offset);
1877                                 tmp->b_this_page = tmp;
1878
1879                                 init_buffer(tmp, end_buffer_io_kiobuf, NULL);
1880                                 tmp->b_dev = dev;
1881                                 tmp->b_blocknr = blocknr;
1882                                 tmp->b_state = 1 << BH_Mapped;
1883                                 tmp->b_kiobuf = iobuf;
1884
1885                                 if (rw == WRITE) {
1886                                         set_bit(BH_Uptodate, &tmp->b_state);
1887                                         set_bit(BH_Dirty, &tmp->b_state);
1888                                 }
1889
1890                                 bh[bhind++] = tmp;
1891                                 length -= size;
1892                                 offset += size;
1893
1894                                 atomic_inc(&iobuf->io_count);
1895
1896                                 /*
1897                                  * Start the IO if we have got too much
1898                                  */
1899                                 if (bhind >= KIO_MAX_SECTORS) {
1900                                         err = do_kio(rw, bhind, bh, size);
1901                                         if (err >= 0)
1902                                                 transferred += err;
1903                                         else
1904                                                 goto finished;
1905                                         bhind = 0;
1906                                 }
1907
1908                                 if (offset >= PAGE_SIZE) {
1909                                         offset = 0;
1910                                         break;
1911                                 }
1912                         } /* End of block loop */
1913                 } /* End of page loop */
1914         } /* End of iovec loop */
1915
1916         /* Is there any IO still left to submit? */
1917         if (bhind) {
1918                 err = do_kio(rw, bhind, bh, size);
1919                 if (err >= 0)
1920                         transferred += err;
1921                 else
1922                         goto finished;
1923         }
1924
1925  finished:
1926         if (transferred)
1927                 return transferred;
1928         return err;
1929
1930  error:
1931         /* We got an error allocating the bh'es.  Just free the current
1932            buffer_heads and exit. */
1933         spin_lock(&unused_list_lock);
1934         for (i = bhind; --i >= 0; ) {
1935                 __put_unused_buffer_head(bh[bhind]);
1936         }
1937         spin_unlock(&unused_list_lock);
1938         goto finished;
1939 }
1940
1941 /*
1942  * Start I/O on a page.
1943  * This function expects the page to be locked and may return
1944  * before I/O is complete. You then have to check page->locked,
1945  * page->uptodate, and maybe wait on page->wait.
1946  *
1947  * brw_page() is SMP-safe, although it's being called with the
1948  * kernel lock held - but the code is ready.
1949  *
1950  * FIXME: we need a swapper_inode->get_block function to remove
1951  *        some of the bmap kludges and interface ugliness here.
1952  */
1953 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size)
1954 {
1955         struct buffer_head *head, *bh, *arr[MAX_BUF_PER_PAGE];
1956         int nr, fresh /* temporary debugging flag */, block;
1957
1958         if (!PageLocked(page))
1959                 panic("brw_page: page not locked for I/O");
1960 //      ClearPageError(page);
1961         /*
1962          * We pretty much rely on the page lock for this, because
1963          * create_page_buffers() might sleep.
1964          */
1965         fresh = 0;
1966         if (!page->buffers) {
1967                 create_page_buffers(rw, page, dev, b, size);
1968                 fresh = 1;
1969         }
1970         if (!page->buffers)
1971                 BUG();
1972
1973         head = page->buffers;
1974         bh = head;
1975         nr = 0;
1976         do {
1977                 block = *(b++);
1978
1979                 if (fresh && (atomic_read(&bh->b_count) != 0))
1980                         BUG();
1981                 if (rw == READ) {
1982                         if (!fresh)
1983                                 BUG();
1984                         if (!buffer_uptodate(bh)) {
1985                                 arr[nr++] = bh;
1986                                 atomic_inc(&bh->b_count);
1987                         }
1988                 } else { /* WRITE */
1989                         if (!bh->b_blocknr) {
1990                                 if (!block)
1991                                         BUG();
1992                                 bh->b_blocknr = block;
1993                         } else {
1994                                 if (!block)
1995                                         BUG();
1996                         }
1997                         set_bit(BH_Uptodate, &bh->b_state);
1998                         set_bit(BH_Dirty, &bh->b_state);
1999                         arr[nr++] = bh;
2000                         atomic_inc(&bh->b_count);
2001                 }
2002                 bh = bh->b_this_page;
2003         } while (bh != head);
2004         if ((rw == READ) && nr) {
2005                 if (Page_Uptodate(page))
2006                         BUG();
2007                 ll_rw_block(rw, nr, arr);
2008         } else {
2009                 if (!nr && rw == READ) {
2010                         SetPageUptodate(page);
2011                         UnlockPage(page);
2012                 }
2013                 if (nr && (rw == WRITE))
2014                         ll_rw_block(rw, nr, arr);
2015         }
2016         return 0;
2017 }
2018
2019 int block_symlink(struct inode *inode, const char *symname, int len)
2020 {
2021         struct address_space *mapping = inode->i_mapping;
2022         struct page *page = grab_cache_page(mapping, 0);
2023         int err = -ENOMEM;
2024         char *kaddr;
2025
2026         if (!page)
2027                 goto fail;
2028         err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
2029         if (err)
2030                 goto fail_map;
2031         kaddr = (char*)page_address(page);
2032         memcpy(kaddr, symname, len-1);
2033         mapping->a_ops->commit_write(NULL, page, 0, len-1);
2034         /*
2035          * Notice that we are _not_ going to block here - end of page is
2036          * unmapped, so this will only try to map the rest of page, see
2037          * that it is unmapped (typically even will not look into inode -
2038          * ->i_size will be enough for everything) and zero it out.
2039          * OTOH it's obviously correct and should make the page up-to-date.
2040          */
2041         err = mapping->a_ops->readpage(NULL, page);
2042         wait_on_page(page);
2043         page_cache_release(page);
2044         if (err < 0)
2045                 goto fail;
2046         mark_inode_dirty(inode);
2047         return 0;
2048 fail_map:
2049         UnlockPage(page);
2050         page_cache_release(page);
2051 fail:
2052         return err;
2053 }
2054
2055 /*
2056  * Try to increase the number of buffers available: the size argument
2057  * is used to determine what kind of buffers we want.
2058  */
2059 static int grow_buffers(int size)
2060 {
2061         struct page * page;
2062         struct buffer_head *bh, *tmp;
2063         struct buffer_head * insert_point;
2064         int isize;
2065
2066         if ((size & 511) || (size > PAGE_SIZE)) {
2067                 printk("VFS: grow_buffers: size = %d\n",size);
2068                 return 0;
2069         }
2070
2071         page = alloc_page(GFP_BUFFER);
2072         if (!page)
2073                 goto out;
2074         bh = create_buffers(page, size, 0);
2075         if (!bh)
2076                 goto no_buffer_head;
2077
2078         isize = BUFSIZE_INDEX(size);
2079
2080         spin_lock(&free_list[isize].lock);
2081         insert_point = free_list[isize].list;
2082         tmp = bh;
2083         while (1) {
2084                 if (insert_point) {
2085                         tmp->b_next_free = insert_point->b_next_free;
2086                         tmp->b_prev_free = insert_point;
2087                         insert_point->b_next_free->b_prev_free = tmp;
2088                         insert_point->b_next_free = tmp;
2089                 } else {
2090                         tmp->b_prev_free = tmp;
2091                         tmp->b_next_free = tmp;
2092                 }
2093                 insert_point = tmp;
2094                 if (tmp->b_this_page)
2095                         tmp = tmp->b_this_page;
2096                 else
2097                         break;
2098         }
2099         tmp->b_this_page = bh;
2100         free_list[isize].list = bh;
2101         spin_unlock(&free_list[isize].lock);
2102
2103         page->buffers = bh;
2104         lru_cache_add(page);
2105         atomic_inc(&buffermem_pages);
2106         return 1;
2107
2108 no_buffer_head:
2109         page_cache_release(page);
2110 out:
2111         return 0;
2112 }
2113
2114 /*
2115  * Sync all the buffers on one page..
2116  *
2117  * If we have old buffers that are locked, we'll
2118  * wait on them, but we won't wait on the new ones
2119  * we're writing out now.
2120  *
2121  * This all is required so that we can free up memory
2122  * later.
2123  */
2124 static void sync_page_buffers(struct buffer_head *bh)
2125 {
2126         struct buffer_head * tmp;
2127
2128         tmp = bh;
2129         do {
2130                 struct buffer_head *p = tmp;
2131                 tmp = tmp->b_this_page;
2132                 if (buffer_dirty(p) && !buffer_locked(p))
2133                         ll_rw_block(WRITE, 1, &p);
2134         } while (tmp != bh);
2135 }
2136
2137 /*
2138  * Can the buffer be thrown out?
2139  */
2140 #define BUFFER_BUSY_BITS        ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
2141 #define buffer_busy(bh)         (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
2142
2143 /*
2144  * try_to_free_buffers() checks if all the buffers on this particular page
2145  * are unused, and free's the page if so.
2146  *
2147  * Wake up bdflush() if this fails - if we're running low on memory due
2148  * to dirty buffers, we need to flush them out as quickly as possible.
2149  *
2150  * NOTE: There are quite a number of ways that threads of control can
2151  *       obtain a reference to a buffer head within a page.  So we must
2152  *       lock out all of these paths to cleanly toss the page.
2153  */
2154 int try_to_free_buffers(struct page * page)
2155 {
2156         struct buffer_head * tmp, * bh = page->buffers;
2157         int index = BUFSIZE_INDEX(bh->b_size);
2158
2159         spin_lock(&lru_list_lock);
2160         write_lock(&hash_table_lock);
2161         spin_lock(&free_list[index].lock);
2162         tmp = bh;
2163         do {
2164                 struct buffer_head *p = tmp;
2165
2166                 tmp = tmp->b_this_page;
2167                 if (buffer_busy(p))
2168                         goto busy_buffer_page;
2169         } while (tmp != bh);
2170
2171         spin_lock(&unused_list_lock);
2172         tmp = bh;
2173         do {
2174                 struct buffer_head * p = tmp;
2175                 tmp = tmp->b_this_page;
2176
2177                 /* The buffer can be either on the regular
2178                  * queues or on the free list..
2179                  */
2180                 if (p->b_dev != B_FREE)
2181                         __remove_from_queues(p);
2182                 else
2183                         __remove_from_free_list(p, index);
2184                 __put_unused_buffer_head(p);
2185         } while (tmp != bh);
2186         spin_unlock(&unused_list_lock);
2187
2188         /* Wake up anyone waiting for buffer heads */
2189         wake_up(&buffer_wait);
2190
2191         /* And free the page */
2192         page->buffers = NULL;
2193         page_cache_release(page);
2194         spin_unlock(&free_list[index].lock);
2195         write_unlock(&hash_table_lock);
2196         spin_unlock(&lru_list_lock);
2197         return 1;
2198
2199 busy_buffer_page:
2200         /* Uhhuh, start writeback so that we don't end up with all dirty pages */
2201         spin_unlock(&free_list[index].lock);
2202         write_unlock(&hash_table_lock);
2203         spin_unlock(&lru_list_lock);
2204         sync_page_buffers(bh);
2205         return 0;
2206 }
2207
2208 /* ================== Debugging =================== */
2209
2210 void show_buffers(void)
2211 {
2212 #ifdef CONFIG_SMP
2213         struct buffer_head * bh;
2214         int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
2215         int protected = 0;
2216         int nlist;
2217         static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", "PROTECTED", };
2218 #endif
2219
2220         printk("Buffer memory:   %6dkB\n",
2221                         atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
2222
2223 #ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
2224         if (!spin_trylock(&lru_list_lock))
2225                 return;
2226         for(nlist = 0; nlist < NR_LIST; nlist++) {
2227                 found = locked = dirty = used = lastused = protected = 0;
2228                 bh = lru_list[nlist];
2229                 if(!bh) continue;
2230
2231                 do {
2232                         found++;
2233                         if (buffer_locked(bh))
2234                                 locked++;
2235                         if (buffer_protected(bh))
2236                                 protected++;
2237                         if (buffer_dirty(bh))
2238                                 dirty++;
2239                         if (atomic_read(&bh->b_count))
2240                                 used++, lastused = found;
2241                         bh = bh->b_next_free;
2242                 } while (bh != lru_list[nlist]);
2243                 {
2244                         int tmp = nr_buffers_type[nlist];
2245                         if (found != tmp)
2246                                 printk("%9s: BUG -> found %d, reported %d\n",
2247                                        buf_types[nlist], found, tmp);
2248                 }
2249                 printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
2250                        "%d locked, %d protected, %d dirty\n",
2251                        buf_types[nlist], found, size_buffers_type[nlist]>>10,
2252                        used, lastused, locked, protected, dirty);
2253         }
2254         spin_unlock(&lru_list_lock);
2255 #endif
2256 }
2257
2258 /* ===================== Init ======================= */
2259
2260 /*
2261  * allocate the hash table and init the free list
2262  * Use gfp() for the hash table to decrease TLB misses, use
2263  * SLAB cache for buffer heads.
2264  */
2265 void __init buffer_init(unsigned long mempages)
2266 {
2267         int order, i;
2268         unsigned int nr_hash;
2269
2270         /* The buffer cache hash table is less important these days,
2271          * trim it a bit.
2272          */
2273         mempages >>= 14;
2274
2275         mempages *= sizeof(struct buffer_head *);
2276
2277         for (order = 0; (1 << order) < mempages; order++)
2278                 ;
2279
2280         /* try to allocate something until we get it or we're asking
2281            for something that is really too small */
2282
2283         do {
2284                 unsigned long tmp;
2285
2286                 nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
2287                 bh_hash_mask = (nr_hash - 1);
2288
2289                 tmp = nr_hash;
2290                 bh_hash_shift = 0;
2291                 while((tmp >>= 1UL) != 0UL)
2292                         bh_hash_shift++;
2293
2294                 hash_table = (struct buffer_head **)
2295                     __get_free_pages(GFP_ATOMIC, order);
2296         } while (hash_table == NULL && --order > 0);
2297         printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
2298                nr_hash, order, (PAGE_SIZE << order));
2299
2300         if (!hash_table)
2301                 panic("Failed to allocate buffer hash table\n");
2302
2303         /* Setup hash chains. */
2304         for(i = 0; i < nr_hash; i++)
2305                 hash_table[i] = NULL;
2306
2307         /* Setup free lists. */
2308         for(i = 0; i < NR_SIZES; i++) {
2309                 free_list[i].list = NULL;
2310                 free_list[i].lock = SPIN_LOCK_UNLOCKED;
2311         }
2312
2313         /* Setup lru lists. */
2314         for(i = 0; i < NR_LIST; i++)
2315                 lru_list[i] = NULL;
2316
2317         bh_cachep = kmem_cache_create("buffer_head",
2318                                       sizeof(struct buffer_head),
2319                                       0,
2320                                       SLAB_HWCACHE_ALIGN, NULL, NULL);
2321         if(!bh_cachep)
2322                 panic("Cannot create buffer head SLAB cache\n");
2323 }
2324
2325
2326 /* ====================== bdflush support =================== */
2327
2328 /* This is a simple kernel daemon, whose job it is to provide a dynamic
2329  * response to dirty buffers.  Once this process is activated, we write back
2330  * a limited number of buffers to the disks and then go back to sleep again.
2331  */
2332 static DECLARE_WAIT_QUEUE_HEAD(bdflush_done);
2333 struct task_struct *bdflush_tsk = 0;
2334
2335 void wakeup_bdflush(int block)
2336 {
2337         DECLARE_WAITQUEUE(wait, current);
2338
2339         if (current == bdflush_tsk)
2340                 return;
2341
2342         if (!block) {
2343                 wake_up_process(bdflush_tsk);
2344                 return;
2345         }
2346
2347         /* kflushd can wakeup us before we have a chance to
2348            go to sleep so we must be smart in handling
2349            this wakeup event from kflushd to avoid deadlocking in SMP
2350            (we are not holding any lock anymore in these two paths). */
2351         __set_current_state(TASK_UNINTERRUPTIBLE);
2352         add_wait_queue(&bdflush_done, &wait);
2353
2354         wake_up_process(bdflush_tsk);
2355         schedule();
2356
2357         remove_wait_queue(&bdflush_done, &wait);
2358         __set_current_state(TASK_RUNNING);
2359 }
2360
2361 /* This is the _only_ function that deals with flushing async writes
2362    to disk.
2363    NOTENOTENOTENOTE: we _only_ need to browse the DIRTY lru list
2364    as all dirty buffers lives _only_ in the DIRTY lru list.
2365    As we never browse the LOCKED and CLEAN lru lists they are infact
2366    completly useless. */
2367 static int flush_dirty_buffers(int check_flushtime)
2368 {
2369         struct buffer_head * bh, *next;
2370         int flushed = 0, i;
2371
2372  restart:
2373         spin_lock(&lru_list_lock);
2374         bh = lru_list[BUF_DIRTY];
2375         if (!bh)
2376                 goto out_unlock;
2377         for (i = nr_buffers_type[BUF_DIRTY]; i-- > 0; bh = next) {
2378                 next = bh->b_next_free;
2379
2380                 if (!buffer_dirty(bh)) {
2381                         __refile_buffer(bh);
2382                         continue;
2383                 }
2384                 if (buffer_locked(bh))
2385                         continue;
2386
2387                 if (check_flushtime) {
2388                         /* The dirty lru list is chronologically ordered so
2389                            if the current bh is not yet timed out,
2390                            then also all the following bhs
2391                            will be too young. */
2392                         if (time_before(jiffies, bh->b_flushtime))
2393                                 goto out_unlock;
2394                 } else {
2395                         if (++flushed > bdf_prm.b_un.ndirty)
2396                                 goto out_unlock;
2397                 }
2398
2399                 /* OK, now we are committed to write it out. */
2400                 atomic_inc(&bh->b_count);
2401                 spin_unlock(&lru_list_lock);
2402                 ll_rw_block(WRITE, 1, &bh);
2403                 atomic_dec(&bh->b_count);
2404
2405                 if (current->need_resched)
2406                         schedule();
2407                 goto restart;
2408         }
2409  out_unlock:
2410         spin_unlock(&lru_list_lock);
2411
2412         return flushed;
2413 }
2414
2415 /*
2416  * Here we attempt to write back old buffers.  We also try to flush inodes
2417  * and supers as well, since this function is essentially "update", and
2418  * otherwise there would be no way of ensuring that these quantities ever
2419  * get written back.  Ideally, we would have a timestamp on the inodes
2420  * and superblocks so that we could write back only the old ones as well
2421  */
2422
2423 static int sync_old_buffers(void)
2424 {
2425         lock_kernel();
2426         sync_supers(0);
2427         sync_inodes(0);
2428         unlock_kernel();
2429
2430         flush_dirty_buffers(1);
2431         /* must really sync all the active I/O request to disk here */
2432         run_task_queue(&tq_disk);
2433         return 0;
2434 }
2435
2436 int block_sync_page(struct page *page)
2437 {
2438         run_task_queue(&tq_disk);
2439         return 0;
2440 }
2441
2442 /* This is the interface to bdflush.  As we get more sophisticated, we can
2443  * pass tuning parameters to this "process", to adjust how it behaves.
2444  * We would want to verify each parameter, however, to make sure that it
2445  * is reasonable. */
2446
2447 asmlinkage long sys_bdflush(int func, long data)
2448 {
2449         if (!capable(CAP_SYS_ADMIN))
2450                 return -EPERM;
2451
2452         if (func == 1) {
2453                 /* do_exit directly and let kupdate to do its work alone. */
2454                 do_exit(0);
2455 #if 0 /* left here as it's the only example of lazy-mm-stuff used from
2456          a syscall that doesn't care about the current mm context. */
2457                 int error;
2458                 struct mm_struct *user_mm;
2459
2460                 /*
2461                  * bdflush will spend all of it's time in kernel-space,
2462                  * without touching user-space, so we can switch it into
2463                  * 'lazy TLB mode' to reduce the cost of context-switches
2464                  * to and from bdflush.
2465                  */
2466                 user_mm = start_lazy_tlb();
2467                 error = sync_old_buffers();
2468                 end_lazy_tlb(user_mm);
2469                 return error;
2470 #endif
2471         }
2472
2473         /* Basically func 1 means read param 1, 2 means write param 1, etc */
2474         if (func >= 2) {
2475                 int i = (func-2) >> 1;
2476                 if (i >= 0 && i < N_PARAM) {
2477                         if ((func & 1) == 0)
2478                                 return put_user(bdf_prm.data[i], (int*)data);
2479
2480                         if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
2481                                 bdf_prm.data[i] = data;
2482                                 return 0;
2483                         }
2484                 }
2485                 return -EINVAL;
2486         }
2487
2488         /* Having func 0 used to launch the actual bdflush and then never
2489          * return (unless explicitly killed). We return zero here to
2490          * remain semi-compatible with present update(8) programs.
2491          */
2492         return 0;
2493 }
2494
2495 /*
2496  * This is the actual bdflush daemon itself. It used to be started from
2497  * the syscall above, but now we launch it ourselves internally with
2498  * kernel_thread(...)  directly after the first thread in init/main.c
2499  */
2500 int bdflush(void * unused)
2501 {
2502         struct task_struct *tsk = current;
2503         int flushed;
2504         /*
2505          *      We have a bare-bones task_struct, and really should fill
2506          *      in a few more things so "top" and /proc/2/{exe,root,cwd}
2507          *      display semi-sane things. Not real crucial though...
2508          */
2509
2510         tsk->session = 1;
2511         tsk->pgrp = 1;
2512         strcpy(tsk->comm, "kflushd");
2513         bdflush_tsk = tsk;
2514
2515         /* avoid getting signals */
2516         spin_lock_irq(&tsk->sigmask_lock);
2517         flush_signals(tsk);
2518         sigfillset(&tsk->blocked);
2519         recalc_sigpending(tsk);
2520         spin_unlock_irq(&tsk->sigmask_lock);
2521
2522         for (;;) {
2523                 CHECK_EMERGENCY_SYNC
2524
2525                 flushed = flush_dirty_buffers(0);
2526
2527                 /* If wakeup_bdflush will wakeup us
2528                    after our bdflush_done wakeup, then
2529                    we must make sure to not sleep
2530                    in schedule_timeout otherwise
2531                    wakeup_bdflush may wait for our
2532                    bdflush_done wakeup that would never arrive
2533                    (as we would be sleeping) and so it would
2534                    deadlock in SMP. */
2535                 __set_current_state(TASK_INTERRUPTIBLE);
2536                 wake_up(&bdflush_done);
2537                 /*
2538                  * If there are still a lot of dirty buffers around,
2539                  * skip the sleep and flush some more. Otherwise, we
2540                  * go to sleep waiting a wakeup.
2541                  */
2542                 if (!flushed || balance_dirty_state(NODEV) < 0)
2543                         schedule();
2544                 /* Remember to mark us as running otherwise
2545                    the next schedule will block. */
2546                 __set_current_state(TASK_RUNNING);
2547         }
2548 }
2549
2550 /*
2551  * This is the kernel update daemon. It was used to live in userspace
2552  * but since it's need to run safely we want it unkillable by mistake.
2553  * You don't need to change your userspace configuration since
2554  * the userspace `update` will do_exit(0) at the first sys_bdflush().
2555  */
2556 int kupdate(void * unused)
2557 {
2558         struct task_struct * tsk = current;
2559         int interval;
2560
2561         tsk->session = 1;
2562         tsk->pgrp = 1;
2563         strcpy(tsk->comm, "kupdate");
2564
2565         /* sigstop and sigcont will stop and wakeup kupdate */
2566         spin_lock_irq(&tsk->sigmask_lock);
2567         sigfillset(&tsk->blocked);
2568         siginitsetinv(&current->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP));
2569         recalc_sigpending(tsk);
2570         spin_unlock_irq(&tsk->sigmask_lock);
2571
2572         for (;;) {
2573                 /* update interval */
2574                 interval = bdf_prm.b_un.interval;
2575                 if (interval) {
2576                         tsk->state = TASK_INTERRUPTIBLE;
2577                         schedule_timeout(interval);
2578                 } else {
2579                 stop_kupdate:
2580                         tsk->state = TASK_STOPPED;
2581                         schedule(); /* wait for SIGCONT */
2582                 }
2583                 /* check for sigstop */
2584                 if (signal_pending(tsk)) {
2585                         int stopped = 0;
2586                         spin_lock_irq(&tsk->sigmask_lock);
2587                         if (sigismember(&tsk->signal, SIGSTOP)) {
2588                                 sigdelset(&tsk->signal, SIGSTOP);
2589                                 stopped = 1;
2590                         }
2591                         recalc_sigpending(tsk);
2592                         spin_unlock_irq(&tsk->sigmask_lock);
2593                         if (stopped)
2594                                 goto stop_kupdate;
2595                 }
2596 #ifdef DEBUG
2597                 printk("kupdate() activated...\n");
2598 #endif
2599                 sync_old_buffers();
2600         }
2601 }
2602
2603 static int __init bdflush_init(void)
2604 {
2605         kernel_thread(bdflush, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
2606         kernel_thread(kupdate, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
2607         return 0;
2608 }
2609
2610 module_init(bdflush_init)
2611