fs/buffer.c

   1 /*
   2  *  linux/fs/buffer.c
   3  *
   4  *  Copyright (C) 1991, 1992  Linus Torvalds
   5  */
   6
   7 /*
   8  *  'buffer.c' implements the buffer-cache functions. Race-conditions have
   9  * been avoided by NEVER letting an interrupt change a buffer (except for the
  10  * data, of course), but instead letting the caller do it.
  11  */
  12
  13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
  14
  15 /* Removed a lot of unnecessary code and simplified things now that
  16  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  17  */
  18
  19 /* Speed up hash, lru, and free list operations.  Use gfp() for allocating
  20  * hash table, use SLAB cache for buffer heads. -DaveM
  21  */
  22
  23 /* Added 32k buffer block sizes - these are required older ARM systems.
  24  * - RMK
  25  */
  26
  27 /* Thread it... -DaveM */
  28
  29 /* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
  30
  31 #include <linux/config.h>
  32 #include <linux/sched.h>
  33 #include <linux/fs.h>
  34 #include <linux/malloc.h>
  35 #include <linux/locks.h>
  36 #include <linux/errno.h>
  37 #include <linux/swap.h>
  38 #include <linux/smp_lock.h>
  39 #include <linux/vmalloc.h>
  40 #include <linux/blkdev.h>
  41 #include <linux/sysrq.h>
  42 #include <linux/file.h>
  43 #include <linux/init.h>
  44 #include <linux/quotaops.h>
  45 #include <linux/iobuf.h>
  46 #include <linux/highmem.h>
  47
  48 #include <asm/uaccess.h>
  49 #include <asm/io.h>
  50 #include <asm/bitops.h>
  51 #include <asm/mmu_context.h>
  52
  53 #define NR_SIZES 7
  54 static char buffersize_index[65] =
  55 {-1,  0,  1, -1,  2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
  56   4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  57   5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  58  -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  59   6};
  60
  61 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
  62 #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
  63 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
  64 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
  65                                              number of unused buffer heads */
  66
  67 /* Anti-deadlock ordering:
  68  *      lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
  69  */
  70
  71 /*
  72  * Hash table gook..
  73  */
  74 static unsigned int bh_hash_mask;
  75 static unsigned int bh_hash_shift;
  76 static struct buffer_head **hash_table;
  77 static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
  78
  79 static struct buffer_head *lru_list[NR_LIST];
  80 static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED;
  81 static int nr_buffers_type[NR_LIST];
  82 static unsigned long size_buffers_type[NR_LIST];
  83
  84 static struct buffer_head * unused_list;
  85 static int nr_unused_buffer_heads;
  86 static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
  87 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
  88
  89 struct bh_free_head {
  90         struct buffer_head *list;
  91         spinlock_t lock;
  92 };
  93 static struct bh_free_head free_list[NR_SIZES];
  94
  95 static int grow_buffers(int size);
  96 static void __refile_buffer(struct buffer_head *);
  97
  98 /* This is used by some architectures to estimate available memory. */
  99 atomic_t buffermem_pages = ATOMIC_INIT(0);
 100
 101 /* Here is the parameter block for the bdflush process. If you add or
 102  * remove any of the parameters, make sure to update kernel/sysctl.c.
 103  */
 104
 105 #define N_PARAM 9
 106
 107 /* The dummy values in this structure are left in there for compatibility
 108  * with old programs that play with the /proc entries.
 109  */
 110 union bdflush_param {
 111         struct {
 112                 int nfract;  /* Percentage of buffer cache dirty to
 113                                 activate bdflush */
 114                 int ndirty;  /* Maximum number of dirty blocks to write out per
 115                                 wake-cycle */
 116                 int nrefill; /* Number of clean buffers to try to obtain
 117                                 each time we call refill */
 118                 int nref_dirt; /* Dirty buffer threshold for activating bdflush
 119                                   when trying to refill buffers. */
 120                 int interval; /* jiffies delay between kupdate flushes */
 121                 int age_buffer;  /* Time for normal buffer to age before we flush it */
 122                 int age_super;  /* Time for superblock to age before we flush it */
 123                 int dummy2;    /* unused */
 124                 int dummy3;    /* unused */
 125         } b_un;
 126         unsigned int data[N_PARAM];
 127 } bdf_prm = {{40, 500, 64, 256, 5*HZ, 30*HZ, 5*HZ, 1884, 2}};
 128
 129 /* These are the min and max parameter values that we will allow to be assigned */
 130 int bdflush_min[N_PARAM] = {  0,  10,    5,   25,  0,   1*HZ,   1*HZ, 1, 1};
 131 int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,600*HZ, 6000*HZ, 6000*HZ, 2047, 5};
 132
 133 /*
 134  * Rewrote the wait-routines to use the "new" wait-queue functionality,
 135  * and getting rid of the cli-sti pairs. The wait-queue routines still
 136  * need cli-sti, but now it's just a couple of 386 instructions or so.
 137  *
 138  * Note that the real wait_on_buffer() is an inline function that checks
 139  * if 'b_wait' is set before calling this, so that the queues aren't set
 140  * up unnecessarily.
 141  */
 142 void __wait_on_buffer(struct buffer_head * bh)
 143 {
 144         struct task_struct *tsk = current;
 145         DECLARE_WAITQUEUE(wait, tsk);
 146
 147         atomic_inc(&bh->b_count);
 148         add_wait_queue(&bh->b_wait, &wait);
 149         do {
 150                 run_task_queue(&tq_disk);
 151                 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 152                 if (!buffer_locked(bh))
 153                         break;
 154                 schedule();
 155         } while (buffer_locked(bh));
 156         tsk->state = TASK_RUNNING;
 157         remove_wait_queue(&bh->b_wait, &wait);
 158         atomic_dec(&bh->b_count);
 159 }
 160
 161 /* Call sync_buffers with wait!=0 to ensure that the call does not
 162  * return until all buffer writes have completed.  Sync() may return
 163  * before the writes have finished; fsync() may not.
 164  */
 165
 166 /* Godamity-damn.  Some buffers (bitmaps for filesystems)
 167  * spontaneously dirty themselves without ever brelse being called.
 168  * We will ultimately want to put these in a separate list, but for
 169  * now we search all of the lists for dirty buffers.
 170  */
 171 static int sync_buffers(kdev_t dev, int wait)
 172 {
 173         int i, retry, pass = 0, err = 0;
 174         struct buffer_head * bh, *next;
 175
 176         /* One pass for no-wait, three for wait:
 177          * 0) write out all dirty, unlocked buffers;
 178          * 1) write out all dirty buffers, waiting if locked;
 179          * 2) wait for completion by waiting for all buffers to unlock.
 180          */
 181         do {
 182                 retry = 0;
 183
 184                 /* We search all lists as a failsafe mechanism, not because we expect
 185                  * there to be dirty buffers on any of the other lists.
 186                  */
 187 repeat:
 188                 spin_lock(&lru_list_lock);
 189                 bh = lru_list[BUF_DIRTY];
 190                 if (!bh)
 191                         goto repeat2;
 192
 193                 for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
 194                         next = bh->b_next_free;
 195
 196                         if (!lru_list[BUF_DIRTY])
 197                                 break;
 198                         if (dev && bh->b_dev != dev)
 199                                 continue;
 200                         if (buffer_locked(bh)) {
 201                                 /* Buffer is locked; skip it unless wait is
 202                                  * requested AND pass > 0.
 203                                  */
 204                                 if (!wait || !pass) {
 205                                         retry = 1;
 206                                         continue;
 207                                 }
 208                                 atomic_inc(&bh->b_count);
 209                                 spin_unlock(&lru_list_lock);
 210                                 wait_on_buffer (bh);
 211                                 atomic_dec(&bh->b_count);
 212                                 goto repeat;
 213                         }
 214
 215                         /* If an unlocked buffer is not uptodate, there has
 216                          * been an IO error. Skip it.
 217                          */
 218                         if (wait && buffer_req(bh) && !buffer_locked(bh) &&
 219                             !buffer_dirty(bh) && !buffer_uptodate(bh)) {
 220                                 err = -EIO;
 221                                 continue;
 222                         }
 223
 224                         /* Don't write clean buffers.  Don't write ANY buffers
 225                          * on the third pass.
 226                          */
 227                         if (!buffer_dirty(bh) || pass >= 2)
 228                                 continue;
 229
 230                         atomic_inc(&bh->b_count);
 231                         spin_unlock(&lru_list_lock);
 232                         ll_rw_block(WRITE, 1, &bh);
 233                         atomic_dec(&bh->b_count);
 234                         retry = 1;
 235                         goto repeat;
 236                 }
 237
 238     repeat2:
 239                 bh = lru_list[BUF_LOCKED];
 240                 if (!bh) {
 241                         spin_unlock(&lru_list_lock);
 242                         break;
 243                 }
 244                 for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
 245                         next = bh->b_next_free;
 246
 247                         if (!lru_list[BUF_LOCKED])
 248                                 break;
 249                         if (dev && bh->b_dev != dev)
 250                                 continue;
 251                         if (buffer_locked(bh)) {
 252                                 /* Buffer is locked; skip it unless wait is
 253                                  * requested AND pass > 0.
 254                                  */
 255                                 if (!wait || !pass) {
 256                                         retry = 1;
 257                                         continue;
 258                                 }
 259                                 atomic_inc(&bh->b_count);
 260                                 spin_unlock(&lru_list_lock);
 261                                 wait_on_buffer (bh);
 262                                 spin_lock(&lru_list_lock);
 263                                 atomic_dec(&bh->b_count);
 264                                 goto repeat2;
 265                         }
 266                 }
 267                 spin_unlock(&lru_list_lock);
 268
 269                 /* If we are waiting for the sync to succeed, and if any dirty
 270                  * blocks were written, then repeat; on the second pass, only
 271                  * wait for buffers being written (do not pass to write any
 272                  * more buffers on the second pass).
 273                  */
 274         } while (wait && retry && ++pass<=2);
 275         return err;
 276 }
 277
 278 void sync_dev(kdev_t dev)
 279 {
 280         sync_supers(dev);
 281         sync_inodes(dev);
 282         DQUOT_SYNC(dev);
 283         /* sync all the dirty buffers out to disk only _after_ all the
 284            high level layers finished generated buffer dirty data
 285            (or we'll return with some buffer still dirty on the blockdevice
 286            so breaking the semantics of this call) */
 287         sync_buffers(dev, 0);
 288         /*
 289          * FIXME(eric) we need to sync the physical devices here.
 290          * This is because some (scsi) controllers have huge amounts of
 291          * cache onboard (hundreds of Mb), and we need to instruct
 292          * them to commit all of the dirty memory to disk, and we should
 293          * not return until this has happened.
 294          *
 295          * This would need to get implemented by going through the assorted
 296          * layers so that each block major number can be synced, and this
 297          * would call down into the upper and mid-layer scsi.
 298          */
 299 }
 300
 301 int fsync_dev(kdev_t dev)
 302 {
 303         sync_buffers(dev, 0);
 304
 305         lock_kernel();
 306         sync_supers(dev);
 307         sync_inodes(dev);
 308         DQUOT_SYNC(dev);
 309         unlock_kernel();
 310
 311         return sync_buffers(dev, 1);
 312 }
 313
 314 asmlinkage long sys_sync(void)
 315 {
 316         fsync_dev(0);
 317         return 0;
 318 }
 319
 320 /*
 321  *      filp may be NULL if called via the msync of a vma.
 322  */
 323
 324 int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
 325 {
 326         struct inode * inode = dentry->d_inode;
 327         struct super_block * sb;
 328         kdev_t dev;
 329         int ret;
 330
 331         lock_kernel();
 332         /* sync the inode to buffers */
 333         write_inode_now(inode, 0);
 334
 335         /* sync the superblock to buffers */
 336         sb = inode->i_sb;
 337         wait_on_super(sb);
 338         if (sb->s_op && sb->s_op->write_super)
 339                 sb->s_op->write_super(sb);
 340
 341         /* .. finally sync the buffers to disk */
 342         dev = inode->i_dev;
 343         ret = sync_buffers(dev, 1);
 344         unlock_kernel();
 345         return ret;
 346 }
 347
 348 asmlinkage long sys_fsync(unsigned int fd)
 349 {
 350         struct file * file;
 351         struct dentry * dentry;
 352         struct inode * inode;
 353         int err;
 354
 355         err = -EBADF;
 356         file = fget(fd);
 357         if (!file)
 358                 goto out;
 359
 360         dentry = file->f_dentry;
 361         inode = dentry->d_inode;
 362
 363         err = -EINVAL;
 364         if (!file->f_op || !file->f_op->fsync)
 365                 goto out_putf;
 366
 367         /* We need to protect against concurrent writers.. */
 368         down(&inode->i_sem);
 369         err = file->f_op->fsync(file, dentry, 0);
 370         up(&inode->i_sem);
 371
 372 out_putf:
 373         fput(file);
 374 out:
 375         return err;
 376 }
 377
 378 asmlinkage long sys_fdatasync(unsigned int fd)
 379 {
 380         struct file * file;
 381         struct dentry * dentry;
 382         struct inode * inode;
 383         int err;
 384
 385         err = -EBADF;
 386         file = fget(fd);
 387         if (!file)
 388                 goto out;
 389
 390         dentry = file->f_dentry;
 391         inode = dentry->d_inode;
 392
 393         err = -EINVAL;
 394         if (!file->f_op || !file->f_op->fsync)
 395                 goto out_putf;
 396
 397         down(&inode->i_sem);
 398         err = file->f_op->fsync(file, dentry, 1);
 399         up(&inode->i_sem);
 400
 401 out_putf:
 402         fput(file);
 403 out:
 404         return err;
 405 }
 406
 407 /* After several hours of tedious analysis, the following hash
 408  * function won.  Do not mess with it... -DaveM
 409  */
 410 #define _hashfn(dev,block)      \
 411         ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
 412          (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ ((block) << (bh_hash_shift - 12))))
 413 #define hash(dev,block) hash_table[(_hashfn(dev,block) & bh_hash_mask)]
 414
 415 static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head)
 416 {
 417         if ((bh->b_next = *head) != NULL)
 418                 bh->b_next->b_pprev = &bh->b_next;
 419         *head = bh;
 420         bh->b_pprev = head;
 421 }
 422
 423 static __inline__ void __hash_unlink(struct buffer_head *bh)
 424 {
 425         if (bh->b_pprev) {
 426                 if (bh->b_next)
 427                         bh->b_next->b_pprev = bh->b_pprev;
 428                 *(bh->b_pprev) = bh->b_next;
 429                 bh->b_pprev = NULL;
 430         }
 431 }
 432
 433 static void __insert_into_lru_list(struct buffer_head * bh, int blist)
 434 {
 435         struct buffer_head **bhp = &lru_list[blist];
 436
 437         if(!*bhp) {
 438                 *bhp = bh;
 439                 bh->b_prev_free = bh;
 440         }
 441         bh->b_next_free = *bhp;
 442         bh->b_prev_free = (*bhp)->b_prev_free;
 443         (*bhp)->b_prev_free->b_next_free = bh;
 444         (*bhp)->b_prev_free = bh;
 445         nr_buffers_type[blist]++;
 446         size_buffers_type[blist] += bh->b_size;
 447 }
 448
 449 static void __remove_from_lru_list(struct buffer_head * bh, int blist)
 450 {
 451         if (bh->b_prev_free || bh->b_next_free) {
 452                 bh->b_prev_free->b_next_free = bh->b_next_free;
 453                 bh->b_next_free->b_prev_free = bh->b_prev_free;
 454                 if (lru_list[blist] == bh)
 455                         lru_list[blist] = bh->b_next_free;
 456                 if (lru_list[blist] == bh)
 457                         lru_list[blist] = NULL;
 458                 bh->b_next_free = bh->b_prev_free = NULL;
 459                 nr_buffers_type[blist]--;
 460                 size_buffers_type[blist] -= bh->b_size;
 461         }
 462 }
 463
 464 static void __remove_from_free_list(struct buffer_head * bh, int index)
 465 {
 466         if(bh->b_next_free == bh)
 467                  free_list[index].list = NULL;
 468         else {
 469                 bh->b_prev_free->b_next_free = bh->b_next_free;
 470                 bh->b_next_free->b_prev_free = bh->b_prev_free;
 471                 if (free_list[index].list == bh)
 472                          free_list[index].list = bh->b_next_free;
 473         }
 474         bh->b_next_free = bh->b_prev_free = NULL;
 475 }
 476
 477 /* must be called with both the hash_table_lock and the lru_list_lock
 478    held */
 479 static void __remove_from_queues(struct buffer_head *bh)
 480 {
 481         __hash_unlink(bh);
 482         __remove_from_lru_list(bh, bh->b_list);
 483 }
 484
 485 static void __insert_into_queues(struct buffer_head *bh)
 486 {
 487         struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
 488
 489         __hash_link(bh, head);
 490         __insert_into_lru_list(bh, bh->b_list);
 491 }
 492
 493 /* This function must only run if there are no other
 494  * references _anywhere_ to this buffer head.
 495  */
 496 static void put_last_free(struct buffer_head * bh)
 497 {
 498         struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)];
 499         struct buffer_head **bhp = &head->list;
 500
 501         bh->b_state = 0;
 502
 503         spin_lock(&head->lock);
 504         bh->b_dev = B_FREE;
 505         if(!*bhp) {
 506                 *bhp = bh;
 507                 bh->b_prev_free = bh;
 508         }
 509         bh->b_next_free = *bhp;
 510         bh->b_prev_free = (*bhp)->b_prev_free;
 511         (*bhp)->b_prev_free->b_next_free = bh;
 512         (*bhp)->b_prev_free = bh;
 513         spin_unlock(&head->lock);
 514 }
 515
 516 /*
 517  * Why like this, I hear you say... The reason is race-conditions.
 518  * As we don't lock buffers (unless we are reading them, that is),
 519  * something might happen to it while we sleep (ie a read-error
 520  * will force it bad). This shouldn't really happen currently, but
 521  * the code is ready.
 522  */
 523 static inline struct buffer_head * __get_hash_table(kdev_t dev, int block, int size)
 524 {
 525         struct buffer_head *bh = hash(dev, block);
 526
 527         for (; bh; bh = bh->b_next)
 528                 if (bh->b_blocknr == block      &&
 529                     bh->b_size    == size       &&
 530                     bh->b_dev     == dev)
 531                         break;
 532         if (bh)
 533                 atomic_inc(&bh->b_count);
 534
 535         return bh;
 536 }
 537
 538 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
 539 {
 540         struct buffer_head *bh;
 541
 542         read_lock(&hash_table_lock);
 543         bh = __get_hash_table(dev, block, size);
 544         read_unlock(&hash_table_lock);
 545
 546         return bh;
 547 }
 548
 549 unsigned int get_hardblocksize(kdev_t dev)
 550 {
 551         /*
 552          * Get the hard sector size for the given device.  If we don't know
 553          * what it is, return 0.
 554          */
 555         if (hardsect_size[MAJOR(dev)] != NULL) {
 556                 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
 557                 if (blksize != 0)
 558                         return blksize;
 559         }
 560
 561         /*
 562          * We don't know what the hardware sector size for this device is.
 563          * Return 0 indicating that we don't know.
 564          */
 565         return 0;
 566 }
 567
 568 /* If invalidate_buffers() will trash dirty buffers, it means some kind
 569    of fs corruption is going on. Trashing dirty data always imply losing
 570    information that was supposed to be just stored on the physical layer
 571    by the user.
 572
 573    Thus invalidate_buffers in general usage is not allwowed to trash dirty
 574    buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved.
 575
 576    NOTE: In the case where the user removed a removable-media-disk even if
 577    there's still dirty data not synced on disk (due a bug in the device driver
 578    or due an error of the user), by not destroying the dirty buffers we could
 579    generate corruption also on the next media inserted, thus a parameter is
 580    necessary to handle this case in the most safe way possible (trying
 581    to not corrupt also the new disk inserted with the data belonging to
 582    the old now corrupted disk). Also for the ramdisk the natural thing
 583    to do in order to release the ramdisk memory is to destroy dirty buffers.
 584
 585    These are two special cases. Normal usage imply the device driver
 586    to issue a sync on the device (without waiting I/O completation) and
 587    then an invalidate_buffers call that doesn't trashes dirty buffers. */
 588 void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
 589 {
 590         int i, nlist, slept;
 591         struct buffer_head * bh, * bh_next;
 592
 593  retry:
 594         slept = 0;
 595         spin_lock(&lru_list_lock);
 596         for(nlist = 0; nlist < NR_LIST; nlist++) {
 597                 bh = lru_list[nlist];
 598                 if (!bh)
 599                         continue;
 600                 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
 601                         bh_next = bh->b_next_free;
 602                         if (bh->b_dev != dev)
 603                                 continue;
 604                         if (buffer_locked(bh)) {
 605                                 atomic_inc(&bh->b_count);
 606                                 spin_unlock(&lru_list_lock);
 607                                 wait_on_buffer(bh);
 608                                 slept = 1;
 609                                 spin_lock(&lru_list_lock);
 610                                 atomic_dec(&bh->b_count);
 611                         }
 612
 613                         write_lock(&hash_table_lock);
 614                         if (!atomic_read(&bh->b_count) &&
 615                             (destroy_dirty_buffers || !buffer_dirty(bh))) {
 616                                 __remove_from_queues(bh);
 617                                 put_last_free(bh);
 618                         }
 619                         write_unlock(&hash_table_lock);
 620                         if (slept)
 621                                 goto out;
 622                 }
 623         }
 624 out:
 625         spin_unlock(&lru_list_lock);
 626         if (slept)
 627                 goto retry;
 628 }
 629
 630 void set_blocksize(kdev_t dev, int size)
 631 {
 632         extern int *blksize_size[];
 633         int i, nlist, slept;
 634         struct buffer_head * bh, * bh_next;
 635
 636         if (!blksize_size[MAJOR(dev)])
 637                 return;
 638
 639         /* Size must be a power of two, and between 512 and PAGE_SIZE */
 640         if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
 641                 panic("Invalid blocksize passed to set_blocksize");
 642
 643         if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
 644                 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 645                 return;
 646         }
 647         if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
 648                 return;
 649         sync_buffers(dev, 2);
 650         blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 651
 652  retry:
 653         slept = 0;
 654         spin_lock(&lru_list_lock);
 655         for(nlist = 0; nlist < NR_LIST; nlist++) {
 656                 bh = lru_list[nlist];
 657                 if (!bh)
 658                         continue;
 659                 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
 660                         bh_next = bh->b_next_free;
 661                         if (bh->b_dev != dev || bh->b_size == size)
 662                                 continue;
 663                         if (buffer_locked(bh)) {
 664                                 atomic_inc(&bh->b_count);
 665                                 spin_unlock(&lru_list_lock);
 666                                 wait_on_buffer(bh);
 667                                 slept = 1;
 668                                 spin_lock(&lru_list_lock);
 669                                 atomic_dec(&bh->b_count);
 670                         }
 671
 672                         write_lock(&hash_table_lock);
 673                         if (!atomic_read(&bh->b_count)) {
 674                                 if (buffer_dirty(bh))
 675                                         printk(KERN_WARNING
 676                                                "set_blocksize: dev %s buffer_dirty %lu size %hu\n",
 677                                                kdevname(dev), bh->b_blocknr, bh->b_size);
 678                                 __remove_from_queues(bh);
 679                                 put_last_free(bh);
 680                         } else {
 681                                 if (atomic_set_buffer_clean(bh))
 682                                         __refile_buffer(bh);
 683                                 clear_bit(BH_Uptodate, &bh->b_state);
 684                                 printk(KERN_WARNING
 685                                        "set_blocksize: "
 686                                        "b_count %d, dev %s, block %lu, from %p\n",
 687                                        atomic_read(&bh->b_count), bdevname(bh->b_dev),
 688                                        bh->b_blocknr, __builtin_return_address(0));
 689                         }
 690                         write_unlock(&hash_table_lock);
 691                         if (slept)
 692                                 goto out;
 693                 }
 694         }
 695  out:
 696         spin_unlock(&lru_list_lock);
 697         if (slept)
 698                 goto retry;
 699 }
 700
 701 /*
 702  * We used to try various strange things. Let's not.
 703  */
 704 static void refill_freelist(int size)
 705 {
 706         if (!grow_buffers(size)) {
 707                 wakeup_bdflush(1);
 708                 current->policy |= SCHED_YIELD;
 709                 schedule();
 710         }
 711 }
 712
 713 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
 714 {
 715         bh->b_list = BUF_CLEAN;
 716         bh->b_end_io = handler;
 717         bh->b_private = private;
 718 }
 719
 720 static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 721 {
 722         mark_buffer_uptodate(bh, uptodate);
 723         unlock_buffer(bh);
 724 }
 725
 726 static void end_buffer_io_bad(struct buffer_head *bh, int uptodate)
 727 {
 728         mark_buffer_uptodate(bh, uptodate);
 729         unlock_buffer(bh);
 730         BUG();
 731 }
 732
 733 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
 734 {
 735         static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
 736         unsigned long flags;
 737         struct buffer_head *tmp;
 738         struct page *page;
 739
 740         mark_buffer_uptodate(bh, uptodate);
 741
 742         /* This is a temporary buffer used for page I/O. */
 743         page = bh->b_page;
 744
 745         if (!uptodate)
 746                 SetPageError(page);
 747
 748         /*
 749          * Be _very_ careful from here on. Bad things can happen if
 750          * two buffer heads end IO at almost the same time and both
 751          * decide that the page is now completely done.
 752          *
 753          * Async buffer_heads are here only as labels for IO, and get
 754          * thrown away once the IO for this page is complete.  IO is
 755          * deemed complete once all buffers have been visited
 756          * (b_count==0) and are now unlocked. We must make sure that
 757          * only the _last_ buffer that decrements its count is the one
 758          * that unlock the page..
 759          */
 760         spin_lock_irqsave(&page_uptodate_lock, flags);
 761         unlock_buffer(bh);
 762         atomic_dec(&bh->b_count);
 763         tmp = bh->b_this_page;
 764         while (tmp != bh) {
 765                 if (tmp->b_end_io == end_buffer_io_async && buffer_locked(tmp))
 766                         goto still_busy;
 767                 tmp = tmp->b_this_page;
 768         }
 769
 770         /* OK, the async IO on this page is complete. */
 771         spin_unlock_irqrestore(&page_uptodate_lock, flags);
 772
 773         /*
 774          * if none of the buffers had errors then we can set the
 775          * page uptodate:
 776          */
 777         if (!PageError(page))
 778                 SetPageUptodate(page);
 779
 780         /*
 781          * Run the hooks that have to be done when a page I/O has completed.
 782          */
 783         if (PageTestandClearDecrAfter(page))
 784                 atomic_dec(&nr_async_pages);
 785
 786         UnlockPage(page);
 787
 788         return;
 789
 790 still_busy:
 791         spin_unlock_irqrestore(&page_uptodate_lock, flags);
 792         return;
 793 }
 794
 795 /*
 796  * Ok, this is getblk, and it isn't very clear, again to hinder
 797  * race-conditions. Most of the code is seldom used, (ie repeating),
 798  * so it should be much more efficient than it looks.
 799  *
 800  * The algorithm is changed: hopefully better, and an elusive bug removed.
 801  *
 802  * 14.02.92: changed it to sync dirty buffers a bit: better performance
 803  * when the filesystem starts to get full of dirty blocks (I hope).
 804  */
 805 struct buffer_head * getblk(kdev_t dev, int block, int size)
 806 {
 807         struct buffer_head * bh;
 808         int isize;
 809
 810 repeat:
 811         spin_lock(&lru_list_lock);
 812         write_lock(&hash_table_lock);
 813         bh = __get_hash_table(dev, block, size);
 814         if (bh)
 815                 goto out;
 816
 817         isize = BUFSIZE_INDEX(size);
 818         spin_lock(&free_list[isize].lock);
 819         bh = free_list[isize].list;
 820         if (bh) {
 821                 __remove_from_free_list(bh, isize);
 822                 atomic_set(&bh->b_count, 1);
 823         }
 824         spin_unlock(&free_list[isize].lock);
 825
 826         /*
 827          * OK, FINALLY we know that this buffer is the only one of
 828          * its kind, we hold a reference (b_count>0), it is unlocked,
 829          * and it is clean.
 830          */
 831         if (bh) {
 832                 init_buffer(bh, end_buffer_io_sync, NULL);
 833                 bh->b_dev = dev;
 834                 bh->b_blocknr = block;
 835                 bh->b_state = 1 << BH_Mapped;
 836
 837                 /* Insert the buffer into the regular lists */
 838                 __insert_into_queues(bh);
 839         out:
 840                 write_unlock(&hash_table_lock);
 841                 spin_unlock(&lru_list_lock);
 842                 return bh;
 843         }
 844
 845         /*
 846          * If we block while refilling the free list, somebody may
 847          * create the buffer first ... search the hashes again.
 848          */
 849         write_unlock(&hash_table_lock);
 850         spin_unlock(&lru_list_lock);
 851         refill_freelist(size);
 852         goto repeat;
 853 }
 854
 855 /* -1 -> no need to flush
 856     0 -> async flush
 857     1 -> sync flush (wait for I/O completation) */
 858 static int balance_dirty_state(kdev_t dev)
 859 {
 860         unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
 861
 862         dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
 863         tot = nr_free_buffer_pages();
 864         tot -= size_buffers_type[BUF_PROTECTED] >> PAGE_SHIFT;
 865
 866         dirty *= 200;
 867         soft_dirty_limit = tot * bdf_prm.b_un.nfract;
 868         hard_dirty_limit = soft_dirty_limit * 2;
 869
 870         if (dirty > soft_dirty_limit) {
 871                 if (dirty > hard_dirty_limit)
 872                         return 1;
 873                 return 0;
 874         }
 875         return -1;
 876 }
 877
 878 /*
 879  * if a new dirty buffer is created we need to balance bdflush.
 880  *
 881  * in the future we might want to make bdflush aware of different
 882  * pressures on different devices - thus the (currently unused)
 883  * 'dev' parameter.
 884  */
 885 void balance_dirty(kdev_t dev)
 886 {
 887         int state = balance_dirty_state(dev);
 888
 889         if (state < 0)
 890                 return;
 891         wakeup_bdflush(state);
 892 }
 893
 894 static __inline__ void __mark_dirty(struct buffer_head *bh, int flag)
 895 {
 896         bh->b_flushtime = jiffies + (flag ? bdf_prm.b_un.age_super : bdf_prm.b_un.age_buffer);
 897         refile_buffer(bh);
 898 }
 899
 900 /* atomic version, the user must call balance_dirty() by hand
 901    as soon as it become possible to block */
 902 void __mark_buffer_dirty(struct buffer_head *bh, int flag)
 903 {
 904         if (!atomic_set_buffer_dirty(bh))
 905                 __mark_dirty(bh, flag);
 906 }
 907
 908 void mark_buffer_dirty(struct buffer_head *bh, int flag)
 909 {
 910         __mark_buffer_dirty(bh, flag);
 911         balance_dirty(bh->b_dev);
 912 }
 913
 914 /*
 915  * A buffer may need to be moved from one buffer list to another
 916  * (e.g. in case it is not shared any more). Handle this.
 917  */
 918 static void __refile_buffer(struct buffer_head *bh)
 919 {
 920         int dispose = BUF_CLEAN;
 921         if (buffer_locked(bh))
 922                 dispose = BUF_LOCKED;
 923         if (buffer_dirty(bh))
 924                 dispose = BUF_DIRTY;
 925         if (buffer_protected(bh))
 926                 dispose = BUF_PROTECTED;
 927         if (dispose != bh->b_list) {
 928                 __remove_from_lru_list(bh, bh->b_list);
 929                 bh->b_list = dispose;
 930                 __insert_into_lru_list(bh, dispose);
 931         }
 932 }
 933
 934 void refile_buffer(struct buffer_head *bh)
 935 {
 936         spin_lock(&lru_list_lock);
 937         __refile_buffer(bh);
 938         spin_unlock(&lru_list_lock);
 939 }
 940
 941 /*
 942  * Release a buffer head
 943  */
 944 void __brelse(struct buffer_head * buf)
 945 {
 946         if (atomic_read(&buf->b_count)) {
 947                 atomic_dec(&buf->b_count);
 948                 return;
 949         }
 950         printk("VFS: brelse: Trying to free free buffer\n");
 951 }
 952
 953 /*
 954  * bforget() is like brelse(), except it puts the buffer on the
 955  * free list if it can.. We can NOT free the buffer if:
 956  *  - there are other users of it
 957  *  - it is locked and thus can have active IO
 958  */
 959 void __bforget(struct buffer_head * buf)
 960 {
 961         /* grab the lru lock here to block bdflush. */
 962         spin_lock(&lru_list_lock);
 963         write_lock(&hash_table_lock);
 964         if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf))
 965                 goto in_use;
 966         __hash_unlink(buf);
 967         write_unlock(&hash_table_lock);
 968         __remove_from_lru_list(buf, buf->b_list);
 969         spin_unlock(&lru_list_lock);
 970         put_last_free(buf);
 971         return;
 972
 973  in_use:
 974         write_unlock(&hash_table_lock);
 975         spin_unlock(&lru_list_lock);
 976 }
 977
 978 /*
 979  * bread() reads a specified block and returns the buffer that contains
 980  * it. It returns NULL if the block was unreadable.
 981  */
 982 struct buffer_head * bread(kdev_t dev, int block, int size)
 983 {
 984         struct buffer_head * bh;
 985
 986         bh = getblk(dev, block, size);
 987         if (buffer_uptodate(bh))
 988                 return bh;
 989         ll_rw_block(READ, 1, &bh);
 990         wait_on_buffer(bh);
 991         if (buffer_uptodate(bh))
 992                 return bh;
 993         brelse(bh);
 994         return NULL;
 995 }
 996
 997 /*
 998  * Ok, breada can be used as bread, but additionally to mark other
 999  * blocks for reading as well. End the argument list with a negative
1000  * number.
1001  */
1002
1003 #define NBUF 16
1004
1005 struct buffer_head * breada(kdev_t dev, int block, int bufsize,
1006         unsigned int pos, unsigned int filesize)
1007 {
1008         struct buffer_head * bhlist[NBUF];
1009         unsigned int blocks;
1010         struct buffer_head * bh;
1011         int index;
1012         int i, j;
1013
1014         if (pos >= filesize)
1015                 return NULL;
1016
1017         if (block < 0)
1018                 return NULL;
1019
1020         bh = getblk(dev, block, bufsize);
1021         index = BUFSIZE_INDEX(bh->b_size);
1022
1023         if (buffer_uptodate(bh))
1024                 return(bh);
1025         else ll_rw_block(READ, 1, &bh);
1026
1027         blocks = (filesize - pos) >> (9+index);
1028
1029         if (blocks < (read_ahead[MAJOR(dev)] >> index))
1030                 blocks = read_ahead[MAJOR(dev)] >> index;
1031         if (blocks > NBUF)
1032                 blocks = NBUF;
1033
1034 /*      if (blocks) printk("breada (new) %d blocks\n",blocks); */
1035
1036         bhlist[0] = bh;
1037         j = 1;
1038         for(i=1; i<blocks; i++) {
1039                 bh = getblk(dev,block+i,bufsize);
1040                 if (buffer_uptodate(bh)) {
1041                         brelse(bh);
1042                         break;
1043                 }
1044                 else bhlist[j++] = bh;
1045         }
1046
1047         /* Request the read for these buffers, and then release them. */
1048         if (j>1)
1049                 ll_rw_block(READA, (j-1), bhlist+1);
1050         for(i=1; i<j; i++)
1051                 brelse(bhlist[i]);
1052
1053         /* Wait for this buffer, and then continue on. */
1054         bh = bhlist[0];
1055         wait_on_buffer(bh);
1056         if (buffer_uptodate(bh))
1057                 return bh;
1058         brelse(bh);
1059         return NULL;
1060 }
1061
1062 /*
1063  * Note: the caller should wake up the buffer_wait list if needed.
1064  */
1065 static __inline__ void __put_unused_buffer_head(struct buffer_head * bh)
1066 {
1067         if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1068                 kmem_cache_free(bh_cachep, bh);
1069         } else {
1070                 bh->b_blocknr = -1;
1071                 init_waitqueue_head(&bh->b_wait);
1072                 nr_unused_buffer_heads++;
1073                 bh->b_next_free = unused_list;
1074                 bh->b_this_page = NULL;
1075                 unused_list = bh;
1076         }
1077 }
1078
1079 /*
1080  * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1081  * no-buffer-head deadlock.  Return NULL on failure; waiting for
1082  * buffer heads is now handled in create_buffers().
1083  */
1084 static struct buffer_head * get_unused_buffer_head(int async)
1085 {
1086         struct buffer_head * bh;
1087
1088         spin_lock(&unused_list_lock);
1089         if (nr_unused_buffer_heads > NR_RESERVED) {
1090                 bh = unused_list;
1091                 unused_list = bh->b_next_free;
1092                 nr_unused_buffer_heads--;
1093                 spin_unlock(&unused_list_lock);
1094                 return bh;
1095         }
1096         spin_unlock(&unused_list_lock);
1097
1098         /* This is critical.  We can't swap out pages to get
1099          * more buffer heads, because the swap-out may need
1100          * more buffer-heads itself.  Thus SLAB_BUFFER.
1101          */
1102         if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
1103                 memset(bh, 0, sizeof(*bh));
1104                 init_waitqueue_head(&bh->b_wait);
1105                 return bh;
1106         }
1107
1108         /*
1109          * If we need an async buffer, use the reserved buffer heads.
1110          */
1111         if (async) {
1112                 spin_lock(&unused_list_lock);
1113                 if (unused_list) {
1114                         bh = unused_list;
1115                         unused_list = bh->b_next_free;
1116                         nr_unused_buffer_heads--;
1117                         spin_unlock(&unused_list_lock);
1118                         return bh;
1119                 }
1120                 spin_unlock(&unused_list_lock);
1121         }
1122 #if 0
1123         /*
1124          * (Pending further analysis ...)
1125          * Ordinary (non-async) requests can use a different memory priority
1126          * to free up pages. Any swapping thus generated will use async
1127          * buffer heads.
1128          */
1129         if(!async &&
1130            (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1131                 memset(bh, 0, sizeof(*bh));
1132                 init_waitqueue_head(&bh->b_wait);
1133                 return bh;
1134         }
1135 #endif
1136
1137         return NULL;
1138 }
1139
1140 void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
1141 {
1142         bh->b_page = page;
1143         if (offset >= PAGE_SIZE)
1144                 BUG();
1145         if (PageHighMem(page))
1146                 /*
1147                  * This catches illegal uses and preserves the offset:
1148                  */
1149                 bh->b_data = (char *)(0 + offset);
1150         else
1151                 bh->b_data = (char *)(page_address(page) + offset);
1152 }
1153
1154 /*
1155  * Create the appropriate buffers when given a page for data area and
1156  * the size of each buffer.. Use the bh->b_this_page linked list to
1157  * follow the buffers created.  Return NULL if unable to create more
1158  * buffers.
1159  * The async flag is used to differentiate async IO (paging, swapping)
1160  * from ordinary buffer allocations, and only async requests are allowed
1161  * to sleep waiting for buffer heads.
1162  */
1163 static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
1164 {
1165         struct buffer_head *bh, *head;
1166         long offset;
1167
1168 try_again:
1169         head = NULL;
1170         offset = PAGE_SIZE;
1171         while ((offset -= size) >= 0) {
1172                 bh = get_unused_buffer_head(async);
1173                 if (!bh)
1174                         goto no_grow;
1175
1176                 bh->b_dev = B_FREE;  /* Flag as unused */
1177                 bh->b_this_page = head;
1178                 head = bh;
1179
1180                 bh->b_state = 0;
1181                 bh->b_next_free = NULL;
1182                 bh->b_pprev = NULL;
1183                 atomic_set(&bh->b_count, 0);
1184                 bh->b_size = size;
1185
1186                 set_bh_page(bh, page, offset);
1187
1188                 bh->b_list = BUF_CLEAN;
1189                 bh->b_end_io = end_buffer_io_bad;
1190         }
1191         return head;
1192 /*
1193  * In case anything failed, we just free everything we got.
1194  */
1195 no_grow:
1196         if (head) {
1197                 spin_lock(&unused_list_lock);
1198                 do {
1199                         bh = head;
1200                         head = head->b_this_page;
1201                         __put_unused_buffer_head(bh);
1202                 } while (head);
1203                 spin_unlock(&unused_list_lock);
1204
1205                 /* Wake up any waiters ... */
1206                 wake_up(&buffer_wait);
1207         }
1208
1209         /*
1210          * Return failure for non-async IO requests.  Async IO requests
1211          * are not allowed to fail, so we have to wait until buffer heads
1212          * become available.  But we don't want tasks sleeping with
1213          * partially complete buffers, so all were released above.
1214          */
1215         if (!async)
1216                 return NULL;
1217
1218         /* We're _really_ low on memory. Now we just
1219          * wait for old buffer heads to become free due to
1220          * finishing IO.  Since this is an async request and
1221          * the reserve list is empty, we're sure there are
1222          * async buffer heads in use.
1223          */
1224         run_task_queue(&tq_disk);
1225
1226         /*
1227          * Set our state for sleeping, then check again for buffer heads.
1228          * This ensures we won't miss a wake_up from an interrupt.
1229          */
1230         wait_event(buffer_wait, nr_unused_buffer_heads >= MAX_BUF_PER_PAGE);
1231         goto try_again;
1232 }
1233
1234 static int create_page_buffers(int rw, struct page *page, kdev_t dev, int b[], int size)
1235 {
1236         struct buffer_head *head, *bh, *tail;
1237         int block;
1238
1239         if (!PageLocked(page))
1240                 BUG();
1241         /*
1242          * Allocate async buffer heads pointing to this page, just for I/O.
1243          * They don't show up in the buffer hash table, but they *are*
1244          * registered in page->buffers.
1245          */
1246         head = create_buffers(page, size, 1);
1247         if (page->buffers)
1248                 BUG();
1249         if (!head)
1250                 BUG();
1251         tail = head;
1252         for (bh = head; bh; bh = bh->b_this_page) {
1253                 block = *(b++);
1254
1255                 tail = bh;
1256                 init_buffer(bh, end_buffer_io_async, NULL);
1257                 bh->b_dev = dev;
1258                 bh->b_blocknr = block;
1259
1260                 set_bit(BH_Mapped, &bh->b_state);
1261         }
1262         tail->b_this_page = head;
1263         page_cache_get(page);
1264         page->buffers = head;
1265         return 0;
1266 }
1267
1268 static void unmap_buffer(struct buffer_head * bh)
1269 {
1270         if (buffer_mapped(bh)) {
1271                 mark_buffer_clean(bh);
1272                 wait_on_buffer(bh);
1273                 clear_bit(BH_Uptodate, &bh->b_state);
1274                 clear_bit(BH_Mapped, &bh->b_state);
1275                 clear_bit(BH_Req, &bh->b_state);
1276                 clear_bit(BH_New, &bh->b_state);
1277         }
1278 }
1279
1280 /*
1281  * We don't have to release all buffers here, but
1282  * we have to be sure that no dirty buffer is left
1283  * and no IO is going on (no buffer is locked), because
1284  * we have truncated the file and are going to free the
1285  * blocks on-disk..
1286  */
1287 int block_flushpage(struct page *page, unsigned long offset)
1288 {
1289         struct buffer_head *head, *bh, *next;
1290         unsigned int curr_off = 0;
1291
1292         if (!PageLocked(page))
1293                 BUG();
1294         if (!page->buffers)
1295                 return 1;
1296
1297         head = page->buffers;
1298         bh = head;
1299         do {
1300                 unsigned int next_off = curr_off + bh->b_size;
1301                 next = bh->b_this_page;
1302
1303                 /*
1304                  * is this block fully flushed?
1305                  */
1306                 if (offset <= curr_off)
1307                         unmap_buffer(bh);
1308                 curr_off = next_off;
1309                 bh = next;
1310         } while (bh != head);
1311
1312         /*
1313          * subtle. We release buffer-heads only if this is
1314          * the 'final' flushpage. We have invalidated the get_block
1315          * cached value unconditionally, so real IO is not
1316          * possible anymore.
1317          *
1318          * If the free doesn't work out, the buffers can be
1319          * left around - they just turn into anonymous buffers
1320          * instead.
1321          */
1322         if (!offset) {
1323                 if (!try_to_free_buffers(page, 0)) {
1324                         atomic_inc(&buffermem_pages);
1325                         return 0;
1326                 }
1327         }
1328
1329         return 1;
1330 }
1331
1332 static void create_empty_buffers(struct page *page, struct inode *inode, unsigned long blocksize)
1333 {
1334         struct buffer_head *bh, *head, *tail;
1335
1336         head = create_buffers(page, blocksize, 1);
1337         if (page->buffers)
1338                 BUG();
1339
1340         bh = head;
1341         do {
1342                 bh->b_dev = inode->i_dev;
1343                 bh->b_blocknr = 0;
1344                 bh->b_end_io = end_buffer_io_bad;
1345                 tail = bh;
1346                 bh = bh->b_this_page;
1347         } while (bh);
1348         tail->b_this_page = head;
1349         page->buffers = head;
1350         page_cache_get(page);
1351 }
1352
1353 /*
1354  * We are taking a block for data and we don't want any output from any
1355  * buffer-cache aliases starting from return from that function and
1356  * until the moment when something will explicitly mark the buffer
1357  * dirty (hopefully that will not happen until we will free that block ;-)
1358  * We don't even need to mark it not-uptodate - nobody can expect
1359  * anything from a newly allocated buffer anyway. We used to used
1360  * unmap_buffer() for such invalidation, but that was wrong. We definitely
1361  * don't want to mark the alias unmapped, for example - it would confuse
1362  * anyone who might pick it with bread() afterwards...
1363  */
1364
1365 static void unmap_underlying_metadata(struct buffer_head * bh)
1366 {
1367         struct buffer_head *old_bh;
1368
1369         old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
1370         if (old_bh) {
1371                 mark_buffer_clean(old_bh);
1372                 wait_on_buffer(old_bh);
1373                 clear_bit(BH_Req, &old_bh->b_state);
1374                 /* Here we could run brelse or bforget. We use
1375                    bforget because it will try to put the buffer
1376                    in the freelist. */
1377                 __bforget(old_bh);
1378         }
1379 }
1380
1381 /*
1382  * block_write_full_page() is SMP-safe - currently it's still
1383  * being called with the kernel lock held, but the code is ready.
1384  */
1385 static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
1386 {
1387         int err, i, need_balance_dirty = 0;
1388         unsigned long block;
1389         struct buffer_head *bh, *head;
1390
1391         if (!PageLocked(page))
1392                 BUG();
1393
1394         if (!page->buffers)
1395                 create_empty_buffers(page, inode, inode->i_sb->s_blocksize);
1396         head = page->buffers;
1397
1398         block = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1399
1400         bh = head;
1401         i = 0;
1402         do {
1403                 /*
1404                  * If the buffer isn't up-to-date, we can't be sure
1405                  * that the buffer has been initialized with the proper
1406                  * block number information etc..
1407                  *
1408                  * Leave it to the low-level FS to make all those
1409                  * decisions (block #0 may actually be a valid block)
1410                  */
1411                 bh->b_end_io = end_buffer_io_sync;
1412                 if (!buffer_mapped(bh)) {
1413                         err = get_block(inode, block, bh, 1);
1414                         if (err)
1415                                 goto out;
1416                         if (buffer_new(bh))
1417                                 unmap_underlying_metadata(bh);
1418                 }
1419                 set_bit(BH_Uptodate, &bh->b_state);
1420                 if (!atomic_set_buffer_dirty(bh)) {
1421                         __mark_dirty(bh, 0);
1422                         need_balance_dirty = 1;
1423                 }
1424
1425                 bh = bh->b_this_page;
1426                 block++;
1427         } while (bh != head);
1428
1429         if (need_balance_dirty)
1430                 balance_dirty(bh->b_dev);
1431
1432         SetPageUptodate(page);
1433         return 0;
1434 out:
1435         ClearPageUptodate(page);
1436         return err;
1437 }
1438
1439 static int __block_prepare_write(struct inode *inode, struct page *page,
1440                 unsigned from, unsigned to, get_block_t *get_block)
1441 {
1442         unsigned block_start, block_end;
1443         unsigned long block;
1444         int err = 0;
1445         unsigned blocksize, bbits;
1446         struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1447         char *kaddr = (char *)kmap(page);
1448
1449         blocksize = inode->i_sb->s_blocksize;
1450         if (!page->buffers)
1451                 create_empty_buffers(page, inode, blocksize);
1452         head = page->buffers;
1453
1454         bbits = inode->i_sb->s_blocksize_bits;
1455         block = page->index << (PAGE_CACHE_SHIFT - bbits);
1456
1457         for(bh = head, block_start = 0; bh != head || !block_start;
1458             block++, block_start=block_end, bh = bh->b_this_page) {
1459                 if (!bh)
1460                         BUG();
1461                 block_end = block_start+blocksize;
1462                 if (block_end <= from)
1463                         continue;
1464                 if (block_start >= to)
1465                         break;
1466                 bh->b_end_io = end_buffer_io_sync;
1467                 if (!buffer_mapped(bh)) {
1468                         err = get_block(inode, block, bh, 1);
1469                         if (err)
1470                                 goto out;
1471                         if (buffer_new(bh)) {
1472                                 unmap_underlying_metadata(bh);
1473                                 if (block_end > to)
1474                                         memset(kaddr+to, 0, block_end-to);
1475                                 if (block_start < from)
1476                                         memset(kaddr+block_start, 0, from-block_start);
1477                                 continue;
1478                         }
1479                 }
1480                 if (!buffer_uptodate(bh) &&
1481                      (block_start < from || block_end > to)) {
1482                         ll_rw_block(READ, 1, &bh);
1483                         *wait_bh++=bh;
1484                 }
1485         }
1486         /*
1487          * If we issued read requests - let them complete.
1488          */
1489         while(wait_bh > wait) {
1490                 wait_on_buffer(*--wait_bh);
1491                 err = -EIO;
1492                 if (!buffer_uptodate(*wait_bh))
1493                         goto out;
1494         }
1495         return 0;
1496 out:
1497         return err;
1498 }
1499
1500 static int __block_commit_write(struct inode *inode, struct page *page,
1501                 unsigned from, unsigned to)
1502 {
1503         unsigned block_start, block_end;
1504         int partial = 0, need_balance_dirty = 0;
1505         unsigned blocksize;
1506         struct buffer_head *bh, *head;
1507
1508         blocksize = inode->i_sb->s_blocksize;
1509
1510         for(bh = head = page->buffers, block_start = 0;
1511             bh != head || !block_start;
1512             block_start=block_end, bh = bh->b_this_page) {
1513                 block_end = block_start + blocksize;
1514                 if (block_end <= from || block_start >= to) {
1515                         if (!buffer_uptodate(bh))
1516                                 partial = 1;
1517                 } else {
1518                         set_bit(BH_Uptodate, &bh->b_state);
1519                         if (!atomic_set_buffer_dirty(bh)) {
1520                                 __mark_dirty(bh, 0);
1521                                 need_balance_dirty = 1;
1522                         }
1523                 }
1524         }
1525
1526         if (need_balance_dirty)
1527                 balance_dirty(bh->b_dev);
1528         /*
1529          * is this a partial write that happened to make all buffers
1530          * uptodate then we can optimize away a bogus readpage() for
1531          * the next read(). Here we 'discover' wether the page went
1532          * uptodate as a result of this (potentially partial) write.
1533          */
1534         if (!partial)
1535                 SetPageUptodate(page);
1536         return 0;
1537 }
1538
1539 /*
1540  * Generic "read page" function for block devices that have the normal
1541  * get_block functionality. This is most of the block device filesystems.
1542  * Reads the page asynchronously --- the unlock_buffer() and
1543  * mark_buffer_uptodate() functions propagate buffer state into the
1544  * page struct once IO has completed.
1545  */
1546 int block_read_full_page(struct page *page, get_block_t *get_block)
1547 {
1548         struct inode *inode = (struct inode*)page->mapping->host;
1549         unsigned long iblock, lblock;
1550         struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1551         unsigned int blocksize, blocks;
1552         unsigned long kaddr = 0;
1553         int nr, i;
1554
1555         if (!PageLocked(page))
1556                 PAGE_BUG(page);
1557         blocksize = inode->i_sb->s_blocksize;
1558         if (!page->buffers)
1559                 create_empty_buffers(page, inode, blocksize);
1560         head = page->buffers;
1561
1562         blocks = PAGE_CACHE_SIZE >> inode->i_sb->s_blocksize_bits;
1563         iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1564         lblock = (inode->i_size+blocksize-1) >> inode->i_sb->s_blocksize_bits;
1565         bh = head;
1566         nr = 0;
1567         i = 0;
1568
1569         do {
1570                 if (buffer_uptodate(bh))
1571                         continue;
1572
1573                 if (!buffer_mapped(bh)) {
1574                         if (iblock < lblock)
1575                                 get_block(inode, iblock, bh, 0);
1576                         if (!buffer_mapped(bh)) {
1577                                 if (!kaddr)
1578                                         kaddr = kmap(page);
1579                                 memset((char *)(kaddr + i*blocksize), 0, blocksize);
1580                                 set_bit(BH_Uptodate, &bh->b_state);
1581                                 continue;
1582                         }
1583                 }
1584
1585                 init_buffer(bh, end_buffer_io_async, NULL);
1586                 atomic_inc(&bh->b_count);
1587                 arr[nr] = bh;
1588                 nr++;
1589         } while (i++, iblock++, (bh = bh->b_this_page) != head);
1590
1591         if (nr) {
1592                 if (Page_Uptodate(page))
1593                         BUG();
1594                 ll_rw_block(READ, nr, arr);
1595         } else {
1596                 /*
1597                  * all buffers are uptodate - we can set the page
1598                  * uptodate as well.
1599                  */
1600                 SetPageUptodate(page);
1601                 UnlockPage(page);
1602         }
1603         if (kaddr)
1604                 kunmap(page);
1605         return 0;
1606 }
1607
1608 /*
1609  * For moronic filesystems that do not allow holes in file.
1610  * We may have to extend the file.
1611  */
1612
1613 int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes)
1614 {
1615         struct address_space *mapping = page->mapping;
1616         struct inode *inode = (struct inode*)mapping->host;
1617         struct page *new_page;
1618         unsigned long pgpos;
1619         long status;
1620         unsigned zerofrom;
1621         unsigned blocksize = inode->i_sb->s_blocksize;
1622         char *kaddr;
1623
1624         while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
1625                 status = -ENOMEM;
1626                 new_page = grab_cache_page(mapping, pgpos);
1627                 if (!new_page)
1628                         goto out;
1629                 /* we might sleep */
1630                 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
1631                         UnlockPage(new_page);
1632                         page_cache_release(new_page);
1633                         continue;
1634                 }
1635                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1636                 if (zerofrom & (blocksize-1)) {
1637                         *bytes |= (blocksize-1);
1638                         (*bytes)++;
1639                 }
1640                 status = __block_prepare_write(inode, new_page, zerofrom,
1641                                                 PAGE_CACHE_SIZE, get_block);
1642                 if (status)
1643                         goto out_unmap;
1644                 kaddr = (char*)page_address(new_page);
1645                 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
1646                 __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
1647                 kunmap(new_page);
1648                 UnlockPage(new_page);
1649                 page_cache_release(new_page);
1650         }
1651
1652         if (page->index < pgpos) {
1653                 /* completely inside the area */
1654                 zerofrom = offset;
1655         } else {
1656                 /* page covers the boundary, find the boundary offset */
1657                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1658
1659                 /* if we will expand the thing last block will be filled */
1660                 if (to > zerofrom && (zerofrom & (blocksize-1))) {
1661                         *bytes |= (blocksize-1);
1662                         (*bytes)++;
1663                 }
1664
1665                 /* starting below the boundary? Nothing to zero out */
1666                 if (offset <= zerofrom)
1667                         zerofrom = offset;
1668         }
1669         status = __block_prepare_write(inode, page, zerofrom, to, get_block);
1670         if (status)
1671                 goto out1;
1672         kaddr = (char*)page_address(page);
1673         if (zerofrom < offset) {
1674                 memset(kaddr+zerofrom, 0, offset-zerofrom);
1675                 __block_commit_write(inode, page, zerofrom, offset);
1676         }
1677         return 0;
1678 out1:
1679         ClearPageUptodate(page);
1680         kunmap(page);
1681         return status;
1682
1683 out_unmap:
1684         ClearPageUptodate(new_page);
1685         kunmap(new_page);
1686         UnlockPage(new_page);
1687         page_cache_release(new_page);
1688 out:
1689         return status;
1690 }
1691
1692 int block_prepare_write(struct page *page, unsigned from, unsigned to,
1693                         get_block_t *get_block)
1694 {
1695         struct inode *inode = (struct inode*)page->mapping->host;
1696         int err = __block_prepare_write(inode, page, from, to, get_block);
1697         if (err) {
1698                 ClearPageUptodate(page);
1699                 kunmap(page);
1700         }
1701         return err;
1702 }
1703
1704 int generic_commit_write(struct file *file, struct page *page,
1705                 unsigned from, unsigned to)
1706 {
1707         struct inode *inode = (struct inode*)page->mapping->host;
1708         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1709         __block_commit_write(inode,page,from,to);
1710         kunmap(page);
1711         if (pos > inode->i_size)
1712                 inode->i_size = pos;
1713         return 0;
1714 }
1715
1716 int block_write_full_page(struct page *page, get_block_t *get_block)
1717 {
1718         struct inode *inode = (struct inode*)page->mapping->host;
1719         unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1720         unsigned offset;
1721         int err;
1722
1723         /* easy case */
1724         if (page->index < end_index)
1725                 return __block_write_full_page(inode, page, get_block);
1726
1727         /* things got complicated... */
1728         offset = inode->i_size & (PAGE_CACHE_SIZE-1);
1729         /* OK, are we completely out? */
1730         if (page->index >= end_index+1 || !offset)
1731                 return -EIO;
1732         /* Sigh... will have to work, then... */
1733         err = __block_prepare_write(inode, page, 0, offset, get_block);
1734         if (!err) {
1735                 memset((char *)page_address(page)+offset, 0, PAGE_CACHE_SIZE-offset);
1736                 __block_commit_write(inode,page,0,offset);
1737 done:
1738                 kunmap(page);
1739                 return err;
1740         }
1741         ClearPageUptodate(page);
1742         goto done;
1743 }
1744
1745 int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block)
1746 {
1747         struct buffer_head tmp;
1748         struct inode *inode = (struct inode*)mapping->host;
1749         tmp.b_state = 0;
1750         tmp.b_blocknr = 0;
1751         get_block(inode, block, &tmp, 0);
1752         return tmp.b_blocknr;
1753 }
1754
1755 /*
1756  * IO completion routine for a buffer_head being used for kiobuf IO: we
1757  * can't dispatch the kiobuf callback until io_count reaches 0.
1758  */
1759
1760 static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate)
1761 {
1762         struct kiobuf *kiobuf;
1763
1764         mark_buffer_uptodate(bh, uptodate);
1765
1766         kiobuf = bh->b_private;
1767         unlock_buffer(bh);
1768         end_kio_request(kiobuf, uptodate);
1769 }
1770
1771
1772 /*
1773  * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
1774  * for them to complete.  Clean up the buffer_heads afterwards.
1775  */
1776
1777 static int do_kio(int rw, int nr, struct buffer_head *bh[], int size)
1778 {
1779         int iosize;
1780         int i;
1781         struct buffer_head *tmp;
1782
1783         if (rw == WRITE)
1784                 rw = WRITERAW;
1785         ll_rw_block(rw, nr, bh);
1786
1787         iosize = 0;
1788         spin_lock(&unused_list_lock);
1789
1790         for (i = nr; --i >= 0; ) {
1791                 iosize += size;
1792                 tmp = bh[i];
1793                 if (buffer_locked(tmp)) {
1794                         spin_unlock(&unused_list_lock);
1795                         wait_on_buffer(tmp);
1796                         spin_lock(&unused_list_lock);
1797                 }
1798
1799                 if (!buffer_uptodate(tmp)) {
1800                         /* We are traversing bh'es in reverse order so
1801                            clearing iosize on error calculates the
1802                            amount of IO before the first error. */
1803                         iosize = 0;
1804                 }
1805                 __put_unused_buffer_head(tmp);
1806         }
1807
1808         spin_unlock(&unused_list_lock);
1809
1810         return iosize;
1811 }
1812
1813 /*
1814  * Start I/O on a physical range of kernel memory, defined by a vector
1815  * of kiobuf structs (much like a user-space iovec list).
1816  *
1817  * The kiobuf must already be locked for IO.  IO is submitted
1818  * asynchronously: you need to check page->locked, page->uptodate, and
1819  * maybe wait on page->wait.
1820  *
1821  * It is up to the caller to make sure that there are enough blocks
1822  * passed in to completely map the iobufs to disk.
1823  */
1824
1825 int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
1826                kdev_t dev, unsigned long b[], int size)
1827 {
1828         int             err;
1829         int             length;
1830         int             transferred;
1831         int             i;
1832         int             bufind;
1833         int             pageind;
1834         int             bhind;
1835         int             offset;
1836         unsigned long   blocknr;
1837         struct kiobuf * iobuf = NULL;
1838         struct page *   map;
1839         struct buffer_head *tmp, *bh[KIO_MAX_SECTORS];
1840
1841         if (!nr)
1842                 return 0;
1843
1844         /*
1845          * First, do some alignment and validity checks
1846          */
1847         for (i = 0; i < nr; i++) {
1848                 iobuf = iovec[i];
1849                 if ((iobuf->offset & (size-1)) ||
1850                     (iobuf->length & (size-1)))
1851                         return -EINVAL;
1852                 if (!iobuf->nr_pages)
1853                         panic("brw_kiovec: iobuf not initialised");
1854         }
1855
1856         /*
1857          * OK to walk down the iovec doing page IO on each page we find.
1858          */
1859         bufind = bhind = transferred = err = 0;
1860         for (i = 0; i < nr; i++) {
1861                 iobuf = iovec[i];
1862                 offset = iobuf->offset;
1863                 length = iobuf->length;
1864                 iobuf->errno = 0;
1865
1866                 for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
1867                         map  = iobuf->maplist[pageind];
1868                         if (!map) {
1869                                 err = -EFAULT;
1870                                 goto error;
1871                         }
1872
1873                         while (length > 0) {
1874                                 blocknr = b[bufind++];
1875                                 tmp = get_unused_buffer_head(0);
1876                                 if (!tmp) {
1877                                         err = -ENOMEM;
1878                                         goto error;
1879                                 }
1880
1881                                 tmp->b_dev = B_FREE;
1882                                 tmp->b_size = size;
1883                                 set_bh_page(tmp, map, offset);
1884                                 tmp->b_this_page = tmp;
1885
1886                                 init_buffer(tmp, end_buffer_io_kiobuf, iobuf);
1887                                 tmp->b_dev = dev;
1888                                 tmp->b_blocknr = blocknr;
1889                                 tmp->b_state = 1 << BH_Mapped;
1890
1891                                 if (rw == WRITE) {
1892                                         set_bit(BH_Uptodate, &tmp->b_state);
1893                                         set_bit(BH_Dirty, &tmp->b_state);
1894                                 }
1895
1896                                 bh[bhind++] = tmp;
1897                                 length -= size;
1898                                 offset += size;
1899
1900                                 atomic_inc(&iobuf->io_count);
1901
1902                                 /*
1903                                  * Start the IO if we have got too much
1904                                  */
1905                                 if (bhind >= KIO_MAX_SECTORS) {
1906                                         err = do_kio(rw, bhind, bh, size);
1907                                         if (err >= 0)
1908                                                 transferred += err;
1909                                         else
1910                                                 goto finished;
1911                                         bhind = 0;
1912                                 }
1913
1914                                 if (offset >= PAGE_SIZE) {
1915                                         offset = 0;
1916                                         break;
1917                                 }
1918                         } /* End of block loop */
1919                 } /* End of page loop */
1920         } /* End of iovec loop */
1921
1922         /* Is there any IO still left to submit? */
1923         if (bhind) {
1924                 err = do_kio(rw, bhind, bh, size);
1925                 if (err >= 0)
1926                         transferred += err;
1927                 else
1928                         goto finished;
1929         }
1930
1931  finished:
1932         if (transferred)
1933                 return transferred;
1934         return err;
1935
1936  error:
1937         /* We got an error allocating the bh'es.  Just free the current
1938            buffer_heads and exit. */
1939         spin_lock(&unused_list_lock);
1940         for (i = bhind; --i >= 0; ) {
1941                 __put_unused_buffer_head(bh[bhind]);
1942         }
1943         spin_unlock(&unused_list_lock);
1944         goto finished;
1945 }
1946
1947 /*
1948  * Start I/O on a page.
1949  * This function expects the page to be locked and may return
1950  * before I/O is complete. You then have to check page->locked,
1951  * page->uptodate, and maybe wait on page->wait.
1952  *
1953  * brw_page() is SMP-safe, although it's being called with the
1954  * kernel lock held - but the code is ready.
1955  *
1956  * FIXME: we need a swapper_inode->get_block function to remove
1957  *        some of the bmap kludges and interface ugliness here.
1958  */
1959 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size)
1960 {
1961         struct buffer_head *head, *bh, *arr[MAX_BUF_PER_PAGE];
1962         int nr, fresh /* temporary debugging flag */, block;
1963
1964         if (!PageLocked(page))
1965                 panic("brw_page: page not locked for I/O");
1966 //      ClearPageError(page);
1967         /*
1968          * We pretty much rely on the page lock for this, because
1969          * create_page_buffers() might sleep.
1970          */
1971         fresh = 0;
1972         if (!page->buffers) {
1973                 create_page_buffers(rw, page, dev, b, size);
1974                 fresh = 1;
1975         }
1976         if (!page->buffers)
1977                 BUG();
1978
1979         head = page->buffers;
1980         bh = head;
1981         nr = 0;
1982         do {
1983                 block = *(b++);
1984
1985                 if (fresh && (atomic_read(&bh->b_count) != 0))
1986                         BUG();
1987                 if (rw == READ) {
1988                         if (!fresh)
1989                                 BUG();
1990                         if (!buffer_uptodate(bh)) {
1991                                 arr[nr++] = bh;
1992                                 atomic_inc(&bh->b_count);
1993                         }
1994                 } else { /* WRITE */
1995                         if (!bh->b_blocknr) {
1996                                 if (!block)
1997                                         BUG();
1998                                 bh->b_blocknr = block;
1999                         } else {
2000                                 if (!block)
2001                                         BUG();
2002                         }
2003                         set_bit(BH_Uptodate, &bh->b_state);
2004                         set_bit(BH_Dirty, &bh->b_state);
2005                         arr[nr++] = bh;
2006                         atomic_inc(&bh->b_count);
2007                 }
2008                 bh = bh->b_this_page;
2009         } while (bh != head);
2010         if ((rw == READ) && nr) {
2011                 if (Page_Uptodate(page))
2012                         BUG();
2013                 ll_rw_block(rw, nr, arr);
2014         } else {
2015                 if (!nr && rw == READ) {
2016                         SetPageUptodate(page);
2017                         UnlockPage(page);
2018                 }
2019                 if (nr && (rw == WRITE))
2020                         ll_rw_block(rw, nr, arr);
2021         }
2022         return 0;
2023 }
2024
2025 int block_symlink(struct inode *inode, const char *symname, int len)
2026 {
2027         struct address_space *mapping = inode->i_mapping;
2028         struct page *page = grab_cache_page(mapping, 0);
2029         int err = -ENOMEM;
2030         char *kaddr;
2031
2032         if (!page)
2033                 goto fail;
2034         err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
2035         if (err)
2036                 goto fail_map;
2037         kaddr = (char*)page_address(page);
2038         memcpy(kaddr, symname, len-1);
2039         mapping->a_ops->commit_write(NULL, page, 0, len-1);
2040         /*
2041          * Notice that we are _not_ going to block here - end of page is
2042          * unmapped, so this will only try to map the rest of page, see
2043          * that it is unmapped (typically even will not look into inode -
2044          * ->i_size will be enough for everything) and zero it out.
2045          * OTOH it's obviously correct and should make the page up-to-date.
2046          */
2047         err = mapping->a_ops->readpage(NULL, page);
2048         wait_on_page(page);
2049         page_cache_release(page);
2050         if (err < 0)
2051                 goto fail;
2052         mark_inode_dirty(inode);
2053         return 0;
2054 fail_map:
2055         UnlockPage(page);
2056         page_cache_release(page);
2057 fail:
2058         return err;
2059 }
2060
2061 /*
2062  * Try to increase the number of buffers available: the size argument
2063  * is used to determine what kind of buffers we want.
2064  */
2065 static int grow_buffers(int size)
2066 {
2067         struct page * page;
2068         struct buffer_head *bh, *tmp;
2069         struct buffer_head * insert_point;
2070         int isize;
2071
2072         if ((size & 511) || (size > PAGE_SIZE)) {
2073                 printk("VFS: grow_buffers: size = %d\n",size);
2074                 return 0;
2075         }
2076
2077         page = alloc_page(GFP_BUFFER);
2078         if (!page)
2079                 goto out;
2080         bh = create_buffers(page, size, 0);
2081         if (!bh)
2082                 goto no_buffer_head;
2083
2084         isize = BUFSIZE_INDEX(size);
2085
2086         spin_lock(&free_list[isize].lock);
2087         insert_point = free_list[isize].list;
2088         tmp = bh;
2089         while (1) {
2090                 if (insert_point) {
2091                         tmp->b_next_free = insert_point->b_next_free;
2092                         tmp->b_prev_free = insert_point;
2093                         insert_point->b_next_free->b_prev_free = tmp;
2094                         insert_point->b_next_free = tmp;
2095                 } else {
2096                         tmp->b_prev_free = tmp;
2097                         tmp->b_next_free = tmp;
2098                 }
2099                 insert_point = tmp;
2100                 if (tmp->b_this_page)
2101                         tmp = tmp->b_this_page;
2102                 else
2103                         break;
2104         }
2105         tmp->b_this_page = bh;
2106         free_list[isize].list = bh;
2107         spin_unlock(&free_list[isize].lock);
2108
2109         page->buffers = bh;
2110         page->flags &= ~(1 << PG_referenced);
2111         lru_cache_add(page);
2112         atomic_inc(&buffermem_pages);
2113         return 1;
2114
2115 no_buffer_head:
2116         page_cache_release(page);
2117 out:
2118         return 0;
2119 }
2120
2121 /*
2122  * Sync all the buffers on one page..
2123  *
2124  * If we have old buffers that are locked, we'll
2125  * wait on them, but we won't wait on the new ones
2126  * we're writing out now.
2127  *
2128  * This all is required so that we can free up memory
2129  * later.
2130  *
2131  * Wait:
2132  *      0 - no wait (this does not get called - see try_to_free_buffers below)
2133  *      1 - start IO for dirty buffers
2134  *      2 - wait for completion of locked buffers
2135  */
2136 static void sync_page_buffers(struct buffer_head *bh, int wait)
2137 {
2138         struct buffer_head * tmp = bh;
2139
2140         do {
2141                 struct buffer_head *p = tmp;
2142                 tmp = tmp->b_this_page;
2143                 if (buffer_locked(p)) {
2144                         if (wait > 1)
2145                                 __wait_on_buffer(p);
2146                 } else if (buffer_dirty(p))
2147                         ll_rw_block(WRITE, 1, &p);
2148         } while (tmp != bh);
2149 }
2150
2151 /*
2152  * Can the buffer be thrown out?
2153  */
2154 #define BUFFER_BUSY_BITS        ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
2155 #define buffer_busy(bh)         (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
2156
2157 /*
2158  * try_to_free_buffers() checks if all the buffers on this particular page
2159  * are unused, and free's the page if so.
2160  *
2161  * Wake up bdflush() if this fails - if we're running low on memory due
2162  * to dirty buffers, we need to flush them out as quickly as possible.
2163  *
2164  * NOTE: There are quite a number of ways that threads of control can
2165  *       obtain a reference to a buffer head within a page.  So we must
2166  *       lock out all of these paths to cleanly toss the page.
2167  */
2168 int try_to_free_buffers(struct page * page, int wait)
2169 {
2170         struct buffer_head * tmp, * bh = page->buffers;
2171         int index = BUFSIZE_INDEX(bh->b_size);
2172
2173         spin_lock(&lru_list_lock);
2174         write_lock(&hash_table_lock);
2175         spin_lock(&free_list[index].lock);
2176         tmp = bh;
2177         do {
2178                 struct buffer_head *p = tmp;
2179
2180                 tmp = tmp->b_this_page;
2181                 if (buffer_busy(p))
2182                         goto busy_buffer_page;
2183         } while (tmp != bh);
2184
2185         spin_lock(&unused_list_lock);
2186         tmp = bh;
2187         do {
2188                 struct buffer_head * p = tmp;
2189                 tmp = tmp->b_this_page;
2190
2191                 /* The buffer can be either on the regular
2192                  * queues or on the free list..
2193                  */
2194                 if (p->b_dev != B_FREE)
2195                         __remove_from_queues(p);
2196                 else
2197                         __remove_from_free_list(p, index);
2198                 __put_unused_buffer_head(p);
2199         } while (tmp != bh);
2200         spin_unlock(&unused_list_lock);
2201
2202         /* Wake up anyone waiting for buffer heads */
2203         wake_up(&buffer_wait);
2204
2205         /* And free the page */
2206         page->buffers = NULL;
2207         page_cache_release(page);
2208         spin_unlock(&free_list[index].lock);
2209         write_unlock(&hash_table_lock);
2210         spin_unlock(&lru_list_lock);
2211         return 1;
2212
2213 busy_buffer_page:
2214         /* Uhhuh, start writeback so that we don't end up with all dirty pages */
2215         spin_unlock(&free_list[index].lock);
2216         write_unlock(&hash_table_lock);
2217         spin_unlock(&lru_list_lock);
2218         if (wait)
2219                 sync_page_buffers(bh, wait);
2220         return 0;
2221 }
2222
2223 /* ================== Debugging =================== */
2224
2225 void show_buffers(void)
2226 {
2227 #ifdef CONFIG_SMP
2228         struct buffer_head * bh;
2229         int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
2230         int protected = 0;
2231         int nlist;
2232         static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", "PROTECTED", };
2233 #endif
2234
2235         printk("Buffer memory:   %6dkB\n",
2236                         atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
2237
2238 #ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
2239         if (!spin_trylock(&lru_list_lock))
2240                 return;
2241         for(nlist = 0; nlist < NR_LIST; nlist++) {
2242                 found = locked = dirty = used = lastused = protected = 0;
2243                 bh = lru_list[nlist];
2244                 if(!bh) continue;
2245
2246                 do {
2247                         found++;
2248                         if (buffer_locked(bh))
2249                                 locked++;
2250                         if (buffer_protected(bh))
2251                                 protected++;
2252                         if (buffer_dirty(bh))
2253                                 dirty++;
2254                         if (atomic_read(&bh->b_count))
2255                                 used++, lastused = found;
2256                         bh = bh->b_next_free;
2257                 } while (bh != lru_list[nlist]);
2258                 {
2259                         int tmp = nr_buffers_type[nlist];
2260                         if (found != tmp)
2261                                 printk("%9s: BUG -> found %d, reported %d\n",
2262                                        buf_types[nlist], found, tmp);
2263                 }
2264                 printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
2265                        "%d locked, %d protected, %d dirty\n",
2266                        buf_types[nlist], found, size_buffers_type[nlist]>>10,
2267                        used, lastused, locked, protected, dirty);
2268         }
2269         spin_unlock(&lru_list_lock);
2270 #endif
2271 }
2272
2273 /* ===================== Init ======================= */
2274
2275 /*
2276  * allocate the hash table and init the free list
2277  * Use gfp() for the hash table to decrease TLB misses, use
2278  * SLAB cache for buffer heads.
2279  */
2280 void __init buffer_init(unsigned long mempages)
2281 {
2282         int order, i;
2283         unsigned int nr_hash;
2284
2285         /* The buffer cache hash table is less important these days,
2286          * trim it a bit.
2287          */
2288         mempages >>= 14;
2289
2290         mempages *= sizeof(struct buffer_head *);
2291
2292         for (order = 0; (1 << order) < mempages; order++)
2293                 ;
2294
2295         /* try to allocate something until we get it or we're asking
2296            for something that is really too small */
2297
2298         do {
2299                 unsigned long tmp;
2300
2301                 nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
2302                 bh_hash_mask = (nr_hash - 1);
2303
2304                 tmp = nr_hash;
2305                 bh_hash_shift = 0;
2306                 while((tmp >>= 1UL) != 0UL)
2307                         bh_hash_shift++;
2308
2309                 hash_table = (struct buffer_head **)
2310                     __get_free_pages(GFP_ATOMIC, order);
2311         } while (hash_table == NULL && --order > 0);
2312         printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
2313                nr_hash, order, (PAGE_SIZE << order));
2314
2315         if (!hash_table)
2316                 panic("Failed to allocate buffer hash table\n");
2317
2318         /* Setup hash chains. */
2319         for(i = 0; i < nr_hash; i++)
2320                 hash_table[i] = NULL;
2321
2322         /* Setup free lists. */
2323         for(i = 0; i < NR_SIZES; i++) {
2324                 free_list[i].list = NULL;
2325                 free_list[i].lock = SPIN_LOCK_UNLOCKED;
2326         }
2327
2328         /* Setup lru lists. */
2329         for(i = 0; i < NR_LIST; i++)
2330                 lru_list[i] = NULL;
2331
2332 }
2333
2334
2335 /* ====================== bdflush support =================== */
2336
2337 /* This is a simple kernel daemon, whose job it is to provide a dynamic
2338  * response to dirty buffers.  Once this process is activated, we write back
2339  * a limited number of buffers to the disks and then go back to sleep again.
2340  */
2341 static DECLARE_WAIT_QUEUE_HEAD(bdflush_done);
2342 struct task_struct *bdflush_tsk = 0;
2343
2344 void wakeup_bdflush(int block)
2345 {
2346         DECLARE_WAITQUEUE(wait, current);
2347
2348         if (current == bdflush_tsk)
2349                 return;
2350
2351         if (!block) {
2352                 wake_up_process(bdflush_tsk);
2353                 return;
2354         }
2355
2356         /* kflushd can wakeup us before we have a chance to
2357            go to sleep so we must be smart in handling
2358            this wakeup event from kflushd to avoid deadlocking in SMP
2359            (we are not holding any lock anymore in these two paths). */
2360         __set_current_state(TASK_UNINTERRUPTIBLE);
2361         add_wait_queue(&bdflush_done, &wait);
2362
2363         wake_up_process(bdflush_tsk);
2364         schedule();
2365
2366         remove_wait_queue(&bdflush_done, &wait);
2367         __set_current_state(TASK_RUNNING);
2368 }
2369
2370 /* This is the _only_ function that deals with flushing async writes
2371    to disk.
2372    NOTENOTENOTENOTE: we _only_ need to browse the DIRTY lru list
2373    as all dirty buffers lives _only_ in the DIRTY lru list.
2374    As we never browse the LOCKED and CLEAN lru lists they are infact
2375    completly useless. */
2376 static int flush_dirty_buffers(int check_flushtime)
2377 {
2378         struct buffer_head * bh, *next;
2379         int flushed = 0, i;
2380
2381  restart:
2382         spin_lock(&lru_list_lock);
2383         bh = lru_list[BUF_DIRTY];
2384         if (!bh)
2385                 goto out_unlock;
2386         for (i = nr_buffers_type[BUF_DIRTY]; i-- > 0; bh = next) {
2387                 next = bh->b_next_free;
2388
2389                 if (!buffer_dirty(bh)) {
2390                         __refile_buffer(bh);
2391                         continue;
2392                 }
2393                 if (buffer_locked(bh))
2394                         continue;
2395
2396                 if (check_flushtime) {
2397                         /* The dirty lru list is chronologically ordered so
2398                            if the current bh is not yet timed out,
2399                            then also all the following bhs
2400                            will be too young. */
2401                         if (time_before(jiffies, bh->b_flushtime))
2402                                 goto out_unlock;
2403                 } else {
2404                         if (++flushed > bdf_prm.b_un.ndirty)
2405                                 goto out_unlock;
2406                 }
2407
2408                 /* OK, now we are committed to write it out. */
2409                 atomic_inc(&bh->b_count);
2410                 spin_unlock(&lru_list_lock);
2411                 ll_rw_block(WRITE, 1, &bh);
2412                 atomic_dec(&bh->b_count);
2413
2414                 if (current->need_resched)
2415                         schedule();
2416                 goto restart;
2417         }
2418  out_unlock:
2419         spin_unlock(&lru_list_lock);
2420
2421         return flushed;
2422 }
2423
2424 /*
2425  * Here we attempt to write back old buffers.  We also try to flush inodes
2426  * and supers as well, since this function is essentially "update", and
2427  * otherwise there would be no way of ensuring that these quantities ever
2428  * get written back.  Ideally, we would have a timestamp on the inodes
2429  * and superblocks so that we could write back only the old ones as well
2430  */
2431
2432 static int sync_old_buffers(void)
2433 {
2434         lock_kernel();
2435         sync_supers(0);
2436         sync_inodes(0);
2437         unlock_kernel();
2438
2439         flush_dirty_buffers(1);
2440         /* must really sync all the active I/O request to disk here */
2441         run_task_queue(&tq_disk);
2442         return 0;
2443 }
2444
2445 int block_sync_page(struct page *page)
2446 {
2447         run_task_queue(&tq_disk);
2448         return 0;
2449 }
2450
2451 /* This is the interface to bdflush.  As we get more sophisticated, we can
2452  * pass tuning parameters to this "process", to adjust how it behaves.
2453  * We would want to verify each parameter, however, to make sure that it
2454  * is reasonable. */
2455
2456 asmlinkage long sys_bdflush(int func, long data)
2457 {
2458         if (!capable(CAP_SYS_ADMIN))
2459                 return -EPERM;
2460
2461         if (func == 1) {
2462                 /* do_exit directly and let kupdate to do its work alone. */
2463                 do_exit(0);
2464 #if 0 /* left here as it's the only example of lazy-mm-stuff used from
2465          a syscall that doesn't care about the current mm context. */
2466                 int error;
2467                 struct mm_struct *user_mm;
2468
2469                 /*
2470                  * bdflush will spend all of it's time in kernel-space,
2471                  * without touching user-space, so we can switch it into
2472                  * 'lazy TLB mode' to reduce the cost of context-switches
2473                  * to and from bdflush.
2474                  */
2475                 user_mm = start_lazy_tlb();
2476                 error = sync_old_buffers();
2477                 end_lazy_tlb(user_mm);
2478                 return error;
2479 #endif
2480         }
2481
2482         /* Basically func 1 means read param 1, 2 means write param 1, etc */
2483         if (func >= 2) {
2484                 int i = (func-2) >> 1;
2485                 if (i >= 0 && i < N_PARAM) {
2486                         if ((func & 1) == 0)
2487                                 return put_user(bdf_prm.data[i], (int*)data);
2488
2489                         if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
2490                                 bdf_prm.data[i] = data;
2491                                 return 0;
2492                         }
2493                 }
2494                 return -EINVAL;
2495         }
2496
2497         /* Having func 0 used to launch the actual bdflush and then never
2498          * return (unless explicitly killed). We return zero here to
2499          * remain semi-compatible with present update(8) programs.
2500          */
2501         return 0;
2502 }
2503
2504 /*
2505  * This is the actual bdflush daemon itself. It used to be started from
2506  * the syscall above, but now we launch it ourselves internally with
2507  * kernel_thread(...)  directly after the first thread in init/main.c
2508  */
2509 int bdflush(void *sem)
2510 {
2511         struct task_struct *tsk = current;
2512         int flushed;
2513         /*
2514          *      We have a bare-bones task_struct, and really should fill
2515          *      in a few more things so "top" and /proc/2/{exe,root,cwd}
2516          *      display semi-sane things. Not real crucial though...
2517          */
2518
2519         tsk->session = 1;
2520         tsk->pgrp = 1;
2521         strcpy(tsk->comm, "kflushd");
2522         bdflush_tsk = tsk;
2523
2524         /* avoid getting signals */
2525         spin_lock_irq(&tsk->sigmask_lock);
2526         flush_signals(tsk);
2527         sigfillset(&tsk->blocked);
2528         recalc_sigpending(tsk);
2529         spin_unlock_irq(&tsk->sigmask_lock);
2530
2531         up((struct semaphore *)sem);
2532
2533         for (;;) {
2534                 CHECK_EMERGENCY_SYNC
2535
2536                 flushed = flush_dirty_buffers(0);
2537
2538                 /* If wakeup_bdflush will wakeup us
2539                    after our bdflush_done wakeup, then
2540                    we must make sure to not sleep
2541                    in schedule_timeout otherwise
2542                    wakeup_bdflush may wait for our
2543                    bdflush_done wakeup that would never arrive
2544                    (as we would be sleeping) and so it would
2545                    deadlock in SMP. */
2546                 __set_current_state(TASK_INTERRUPTIBLE);
2547                 wake_up(&bdflush_done);
2548                 /*
2549                  * If there are still a lot of dirty buffers around,
2550                  * skip the sleep and flush some more. Otherwise, we
2551                  * go to sleep waiting a wakeup.
2552                  */
2553                 if (!flushed || balance_dirty_state(NODEV) < 0)
2554                         schedule();
2555                 /* Remember to mark us as running otherwise
2556                    the next schedule will block. */
2557                 __set_current_state(TASK_RUNNING);
2558         }
2559 }
2560
2561 /*
2562  * This is the kernel update daemon. It was used to live in userspace
2563  * but since it's need to run safely we want it unkillable by mistake.
2564  * You don't need to change your userspace configuration since
2565  * the userspace `update` will do_exit(0) at the first sys_bdflush().
2566  */
2567 int kupdate(void *sem)
2568 {
2569         struct task_struct * tsk = current;
2570         int interval;
2571
2572         tsk->session = 1;
2573         tsk->pgrp = 1;
2574         strcpy(tsk->comm, "kupdate");
2575
2576         /* sigstop and sigcont will stop and wakeup kupdate */
2577         spin_lock_irq(&tsk->sigmask_lock);
2578         sigfillset(&tsk->blocked);
2579         siginitsetinv(&current->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP));
2580         recalc_sigpending(tsk);
2581         spin_unlock_irq(&tsk->sigmask_lock);
2582
2583         up((struct semaphore *)sem);
2584
2585         for (;;) {
2586                 /* update interval */
2587                 interval = bdf_prm.b_un.interval;
2588                 if (interval) {
2589                         tsk->state = TASK_INTERRUPTIBLE;
2590                         schedule_timeout(interval);
2591                 } else {
2592                 stop_kupdate:
2593                         tsk->state = TASK_STOPPED;
2594                         schedule(); /* wait for SIGCONT */
2595                 }
2596                 /* check for sigstop */
2597                 if (signal_pending(tsk)) {
2598                         int stopped = 0;
2599                         spin_lock_irq(&tsk->sigmask_lock);
2600                         if (sigismember(&tsk->signal, SIGSTOP)) {
2601                                 sigdelset(&tsk->signal, SIGSTOP);
2602                                 stopped = 1;
2603                         }
2604                         recalc_sigpending(tsk);
2605                         spin_unlock_irq(&tsk->sigmask_lock);
2606                         if (stopped)
2607                                 goto stop_kupdate;
2608                 }
2609 #ifdef DEBUG
2610                 printk("kupdate() activated...\n");
2611 #endif
2612                 sync_old_buffers();
2613         }
2614 }
2615
2616 static int __init bdflush_init(void)
2617 {
2618         DECLARE_MUTEX_LOCKED(sem);
2619         kernel_thread(bdflush, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
2620         down(&sem);
2621         kernel_thread(kupdate, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
2622         down(&sem);
2623         return 0;
2624 }
2625
2626 module_init(bdflush_init)
2627