fs/buffer.c

   1 /*
   2  *  linux/fs/buffer.c
   3  *
   4  *  Copyright (C) 1991, 1992  Linus Torvalds
   5  */
   6
   7 /*
   8  *  'buffer.c' implements the buffer-cache functions. Race-conditions have
   9  * been avoided by NEVER letting an interrupt change a buffer (except for the
  10  * data, of course), but instead letting the caller do it.
  11  */
  12
  13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
  14
  15 /* Removed a lot of unnecessary code and simplified things now that
  16  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  17  */
  18
  19 /* Speed up hash, lru, and free list operations.  Use gfp() for allocating
  20  * hash table, use SLAB cache for buffer heads. -DaveM
  21  */
  22
  23 /* Added 32k buffer block sizes - these are required older ARM systems.
  24  * - RMK
  25  */
  26
  27 /* Thread it... -DaveM */
  28
  29 /* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
  30
  31 #include <linux/config.h>
  32 #include <linux/sched.h>
  33 #include <linux/fs.h>
  34 #include <linux/malloc.h>
  35 #include <linux/locks.h>
  36 #include <linux/errno.h>
  37 #include <linux/swap.h>
  38 #include <linux/smp_lock.h>
  39 #include <linux/vmalloc.h>
  40 #include <linux/blkdev.h>
  41 #include <linux/sysrq.h>
  42 #include <linux/file.h>
  43 #include <linux/init.h>
  44 #include <linux/quotaops.h>
  45 #include <linux/iobuf.h>
  46 #include <linux/highmem.h>
  47
  48 #include <asm/uaccess.h>
  49 #include <asm/io.h>
  50 #include <asm/bitops.h>
  51 #include <asm/mmu_context.h>
  52
  53 #define NR_SIZES 7
  54 static char buffersize_index[65] =
  55 {-1,  0,  1, -1,  2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
  56   4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  57   5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  58  -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  59   6};
  60
  61 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
  62 #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
  63 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
  64 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
  65                                              number of unused buffer heads */
  66
  67 /* Anti-deadlock ordering:
  68  *      lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
  69  */
  70
  71 /*
  72  * Hash table gook..
  73  */
  74 static unsigned int bh_hash_mask;
  75 static unsigned int bh_hash_shift;
  76 static struct buffer_head **hash_table;
  77 static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
  78
  79 static struct buffer_head *lru_list[NR_LIST];
  80 static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED;
  81 static int nr_buffers_type[NR_LIST];
  82 static unsigned long size_buffers_type[NR_LIST];
  83
  84 static struct buffer_head * unused_list;
  85 static int nr_unused_buffer_heads;
  86 static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
  87 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
  88
  89 struct bh_free_head {
  90         struct buffer_head *list;
  91         spinlock_t lock;
  92 };
  93 static struct bh_free_head free_list[NR_SIZES];
  94
  95 static int grow_buffers(int size);
  96 static void __refile_buffer(struct buffer_head *);
  97
  98 /* This is used by some architectures to estimate available memory. */
  99 atomic_t buffermem_pages = ATOMIC_INIT(0);
 100
 101 /* Here is the parameter block for the bdflush process. If you add or
 102  * remove any of the parameters, make sure to update kernel/sysctl.c.
 103  */
 104
 105 #define N_PARAM 9
 106
 107 /* The dummy values in this structure are left in there for compatibility
 108  * with old programs that play with the /proc entries.
 109  */
 110 union bdflush_param {
 111         struct {
 112                 int nfract;  /* Percentage of buffer cache dirty to
 113                                 activate bdflush */
 114                 int ndirty;  /* Maximum number of dirty blocks to write out per
 115                                 wake-cycle */
 116                 int nrefill; /* Number of clean buffers to try to obtain
 117                                 each time we call refill */
 118                 int nref_dirt; /* Dirty buffer threshold for activating bdflush
 119                                   when trying to refill buffers. */
 120                 int interval; /* jiffies delay between kupdate flushes */
 121                 int age_buffer;  /* Time for normal buffer to age before we flush it */
 122                 int dummy1;    /* unused, was age_super */
 123                 int dummy2;    /* unused */
 124                 int dummy3;    /* unused */
 125         } b_un;
 126         unsigned int data[N_PARAM];
 127 } bdf_prm = {{40, 500, 64, 256, 5*HZ, 30*HZ, 5*HZ, 1884, 2}};
 128
 129 /* These are the min and max parameter values that we will allow to be assigned */
 130 int bdflush_min[N_PARAM] = {  0,  10,    5,   25,  0,   1*HZ,   1*HZ, 1, 1};
 131 int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,600*HZ, 6000*HZ, 6000*HZ, 2047, 5};
 132
 133 /*
 134  * Rewrote the wait-routines to use the "new" wait-queue functionality,
 135  * and getting rid of the cli-sti pairs. The wait-queue routines still
 136  * need cli-sti, but now it's just a couple of 386 instructions or so.
 137  *
 138  * Note that the real wait_on_buffer() is an inline function that checks
 139  * if 'b_wait' is set before calling this, so that the queues aren't set
 140  * up unnecessarily.
 141  */
 142 void __wait_on_buffer(struct buffer_head * bh)
 143 {
 144         struct task_struct *tsk = current;
 145         DECLARE_WAITQUEUE(wait, tsk);
 146
 147         atomic_inc(&bh->b_count);
 148         add_wait_queue(&bh->b_wait, &wait);
 149         do {
 150                 run_task_queue(&tq_disk);
 151                 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 152                 if (!buffer_locked(bh))
 153                         break;
 154                 schedule();
 155         } while (buffer_locked(bh));
 156         tsk->state = TASK_RUNNING;
 157         remove_wait_queue(&bh->b_wait, &wait);
 158         atomic_dec(&bh->b_count);
 159 }
 160
 161 /* Call sync_buffers with wait!=0 to ensure that the call does not
 162  * return until all buffer writes have completed.  Sync() may return
 163  * before the writes have finished; fsync() may not.
 164  */
 165
 166 /* Godamity-damn.  Some buffers (bitmaps for filesystems)
 167  * spontaneously dirty themselves without ever brelse being called.
 168  * We will ultimately want to put these in a separate list, but for
 169  * now we search all of the lists for dirty buffers.
 170  */
 171 static int sync_buffers(kdev_t dev, int wait)
 172 {
 173         int i, retry, pass = 0, err = 0;
 174         struct buffer_head * bh, *next;
 175
 176         /* One pass for no-wait, three for wait:
 177          * 0) write out all dirty, unlocked buffers;
 178          * 1) write out all dirty buffers, waiting if locked;
 179          * 2) wait for completion by waiting for all buffers to unlock.
 180          */
 181         do {
 182                 retry = 0;
 183
 184                 /* We search all lists as a failsafe mechanism, not because we expect
 185                  * there to be dirty buffers on any of the other lists.
 186                  */
 187 repeat:
 188                 spin_lock(&lru_list_lock);
 189                 bh = lru_list[BUF_DIRTY];
 190                 if (!bh)
 191                         goto repeat2;
 192
 193                 for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
 194                         next = bh->b_next_free;
 195
 196                         if (!lru_list[BUF_DIRTY])
 197                                 break;
 198                         if (dev && bh->b_dev != dev)
 199                                 continue;
 200                         if (buffer_locked(bh)) {
 201                                 /* Buffer is locked; skip it unless wait is
 202                                  * requested AND pass > 0.
 203                                  */
 204                                 if (!wait || !pass) {
 205                                         retry = 1;
 206                                         continue;
 207                                 }
 208                                 atomic_inc(&bh->b_count);
 209                                 spin_unlock(&lru_list_lock);
 210                                 wait_on_buffer (bh);
 211                                 atomic_dec(&bh->b_count);
 212                                 goto repeat;
 213                         }
 214
 215                         /* If an unlocked buffer is not uptodate, there has
 216                          * been an IO error. Skip it.
 217                          */
 218                         if (wait && buffer_req(bh) && !buffer_locked(bh) &&
 219                             !buffer_dirty(bh) && !buffer_uptodate(bh)) {
 220                                 err = -EIO;
 221                                 continue;
 222                         }
 223
 224                         /* Don't write clean buffers.  Don't write ANY buffers
 225                          * on the third pass.
 226                          */
 227                         if (!buffer_dirty(bh) || pass >= 2)
 228                                 continue;
 229
 230                         atomic_inc(&bh->b_count);
 231                         spin_unlock(&lru_list_lock);
 232                         ll_rw_block(WRITE, 1, &bh);
 233                         atomic_dec(&bh->b_count);
 234                         retry = 1;
 235                         goto repeat;
 236                 }
 237
 238     repeat2:
 239                 bh = lru_list[BUF_LOCKED];
 240                 if (!bh) {
 241                         spin_unlock(&lru_list_lock);
 242                         break;
 243                 }
 244                 for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
 245                         next = bh->b_next_free;
 246
 247                         if (!lru_list[BUF_LOCKED])
 248                                 break;
 249                         if (dev && bh->b_dev != dev)
 250                                 continue;
 251                         if (buffer_locked(bh)) {
 252                                 /* Buffer is locked; skip it unless wait is
 253                                  * requested AND pass > 0.
 254                                  */
 255                                 if (!wait || !pass) {
 256                                         retry = 1;
 257                                         continue;
 258                                 }
 259                                 atomic_inc(&bh->b_count);
 260                                 spin_unlock(&lru_list_lock);
 261                                 wait_on_buffer (bh);
 262                                 spin_lock(&lru_list_lock);
 263                                 atomic_dec(&bh->b_count);
 264                                 goto repeat2;
 265                         }
 266                 }
 267                 spin_unlock(&lru_list_lock);
 268
 269                 /* If we are waiting for the sync to succeed, and if any dirty
 270                  * blocks were written, then repeat; on the second pass, only
 271                  * wait for buffers being written (do not pass to write any
 272                  * more buffers on the second pass).
 273                  */
 274         } while (wait && retry && ++pass<=2);
 275         return err;
 276 }
 277
 278 void sync_dev(kdev_t dev)
 279 {
 280         sync_supers(dev);
 281         sync_inodes(dev);
 282         DQUOT_SYNC(dev);
 283         /* sync all the dirty buffers out to disk only _after_ all the
 284            high level layers finished generated buffer dirty data
 285            (or we'll return with some buffer still dirty on the blockdevice
 286            so breaking the semantics of this call) */
 287         sync_buffers(dev, 0);
 288         /*
 289          * FIXME(eric) we need to sync the physical devices here.
 290          * This is because some (scsi) controllers have huge amounts of
 291          * cache onboard (hundreds of Mb), and we need to instruct
 292          * them to commit all of the dirty memory to disk, and we should
 293          * not return until this has happened.
 294          *
 295          * This would need to get implemented by going through the assorted
 296          * layers so that each block major number can be synced, and this
 297          * would call down into the upper and mid-layer scsi.
 298          */
 299 }
 300
 301 int fsync_dev(kdev_t dev)
 302 {
 303         sync_buffers(dev, 0);
 304
 305         lock_kernel();
 306         sync_supers(dev);
 307         sync_inodes(dev);
 308         DQUOT_SYNC(dev);
 309         unlock_kernel();
 310
 311         return sync_buffers(dev, 1);
 312 }
 313
 314 asmlinkage long sys_sync(void)
 315 {
 316         fsync_dev(0);
 317         return 0;
 318 }
 319
 320 /*
 321  *      filp may be NULL if called via the msync of a vma.
 322  */
 323
 324 int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
 325 {
 326         struct inode * inode = dentry->d_inode;
 327         struct super_block * sb;
 328         kdev_t dev;
 329         int ret;
 330
 331         lock_kernel();
 332         /* sync the inode to buffers */
 333         write_inode_now(inode, 0);
 334
 335         /* sync the superblock to buffers */
 336         sb = inode->i_sb;
 337         wait_on_super(sb);
 338         if (sb->s_op && sb->s_op->write_super)
 339                 sb->s_op->write_super(sb);
 340
 341         /* .. finally sync the buffers to disk */
 342         dev = inode->i_dev;
 343         ret = sync_buffers(dev, 1);
 344         unlock_kernel();
 345         return ret;
 346 }
 347
 348 asmlinkage long sys_fsync(unsigned int fd)
 349 {
 350         struct file * file;
 351         struct dentry * dentry;
 352         struct inode * inode;
 353         int err;
 354
 355         err = -EBADF;
 356         file = fget(fd);
 357         if (!file)
 358                 goto out;
 359
 360         dentry = file->f_dentry;
 361         inode = dentry->d_inode;
 362
 363         err = -EINVAL;
 364         if (!file->f_op || !file->f_op->fsync)
 365                 goto out_putf;
 366
 367         /* We need to protect against concurrent writers.. */
 368         down(&inode->i_sem);
 369         err = file->f_op->fsync(file, dentry, 0);
 370         up(&inode->i_sem);
 371
 372 out_putf:
 373         fput(file);
 374 out:
 375         return err;
 376 }
 377
 378 asmlinkage long sys_fdatasync(unsigned int fd)
 379 {
 380         struct file * file;
 381         struct dentry * dentry;
 382         struct inode * inode;
 383         int err;
 384
 385         err = -EBADF;
 386         file = fget(fd);
 387         if (!file)
 388                 goto out;
 389
 390         dentry = file->f_dentry;
 391         inode = dentry->d_inode;
 392
 393         err = -EINVAL;
 394         if (!file->f_op || !file->f_op->fsync)
 395                 goto out_putf;
 396
 397         down(&inode->i_sem);
 398         err = file->f_op->fsync(file, dentry, 1);
 399         up(&inode->i_sem);
 400
 401 out_putf:
 402         fput(file);
 403 out:
 404         return err;
 405 }
 406
 407 /* After several hours of tedious analysis, the following hash
 408  * function won.  Do not mess with it... -DaveM
 409  */
 410 #define _hashfn(dev,block)      \
 411         ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
 412          (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ ((block) << (bh_hash_shift - 12))))
 413 #define hash(dev,block) hash_table[(_hashfn(dev,block) & bh_hash_mask)]
 414
 415 static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head)
 416 {
 417         if ((bh->b_next = *head) != NULL)
 418                 bh->b_next->b_pprev = &bh->b_next;
 419         *head = bh;
 420         bh->b_pprev = head;
 421 }
 422
 423 static __inline__ void __hash_unlink(struct buffer_head *bh)
 424 {
 425         if (bh->b_pprev) {
 426                 if (bh->b_next)
 427                         bh->b_next->b_pprev = bh->b_pprev;
 428                 *(bh->b_pprev) = bh->b_next;
 429                 bh->b_pprev = NULL;
 430         }
 431 }
 432
 433 static void __insert_into_lru_list(struct buffer_head * bh, int blist)
 434 {
 435         struct buffer_head **bhp = &lru_list[blist];
 436
 437         if(!*bhp) {
 438                 *bhp = bh;
 439                 bh->b_prev_free = bh;
 440         }
 441         bh->b_next_free = *bhp;
 442         bh->b_prev_free = (*bhp)->b_prev_free;
 443         (*bhp)->b_prev_free->b_next_free = bh;
 444         (*bhp)->b_prev_free = bh;
 445         nr_buffers_type[blist]++;
 446         size_buffers_type[blist] += bh->b_size;
 447 }
 448
 449 static void __remove_from_lru_list(struct buffer_head * bh, int blist)
 450 {
 451         if (bh->b_prev_free || bh->b_next_free) {
 452                 bh->b_prev_free->b_next_free = bh->b_next_free;
 453                 bh->b_next_free->b_prev_free = bh->b_prev_free;
 454                 if (lru_list[blist] == bh)
 455                         lru_list[blist] = bh->b_next_free;
 456                 if (lru_list[blist] == bh)
 457                         lru_list[blist] = NULL;
 458                 bh->b_next_free = bh->b_prev_free = NULL;
 459                 nr_buffers_type[blist]--;
 460                 size_buffers_type[blist] -= bh->b_size;
 461         }
 462 }
 463
 464 static void __remove_from_free_list(struct buffer_head * bh, int index)
 465 {
 466         if(bh->b_next_free == bh)
 467                  free_list[index].list = NULL;
 468         else {
 469                 bh->b_prev_free->b_next_free = bh->b_next_free;
 470                 bh->b_next_free->b_prev_free = bh->b_prev_free;
 471                 if (free_list[index].list == bh)
 472                          free_list[index].list = bh->b_next_free;
 473         }
 474         bh->b_next_free = bh->b_prev_free = NULL;
 475 }
 476
 477 /* must be called with both the hash_table_lock and the lru_list_lock
 478    held */
 479 static void __remove_from_queues(struct buffer_head *bh)
 480 {
 481         __hash_unlink(bh);
 482         __remove_from_lru_list(bh, bh->b_list);
 483 }
 484
 485 static void __insert_into_queues(struct buffer_head *bh)
 486 {
 487         struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
 488
 489         __hash_link(bh, head);
 490         __insert_into_lru_list(bh, bh->b_list);
 491 }
 492
 493 /* This function must only run if there are no other
 494  * references _anywhere_ to this buffer head.
 495  */
 496 static void put_last_free(struct buffer_head * bh)
 497 {
 498         struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)];
 499         struct buffer_head **bhp = &head->list;
 500
 501         bh->b_state = 0;
 502
 503         spin_lock(&head->lock);
 504         bh->b_dev = B_FREE;
 505         if(!*bhp) {
 506                 *bhp = bh;
 507                 bh->b_prev_free = bh;
 508         }
 509         bh->b_next_free = *bhp;
 510         bh->b_prev_free = (*bhp)->b_prev_free;
 511         (*bhp)->b_prev_free->b_next_free = bh;
 512         (*bhp)->b_prev_free = bh;
 513         spin_unlock(&head->lock);
 514 }
 515
 516 /*
 517  * Why like this, I hear you say... The reason is race-conditions.
 518  * As we don't lock buffers (unless we are reading them, that is),
 519  * something might happen to it while we sleep (ie a read-error
 520  * will force it bad). This shouldn't really happen currently, but
 521  * the code is ready.
 522  */
 523 static inline struct buffer_head * __get_hash_table(kdev_t dev, int block, int size)
 524 {
 525         struct buffer_head *bh = hash(dev, block);
 526
 527         for (; bh; bh = bh->b_next)
 528                 if (bh->b_blocknr == block      &&
 529                     bh->b_size    == size       &&
 530                     bh->b_dev     == dev)
 531                         break;
 532         if (bh)
 533                 atomic_inc(&bh->b_count);
 534
 535         return bh;
 536 }
 537
 538 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
 539 {
 540         struct buffer_head *bh;
 541
 542         read_lock(&hash_table_lock);
 543         bh = __get_hash_table(dev, block, size);
 544         read_unlock(&hash_table_lock);
 545
 546         return bh;
 547 }
 548
 549 unsigned int get_hardblocksize(kdev_t dev)
 550 {
 551         /*
 552          * Get the hard sector size for the given device.  If we don't know
 553          * what it is, return 0.
 554          */
 555         if (hardsect_size[MAJOR(dev)] != NULL) {
 556                 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
 557                 if (blksize != 0)
 558                         return blksize;
 559         }
 560
 561         /*
 562          * We don't know what the hardware sector size for this device is.
 563          * Return 0 indicating that we don't know.
 564          */
 565         return 0;
 566 }
 567
 568 /* If invalidate_buffers() will trash dirty buffers, it means some kind
 569    of fs corruption is going on. Trashing dirty data always imply losing
 570    information that was supposed to be just stored on the physical layer
 571    by the user.
 572
 573    Thus invalidate_buffers in general usage is not allwowed to trash dirty
 574    buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved.
 575
 576    NOTE: In the case where the user removed a removable-media-disk even if
 577    there's still dirty data not synced on disk (due a bug in the device driver
 578    or due an error of the user), by not destroying the dirty buffers we could
 579    generate corruption also on the next media inserted, thus a parameter is
 580    necessary to handle this case in the most safe way possible (trying
 581    to not corrupt also the new disk inserted with the data belonging to
 582    the old now corrupted disk). Also for the ramdisk the natural thing
 583    to do in order to release the ramdisk memory is to destroy dirty buffers.
 584
 585    These are two special cases. Normal usage imply the device driver
 586    to issue a sync on the device (without waiting I/O completation) and
 587    then an invalidate_buffers call that doesn't trashes dirty buffers. */
 588 void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
 589 {
 590         int i, nlist, slept;
 591         struct buffer_head * bh, * bh_next;
 592
 593  retry:
 594         slept = 0;
 595         spin_lock(&lru_list_lock);
 596         for(nlist = 0; nlist < NR_LIST; nlist++) {
 597                 bh = lru_list[nlist];
 598                 if (!bh)
 599                         continue;
 600                 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
 601                         bh_next = bh->b_next_free;
 602                         if (bh->b_dev != dev)
 603                                 continue;
 604                         if (buffer_locked(bh)) {
 605                                 atomic_inc(&bh->b_count);
 606                                 spin_unlock(&lru_list_lock);
 607                                 wait_on_buffer(bh);
 608                                 slept = 1;
 609                                 spin_lock(&lru_list_lock);
 610                                 atomic_dec(&bh->b_count);
 611                         }
 612
 613                         write_lock(&hash_table_lock);
 614                         if (!atomic_read(&bh->b_count) &&
 615                             (destroy_dirty_buffers || !buffer_dirty(bh))) {
 616                                 __remove_from_queues(bh);
 617                                 put_last_free(bh);
 618                         }
 619                         write_unlock(&hash_table_lock);
 620                         if (slept)
 621                                 goto out;
 622                 }
 623         }
 624 out:
 625         spin_unlock(&lru_list_lock);
 626         if (slept)
 627                 goto retry;
 628 }
 629
 630 void set_blocksize(kdev_t dev, int size)
 631 {
 632         extern int *blksize_size[];
 633         int i, nlist, slept;
 634         struct buffer_head * bh, * bh_next;
 635
 636         if (!blksize_size[MAJOR(dev)])
 637                 return;
 638
 639         /* Size must be a power of two, and between 512 and PAGE_SIZE */
 640         if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
 641                 panic("Invalid blocksize passed to set_blocksize");
 642
 643         if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
 644                 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 645                 return;
 646         }
 647         if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
 648                 return;
 649         sync_buffers(dev, 2);
 650         blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 651
 652  retry:
 653         slept = 0;
 654         spin_lock(&lru_list_lock);
 655         for(nlist = 0; nlist < NR_LIST; nlist++) {
 656                 bh = lru_list[nlist];
 657                 if (!bh)
 658                         continue;
 659                 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
 660                         bh_next = bh->b_next_free;
 661                         if (bh->b_dev != dev || bh->b_size == size)
 662                                 continue;
 663                         if (buffer_locked(bh)) {
 664                                 atomic_inc(&bh->b_count);
 665                                 spin_unlock(&lru_list_lock);
 666                                 wait_on_buffer(bh);
 667                                 slept = 1;
 668                                 spin_lock(&lru_list_lock);
 669                                 atomic_dec(&bh->b_count);
 670                         }
 671
 672                         write_lock(&hash_table_lock);
 673                         if (!atomic_read(&bh->b_count)) {
 674                                 if (buffer_dirty(bh))
 675                                         printk(KERN_WARNING
 676                                                "set_blocksize: dev %s buffer_dirty %lu size %hu\n",
 677                                                kdevname(dev), bh->b_blocknr, bh->b_size);
 678                                 __remove_from_queues(bh);
 679                                 put_last_free(bh);
 680                         } else {
 681                                 if (atomic_set_buffer_clean(bh))
 682                                         __refile_buffer(bh);
 683                                 clear_bit(BH_Uptodate, &bh->b_state);
 684                                 printk(KERN_WARNING
 685                                        "set_blocksize: "
 686                                        "b_count %d, dev %s, block %lu, from %p\n",
 687                                        atomic_read(&bh->b_count), bdevname(bh->b_dev),
 688                                        bh->b_blocknr, __builtin_return_address(0));
 689                         }
 690                         write_unlock(&hash_table_lock);
 691                         if (slept)
 692                                 goto out;
 693                 }
 694         }
 695  out:
 696         spin_unlock(&lru_list_lock);
 697         if (slept)
 698                 goto retry;
 699 }
 700
 701 /*
 702  * We used to try various strange things. Let's not.
 703  */
 704 static void refill_freelist(int size)
 705 {
 706         if (!grow_buffers(size)) {
 707                 wakeup_bdflush(1);
 708                 current->policy |= SCHED_YIELD;
 709                 schedule();
 710         }
 711 }
 712
 713 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
 714 {
 715         bh->b_list = BUF_CLEAN;
 716         bh->b_end_io = handler;
 717         bh->b_private = private;
 718 }
 719
 720 static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 721 {
 722         mark_buffer_uptodate(bh, uptodate);
 723         unlock_buffer(bh);
 724 }
 725
 726 static void end_buffer_io_bad(struct buffer_head *bh, int uptodate)
 727 {
 728         mark_buffer_uptodate(bh, uptodate);
 729         unlock_buffer(bh);
 730         BUG();
 731 }
 732
 733 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
 734 {
 735         static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
 736         unsigned long flags;
 737         struct buffer_head *tmp;
 738         struct page *page;
 739
 740         mark_buffer_uptodate(bh, uptodate);
 741
 742         /* This is a temporary buffer used for page I/O. */
 743         page = bh->b_page;
 744
 745         if (!uptodate)
 746                 SetPageError(page);
 747
 748         /*
 749          * Be _very_ careful from here on. Bad things can happen if
 750          * two buffer heads end IO at almost the same time and both
 751          * decide that the page is now completely done.
 752          *
 753          * Async buffer_heads are here only as labels for IO, and get
 754          * thrown away once the IO for this page is complete.  IO is
 755          * deemed complete once all buffers have been visited
 756          * (b_count==0) and are now unlocked. We must make sure that
 757          * only the _last_ buffer that decrements its count is the one
 758          * that unlock the page..
 759          */
 760         spin_lock_irqsave(&page_uptodate_lock, flags);
 761         unlock_buffer(bh);
 762         atomic_dec(&bh->b_count);
 763         tmp = bh->b_this_page;
 764         while (tmp != bh) {
 765                 if (tmp->b_end_io == end_buffer_io_async && buffer_locked(tmp))
 766                         goto still_busy;
 767                 tmp = tmp->b_this_page;
 768         }
 769
 770         /* OK, the async IO on this page is complete. */
 771         spin_unlock_irqrestore(&page_uptodate_lock, flags);
 772
 773         /*
 774          * if none of the buffers had errors then we can set the
 775          * page uptodate:
 776          */
 777         if (!PageError(page))
 778                 SetPageUptodate(page);
 779
 780         /*
 781          * Run the hooks that have to be done when a page I/O has completed.
 782          */
 783         if (PageTestandClearDecrAfter(page))
 784                 atomic_dec(&nr_async_pages);
 785
 786         UnlockPage(page);
 787
 788         return;
 789
 790 still_busy:
 791         spin_unlock_irqrestore(&page_uptodate_lock, flags);
 792         return;
 793 }
 794
 795 /*
 796  * Ok, this is getblk, and it isn't very clear, again to hinder
 797  * race-conditions. Most of the code is seldom used, (ie repeating),
 798  * so it should be much more efficient than it looks.
 799  *
 800  * The algorithm is changed: hopefully better, and an elusive bug removed.
 801  *
 802  * 14.02.92: changed it to sync dirty buffers a bit: better performance
 803  * when the filesystem starts to get full of dirty blocks (I hope).
 804  */
 805 struct buffer_head * getblk(kdev_t dev, int block, int size)
 806 {
 807         struct buffer_head * bh;
 808         int isize;
 809
 810 repeat:
 811         spin_lock(&lru_list_lock);
 812         write_lock(&hash_table_lock);
 813         bh = __get_hash_table(dev, block, size);
 814         if (bh)
 815                 goto out;
 816
 817         isize = BUFSIZE_INDEX(size);
 818         spin_lock(&free_list[isize].lock);
 819         bh = free_list[isize].list;
 820         if (bh) {
 821                 __remove_from_free_list(bh, isize);
 822                 atomic_set(&bh->b_count, 1);
 823         }
 824         spin_unlock(&free_list[isize].lock);
 825
 826         /*
 827          * OK, FINALLY we know that this buffer is the only one of
 828          * its kind, we hold a reference (b_count>0), it is unlocked,
 829          * and it is clean.
 830          */
 831         if (bh) {
 832                 init_buffer(bh, end_buffer_io_sync, NULL);
 833                 bh->b_dev = dev;
 834                 bh->b_blocknr = block;
 835                 bh->b_state = 1 << BH_Mapped;
 836
 837                 /* Insert the buffer into the regular lists */
 838                 __insert_into_queues(bh);
 839         out:
 840                 write_unlock(&hash_table_lock);
 841                 spin_unlock(&lru_list_lock);
 842                 touch_buffer(bh);
 843                 return bh;
 844         }
 845
 846         /*
 847          * If we block while refilling the free list, somebody may
 848          * create the buffer first ... search the hashes again.
 849          */
 850         write_unlock(&hash_table_lock);
 851         spin_unlock(&lru_list_lock);
 852         refill_freelist(size);
 853         goto repeat;
 854 }
 855
 856 /* -1 -> no need to flush
 857     0 -> async flush
 858     1 -> sync flush (wait for I/O completation) */
 859 static int balance_dirty_state(kdev_t dev)
 860 {
 861         unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
 862
 863         dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
 864         tot = nr_free_buffer_pages();
 865         tot -= size_buffers_type[BUF_PROTECTED] >> PAGE_SHIFT;
 866
 867         dirty *= 200;
 868         soft_dirty_limit = tot * bdf_prm.b_un.nfract;
 869         hard_dirty_limit = soft_dirty_limit * 2;
 870
 871         if (dirty > soft_dirty_limit) {
 872                 if (dirty > hard_dirty_limit)
 873                         return 1;
 874                 return 0;
 875         }
 876         return -1;
 877 }
 878
 879 /*
 880  * if a new dirty buffer is created we need to balance bdflush.
 881  *
 882  * in the future we might want to make bdflush aware of different
 883  * pressures on different devices - thus the (currently unused)
 884  * 'dev' parameter.
 885  */
 886 void balance_dirty(kdev_t dev)
 887 {
 888         int state = balance_dirty_state(dev);
 889
 890         if (state < 0)
 891                 return;
 892         wakeup_bdflush(state);
 893 }
 894
 895 static __inline__ void __mark_dirty(struct buffer_head *bh)
 896 {
 897         bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
 898         refile_buffer(bh);
 899 }
 900
 901 /* atomic version, the user must call balance_dirty() by hand
 902    as soon as it become possible to block */
 903 void __mark_buffer_dirty(struct buffer_head *bh)
 904 {
 905         if (!atomic_set_buffer_dirty(bh))
 906                 __mark_dirty(bh);
 907 }
 908
 909 void mark_buffer_dirty(struct buffer_head *bh)
 910 {
 911         __mark_buffer_dirty(bh);
 912         balance_dirty(bh->b_dev);
 913 }
 914
 915 /*
 916  * A buffer may need to be moved from one buffer list to another
 917  * (e.g. in case it is not shared any more). Handle this.
 918  */
 919 static void __refile_buffer(struct buffer_head *bh)
 920 {
 921         int dispose = BUF_CLEAN;
 922         if (buffer_locked(bh))
 923                 dispose = BUF_LOCKED;
 924         if (buffer_dirty(bh))
 925                 dispose = BUF_DIRTY;
 926         if (buffer_protected(bh))
 927                 dispose = BUF_PROTECTED;
 928         if (dispose != bh->b_list) {
 929                 __remove_from_lru_list(bh, bh->b_list);
 930                 bh->b_list = dispose;
 931                 __insert_into_lru_list(bh, dispose);
 932         }
 933 }
 934
 935 void refile_buffer(struct buffer_head *bh)
 936 {
 937         spin_lock(&lru_list_lock);
 938         __refile_buffer(bh);
 939         spin_unlock(&lru_list_lock);
 940 }
 941
 942 /*
 943  * Release a buffer head
 944  */
 945 void __brelse(struct buffer_head * buf)
 946 {
 947         if (atomic_read(&buf->b_count)) {
 948                 atomic_dec(&buf->b_count);
 949                 return;
 950         }
 951         printk("VFS: brelse: Trying to free free buffer\n");
 952 }
 953
 954 /*
 955  * bforget() is like brelse(), except it puts the buffer on the
 956  * free list if it can.. We can NOT free the buffer if:
 957  *  - there are other users of it
 958  *  - it is locked and thus can have active IO
 959  */
 960 void __bforget(struct buffer_head * buf)
 961 {
 962         /* grab the lru lock here to block bdflush. */
 963         spin_lock(&lru_list_lock);
 964         write_lock(&hash_table_lock);
 965         if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf))
 966                 goto in_use;
 967         __hash_unlink(buf);
 968         write_unlock(&hash_table_lock);
 969         __remove_from_lru_list(buf, buf->b_list);
 970         spin_unlock(&lru_list_lock);
 971         put_last_free(buf);
 972         return;
 973
 974  in_use:
 975         write_unlock(&hash_table_lock);
 976         spin_unlock(&lru_list_lock);
 977 }
 978
 979 /*
 980  * bread() reads a specified block and returns the buffer that contains
 981  * it. It returns NULL if the block was unreadable.
 982  */
 983 struct buffer_head * bread(kdev_t dev, int block, int size)
 984 {
 985         struct buffer_head * bh;
 986
 987         bh = getblk(dev, block, size);
 988         if (buffer_uptodate(bh))
 989                 return bh;
 990         ll_rw_block(READ, 1, &bh);
 991         wait_on_buffer(bh);
 992         if (buffer_uptodate(bh))
 993                 return bh;
 994         brelse(bh);
 995         return NULL;
 996 }
 997
 998 /*
 999  * Ok, breada can be used as bread, but additionally to mark other
1000  * blocks for reading as well. End the argument list with a negative
1001  * number.
1002  */
1003
1004 #define NBUF 16
1005
1006 struct buffer_head * breada(kdev_t dev, int block, int bufsize,
1007         unsigned int pos, unsigned int filesize)
1008 {
1009         struct buffer_head * bhlist[NBUF];
1010         unsigned int blocks;
1011         struct buffer_head * bh;
1012         int index;
1013         int i, j;
1014
1015         if (pos >= filesize)
1016                 return NULL;
1017
1018         if (block < 0)
1019                 return NULL;
1020
1021         bh = getblk(dev, block, bufsize);
1022         index = BUFSIZE_INDEX(bh->b_size);
1023
1024         if (buffer_uptodate(bh))
1025                 return(bh);
1026         else ll_rw_block(READ, 1, &bh);
1027
1028         blocks = (filesize - pos) >> (9+index);
1029
1030         if (blocks < (read_ahead[MAJOR(dev)] >> index))
1031                 blocks = read_ahead[MAJOR(dev)] >> index;
1032         if (blocks > NBUF)
1033                 blocks = NBUF;
1034
1035 /*      if (blocks) printk("breada (new) %d blocks\n",blocks); */
1036
1037         bhlist[0] = bh;
1038         j = 1;
1039         for(i=1; i<blocks; i++) {
1040                 bh = getblk(dev,block+i,bufsize);
1041                 if (buffer_uptodate(bh)) {
1042                         brelse(bh);
1043                         break;
1044                 }
1045                 else bhlist[j++] = bh;
1046         }
1047
1048         /* Request the read for these buffers, and then release them. */
1049         if (j>1)
1050                 ll_rw_block(READA, (j-1), bhlist+1);
1051         for(i=1; i<j; i++)
1052                 brelse(bhlist[i]);
1053
1054         /* Wait for this buffer, and then continue on. */
1055         bh = bhlist[0];
1056         wait_on_buffer(bh);
1057         if (buffer_uptodate(bh))
1058                 return bh;
1059         brelse(bh);
1060         return NULL;
1061 }
1062
1063 /*
1064  * Note: the caller should wake up the buffer_wait list if needed.
1065  */
1066 static __inline__ void __put_unused_buffer_head(struct buffer_head * bh)
1067 {
1068         if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1069                 kmem_cache_free(bh_cachep, bh);
1070         } else {
1071                 bh->b_blocknr = -1;
1072                 init_waitqueue_head(&bh->b_wait);
1073                 nr_unused_buffer_heads++;
1074                 bh->b_next_free = unused_list;
1075                 bh->b_this_page = NULL;
1076                 unused_list = bh;
1077         }
1078 }
1079
1080 /*
1081  * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1082  * no-buffer-head deadlock.  Return NULL on failure; waiting for
1083  * buffer heads is now handled in create_buffers().
1084  */
1085 static struct buffer_head * get_unused_buffer_head(int async)
1086 {
1087         struct buffer_head * bh;
1088
1089         spin_lock(&unused_list_lock);
1090         if (nr_unused_buffer_heads > NR_RESERVED) {
1091                 bh = unused_list;
1092                 unused_list = bh->b_next_free;
1093                 nr_unused_buffer_heads--;
1094                 spin_unlock(&unused_list_lock);
1095                 return bh;
1096         }
1097         spin_unlock(&unused_list_lock);
1098
1099         /* This is critical.  We can't swap out pages to get
1100          * more buffer heads, because the swap-out may need
1101          * more buffer-heads itself.  Thus SLAB_BUFFER.
1102          */
1103         if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
1104                 memset(bh, 0, sizeof(*bh));
1105                 init_waitqueue_head(&bh->b_wait);
1106                 return bh;
1107         }
1108
1109         /*
1110          * If we need an async buffer, use the reserved buffer heads.
1111          */
1112         if (async) {
1113                 spin_lock(&unused_list_lock);
1114                 if (unused_list) {
1115                         bh = unused_list;
1116                         unused_list = bh->b_next_free;
1117                         nr_unused_buffer_heads--;
1118                         spin_unlock(&unused_list_lock);
1119                         return bh;
1120                 }
1121                 spin_unlock(&unused_list_lock);
1122         }
1123 #if 0
1124         /*
1125          * (Pending further analysis ...)
1126          * Ordinary (non-async) requests can use a different memory priority
1127          * to free up pages. Any swapping thus generated will use async
1128          * buffer heads.
1129          */
1130         if(!async &&
1131            (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1132                 memset(bh, 0, sizeof(*bh));
1133                 init_waitqueue_head(&bh->b_wait);
1134                 return bh;
1135         }
1136 #endif
1137
1138         return NULL;
1139 }
1140
1141 void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
1142 {
1143         bh->b_page = page;
1144         if (offset >= PAGE_SIZE)
1145                 BUG();
1146         if (PageHighMem(page))
1147                 /*
1148                  * This catches illegal uses and preserves the offset:
1149                  */
1150                 bh->b_data = (char *)(0 + offset);
1151         else
1152                 bh->b_data = page_address(page) + offset;
1153 }
1154
1155 /*
1156  * Create the appropriate buffers when given a page for data area and
1157  * the size of each buffer.. Use the bh->b_this_page linked list to
1158  * follow the buffers created.  Return NULL if unable to create more
1159  * buffers.
1160  * The async flag is used to differentiate async IO (paging, swapping)
1161  * from ordinary buffer allocations, and only async requests are allowed
1162  * to sleep waiting for buffer heads.
1163  */
1164 static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
1165 {
1166         struct buffer_head *bh, *head;
1167         long offset;
1168
1169 try_again:
1170         head = NULL;
1171         offset = PAGE_SIZE;
1172         while ((offset -= size) >= 0) {
1173                 bh = get_unused_buffer_head(async);
1174                 if (!bh)
1175                         goto no_grow;
1176
1177                 bh->b_dev = B_FREE;  /* Flag as unused */
1178                 bh->b_this_page = head;
1179                 head = bh;
1180
1181                 bh->b_state = 0;
1182                 bh->b_next_free = NULL;
1183                 bh->b_pprev = NULL;
1184                 atomic_set(&bh->b_count, 0);
1185                 bh->b_size = size;
1186
1187                 set_bh_page(bh, page, offset);
1188
1189                 bh->b_list = BUF_CLEAN;
1190                 bh->b_end_io = end_buffer_io_bad;
1191         }
1192         return head;
1193 /*
1194  * In case anything failed, we just free everything we got.
1195  */
1196 no_grow:
1197         if (head) {
1198                 spin_lock(&unused_list_lock);
1199                 do {
1200                         bh = head;
1201                         head = head->b_this_page;
1202                         __put_unused_buffer_head(bh);
1203                 } while (head);
1204                 spin_unlock(&unused_list_lock);
1205
1206                 /* Wake up any waiters ... */
1207                 wake_up(&buffer_wait);
1208         }
1209
1210         /*
1211          * Return failure for non-async IO requests.  Async IO requests
1212          * are not allowed to fail, so we have to wait until buffer heads
1213          * become available.  But we don't want tasks sleeping with
1214          * partially complete buffers, so all were released above.
1215          */
1216         if (!async)
1217                 return NULL;
1218
1219         /* We're _really_ low on memory. Now we just
1220          * wait for old buffer heads to become free due to
1221          * finishing IO.  Since this is an async request and
1222          * the reserve list is empty, we're sure there are
1223          * async buffer heads in use.
1224          */
1225         run_task_queue(&tq_disk);
1226
1227         /*
1228          * Set our state for sleeping, then check again for buffer heads.
1229          * This ensures we won't miss a wake_up from an interrupt.
1230          */
1231         wait_event(buffer_wait, nr_unused_buffer_heads >= MAX_BUF_PER_PAGE);
1232         goto try_again;
1233 }
1234
1235 static int create_page_buffers(int rw, struct page *page, kdev_t dev, int b[], int size)
1236 {
1237         struct buffer_head *head, *bh, *tail;
1238         int block;
1239
1240         if (!PageLocked(page))
1241                 BUG();
1242         /*
1243          * Allocate async buffer heads pointing to this page, just for I/O.
1244          * They don't show up in the buffer hash table, but they *are*
1245          * registered in page->buffers.
1246          */
1247         head = create_buffers(page, size, 1);
1248         if (page->buffers)
1249                 BUG();
1250         if (!head)
1251                 BUG();
1252         tail = head;
1253         for (bh = head; bh; bh = bh->b_this_page) {
1254                 block = *(b++);
1255
1256                 tail = bh;
1257                 init_buffer(bh, end_buffer_io_async, NULL);
1258                 bh->b_dev = dev;
1259                 bh->b_blocknr = block;
1260
1261                 set_bit(BH_Mapped, &bh->b_state);
1262         }
1263         tail->b_this_page = head;
1264         page_cache_get(page);
1265         page->buffers = head;
1266         return 0;
1267 }
1268
1269 static void unmap_buffer(struct buffer_head * bh)
1270 {
1271         if (buffer_mapped(bh)) {
1272                 mark_buffer_clean(bh);
1273                 wait_on_buffer(bh);
1274                 clear_bit(BH_Uptodate, &bh->b_state);
1275                 clear_bit(BH_Mapped, &bh->b_state);
1276                 clear_bit(BH_Req, &bh->b_state);
1277                 clear_bit(BH_New, &bh->b_state);
1278         }
1279 }
1280
1281 /*
1282  * We don't have to release all buffers here, but
1283  * we have to be sure that no dirty buffer is left
1284  * and no IO is going on (no buffer is locked), because
1285  * we have truncated the file and are going to free the
1286  * blocks on-disk..
1287  */
1288 int block_flushpage(struct page *page, unsigned long offset)
1289 {
1290         struct buffer_head *head, *bh, *next;
1291         unsigned int curr_off = 0;
1292
1293         if (!PageLocked(page))
1294                 BUG();
1295         if (!page->buffers)
1296                 return 1;
1297
1298         head = page->buffers;
1299         bh = head;
1300         do {
1301                 unsigned int next_off = curr_off + bh->b_size;
1302                 next = bh->b_this_page;
1303
1304                 /*
1305                  * is this block fully flushed?
1306                  */
1307                 if (offset <= curr_off)
1308                         unmap_buffer(bh);
1309                 curr_off = next_off;
1310                 bh = next;
1311         } while (bh != head);
1312
1313         /*
1314          * subtle. We release buffer-heads only if this is
1315          * the 'final' flushpage. We have invalidated the get_block
1316          * cached value unconditionally, so real IO is not
1317          * possible anymore.
1318          *
1319          * If the free doesn't work out, the buffers can be
1320          * left around - they just turn into anonymous buffers
1321          * instead.
1322          */
1323         if (!offset) {
1324                 if (!try_to_free_buffers(page, 0)) {
1325                         atomic_inc(&buffermem_pages);
1326                         return 0;
1327                 }
1328         }
1329
1330         return 1;
1331 }
1332
1333 static void create_empty_buffers(struct page *page, struct inode *inode, unsigned long blocksize)
1334 {
1335         struct buffer_head *bh, *head, *tail;
1336
1337         head = create_buffers(page, blocksize, 1);
1338         if (page->buffers)
1339                 BUG();
1340
1341         bh = head;
1342         do {
1343                 bh->b_dev = inode->i_dev;
1344                 bh->b_blocknr = 0;
1345                 bh->b_end_io = end_buffer_io_bad;
1346                 tail = bh;
1347                 bh = bh->b_this_page;
1348         } while (bh);
1349         tail->b_this_page = head;
1350         page->buffers = head;
1351         page_cache_get(page);
1352 }
1353
1354 /*
1355  * We are taking a block for data and we don't want any output from any
1356  * buffer-cache aliases starting from return from that function and
1357  * until the moment when something will explicitly mark the buffer
1358  * dirty (hopefully that will not happen until we will free that block ;-)
1359  * We don't even need to mark it not-uptodate - nobody can expect
1360  * anything from a newly allocated buffer anyway. We used to used
1361  * unmap_buffer() for such invalidation, but that was wrong. We definitely
1362  * don't want to mark the alias unmapped, for example - it would confuse
1363  * anyone who might pick it with bread() afterwards...
1364  */
1365
1366 static void unmap_underlying_metadata(struct buffer_head * bh)
1367 {
1368         struct buffer_head *old_bh;
1369
1370         old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
1371         if (old_bh) {
1372                 mark_buffer_clean(old_bh);
1373                 wait_on_buffer(old_bh);
1374                 clear_bit(BH_Req, &old_bh->b_state);
1375                 /* Here we could run brelse or bforget. We use
1376                    bforget because it will try to put the buffer
1377                    in the freelist. */
1378                 __bforget(old_bh);
1379         }
1380 }
1381
1382 /*
1383  * block_write_full_page() is SMP-safe - currently it's still
1384  * being called with the kernel lock held, but the code is ready.
1385  */
1386 static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
1387 {
1388         int err, i, need_balance_dirty = 0;
1389         unsigned long block;
1390         struct buffer_head *bh, *head;
1391
1392         if (!PageLocked(page))
1393                 BUG();
1394
1395         if (!page->buffers)
1396                 create_empty_buffers(page, inode, inode->i_sb->s_blocksize);
1397         head = page->buffers;
1398
1399         block = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1400
1401         bh = head;
1402         i = 0;
1403         do {
1404                 /*
1405                  * If the buffer isn't up-to-date, we can't be sure
1406                  * that the buffer has been initialized with the proper
1407                  * block number information etc..
1408                  *
1409                  * Leave it to the low-level FS to make all those
1410                  * decisions (block #0 may actually be a valid block)
1411                  */
1412                 bh->b_end_io = end_buffer_io_sync;
1413                 if (!buffer_mapped(bh)) {
1414                         err = get_block(inode, block, bh, 1);
1415                         if (err)
1416                                 goto out;
1417                         if (buffer_new(bh))
1418                                 unmap_underlying_metadata(bh);
1419                 }
1420                 set_bit(BH_Uptodate, &bh->b_state);
1421                 if (!atomic_set_buffer_dirty(bh)) {
1422                         __mark_dirty(bh);
1423                         need_balance_dirty = 1;
1424                 }
1425
1426                 bh = bh->b_this_page;
1427                 block++;
1428         } while (bh != head);
1429
1430         if (need_balance_dirty)
1431                 balance_dirty(bh->b_dev);
1432
1433         SetPageUptodate(page);
1434         return 0;
1435 out:
1436         ClearPageUptodate(page);
1437         return err;
1438 }
1439
1440 static int __block_prepare_write(struct inode *inode, struct page *page,
1441                 unsigned from, unsigned to, get_block_t *get_block)
1442 {
1443         unsigned block_start, block_end;
1444         unsigned long block;
1445         int err = 0;
1446         unsigned blocksize, bbits;
1447         struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1448         char *kaddr = (char *)kmap(page);
1449
1450         blocksize = inode->i_sb->s_blocksize;
1451         if (!page->buffers)
1452                 create_empty_buffers(page, inode, blocksize);
1453         head = page->buffers;
1454
1455         bbits = inode->i_sb->s_blocksize_bits;
1456         block = page->index << (PAGE_CACHE_SHIFT - bbits);
1457
1458         for(bh = head, block_start = 0; bh != head || !block_start;
1459             block++, block_start=block_end, bh = bh->b_this_page) {
1460                 if (!bh)
1461                         BUG();
1462                 block_end = block_start+blocksize;
1463                 if (block_end <= from)
1464                         continue;
1465                 if (block_start >= to)
1466                         break;
1467                 bh->b_end_io = end_buffer_io_sync;
1468                 if (!buffer_mapped(bh)) {
1469                         err = get_block(inode, block, bh, 1);
1470                         if (err)
1471                                 goto out;
1472                         if (buffer_new(bh)) {
1473                                 unmap_underlying_metadata(bh);
1474                                 if (block_end > to)
1475                                         memset(kaddr+to, 0, block_end-to);
1476                                 if (block_start < from)
1477                                         memset(kaddr+block_start, 0, from-block_start);
1478                                 if (block_end > to || block_start < from)
1479                                         flush_dcache_page(page);
1480                                 continue;
1481                         }
1482                 }
1483                 if (!buffer_uptodate(bh) &&
1484                      (block_start < from || block_end > to)) {
1485                         ll_rw_block(READ, 1, &bh);
1486                         *wait_bh++=bh;
1487                 }
1488         }
1489         /*
1490          * If we issued read requests - let them complete.
1491          */
1492         while(wait_bh > wait) {
1493                 wait_on_buffer(*--wait_bh);
1494                 err = -EIO;
1495                 if (!buffer_uptodate(*wait_bh))
1496                         goto out;
1497         }
1498         return 0;
1499 out:
1500         return err;
1501 }
1502
1503 static int __block_commit_write(struct inode *inode, struct page *page,
1504                 unsigned from, unsigned to)
1505 {
1506         unsigned block_start, block_end;
1507         int partial = 0, need_balance_dirty = 0;
1508         unsigned blocksize;
1509         struct buffer_head *bh, *head;
1510
1511         blocksize = inode->i_sb->s_blocksize;
1512
1513         for(bh = head = page->buffers, block_start = 0;
1514             bh != head || !block_start;
1515             block_start=block_end, bh = bh->b_this_page) {
1516                 block_end = block_start + blocksize;
1517                 if (block_end <= from || block_start >= to) {
1518                         if (!buffer_uptodate(bh))
1519                                 partial = 1;
1520                 } else {
1521                         set_bit(BH_Uptodate, &bh->b_state);
1522                         if (!atomic_set_buffer_dirty(bh)) {
1523                                 __mark_dirty(bh);
1524                                 need_balance_dirty = 1;
1525                         }
1526                 }
1527         }
1528
1529         if (need_balance_dirty)
1530                 balance_dirty(bh->b_dev);
1531         /*
1532          * is this a partial write that happened to make all buffers
1533          * uptodate then we can optimize away a bogus readpage() for
1534          * the next read(). Here we 'discover' wether the page went
1535          * uptodate as a result of this (potentially partial) write.
1536          */
1537         if (!partial)
1538                 SetPageUptodate(page);
1539         return 0;
1540 }
1541
1542 /*
1543  * Generic "read page" function for block devices that have the normal
1544  * get_block functionality. This is most of the block device filesystems.
1545  * Reads the page asynchronously --- the unlock_buffer() and
1546  * mark_buffer_uptodate() functions propagate buffer state into the
1547  * page struct once IO has completed.
1548  */
1549 int block_read_full_page(struct page *page, get_block_t *get_block)
1550 {
1551         struct inode *inode = (struct inode*)page->mapping->host;
1552         unsigned long iblock, lblock;
1553         struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1554         unsigned int blocksize, blocks;
1555         unsigned long kaddr = 0;
1556         int nr, i;
1557
1558         if (!PageLocked(page))
1559                 PAGE_BUG(page);
1560         blocksize = inode->i_sb->s_blocksize;
1561         if (!page->buffers)
1562                 create_empty_buffers(page, inode, blocksize);
1563         head = page->buffers;
1564
1565         blocks = PAGE_CACHE_SIZE >> inode->i_sb->s_blocksize_bits;
1566         iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1567         lblock = (inode->i_size+blocksize-1) >> inode->i_sb->s_blocksize_bits;
1568         bh = head;
1569         nr = 0;
1570         i = 0;
1571
1572         do {
1573                 if (buffer_uptodate(bh))
1574                         continue;
1575
1576                 if (!buffer_mapped(bh)) {
1577                         if (iblock < lblock)
1578                                 get_block(inode, iblock, bh, 0);
1579                         if (!buffer_mapped(bh)) {
1580                                 if (!kaddr)
1581                                         kaddr = kmap(page);
1582                                 memset((char *)(kaddr + i*blocksize), 0, blocksize);
1583                                 flush_dcache_page(page);
1584                                 set_bit(BH_Uptodate, &bh->b_state);
1585                                 continue;
1586                         }
1587                 }
1588
1589                 init_buffer(bh, end_buffer_io_async, NULL);
1590                 atomic_inc(&bh->b_count);
1591                 arr[nr] = bh;
1592                 nr++;
1593         } while (i++, iblock++, (bh = bh->b_this_page) != head);
1594
1595         if (nr) {
1596                 if (Page_Uptodate(page))
1597                         BUG();
1598                 ll_rw_block(READ, nr, arr);
1599         } else {
1600                 /*
1601                  * all buffers are uptodate - we can set the page
1602                  * uptodate as well.
1603                  */
1604                 SetPageUptodate(page);
1605                 UnlockPage(page);
1606         }
1607         if (kaddr)
1608                 kunmap(page);
1609         return 0;
1610 }
1611
1612 /*
1613  * For moronic filesystems that do not allow holes in file.
1614  * We may have to extend the file.
1615  */
1616
1617 int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes)
1618 {
1619         struct address_space *mapping = page->mapping;
1620         struct inode *inode = (struct inode*)mapping->host;
1621         struct page *new_page;
1622         unsigned long pgpos;
1623         long status;
1624         unsigned zerofrom;
1625         unsigned blocksize = inode->i_sb->s_blocksize;
1626         char *kaddr;
1627
1628         while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
1629                 status = -ENOMEM;
1630                 new_page = grab_cache_page(mapping, pgpos);
1631                 if (!new_page)
1632                         goto out;
1633                 /* we might sleep */
1634                 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
1635                         UnlockPage(new_page);
1636                         page_cache_release(new_page);
1637                         continue;
1638                 }
1639                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1640                 if (zerofrom & (blocksize-1)) {
1641                         *bytes |= (blocksize-1);
1642                         (*bytes)++;
1643                 }
1644                 status = __block_prepare_write(inode, new_page, zerofrom,
1645                                                 PAGE_CACHE_SIZE, get_block);
1646                 if (status)
1647                         goto out_unmap;
1648                 kaddr = page_address(new_page);
1649                 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
1650                 flush_dcache_page(new_page);
1651                 __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
1652                 kunmap(new_page);
1653                 UnlockPage(new_page);
1654                 page_cache_release(new_page);
1655         }
1656
1657         if (page->index < pgpos) {
1658                 /* completely inside the area */
1659                 zerofrom = offset;
1660         } else {
1661                 /* page covers the boundary, find the boundary offset */
1662                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1663
1664                 /* if we will expand the thing last block will be filled */
1665                 if (to > zerofrom && (zerofrom & (blocksize-1))) {
1666                         *bytes |= (blocksize-1);
1667                         (*bytes)++;
1668                 }
1669
1670                 /* starting below the boundary? Nothing to zero out */
1671                 if (offset <= zerofrom)
1672                         zerofrom = offset;
1673         }
1674         status = __block_prepare_write(inode, page, zerofrom, to, get_block);
1675         if (status)
1676                 goto out1;
1677         kaddr = page_address(page);
1678         if (zerofrom < offset) {
1679                 memset(kaddr+zerofrom, 0, offset-zerofrom);
1680                 flush_dcache_page(page);
1681                 __block_commit_write(inode, page, zerofrom, offset);
1682         }
1683         return 0;
1684 out1:
1685         ClearPageUptodate(page);
1686         kunmap(page);
1687         return status;
1688
1689 out_unmap:
1690         ClearPageUptodate(new_page);
1691         kunmap(new_page);
1692         UnlockPage(new_page);
1693         page_cache_release(new_page);
1694 out:
1695         return status;
1696 }
1697
1698 int block_prepare_write(struct page *page, unsigned from, unsigned to,
1699                         get_block_t *get_block)
1700 {
1701         struct inode *inode = (struct inode*)page->mapping->host;
1702         int err = __block_prepare_write(inode, page, from, to, get_block);
1703         if (err) {
1704                 ClearPageUptodate(page);
1705                 kunmap(page);
1706         }
1707         return err;
1708 }
1709
1710 int generic_commit_write(struct file *file, struct page *page,
1711                 unsigned from, unsigned to)
1712 {
1713         struct inode *inode = (struct inode*)page->mapping->host;
1714         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1715         __block_commit_write(inode,page,from,to);
1716         kunmap(page);
1717         if (pos > inode->i_size) {
1718                 inode->i_size = pos;
1719                 mark_inode_dirty(inode);
1720         }
1721         return 0;
1722 }
1723
1724 int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t *get_block)
1725 {
1726         unsigned long index = from >> PAGE_CACHE_SHIFT;
1727         unsigned offset = from & (PAGE_CACHE_SIZE-1);
1728         unsigned blocksize, iblock, length, pos;
1729         struct inode *inode = (struct inode *)mapping->host;
1730         struct page *page;
1731         struct buffer_head *bh;
1732         int err;
1733
1734         blocksize = inode->i_sb->s_blocksize;
1735         length = offset & (blocksize - 1);
1736
1737         /* Block boundary? Nothing to do */
1738         if (!length)
1739                 return 0;
1740
1741         length = blocksize - length;
1742         iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1743
1744         page = grab_cache_page(mapping, index);
1745         err = PTR_ERR(page);
1746         if (IS_ERR(page))
1747                 goto out;
1748
1749         if (!page->buffers)
1750                 create_empty_buffers(page, inode, blocksize);
1751
1752         /* Find the buffer that contains "offset" */
1753         bh = page->buffers;
1754         pos = blocksize;
1755         while (offset >= pos) {
1756                 bh = bh->b_this_page;
1757                 iblock++;
1758                 pos += blocksize;
1759         }
1760
1761         if (!buffer_uptodate(bh)) {
1762                 err = 0;
1763                 if (!buffer_mapped(bh)) {
1764                         get_block(inode, iblock, bh, 0);
1765                         if (!buffer_mapped(bh))
1766                                 goto unlock;
1767                 }
1768                 err = -EIO;
1769                 bh->b_end_io = end_buffer_io_sync;
1770                 ll_rw_block(READ, 1, &bh);
1771                 wait_on_buffer(bh);
1772                 if (!buffer_uptodate(bh))
1773                         goto unlock;
1774         }
1775
1776         memset((char *) kmap(page) + offset, 0, length);
1777         flush_dcache_page(page);
1778         kunmap(page);
1779
1780         mark_buffer_dirty(bh);
1781         err = 0;
1782
1783 unlock:
1784         UnlockPage(page);
1785         page_cache_release(page);
1786 out:
1787         return err;
1788 }
1789
1790 int block_write_full_page(struct page *page, get_block_t *get_block)
1791 {
1792         struct inode *inode = (struct inode*)page->mapping->host;
1793         unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1794         unsigned offset;
1795         int err;
1796
1797         /* easy case */
1798         if (page->index < end_index)
1799                 return __block_write_full_page(inode, page, get_block);
1800
1801         /* things got complicated... */
1802         offset = inode->i_size & (PAGE_CACHE_SIZE-1);
1803         /* OK, are we completely out? */
1804         if (page->index >= end_index+1 || !offset)
1805                 return -EIO;
1806         /* Sigh... will have to work, then... */
1807         err = __block_prepare_write(inode, page, 0, offset, get_block);
1808         if (!err) {
1809                 memset(page_address(page) + offset, 0, PAGE_CACHE_SIZE - offset);
1810                 flush_dcache_page(page);
1811                 __block_commit_write(inode,page,0,offset);
1812 done:
1813                 kunmap(page);
1814                 return err;
1815         }
1816         ClearPageUptodate(page);
1817         goto done;
1818 }
1819
1820 int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block)
1821 {
1822         struct buffer_head tmp;
1823         struct inode *inode = (struct inode*)mapping->host;
1824         tmp.b_state = 0;
1825         tmp.b_blocknr = 0;
1826         get_block(inode, block, &tmp, 0);
1827         return tmp.b_blocknr;
1828 }
1829
1830 /*
1831  * IO completion routine for a buffer_head being used for kiobuf IO: we
1832  * can't dispatch the kiobuf callback until io_count reaches 0.
1833  */
1834
1835 static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate)
1836 {
1837         struct kiobuf *kiobuf;
1838
1839         mark_buffer_uptodate(bh, uptodate);
1840
1841         kiobuf = bh->b_private;
1842         unlock_buffer(bh);
1843         end_kio_request(kiobuf, uptodate);
1844 }
1845
1846
1847 /*
1848  * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
1849  * for them to complete.  Clean up the buffer_heads afterwards.
1850  */
1851
1852 static int wait_kio(int rw, int nr, struct buffer_head *bh[], int size)
1853 {
1854         int iosize;
1855         int i;
1856         struct buffer_head *tmp;
1857
1858
1859         iosize = 0;
1860         spin_lock(&unused_list_lock);
1861
1862         for (i = nr; --i >= 0; ) {
1863                 iosize += size;
1864                 tmp = bh[i];
1865                 if (buffer_locked(tmp)) {
1866                         spin_unlock(&unused_list_lock);
1867                         wait_on_buffer(tmp);
1868                         spin_lock(&unused_list_lock);
1869                 }
1870
1871                 if (!buffer_uptodate(tmp)) {
1872                         /* We are traversing bh'es in reverse order so
1873                            clearing iosize on error calculates the
1874                            amount of IO before the first error. */
1875                         iosize = 0;
1876                 }
1877                 __put_unused_buffer_head(tmp);
1878         }
1879
1880         spin_unlock(&unused_list_lock);
1881
1882         return iosize;
1883 }
1884
1885 /*
1886  * Start I/O on a physical range of kernel memory, defined by a vector
1887  * of kiobuf structs (much like a user-space iovec list).
1888  *
1889  * The kiobuf must already be locked for IO.  IO is submitted
1890  * asynchronously: you need to check page->locked, page->uptodate, and
1891  * maybe wait on page->wait.
1892  *
1893  * It is up to the caller to make sure that there are enough blocks
1894  * passed in to completely map the iobufs to disk.
1895  */
1896
1897 int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
1898                kdev_t dev, unsigned long b[], int size)
1899 {
1900         int             err;
1901         int             length;
1902         int             transferred;
1903         int             i;
1904         int             bufind;
1905         int             pageind;
1906         int             bhind;
1907         int             offset;
1908         int             sectors = size>>9;
1909         unsigned long   blocknr;
1910         struct kiobuf * iobuf = NULL;
1911         struct page *   map;
1912         struct buffer_head *tmp, *bh[KIO_MAX_SECTORS];
1913
1914         if (!nr)
1915                 return 0;
1916
1917         /*
1918          * First, do some alignment and validity checks
1919          */
1920         for (i = 0; i < nr; i++) {
1921                 iobuf = iovec[i];
1922                 if ((iobuf->offset & (size-1)) ||
1923                     (iobuf->length & (size-1)))
1924                         return -EINVAL;
1925                 if (!iobuf->nr_pages)
1926                         panic("brw_kiovec: iobuf not initialised");
1927         }
1928
1929         /*
1930          * OK to walk down the iovec doing page IO on each page we find.
1931          */
1932         bufind = bhind = transferred = err = 0;
1933         for (i = 0; i < nr; i++) {
1934                 iobuf = iovec[i];
1935                 offset = iobuf->offset;
1936                 length = iobuf->length;
1937                 iobuf->errno = 0;
1938
1939                 for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
1940                         map  = iobuf->maplist[pageind];
1941                         if (!map) {
1942                                 err = -EFAULT;
1943                                 goto error;
1944                         }
1945
1946                         while (length > 0) {
1947                                 blocknr = b[bufind++];
1948                                 tmp = get_unused_buffer_head(0);
1949                                 if (!tmp) {
1950                                         err = -ENOMEM;
1951                                         goto error;
1952                                 }
1953
1954                                 tmp->b_dev = B_FREE;
1955                                 tmp->b_size = size;
1956                                 set_bh_page(tmp, map, offset);
1957                                 tmp->b_this_page = tmp;
1958
1959                                 init_buffer(tmp, end_buffer_io_kiobuf, iobuf);
1960                                 tmp->b_rdev = tmp->b_dev = dev;
1961                                 tmp->b_blocknr = blocknr;
1962                                 tmp->b_rsector = blocknr*sectors;
1963                                 tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req);
1964
1965                                 if (rw == WRITE) {
1966                                         set_bit(BH_Uptodate, &tmp->b_state);
1967                                         set_bit(BH_Dirty, &tmp->b_state);
1968                                 }
1969
1970                                 bh[bhind++] = tmp;
1971                                 length -= size;
1972                                 offset += size;
1973
1974                                 atomic_inc(&iobuf->io_count);
1975
1976                                 generic_make_request(rw, tmp);
1977                                 /*
1978                                  * Wait for IO if we have got too much
1979                                  */
1980                                 if (bhind >= KIO_MAX_SECTORS) {
1981                                         err = wait_kio(rw, bhind, bh, size);
1982                                         if (err >= 0)
1983                                                 transferred += err;
1984                                         else
1985                                                 goto finished;
1986                                         bhind = 0;
1987                                 }
1988
1989                                 if (offset >= PAGE_SIZE) {
1990                                         offset = 0;
1991                                         break;
1992                                 }
1993                         } /* End of block loop */
1994                 } /* End of page loop */
1995         } /* End of iovec loop */
1996
1997         /* Is there any IO still left to submit? */
1998         if (bhind) {
1999                 err = wait_kio(rw, bhind, bh, size);
2000                 if (err >= 0)
2001                         transferred += err;
2002                 else
2003                         goto finished;
2004         }
2005
2006  finished:
2007         if (transferred)
2008                 return transferred;
2009         return err;
2010
2011  error:
2012         /* We got an error allocating the bh'es.  Just free the current
2013            buffer_heads and exit. */
2014         spin_lock(&unused_list_lock);
2015         for (i = bhind; --i >= 0; ) {
2016                 __put_unused_buffer_head(bh[bhind]);
2017         }
2018         spin_unlock(&unused_list_lock);
2019         goto finished;
2020 }
2021
2022 /*
2023  * Start I/O on a page.
2024  * This function expects the page to be locked and may return
2025  * before I/O is complete. You then have to check page->locked,
2026  * page->uptodate, and maybe wait on page->wait.
2027  *
2028  * brw_page() is SMP-safe, although it's being called with the
2029  * kernel lock held - but the code is ready.
2030  *
2031  * FIXME: we need a swapper_inode->get_block function to remove
2032  *        some of the bmap kludges and interface ugliness here.
2033  */
2034 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size)
2035 {
2036         struct buffer_head *head, *bh, *arr[MAX_BUF_PER_PAGE];
2037         int nr, fresh /* temporary debugging flag */, block;
2038
2039         if (!PageLocked(page))
2040                 panic("brw_page: page not locked for I/O");
2041 //      ClearPageError(page);
2042         /*
2043          * We pretty much rely on the page lock for this, because
2044          * create_page_buffers() might sleep.
2045          */
2046         fresh = 0;
2047         if (!page->buffers) {
2048                 create_page_buffers(rw, page, dev, b, size);
2049                 fresh = 1;
2050         }
2051         if (!page->buffers)
2052                 BUG();
2053
2054         head = page->buffers;
2055         bh = head;
2056         nr = 0;
2057         do {
2058                 block = *(b++);
2059
2060                 if (fresh && (atomic_read(&bh->b_count) != 0))
2061                         BUG();
2062                 if (rw == READ) {
2063                         if (!fresh)
2064                                 BUG();
2065                         if (!buffer_uptodate(bh)) {
2066                                 arr[nr++] = bh;
2067                                 atomic_inc(&bh->b_count);
2068                         }
2069                 } else { /* WRITE */
2070                         if (!bh->b_blocknr) {
2071                                 if (!block)
2072                                         BUG();
2073                                 bh->b_blocknr = block;
2074                         } else {
2075                                 if (!block)
2076                                         BUG();
2077                         }
2078                         set_bit(BH_Uptodate, &bh->b_state);
2079                         set_bit(BH_Dirty, &bh->b_state);
2080                         arr[nr++] = bh;
2081                         atomic_inc(&bh->b_count);
2082                 }
2083                 bh = bh->b_this_page;
2084         } while (bh != head);
2085         if ((rw == READ) && nr) {
2086                 if (Page_Uptodate(page))
2087                         BUG();
2088                 ll_rw_block(rw, nr, arr);
2089         } else {
2090                 if (!nr && rw == READ) {
2091                         SetPageUptodate(page);
2092                         UnlockPage(page);
2093                 }
2094                 if (nr && (rw == WRITE))
2095                         ll_rw_block(rw, nr, arr);
2096         }
2097         return 0;
2098 }
2099
2100 int block_symlink(struct inode *inode, const char *symname, int len)
2101 {
2102         struct address_space *mapping = inode->i_mapping;
2103         struct page *page = grab_cache_page(mapping, 0);
2104         int err = -ENOMEM;
2105         char *kaddr;
2106
2107         if (!page)
2108                 goto fail;
2109         err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
2110         if (err)
2111                 goto fail_map;
2112         kaddr = page_address(page);
2113         memcpy(kaddr, symname, len-1);
2114         mapping->a_ops->commit_write(NULL, page, 0, len-1);
2115         /*
2116          * Notice that we are _not_ going to block here - end of page is
2117          * unmapped, so this will only try to map the rest of page, see
2118          * that it is unmapped (typically even will not look into inode -
2119          * ->i_size will be enough for everything) and zero it out.
2120          * OTOH it's obviously correct and should make the page up-to-date.
2121          */
2122         err = mapping->a_ops->readpage(NULL, page);
2123         wait_on_page(page);
2124         page_cache_release(page);
2125         if (err < 0)
2126                 goto fail;
2127         mark_inode_dirty(inode);
2128         return 0;
2129 fail_map:
2130         UnlockPage(page);
2131         page_cache_release(page);
2132 fail:
2133         return err;
2134 }
2135
2136 /*
2137  * Try to increase the number of buffers available: the size argument
2138  * is used to determine what kind of buffers we want.
2139  */
2140 static int grow_buffers(int size)
2141 {
2142         struct page * page;
2143         struct buffer_head *bh, *tmp;
2144         struct buffer_head * insert_point;
2145         int isize;
2146
2147         if ((size & 511) || (size > PAGE_SIZE)) {
2148                 printk("VFS: grow_buffers: size = %d\n",size);
2149                 return 0;
2150         }
2151
2152         page = alloc_page(GFP_BUFFER);
2153         if (!page)
2154                 goto out;
2155         bh = create_buffers(page, size, 0);
2156         if (!bh)
2157                 goto no_buffer_head;
2158
2159         isize = BUFSIZE_INDEX(size);
2160
2161         spin_lock(&free_list[isize].lock);
2162         insert_point = free_list[isize].list;
2163         tmp = bh;
2164         while (1) {
2165                 if (insert_point) {
2166                         tmp->b_next_free = insert_point->b_next_free;
2167                         tmp->b_prev_free = insert_point;
2168                         insert_point->b_next_free->b_prev_free = tmp;
2169                         insert_point->b_next_free = tmp;
2170                 } else {
2171                         tmp->b_prev_free = tmp;
2172                         tmp->b_next_free = tmp;
2173                 }
2174                 insert_point = tmp;
2175                 if (tmp->b_this_page)
2176                         tmp = tmp->b_this_page;
2177                 else
2178                         break;
2179         }
2180         tmp->b_this_page = bh;
2181         free_list[isize].list = bh;
2182         spin_unlock(&free_list[isize].lock);
2183
2184         page->buffers = bh;
2185         page->flags &= ~(1 << PG_referenced);
2186         lru_cache_add(page);
2187         atomic_inc(&buffermem_pages);
2188         return 1;
2189
2190 no_buffer_head:
2191         page_cache_release(page);
2192 out:
2193         return 0;
2194 }
2195
2196 /*
2197  * Sync all the buffers on one page..
2198  *
2199  * If we have old buffers that are locked, we'll
2200  * wait on them, but we won't wait on the new ones
2201  * we're writing out now.
2202  *
2203  * This all is required so that we can free up memory
2204  * later.
2205  *
2206  * Wait:
2207  *      0 - no wait (this does not get called - see try_to_free_buffers below)
2208  *      1 - start IO for dirty buffers
2209  *      2 - wait for completion of locked buffers
2210  */
2211 static void sync_page_buffers(struct buffer_head *bh, int wait)
2212 {
2213         struct buffer_head * tmp = bh;
2214
2215         do {
2216                 struct buffer_head *p = tmp;
2217                 tmp = tmp->b_this_page;
2218                 if (buffer_locked(p)) {
2219                         if (wait > 1)
2220                                 __wait_on_buffer(p);
2221                 } else if (buffer_dirty(p))
2222                         ll_rw_block(WRITE, 1, &p);
2223         } while (tmp != bh);
2224 }
2225
2226 /*
2227  * Can the buffer be thrown out?
2228  */
2229 #define BUFFER_BUSY_BITS        ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
2230 #define buffer_busy(bh)         (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
2231
2232 /*
2233  * try_to_free_buffers() checks if all the buffers on this particular page
2234  * are unused, and free's the page if so.
2235  *
2236  * Wake up bdflush() if this fails - if we're running low on memory due
2237  * to dirty buffers, we need to flush them out as quickly as possible.
2238  *
2239  * NOTE: There are quite a number of ways that threads of control can
2240  *       obtain a reference to a buffer head within a page.  So we must
2241  *       lock out all of these paths to cleanly toss the page.
2242  */
2243 int try_to_free_buffers(struct page * page, int wait)
2244 {
2245         struct buffer_head * tmp, * bh = page->buffers;
2246         int index = BUFSIZE_INDEX(bh->b_size);
2247
2248         spin_lock(&lru_list_lock);
2249         write_lock(&hash_table_lock);
2250         spin_lock(&free_list[index].lock);
2251         tmp = bh;
2252         do {
2253                 struct buffer_head *p = tmp;
2254
2255                 tmp = tmp->b_this_page;
2256                 if (buffer_busy(p))
2257                         goto busy_buffer_page;
2258         } while (tmp != bh);
2259
2260         spin_lock(&unused_list_lock);
2261         tmp = bh;
2262         do {
2263                 struct buffer_head * p = tmp;
2264                 tmp = tmp->b_this_page;
2265
2266                 /* The buffer can be either on the regular
2267                  * queues or on the free list..
2268                  */
2269                 if (p->b_dev != B_FREE)
2270                         __remove_from_queues(p);
2271                 else
2272                         __remove_from_free_list(p, index);
2273                 __put_unused_buffer_head(p);
2274         } while (tmp != bh);
2275         spin_unlock(&unused_list_lock);
2276
2277         /* Wake up anyone waiting for buffer heads */
2278         wake_up(&buffer_wait);
2279
2280         /* And free the page */
2281         page->buffers = NULL;
2282         page_cache_release(page);
2283         spin_unlock(&free_list[index].lock);
2284         write_unlock(&hash_table_lock);
2285         spin_unlock(&lru_list_lock);
2286         return 1;
2287
2288 busy_buffer_page:
2289         /* Uhhuh, start writeback so that we don't end up with all dirty pages */
2290         spin_unlock(&free_list[index].lock);
2291         write_unlock(&hash_table_lock);
2292         spin_unlock(&lru_list_lock);
2293         if (wait)
2294                 sync_page_buffers(bh, wait);
2295         return 0;
2296 }
2297
2298 /* ================== Debugging =================== */
2299
2300 void show_buffers(void)
2301 {
2302 #ifdef CONFIG_SMP
2303         struct buffer_head * bh;
2304         int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
2305         int protected = 0;
2306         int nlist;
2307         static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", "PROTECTED", };
2308 #endif
2309
2310         printk("Buffer memory:   %6dkB\n",
2311                         atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
2312
2313 #ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
2314         if (!spin_trylock(&lru_list_lock))
2315                 return;
2316         for(nlist = 0; nlist < NR_LIST; nlist++) {
2317                 found = locked = dirty = used = lastused = protected = 0;
2318                 bh = lru_list[nlist];
2319                 if(!bh) continue;
2320
2321                 do {
2322                         found++;
2323                         if (buffer_locked(bh))
2324                                 locked++;
2325                         if (buffer_protected(bh))
2326                                 protected++;
2327                         if (buffer_dirty(bh))
2328                                 dirty++;
2329                         if (atomic_read(&bh->b_count))
2330                                 used++, lastused = found;
2331                         bh = bh->b_next_free;
2332                 } while (bh != lru_list[nlist]);
2333                 {
2334                         int tmp = nr_buffers_type[nlist];
2335                         if (found != tmp)
2336                                 printk("%9s: BUG -> found %d, reported %d\n",
2337                                        buf_types[nlist], found, tmp);
2338                 }
2339                 printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
2340                        "%d locked, %d protected, %d dirty\n",
2341                        buf_types[nlist], found, size_buffers_type[nlist]>>10,
2342                        used, lastused, locked, protected, dirty);
2343         }
2344         spin_unlock(&lru_list_lock);
2345 #endif
2346 }
2347
2348 /* ===================== Init ======================= */
2349
2350 /*
2351  * allocate the hash table and init the free list
2352  * Use gfp() for the hash table to decrease TLB misses, use
2353  * SLAB cache for buffer heads.
2354  */
2355 void __init buffer_init(unsigned long mempages)
2356 {
2357         int order, i;
2358         unsigned int nr_hash;
2359
2360         /* The buffer cache hash table is less important these days,
2361          * trim it a bit.
2362          */
2363         mempages >>= 14;
2364
2365         mempages *= sizeof(struct buffer_head *);
2366
2367         for (order = 0; (1 << order) < mempages; order++)
2368                 ;
2369
2370         /* try to allocate something until we get it or we're asking
2371            for something that is really too small */
2372
2373         do {
2374                 unsigned long tmp;
2375
2376                 nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
2377                 bh_hash_mask = (nr_hash - 1);
2378
2379                 tmp = nr_hash;
2380                 bh_hash_shift = 0;
2381                 while((tmp >>= 1UL) != 0UL)
2382                         bh_hash_shift++;
2383
2384                 hash_table = (struct buffer_head **)
2385                     __get_free_pages(GFP_ATOMIC, order);
2386         } while (hash_table == NULL && --order > 0);
2387         printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
2388                nr_hash, order, (PAGE_SIZE << order));
2389
2390         if (!hash_table)
2391                 panic("Failed to allocate buffer hash table\n");
2392
2393         /* Setup hash chains. */
2394         for(i = 0; i < nr_hash; i++)
2395                 hash_table[i] = NULL;
2396
2397         /* Setup free lists. */
2398         for(i = 0; i < NR_SIZES; i++) {
2399                 free_list[i].list = NULL;
2400                 free_list[i].lock = SPIN_LOCK_UNLOCKED;
2401         }
2402
2403         /* Setup lru lists. */
2404         for(i = 0; i < NR_LIST; i++)
2405                 lru_list[i] = NULL;
2406
2407 }
2408
2409
2410 /* ====================== bdflush support =================== */
2411
2412 /* This is a simple kernel daemon, whose job it is to provide a dynamic
2413  * response to dirty buffers.  Once this process is activated, we write back
2414  * a limited number of buffers to the disks and then go back to sleep again.
2415  */
2416 static DECLARE_WAIT_QUEUE_HEAD(bdflush_done);
2417 struct task_struct *bdflush_tsk = 0;
2418
2419 void wakeup_bdflush(int block)
2420 {
2421         DECLARE_WAITQUEUE(wait, current);
2422
2423         if (current == bdflush_tsk)
2424                 return;
2425
2426         if (!block) {
2427                 wake_up_process(bdflush_tsk);
2428                 return;
2429         }
2430
2431         /* kflushd can wakeup us before we have a chance to
2432            go to sleep so we must be smart in handling
2433            this wakeup event from kflushd to avoid deadlocking in SMP
2434            (we are not holding any lock anymore in these two paths). */
2435         __set_current_state(TASK_UNINTERRUPTIBLE);
2436         add_wait_queue(&bdflush_done, &wait);
2437
2438         wake_up_process(bdflush_tsk);
2439         schedule();
2440
2441         remove_wait_queue(&bdflush_done, &wait);
2442         __set_current_state(TASK_RUNNING);
2443 }
2444
2445 /* This is the _only_ function that deals with flushing async writes
2446    to disk.
2447    NOTENOTENOTENOTE: we _only_ need to browse the DIRTY lru list
2448    as all dirty buffers lives _only_ in the DIRTY lru list.
2449    As we never browse the LOCKED and CLEAN lru lists they are infact
2450    completly useless. */
2451 static int flush_dirty_buffers(int check_flushtime)
2452 {
2453         struct buffer_head * bh, *next;
2454         int flushed = 0, i;
2455
2456  restart:
2457         spin_lock(&lru_list_lock);
2458         bh = lru_list[BUF_DIRTY];
2459         if (!bh)
2460                 goto out_unlock;
2461         for (i = nr_buffers_type[BUF_DIRTY]; i-- > 0; bh = next) {
2462                 next = bh->b_next_free;
2463
2464                 if (!buffer_dirty(bh)) {
2465                         __refile_buffer(bh);
2466                         continue;
2467                 }
2468                 if (buffer_locked(bh))
2469                         continue;
2470
2471                 if (check_flushtime) {
2472                         /* The dirty lru list is chronologically ordered so
2473                            if the current bh is not yet timed out,
2474                            then also all the following bhs
2475                            will be too young. */
2476                         if (time_before(jiffies, bh->b_flushtime))
2477                                 goto out_unlock;
2478                 } else {
2479                         if (++flushed > bdf_prm.b_un.ndirty)
2480                                 goto out_unlock;
2481                 }
2482
2483                 /* OK, now we are committed to write it out. */
2484                 atomic_inc(&bh->b_count);
2485                 spin_unlock(&lru_list_lock);
2486                 ll_rw_block(WRITE, 1, &bh);
2487                 atomic_dec(&bh->b_count);
2488
2489                 if (current->need_resched)
2490                         schedule();
2491                 goto restart;
2492         }
2493  out_unlock:
2494         spin_unlock(&lru_list_lock);
2495
2496         return flushed;
2497 }
2498
2499 /*
2500  * Here we attempt to write back old buffers.  We also try to flush inodes
2501  * and supers as well, since this function is essentially "update", and
2502  * otherwise there would be no way of ensuring that these quantities ever
2503  * get written back.  Ideally, we would have a timestamp on the inodes
2504  * and superblocks so that we could write back only the old ones as well
2505  */
2506
2507 static int sync_old_buffers(void)
2508 {
2509         lock_kernel();
2510         sync_supers(0);
2511         sync_inodes(0);
2512         unlock_kernel();
2513
2514         flush_dirty_buffers(1);
2515         /* must really sync all the active I/O request to disk here */
2516         run_task_queue(&tq_disk);
2517         return 0;
2518 }
2519
2520 int block_sync_page(struct page *page)
2521 {
2522         run_task_queue(&tq_disk);
2523         return 0;
2524 }
2525
2526 /* This is the interface to bdflush.  As we get more sophisticated, we can
2527  * pass tuning parameters to this "process", to adjust how it behaves.
2528  * We would want to verify each parameter, however, to make sure that it
2529  * is reasonable. */
2530
2531 asmlinkage long sys_bdflush(int func, long data)
2532 {
2533         if (!capable(CAP_SYS_ADMIN))
2534                 return -EPERM;
2535
2536         if (func == 1) {
2537                 /* do_exit directly and let kupdate to do its work alone. */
2538                 do_exit(0);
2539 #if 0 /* left here as it's the only example of lazy-mm-stuff used from
2540          a syscall that doesn't care about the current mm context. */
2541                 int error;
2542                 struct mm_struct *user_mm;
2543
2544                 /*
2545                  * bdflush will spend all of it's time in kernel-space,
2546                  * without touching user-space, so we can switch it into
2547                  * 'lazy TLB mode' to reduce the cost of context-switches
2548                  * to and from bdflush.
2549                  */
2550                 user_mm = start_lazy_tlb();
2551                 error = sync_old_buffers();
2552                 end_lazy_tlb(user_mm);
2553                 return error;
2554 #endif
2555         }
2556
2557         /* Basically func 1 means read param 1, 2 means write param 1, etc */
2558         if (func >= 2) {
2559                 int i = (func-2) >> 1;
2560                 if (i >= 0 && i < N_PARAM) {
2561                         if ((func & 1) == 0)
2562                                 return put_user(bdf_prm.data[i], (int*)data);
2563
2564                         if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
2565                                 bdf_prm.data[i] = data;
2566                                 return 0;
2567                         }
2568                 }
2569                 return -EINVAL;
2570         }
2571
2572         /* Having func 0 used to launch the actual bdflush and then never
2573          * return (unless explicitly killed). We return zero here to
2574          * remain semi-compatible with present update(8) programs.
2575          */
2576         return 0;
2577 }
2578
2579 /*
2580  * This is the actual bdflush daemon itself. It used to be started from
2581  * the syscall above, but now we launch it ourselves internally with
2582  * kernel_thread(...)  directly after the first thread in init/main.c
2583  */
2584 int bdflush(void *sem)
2585 {
2586         struct task_struct *tsk = current;
2587         int flushed;
2588         /*
2589          *      We have a bare-bones task_struct, and really should fill
2590          *      in a few more things so "top" and /proc/2/{exe,root,cwd}
2591          *      display semi-sane things. Not real crucial though...
2592          */
2593
2594         tsk->session = 1;
2595         tsk->pgrp = 1;
2596         strcpy(tsk->comm, "kflushd");
2597         bdflush_tsk = tsk;
2598
2599         /* avoid getting signals */
2600         spin_lock_irq(&tsk->sigmask_lock);
2601         flush_signals(tsk);
2602         sigfillset(&tsk->blocked);
2603         recalc_sigpending(tsk);
2604         spin_unlock_irq(&tsk->sigmask_lock);
2605
2606         up((struct semaphore *)sem);
2607
2608         for (;;) {
2609                 CHECK_EMERGENCY_SYNC
2610
2611                 flushed = flush_dirty_buffers(0);
2612
2613                 /* If wakeup_bdflush will wakeup us
2614                    after our bdflush_done wakeup, then
2615                    we must make sure to not sleep
2616                    in schedule_timeout otherwise
2617                    wakeup_bdflush may wait for our
2618                    bdflush_done wakeup that would never arrive
2619                    (as we would be sleeping) and so it would
2620                    deadlock in SMP. */
2621                 __set_current_state(TASK_INTERRUPTIBLE);
2622                 wake_up(&bdflush_done);
2623                 /*
2624                  * If there are still a lot of dirty buffers around,
2625                  * skip the sleep and flush some more. Otherwise, we
2626                  * go to sleep waiting a wakeup.
2627                  */
2628                 if (!flushed || balance_dirty_state(NODEV) < 0)
2629                         schedule();
2630                 /* Remember to mark us as running otherwise
2631                    the next schedule will block. */
2632                 __set_current_state(TASK_RUNNING);
2633         }
2634 }
2635
2636 /*
2637  * This is the kernel update daemon. It was used to live in userspace
2638  * but since it's need to run safely we want it unkillable by mistake.
2639  * You don't need to change your userspace configuration since
2640  * the userspace `update` will do_exit(0) at the first sys_bdflush().
2641  */
2642 int kupdate(void *sem)
2643 {
2644         struct task_struct * tsk = current;
2645         int interval;
2646
2647         tsk->session = 1;
2648         tsk->pgrp = 1;
2649         strcpy(tsk->comm, "kupdate");
2650
2651         /* sigstop and sigcont will stop and wakeup kupdate */
2652         spin_lock_irq(&tsk->sigmask_lock);
2653         sigfillset(&tsk->blocked);
2654         siginitsetinv(&current->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP));
2655         recalc_sigpending(tsk);
2656         spin_unlock_irq(&tsk->sigmask_lock);
2657
2658         up((struct semaphore *)sem);
2659
2660         for (;;) {
2661                 /* update interval */
2662                 interval = bdf_prm.b_un.interval;
2663                 if (interval) {
2664                         tsk->state = TASK_INTERRUPTIBLE;
2665                         schedule_timeout(interval);
2666                 } else {
2667                 stop_kupdate:
2668                         tsk->state = TASK_STOPPED;
2669                         schedule(); /* wait for SIGCONT */
2670                 }
2671                 /* check for sigstop */
2672                 if (signal_pending(tsk)) {
2673                         int stopped = 0;
2674                         spin_lock_irq(&tsk->sigmask_lock);
2675                         if (sigismember(&tsk->pending.signal, SIGSTOP)) {
2676                                 sigdelset(&tsk->pending.signal, SIGSTOP);
2677                                 stopped = 1;
2678                         }
2679                         recalc_sigpending(tsk);
2680                         spin_unlock_irq(&tsk->sigmask_lock);
2681                         if (stopped)
2682                                 goto stop_kupdate;
2683                 }
2684 #ifdef DEBUG
2685                 printk("kupdate() activated...\n");
2686 #endif
2687                 sync_old_buffers();
2688         }
2689 }
2690
2691 static int __init bdflush_init(void)
2692 {
2693         DECLARE_MUTEX_LOCKED(sem);
2694         kernel_thread(bdflush, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
2695         down(&sem);
2696         kernel_thread(kupdate, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
2697         down(&sem);
2698         return 0;
2699 }
2700
2701 module_init(bdflush_init)
2702