fs/buffer.c

   1 /*
   2  *  linux/fs/buffer.c
   3  *
   4  *  Copyright (C) 1991, 1992  Linus Torvalds
   5  */
   6
   7 /*
   8  *  'buffer.c' implements the buffer-cache functions. Race-conditions have
   9  * been avoided by NEVER letting an interrupt change a buffer (except for the
  10  * data, of course), but instead letting the caller do it.
  11  */
  12
  13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
  14
  15 /* Removed a lot of unnecessary code and simplified things now that
  16  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  17  */
  18
  19 /* Speed up hash, lru, and free list operations.  Use gfp() for allocating
  20  * hash table, use SLAB cache for buffer heads. -DaveM
  21  */
  22
  23 /* Added 32k buffer block sizes - these are required older ARM systems.
  24  * - RMK
  25  */
  26
  27 /* Thread it... -DaveM */
  28
  29 /* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
  30
  31 #include <linux/config.h>
  32 #include <linux/sched.h>
  33 #include <linux/fs.h>
  34 #include <linux/malloc.h>
  35 #include <linux/locks.h>
  36 #include <linux/errno.h>
  37 #include <linux/swap.h>
  38 #include <linux/smp_lock.h>
  39 #include <linux/vmalloc.h>
  40 #include <linux/blkdev.h>
  41 #include <linux/sysrq.h>
  42 #include <linux/file.h>
  43 #include <linux/init.h>
  44 #include <linux/quotaops.h>
  45 #include <linux/iobuf.h>
  46 #include <linux/highmem.h>
  47
  48 #include <asm/uaccess.h>
  49 #include <asm/io.h>
  50 #include <asm/bitops.h>
  51 #include <asm/mmu_context.h>
  52
  53 #define NR_SIZES 7
  54 static char buffersize_index[65] =
  55 {-1,  0,  1, -1,  2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
  56   4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  57   5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  58  -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  59   6};
  60
  61 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
  62 #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
  63 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
  64 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
  65                                              number of unused buffer heads */
  66
  67 /* Anti-deadlock ordering:
  68  *      lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
  69  */
  70
  71 /*
  72  * Hash table gook..
  73  */
  74 static unsigned int bh_hash_mask;
  75 static unsigned int bh_hash_shift;
  76 static struct buffer_head **hash_table;
  77 static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
  78
  79 static struct buffer_head *lru_list[NR_LIST];
  80 static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED;
  81 static int nr_buffers_type[NR_LIST];
  82 static unsigned long size_buffers_type[NR_LIST];
  83
  84 static struct buffer_head * unused_list;
  85 static int nr_unused_buffer_heads;
  86 static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
  87 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
  88
  89 struct bh_free_head {
  90         struct buffer_head *list;
  91         spinlock_t lock;
  92 };
  93 static struct bh_free_head free_list[NR_SIZES];
  94
  95 kmem_cache_t *bh_cachep;
  96
  97 static int grow_buffers(int size);
  98 static void __refile_buffer(struct buffer_head *);
  99
 100 /* This is used by some architectures to estimate available memory. */
 101 atomic_t buffermem_pages = ATOMIC_INIT(0);
 102
 103 /* Here is the parameter block for the bdflush process. If you add or
 104  * remove any of the parameters, make sure to update kernel/sysctl.c.
 105  */
 106
 107 #define N_PARAM 9
 108
 109 /* The dummy values in this structure are left in there for compatibility
 110  * with old programs that play with the /proc entries.
 111  */
 112 union bdflush_param {
 113         struct {
 114                 int nfract;  /* Percentage of buffer cache dirty to
 115                                 activate bdflush */
 116                 int ndirty;  /* Maximum number of dirty blocks to write out per
 117                                 wake-cycle */
 118                 int nrefill; /* Number of clean buffers to try to obtain
 119                                 each time we call refill */
 120                 int nref_dirt; /* Dirty buffer threshold for activating bdflush
 121                                   when trying to refill buffers. */
 122                 int interval; /* jiffies delay between kupdate flushes */
 123                 int age_buffer;  /* Time for normal buffer to age before we flush it */
 124                 int age_super;  /* Time for superblock to age before we flush it */
 125                 int dummy2;    /* unused */
 126                 int dummy3;    /* unused */
 127         } b_un;
 128         unsigned int data[N_PARAM];
 129 } bdf_prm = {{40, 500, 64, 256, 5*HZ, 30*HZ, 5*HZ, 1884, 2}};
 130
 131 /* These are the min and max parameter values that we will allow to be assigned */
 132 int bdflush_min[N_PARAM] = {  0,  10,    5,   25,  0,   1*HZ,   1*HZ, 1, 1};
 133 int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,600*HZ, 6000*HZ, 6000*HZ, 2047, 5};
 134
 135 /*
 136  * Rewrote the wait-routines to use the "new" wait-queue functionality,
 137  * and getting rid of the cli-sti pairs. The wait-queue routines still
 138  * need cli-sti, but now it's just a couple of 386 instructions or so.
 139  *
 140  * Note that the real wait_on_buffer() is an inline function that checks
 141  * if 'b_wait' is set before calling this, so that the queues aren't set
 142  * up unnecessarily.
 143  */
 144 void __wait_on_buffer(struct buffer_head * bh)
 145 {
 146         struct task_struct *tsk = current;
 147         DECLARE_WAITQUEUE(wait, tsk);
 148
 149         atomic_inc(&bh->b_count);
 150         add_wait_queue(&bh->b_wait, &wait);
 151         do {
 152                 run_task_queue(&tq_disk);
 153                 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 154                 if (!buffer_locked(bh))
 155                         break;
 156                 schedule();
 157         } while (buffer_locked(bh));
 158         tsk->state = TASK_RUNNING;
 159         remove_wait_queue(&bh->b_wait, &wait);
 160         atomic_dec(&bh->b_count);
 161 }
 162
 163 /* Call sync_buffers with wait!=0 to ensure that the call does not
 164  * return until all buffer writes have completed.  Sync() may return
 165  * before the writes have finished; fsync() may not.
 166  */
 167
 168 /* Godamity-damn.  Some buffers (bitmaps for filesystems)
 169  * spontaneously dirty themselves without ever brelse being called.
 170  * We will ultimately want to put these in a separate list, but for
 171  * now we search all of the lists for dirty buffers.
 172  */
 173 static int sync_buffers(kdev_t dev, int wait)
 174 {
 175         int i, retry, pass = 0, err = 0;
 176         struct buffer_head * bh, *next;
 177
 178         /* One pass for no-wait, three for wait:
 179          * 0) write out all dirty, unlocked buffers;
 180          * 1) write out all dirty buffers, waiting if locked;
 181          * 2) wait for completion by waiting for all buffers to unlock.
 182          */
 183         do {
 184                 retry = 0;
 185
 186                 /* We search all lists as a failsafe mechanism, not because we expect
 187                  * there to be dirty buffers on any of the other lists.
 188                  */
 189 repeat:
 190                 spin_lock(&lru_list_lock);
 191                 bh = lru_list[BUF_DIRTY];
 192                 if (!bh)
 193                         goto repeat2;
 194
 195                 for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
 196                         next = bh->b_next_free;
 197
 198                         if (!lru_list[BUF_DIRTY])
 199                                 break;
 200                         if (dev && bh->b_dev != dev)
 201                                 continue;
 202                         if (buffer_locked(bh)) {
 203                                 /* Buffer is locked; skip it unless wait is
 204                                  * requested AND pass > 0.
 205                                  */
 206                                 if (!wait || !pass) {
 207                                         retry = 1;
 208                                         continue;
 209                                 }
 210                                 atomic_inc(&bh->b_count);
 211                                 spin_unlock(&lru_list_lock);
 212                                 wait_on_buffer (bh);
 213                                 atomic_dec(&bh->b_count);
 214                                 goto repeat;
 215                         }
 216
 217                         /* If an unlocked buffer is not uptodate, there has
 218                          * been an IO error. Skip it.
 219                          */
 220                         if (wait && buffer_req(bh) && !buffer_locked(bh) &&
 221                             !buffer_dirty(bh) && !buffer_uptodate(bh)) {
 222                                 err = -EIO;
 223                                 continue;
 224                         }
 225
 226                         /* Don't write clean buffers.  Don't write ANY buffers
 227                          * on the third pass.
 228                          */
 229                         if (!buffer_dirty(bh) || pass >= 2)
 230                                 continue;
 231
 232                         atomic_inc(&bh->b_count);
 233                         spin_unlock(&lru_list_lock);
 234                         ll_rw_block(WRITE, 1, &bh);
 235                         atomic_dec(&bh->b_count);
 236                         retry = 1;
 237                         goto repeat;
 238                 }
 239
 240     repeat2:
 241                 bh = lru_list[BUF_LOCKED];
 242                 if (!bh) {
 243                         spin_unlock(&lru_list_lock);
 244                         break;
 245                 }
 246                 for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
 247                         next = bh->b_next_free;
 248
 249                         if (!lru_list[BUF_LOCKED])
 250                                 break;
 251                         if (dev && bh->b_dev != dev)
 252                                 continue;
 253                         if (buffer_locked(bh)) {
 254                                 /* Buffer is locked; skip it unless wait is
 255                                  * requested AND pass > 0.
 256                                  */
 257                                 if (!wait || !pass) {
 258                                         retry = 1;
 259                                         continue;
 260                                 }
 261                                 atomic_inc(&bh->b_count);
 262                                 spin_unlock(&lru_list_lock);
 263                                 wait_on_buffer (bh);
 264                                 spin_lock(&lru_list_lock);
 265                                 atomic_dec(&bh->b_count);
 266                                 goto repeat2;
 267                         }
 268                 }
 269                 spin_unlock(&lru_list_lock);
 270
 271                 /* If we are waiting for the sync to succeed, and if any dirty
 272                  * blocks were written, then repeat; on the second pass, only
 273                  * wait for buffers being written (do not pass to write any
 274                  * more buffers on the second pass).
 275                  */
 276         } while (wait && retry && ++pass<=2);
 277         return err;
 278 }
 279
 280 void sync_dev(kdev_t dev)
 281 {
 282         sync_supers(dev);
 283         sync_inodes(dev);
 284         DQUOT_SYNC(dev);
 285         /* sync all the dirty buffers out to disk only _after_ all the
 286            high level layers finished generated buffer dirty data
 287            (or we'll return with some buffer still dirty on the blockdevice
 288            so breaking the semantics of this call) */
 289         sync_buffers(dev, 0);
 290         /*
 291          * FIXME(eric) we need to sync the physical devices here.
 292          * This is because some (scsi) controllers have huge amounts of
 293          * cache onboard (hundreds of Mb), and we need to instruct
 294          * them to commit all of the dirty memory to disk, and we should
 295          * not return until this has happened.
 296          *
 297          * This would need to get implemented by going through the assorted
 298          * layers so that each block major number can be synced, and this
 299          * would call down into the upper and mid-layer scsi.
 300          */
 301 }
 302
 303 int fsync_dev(kdev_t dev)
 304 {
 305         sync_buffers(dev, 0);
 306
 307         lock_kernel();
 308         sync_supers(dev);
 309         sync_inodes(dev);
 310         DQUOT_SYNC(dev);
 311         unlock_kernel();
 312
 313         return sync_buffers(dev, 1);
 314 }
 315
 316 asmlinkage long sys_sync(void)
 317 {
 318         fsync_dev(0);
 319         return 0;
 320 }
 321
 322 /*
 323  *      filp may be NULL if called via the msync of a vma.
 324  */
 325
 326 int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
 327 {
 328         struct inode * inode = dentry->d_inode;
 329         struct super_block * sb;
 330         kdev_t dev;
 331         int ret;
 332
 333         lock_kernel();
 334         /* sync the inode to buffers */
 335         write_inode_now(inode, 0);
 336
 337         /* sync the superblock to buffers */
 338         sb = inode->i_sb;
 339         wait_on_super(sb);
 340         if (sb->s_op && sb->s_op->write_super)
 341                 sb->s_op->write_super(sb);
 342
 343         /* .. finally sync the buffers to disk */
 344         dev = inode->i_dev;
 345         ret = sync_buffers(dev, 1);
 346         unlock_kernel();
 347         return ret;
 348 }
 349
 350 asmlinkage long sys_fsync(unsigned int fd)
 351 {
 352         struct file * file;
 353         struct dentry * dentry;
 354         struct inode * inode;
 355         int err;
 356
 357         err = -EBADF;
 358         file = fget(fd);
 359         if (!file)
 360                 goto out;
 361
 362         dentry = file->f_dentry;
 363         inode = dentry->d_inode;
 364
 365         err = -EINVAL;
 366         if (!file->f_op || !file->f_op->fsync)
 367                 goto out_putf;
 368
 369         /* We need to protect against concurrent writers.. */
 370         down(&inode->i_sem);
 371         err = file->f_op->fsync(file, dentry, 0);
 372         up(&inode->i_sem);
 373
 374 out_putf:
 375         fput(file);
 376 out:
 377         return err;
 378 }
 379
 380 asmlinkage long sys_fdatasync(unsigned int fd)
 381 {
 382         struct file * file;
 383         struct dentry * dentry;
 384         struct inode * inode;
 385         int err;
 386
 387         err = -EBADF;
 388         file = fget(fd);
 389         if (!file)
 390                 goto out;
 391
 392         dentry = file->f_dentry;
 393         inode = dentry->d_inode;
 394
 395         err = -EINVAL;
 396         if (!file->f_op || !file->f_op->fsync)
 397                 goto out_putf;
 398
 399         down(&inode->i_sem);
 400         err = file->f_op->fsync(file, dentry, 1);
 401         up(&inode->i_sem);
 402
 403 out_putf:
 404         fput(file);
 405 out:
 406         return err;
 407 }
 408
 409 /* After several hours of tedious analysis, the following hash
 410  * function won.  Do not mess with it... -DaveM
 411  */
 412 #define _hashfn(dev,block)      \
 413         ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
 414          (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ ((block) << (bh_hash_shift - 12))))
 415 #define hash(dev,block) hash_table[(_hashfn(dev,block) & bh_hash_mask)]
 416
 417 static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head)
 418 {
 419         if ((bh->b_next = *head) != NULL)
 420                 bh->b_next->b_pprev = &bh->b_next;
 421         *head = bh;
 422         bh->b_pprev = head;
 423 }
 424
 425 static __inline__ void __hash_unlink(struct buffer_head *bh)
 426 {
 427         if (bh->b_pprev) {
 428                 if (bh->b_next)
 429                         bh->b_next->b_pprev = bh->b_pprev;
 430                 *(bh->b_pprev) = bh->b_next;
 431                 bh->b_pprev = NULL;
 432         }
 433 }
 434
 435 static void __insert_into_lru_list(struct buffer_head * bh, int blist)
 436 {
 437         struct buffer_head **bhp = &lru_list[blist];
 438
 439         if(!*bhp) {
 440                 *bhp = bh;
 441                 bh->b_prev_free = bh;
 442         }
 443         bh->b_next_free = *bhp;
 444         bh->b_prev_free = (*bhp)->b_prev_free;
 445         (*bhp)->b_prev_free->b_next_free = bh;
 446         (*bhp)->b_prev_free = bh;
 447         nr_buffers_type[blist]++;
 448         size_buffers_type[blist] += bh->b_size;
 449 }
 450
 451 static void __remove_from_lru_list(struct buffer_head * bh, int blist)
 452 {
 453         if (bh->b_prev_free || bh->b_next_free) {
 454                 bh->b_prev_free->b_next_free = bh->b_next_free;
 455                 bh->b_next_free->b_prev_free = bh->b_prev_free;
 456                 if (lru_list[blist] == bh)
 457                         lru_list[blist] = bh->b_next_free;
 458                 if (lru_list[blist] == bh)
 459                         lru_list[blist] = NULL;
 460                 bh->b_next_free = bh->b_prev_free = NULL;
 461                 nr_buffers_type[blist]--;
 462                 size_buffers_type[blist] -= bh->b_size;
 463         }
 464 }
 465
 466 static void __remove_from_free_list(struct buffer_head * bh, int index)
 467 {
 468         if(bh->b_next_free == bh)
 469                  free_list[index].list = NULL;
 470         else {
 471                 bh->b_prev_free->b_next_free = bh->b_next_free;
 472                 bh->b_next_free->b_prev_free = bh->b_prev_free;
 473                 if (free_list[index].list == bh)
 474                          free_list[index].list = bh->b_next_free;
 475         }
 476         bh->b_next_free = bh->b_prev_free = NULL;
 477 }
 478
 479 /* must be called with both the hash_table_lock and the lru_list_lock
 480    held */
 481 static void __remove_from_queues(struct buffer_head *bh)
 482 {
 483         __hash_unlink(bh);
 484         __remove_from_lru_list(bh, bh->b_list);
 485 }
 486
 487 static void insert_into_queues(struct buffer_head *bh)
 488 {
 489         struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
 490
 491         spin_lock(&lru_list_lock);
 492         write_lock(&hash_table_lock);
 493         __hash_link(bh, head);
 494         __insert_into_lru_list(bh, bh->b_list);
 495         write_unlock(&hash_table_lock);
 496         spin_unlock(&lru_list_lock);
 497 }
 498
 499 /* This function must only run if there are no other
 500  * references _anywhere_ to this buffer head.
 501  */
 502 static void put_last_free(struct buffer_head * bh)
 503 {
 504         struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)];
 505         struct buffer_head **bhp = &head->list;
 506
 507         bh->b_state = 0;
 508
 509         spin_lock(&head->lock);
 510         bh->b_dev = B_FREE;
 511         if(!*bhp) {
 512                 *bhp = bh;
 513                 bh->b_prev_free = bh;
 514         }
 515         bh->b_next_free = *bhp;
 516         bh->b_prev_free = (*bhp)->b_prev_free;
 517         (*bhp)->b_prev_free->b_next_free = bh;
 518         (*bhp)->b_prev_free = bh;
 519         spin_unlock(&head->lock);
 520 }
 521
 522 /*
 523  * Why like this, I hear you say... The reason is race-conditions.
 524  * As we don't lock buffers (unless we are reading them, that is),
 525  * something might happen to it while we sleep (ie a read-error
 526  * will force it bad). This shouldn't really happen currently, but
 527  * the code is ready.
 528  */
 529 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
 530 {
 531         struct buffer_head **head = &hash(dev, block);
 532         struct buffer_head *bh;
 533
 534         read_lock(&hash_table_lock);
 535         for(bh = *head; bh; bh = bh->b_next)
 536                 if (bh->b_blocknr == block      &&
 537                     bh->b_size    == size       &&
 538                     bh->b_dev     == dev)
 539                         break;
 540         if (bh)
 541                 atomic_inc(&bh->b_count);
 542         read_unlock(&hash_table_lock);
 543
 544         return bh;
 545 }
 546
 547 unsigned int get_hardblocksize(kdev_t dev)
 548 {
 549         /*
 550          * Get the hard sector size for the given device.  If we don't know
 551          * what it is, return 0.
 552          */
 553         if (hardsect_size[MAJOR(dev)] != NULL) {
 554                 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
 555                 if (blksize != 0)
 556                         return blksize;
 557         }
 558
 559         /*
 560          * We don't know what the hardware sector size for this device is.
 561          * Return 0 indicating that we don't know.
 562          */
 563         return 0;
 564 }
 565
 566 /* If invalidate_buffers() will trash dirty buffers, it means some kind
 567    of fs corruption is going on. Trashing dirty data always imply losing
 568    information that was supposed to be just stored on the physical layer
 569    by the user.
 570
 571    Thus invalidate_buffers in general usage is not allwowed to trash dirty
 572    buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved.
 573
 574    NOTE: In the case where the user removed a removable-media-disk even if
 575    there's still dirty data not synced on disk (due a bug in the device driver
 576    or due an error of the user), by not destroying the dirty buffers we could
 577    generate corruption also on the next media inserted, thus a parameter is
 578    necessary to handle this case in the most safe way possible (trying
 579    to not corrupt also the new disk inserted with the data belonging to
 580    the old now corrupted disk). Also for the ramdisk the natural thing
 581    to do in order to release the ramdisk memory is to destroy dirty buffers.
 582
 583    These are two special cases. Normal usage imply the device driver
 584    to issue a sync on the device (without waiting I/O completation) and
 585    then an invalidate_buffers call that doesn't trashes dirty buffers. */
 586 void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
 587 {
 588         int i, nlist, slept;
 589         struct buffer_head * bh, * bh_next;
 590
 591  retry:
 592         slept = 0;
 593         spin_lock(&lru_list_lock);
 594         for(nlist = 0; nlist < NR_LIST; nlist++) {
 595                 bh = lru_list[nlist];
 596                 if (!bh)
 597                         continue;
 598                 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
 599                         bh_next = bh->b_next_free;
 600                         if (bh->b_dev != dev)
 601                                 continue;
 602                         if (buffer_locked(bh)) {
 603                                 atomic_inc(&bh->b_count);
 604                                 spin_unlock(&lru_list_lock);
 605                                 wait_on_buffer(bh);
 606                                 slept = 1;
 607                                 spin_lock(&lru_list_lock);
 608                                 atomic_dec(&bh->b_count);
 609                         }
 610
 611                         write_lock(&hash_table_lock);
 612                         if (!atomic_read(&bh->b_count) &&
 613                             (destroy_dirty_buffers || !buffer_dirty(bh))) {
 614                                 __remove_from_queues(bh);
 615                                 put_last_free(bh);
 616                         }
 617                         write_unlock(&hash_table_lock);
 618                         if (slept)
 619                                 goto out;
 620                 }
 621         }
 622 out:
 623         spin_unlock(&lru_list_lock);
 624         if (slept)
 625                 goto retry;
 626 }
 627
 628 void set_blocksize(kdev_t dev, int size)
 629 {
 630         extern int *blksize_size[];
 631         int i, nlist, slept;
 632         struct buffer_head * bh, * bh_next;
 633
 634         if (!blksize_size[MAJOR(dev)])
 635                 return;
 636
 637         /* Size must be a power of two, and between 512 and PAGE_SIZE */
 638         if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
 639                 panic("Invalid blocksize passed to set_blocksize");
 640
 641         if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
 642                 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 643                 return;
 644         }
 645         if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
 646                 return;
 647         sync_buffers(dev, 2);
 648         blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 649
 650  retry:
 651         slept = 0;
 652         spin_lock(&lru_list_lock);
 653         for(nlist = 0; nlist < NR_LIST; nlist++) {
 654                 bh = lru_list[nlist];
 655                 if (!bh)
 656                         continue;
 657                 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
 658                         bh_next = bh->b_next_free;
 659                         if (bh->b_dev != dev || bh->b_size == size)
 660                                 continue;
 661                         if (buffer_locked(bh)) {
 662                                 atomic_inc(&bh->b_count);
 663                                 spin_unlock(&lru_list_lock);
 664                                 wait_on_buffer(bh);
 665                                 slept = 1;
 666                                 spin_lock(&lru_list_lock);
 667                                 atomic_dec(&bh->b_count);
 668                         }
 669
 670                         write_lock(&hash_table_lock);
 671                         if (!atomic_read(&bh->b_count)) {
 672                                 if (buffer_dirty(bh))
 673                                         printk(KERN_WARNING
 674                                                "set_blocksize: dev %s buffer_dirty %lu size %hu\n",
 675                                                kdevname(dev), bh->b_blocknr, bh->b_size);
 676                                 __remove_from_queues(bh);
 677                                 put_last_free(bh);
 678                         } else {
 679                                 if (atomic_set_buffer_clean(bh))
 680                                         __refile_buffer(bh);
 681                                 clear_bit(BH_Uptodate, &bh->b_state);
 682                                 printk(KERN_WARNING
 683                                        "set_blocksize: "
 684                                        "b_count %d, dev %s, block %lu, from %p\n",
 685                                        atomic_read(&bh->b_count), bdevname(bh->b_dev),
 686                                        bh->b_blocknr, __builtin_return_address(0));
 687                         }
 688                         write_unlock(&hash_table_lock);
 689                         if (slept)
 690                                 goto out;
 691                 }
 692         }
 693  out:
 694         spin_unlock(&lru_list_lock);
 695         if (slept)
 696                 goto retry;
 697 }
 698
 699 /*
 700  * We used to try various strange things. Let's not.
 701  */
 702 static void refill_freelist(int size)
 703 {
 704         if (!grow_buffers(size)) {
 705                 wakeup_bdflush(1);
 706                 current->policy |= SCHED_YIELD;
 707                 schedule();
 708         }
 709 }
 710
 711 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
 712 {
 713         bh->b_list = BUF_CLEAN;
 714         bh->b_end_io = handler;
 715         bh->b_private = private;
 716 }
 717
 718 static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 719 {
 720         mark_buffer_uptodate(bh, uptodate);
 721         unlock_buffer(bh);
 722 }
 723
 724 static void end_buffer_io_bad(struct buffer_head *bh, int uptodate)
 725 {
 726         mark_buffer_uptodate(bh, uptodate);
 727         unlock_buffer(bh);
 728         BUG();
 729 }
 730
 731 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
 732 {
 733         static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
 734         unsigned long flags;
 735         struct buffer_head *tmp;
 736         struct page *page;
 737
 738         mark_buffer_uptodate(bh, uptodate);
 739
 740         /* This is a temporary buffer used for page I/O. */
 741         page = bh->b_page;
 742
 743         if (!uptodate)
 744                 SetPageError(page);
 745
 746         /*
 747          * Be _very_ careful from here on. Bad things can happen if
 748          * two buffer heads end IO at almost the same time and both
 749          * decide that the page is now completely done.
 750          *
 751          * Async buffer_heads are here only as labels for IO, and get
 752          * thrown away once the IO for this page is complete.  IO is
 753          * deemed complete once all buffers have been visited
 754          * (b_count==0) and are now unlocked. We must make sure that
 755          * only the _last_ buffer that decrements its count is the one
 756          * that unlock the page..
 757          */
 758         spin_lock_irqsave(&page_uptodate_lock, flags);
 759         unlock_buffer(bh);
 760         atomic_dec(&bh->b_count);
 761         tmp = bh->b_this_page;
 762         while (tmp != bh) {
 763                 if (tmp->b_end_io == end_buffer_io_async && buffer_locked(tmp))
 764                         goto still_busy;
 765                 tmp = tmp->b_this_page;
 766         }
 767
 768         /* OK, the async IO on this page is complete. */
 769         spin_unlock_irqrestore(&page_uptodate_lock, flags);
 770
 771         /*
 772          * if none of the buffers had errors then we can set the
 773          * page uptodate:
 774          */
 775         if (!PageError(page))
 776                 SetPageUptodate(page);
 777
 778         /*
 779          * Run the hooks that have to be done when a page I/O has completed.
 780          */
 781         if (PageTestandClearDecrAfter(page))
 782                 atomic_dec(&nr_async_pages);
 783
 784         UnlockPage(page);
 785
 786         return;
 787
 788 still_busy:
 789         spin_unlock_irqrestore(&page_uptodate_lock, flags);
 790         return;
 791 }
 792
 793 /*
 794  * Ok, this is getblk, and it isn't very clear, again to hinder
 795  * race-conditions. Most of the code is seldom used, (ie repeating),
 796  * so it should be much more efficient than it looks.
 797  *
 798  * The algorithm is changed: hopefully better, and an elusive bug removed.
 799  *
 800  * 14.02.92: changed it to sync dirty buffers a bit: better performance
 801  * when the filesystem starts to get full of dirty blocks (I hope).
 802  */
 803 struct buffer_head * getblk(kdev_t dev, int block, int size)
 804 {
 805         struct buffer_head * bh;
 806         int isize;
 807
 808 repeat:
 809         bh = get_hash_table(dev, block, size);
 810         if (bh)
 811                 goto out;
 812
 813         isize = BUFSIZE_INDEX(size);
 814         spin_lock(&free_list[isize].lock);
 815         bh = free_list[isize].list;
 816         if (bh) {
 817                 __remove_from_free_list(bh, isize);
 818                 atomic_set(&bh->b_count, 1);
 819         }
 820         spin_unlock(&free_list[isize].lock);
 821
 822         /*
 823          * OK, FINALLY we know that this buffer is the only one of
 824          * its kind, we hold a reference (b_count>0), it is unlocked,
 825          * and it is clean.
 826          */
 827         if (bh) {
 828                 init_buffer(bh, end_buffer_io_sync, NULL);
 829                 bh->b_dev = dev;
 830                 bh->b_blocknr = block;
 831                 bh->b_state = 1 << BH_Mapped;
 832
 833                 /* Insert the buffer into the regular lists */
 834                 insert_into_queues(bh);
 835         out:
 836                 touch_buffer(bh);
 837                 return bh;
 838         }
 839
 840         /*
 841          * If we block while refilling the free list, somebody may
 842          * create the buffer first ... search the hashes again.
 843          */
 844         refill_freelist(size);
 845         goto repeat;
 846 }
 847
 848 /* -1 -> no need to flush
 849     0 -> async flush
 850     1 -> sync flush (wait for I/O completation) */
 851 static int balance_dirty_state(kdev_t dev)
 852 {
 853         unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
 854
 855         dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
 856         tot = nr_free_buffer_pages();
 857         tot -= size_buffers_type[BUF_PROTECTED] >> PAGE_SHIFT;
 858
 859         dirty *= 200;
 860         soft_dirty_limit = tot * bdf_prm.b_un.nfract;
 861         hard_dirty_limit = soft_dirty_limit * 2;
 862
 863         if (dirty > soft_dirty_limit) {
 864                 if (dirty > hard_dirty_limit)
 865                         return 1;
 866                 return 0;
 867         }
 868         return -1;
 869 }
 870
 871 /*
 872  * if a new dirty buffer is created we need to balance bdflush.
 873  *
 874  * in the future we might want to make bdflush aware of different
 875  * pressures on different devices - thus the (currently unused)
 876  * 'dev' parameter.
 877  */
 878 void balance_dirty(kdev_t dev)
 879 {
 880         int state = balance_dirty_state(dev);
 881
 882         if (state < 0)
 883                 return;
 884         wakeup_bdflush(state);
 885 }
 886
 887 static __inline__ void __mark_dirty(struct buffer_head *bh, int flag)
 888 {
 889         bh->b_flushtime = jiffies + (flag ? bdf_prm.b_un.age_super : bdf_prm.b_un.age_buffer);
 890         refile_buffer(bh);
 891 }
 892
 893 /* atomic version, the user must call balance_dirty() by hand
 894    as soon as it become possible to block */
 895 void __mark_buffer_dirty(struct buffer_head *bh, int flag)
 896 {
 897         if (!atomic_set_buffer_dirty(bh))
 898                 __mark_dirty(bh, flag);
 899 }
 900
 901 void mark_buffer_dirty(struct buffer_head *bh, int flag)
 902 {
 903         __mark_buffer_dirty(bh, flag);
 904         balance_dirty(bh->b_dev);
 905 }
 906
 907 /*
 908  * A buffer may need to be moved from one buffer list to another
 909  * (e.g. in case it is not shared any more). Handle this.
 910  */
 911 static void __refile_buffer(struct buffer_head *bh)
 912 {
 913         int dispose = BUF_CLEAN;
 914         if (buffer_locked(bh))
 915                 dispose = BUF_LOCKED;
 916         if (buffer_dirty(bh))
 917                 dispose = BUF_DIRTY;
 918         if (buffer_protected(bh))
 919                 dispose = BUF_PROTECTED;
 920         if (dispose != bh->b_list) {
 921                 __remove_from_lru_list(bh, bh->b_list);
 922                 bh->b_list = dispose;
 923                 __insert_into_lru_list(bh, dispose);
 924         }
 925 }
 926
 927 void refile_buffer(struct buffer_head *bh)
 928 {
 929         spin_lock(&lru_list_lock);
 930         __refile_buffer(bh);
 931         spin_unlock(&lru_list_lock);
 932 }
 933
 934 /*
 935  * Release a buffer head
 936  */
 937 void __brelse(struct buffer_head * buf)
 938 {
 939         if (atomic_read(&buf->b_count)) {
 940                 atomic_dec(&buf->b_count);
 941                 return;
 942         }
 943         printk("VFS: brelse: Trying to free free buffer\n");
 944 }
 945
 946 /*
 947  * bforget() is like brelse(), except it puts the buffer on the
 948  * free list if it can.. We can NOT free the buffer if:
 949  *  - there are other users of it
 950  *  - it is locked and thus can have active IO
 951  */
 952 void __bforget(struct buffer_head * buf)
 953 {
 954         /* grab the lru lock here to block bdflush. */
 955         spin_lock(&lru_list_lock);
 956         write_lock(&hash_table_lock);
 957         if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf))
 958                 goto in_use;
 959         __hash_unlink(buf);
 960         write_unlock(&hash_table_lock);
 961         __remove_from_lru_list(buf, buf->b_list);
 962         spin_unlock(&lru_list_lock);
 963         put_last_free(buf);
 964         return;
 965
 966  in_use:
 967         write_unlock(&hash_table_lock);
 968         spin_unlock(&lru_list_lock);
 969 }
 970
 971 /*
 972  * bread() reads a specified block and returns the buffer that contains
 973  * it. It returns NULL if the block was unreadable.
 974  */
 975 struct buffer_head * bread(kdev_t dev, int block, int size)
 976 {
 977         struct buffer_head * bh;
 978
 979         bh = getblk(dev, block, size);
 980         if (buffer_uptodate(bh))
 981                 return bh;
 982         ll_rw_block(READ, 1, &bh);
 983         wait_on_buffer(bh);
 984         if (buffer_uptodate(bh))
 985                 return bh;
 986         brelse(bh);
 987         return NULL;
 988 }
 989
 990 /*
 991  * Ok, breada can be used as bread, but additionally to mark other
 992  * blocks for reading as well. End the argument list with a negative
 993  * number.
 994  */
 995
 996 #define NBUF 16
 997
 998 struct buffer_head * breada(kdev_t dev, int block, int bufsize,
 999         unsigned int pos, unsigned int filesize)
1000 {
1001         struct buffer_head * bhlist[NBUF];
1002         unsigned int blocks;
1003         struct buffer_head * bh;
1004         int index;
1005         int i, j;
1006
1007         if (pos >= filesize)
1008                 return NULL;
1009
1010         if (block < 0)
1011                 return NULL;
1012
1013         bh = getblk(dev, block, bufsize);
1014         index = BUFSIZE_INDEX(bh->b_size);
1015
1016         if (buffer_uptodate(bh))
1017                 return(bh);
1018         else ll_rw_block(READ, 1, &bh);
1019
1020         blocks = (filesize - pos) >> (9+index);
1021
1022         if (blocks < (read_ahead[MAJOR(dev)] >> index))
1023                 blocks = read_ahead[MAJOR(dev)] >> index;
1024         if (blocks > NBUF)
1025                 blocks = NBUF;
1026
1027 /*      if (blocks) printk("breada (new) %d blocks\n",blocks); */
1028
1029         bhlist[0] = bh;
1030         j = 1;
1031         for(i=1; i<blocks; i++) {
1032                 bh = getblk(dev,block+i,bufsize);
1033                 if (buffer_uptodate(bh)) {
1034                         brelse(bh);
1035                         break;
1036                 }
1037                 else bhlist[j++] = bh;
1038         }
1039
1040         /* Request the read for these buffers, and then release them. */
1041         if (j>1)
1042                 ll_rw_block(READA, (j-1), bhlist+1);
1043         for(i=1; i<j; i++)
1044                 brelse(bhlist[i]);
1045
1046         /* Wait for this buffer, and then continue on. */
1047         bh = bhlist[0];
1048         wait_on_buffer(bh);
1049         if (buffer_uptodate(bh))
1050                 return bh;
1051         brelse(bh);
1052         return NULL;
1053 }
1054
1055 /*
1056  * Note: the caller should wake up the buffer_wait list if needed.
1057  */
1058 static __inline__ void __put_unused_buffer_head(struct buffer_head * bh)
1059 {
1060         if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1061                 kmem_cache_free(bh_cachep, bh);
1062         } else {
1063                 bh->b_blocknr = -1;
1064                 init_waitqueue_head(&bh->b_wait);
1065                 nr_unused_buffer_heads++;
1066                 bh->b_next_free = unused_list;
1067                 bh->b_this_page = NULL;
1068                 unused_list = bh;
1069         }
1070 }
1071
1072 /*
1073  * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1074  * no-buffer-head deadlock.  Return NULL on failure; waiting for
1075  * buffer heads is now handled in create_buffers().
1076  */
1077 static struct buffer_head * get_unused_buffer_head(int async)
1078 {
1079         struct buffer_head * bh;
1080
1081         spin_lock(&unused_list_lock);
1082         if (nr_unused_buffer_heads > NR_RESERVED) {
1083                 bh = unused_list;
1084                 unused_list = bh->b_next_free;
1085                 nr_unused_buffer_heads--;
1086                 spin_unlock(&unused_list_lock);
1087                 return bh;
1088         }
1089         spin_unlock(&unused_list_lock);
1090
1091         /* This is critical.  We can't swap out pages to get
1092          * more buffer heads, because the swap-out may need
1093          * more buffer-heads itself.  Thus SLAB_BUFFER.
1094          */
1095         if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
1096                 memset(bh, 0, sizeof(*bh));
1097                 init_waitqueue_head(&bh->b_wait);
1098                 return bh;
1099         }
1100
1101         /*
1102          * If we need an async buffer, use the reserved buffer heads.
1103          */
1104         if (async) {
1105                 spin_lock(&unused_list_lock);
1106                 if (unused_list) {
1107                         bh = unused_list;
1108                         unused_list = bh->b_next_free;
1109                         nr_unused_buffer_heads--;
1110                         spin_unlock(&unused_list_lock);
1111                         return bh;
1112                 }
1113                 spin_unlock(&unused_list_lock);
1114         }
1115 #if 0
1116         /*
1117          * (Pending further analysis ...)
1118          * Ordinary (non-async) requests can use a different memory priority
1119          * to free up pages. Any swapping thus generated will use async
1120          * buffer heads.
1121          */
1122         if(!async &&
1123            (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1124                 memset(bh, 0, sizeof(*bh));
1125                 init_waitqueue_head(&bh->b_wait);
1126                 return bh;
1127         }
1128 #endif
1129
1130         return NULL;
1131 }
1132
1133 void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
1134 {
1135         bh->b_page = page;
1136         if (offset >= PAGE_SIZE)
1137                 BUG();
1138         if (PageHighMem(page))
1139                 /*
1140                  * This catches illegal uses and preserves the offset:
1141                  */
1142                 bh->b_data = (char *)(0 + offset);
1143         else
1144                 bh->b_data = (char *)(page_address(page) + offset);
1145 }
1146
1147 /*
1148  * Create the appropriate buffers when given a page for data area and
1149  * the size of each buffer.. Use the bh->b_this_page linked list to
1150  * follow the buffers created.  Return NULL if unable to create more
1151  * buffers.
1152  * The async flag is used to differentiate async IO (paging, swapping)
1153  * from ordinary buffer allocations, and only async requests are allowed
1154  * to sleep waiting for buffer heads.
1155  */
1156 static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
1157 {
1158         struct buffer_head *bh, *head;
1159         long offset;
1160
1161 try_again:
1162         head = NULL;
1163         offset = PAGE_SIZE;
1164         while ((offset -= size) >= 0) {
1165                 bh = get_unused_buffer_head(async);
1166                 if (!bh)
1167                         goto no_grow;
1168
1169                 bh->b_dev = B_FREE;  /* Flag as unused */
1170                 bh->b_this_page = head;
1171                 head = bh;
1172
1173                 bh->b_state = 0;
1174                 bh->b_next_free = NULL;
1175                 bh->b_pprev = NULL;
1176                 atomic_set(&bh->b_count, 0);
1177                 bh->b_size = size;
1178
1179                 set_bh_page(bh, page, offset);
1180
1181                 bh->b_list = BUF_CLEAN;
1182                 bh->b_end_io = end_buffer_io_bad;
1183         }
1184         return head;
1185 /*
1186  * In case anything failed, we just free everything we got.
1187  */
1188 no_grow:
1189         if (head) {
1190                 spin_lock(&unused_list_lock);
1191                 do {
1192                         bh = head;
1193                         head = head->b_this_page;
1194                         __put_unused_buffer_head(bh);
1195                 } while (head);
1196                 spin_unlock(&unused_list_lock);
1197
1198                 /* Wake up any waiters ... */
1199                 wake_up(&buffer_wait);
1200         }
1201
1202         /*
1203          * Return failure for non-async IO requests.  Async IO requests
1204          * are not allowed to fail, so we have to wait until buffer heads
1205          * become available.  But we don't want tasks sleeping with
1206          * partially complete buffers, so all were released above.
1207          */
1208         if (!async)
1209                 return NULL;
1210
1211         /* We're _really_ low on memory. Now we just
1212          * wait for old buffer heads to become free due to
1213          * finishing IO.  Since this is an async request and
1214          * the reserve list is empty, we're sure there are
1215          * async buffer heads in use.
1216          */
1217         run_task_queue(&tq_disk);
1218
1219         /*
1220          * Set our state for sleeping, then check again for buffer heads.
1221          * This ensures we won't miss a wake_up from an interrupt.
1222          */
1223         wait_event(buffer_wait, nr_unused_buffer_heads >= MAX_BUF_PER_PAGE);
1224         goto try_again;
1225 }
1226
1227 static int create_page_buffers(int rw, struct page *page, kdev_t dev, int b[], int size)
1228 {
1229         struct buffer_head *head, *bh, *tail;
1230         int block;
1231
1232         if (!PageLocked(page))
1233                 BUG();
1234         /*
1235          * Allocate async buffer heads pointing to this page, just for I/O.
1236          * They don't show up in the buffer hash table, but they *are*
1237          * registered in page->buffers.
1238          */
1239         head = create_buffers(page, size, 1);
1240         if (page->buffers)
1241                 BUG();
1242         if (!head)
1243                 BUG();
1244         tail = head;
1245         for (bh = head; bh; bh = bh->b_this_page) {
1246                 block = *(b++);
1247
1248                 tail = bh;
1249                 init_buffer(bh, end_buffer_io_async, NULL);
1250                 bh->b_dev = dev;
1251                 bh->b_blocknr = block;
1252
1253                 set_bit(BH_Mapped, &bh->b_state);
1254         }
1255         tail->b_this_page = head;
1256         page_cache_get(page);
1257         page->buffers = head;
1258         return 0;
1259 }
1260
1261 static void unmap_buffer(struct buffer_head * bh)
1262 {
1263         if (buffer_mapped(bh)) {
1264                 mark_buffer_clean(bh);
1265                 wait_on_buffer(bh);
1266                 clear_bit(BH_Uptodate, &bh->b_state);
1267                 clear_bit(BH_Mapped, &bh->b_state);
1268                 clear_bit(BH_Req, &bh->b_state);
1269                 clear_bit(BH_New, &bh->b_state);
1270         }
1271 }
1272
1273 /*
1274  * We don't have to release all buffers here, but
1275  * we have to be sure that no dirty buffer is left
1276  * and no IO is going on (no buffer is locked), because
1277  * we have truncated the file and are going to free the
1278  * blocks on-disk..
1279  */
1280 int block_flushpage(struct page *page, unsigned long offset)
1281 {
1282         struct buffer_head *head, *bh, *next;
1283         unsigned int curr_off = 0;
1284
1285         if (!PageLocked(page))
1286                 BUG();
1287         if (!page->buffers)
1288                 return 1;
1289
1290         head = page->buffers;
1291         bh = head;
1292         do {
1293                 unsigned int next_off = curr_off + bh->b_size;
1294                 next = bh->b_this_page;
1295
1296                 /*
1297                  * is this block fully flushed?
1298                  */
1299                 if (offset <= curr_off)
1300                         unmap_buffer(bh);
1301                 curr_off = next_off;
1302                 bh = next;
1303         } while (bh != head);
1304
1305         /*
1306          * subtle. We release buffer-heads only if this is
1307          * the 'final' flushpage. We have invalidated the get_block
1308          * cached value unconditionally, so real IO is not
1309          * possible anymore.
1310          *
1311          * If the free doesn't work out, the buffers can be
1312          * left around - they just turn into anonymous buffers
1313          * instead.
1314          */
1315         if (!offset) {
1316                 if (!try_to_free_buffers(page, 0)) {
1317                         atomic_inc(&buffermem_pages);
1318                         return 0;
1319                 }
1320         }
1321
1322         return 1;
1323 }
1324
1325 static void create_empty_buffers(struct page *page, struct inode *inode, unsigned long blocksize)
1326 {
1327         struct buffer_head *bh, *head, *tail;
1328
1329         head = create_buffers(page, blocksize, 1);
1330         if (page->buffers)
1331                 BUG();
1332
1333         bh = head;
1334         do {
1335                 bh->b_dev = inode->i_dev;
1336                 bh->b_blocknr = 0;
1337                 bh->b_end_io = end_buffer_io_bad;
1338                 tail = bh;
1339                 bh = bh->b_this_page;
1340         } while (bh);
1341         tail->b_this_page = head;
1342         page->buffers = head;
1343         page_cache_get(page);
1344 }
1345
1346 static void unmap_underlying_metadata(struct buffer_head * bh)
1347 {
1348         struct buffer_head *old_bh;
1349
1350         old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
1351         if (old_bh) {
1352                 unmap_buffer(old_bh);
1353                 /* Here we could run brelse or bforget. We use
1354                    bforget because it will try to put the buffer
1355                    in the freelist. */
1356                 __bforget(old_bh);
1357         }
1358 }
1359
1360 /*
1361  * block_write_full_page() is SMP-safe - currently it's still
1362  * being called with the kernel lock held, but the code is ready.
1363  */
1364 static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
1365 {
1366         int err, i, need_balance_dirty = 0;
1367         unsigned long block;
1368         struct buffer_head *bh, *head;
1369
1370         if (!PageLocked(page))
1371                 BUG();
1372
1373         if (!page->buffers)
1374                 create_empty_buffers(page, inode, inode->i_sb->s_blocksize);
1375         head = page->buffers;
1376
1377         block = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1378
1379         bh = head;
1380         i = 0;
1381         do {
1382                 /*
1383                  * If the buffer isn't up-to-date, we can't be sure
1384                  * that the buffer has been initialized with the proper
1385                  * block number information etc..
1386                  *
1387                  * Leave it to the low-level FS to make all those
1388                  * decisions (block #0 may actually be a valid block)
1389                  */
1390                 bh->b_end_io = end_buffer_io_sync;
1391                 if (!buffer_mapped(bh)) {
1392                         err = get_block(inode, block, bh, 1);
1393                         if (err)
1394                                 goto out;
1395                         if (buffer_new(bh))
1396                                 unmap_underlying_metadata(bh);
1397                 }
1398                 set_bit(BH_Uptodate, &bh->b_state);
1399                 if (!atomic_set_buffer_dirty(bh)) {
1400                         __mark_dirty(bh, 0);
1401                         need_balance_dirty = 1;
1402                 }
1403
1404                 bh = bh->b_this_page;
1405                 block++;
1406         } while (bh != head);
1407
1408         if (need_balance_dirty)
1409                 balance_dirty(bh->b_dev);
1410
1411         SetPageUptodate(page);
1412         return 0;
1413 out:
1414         ClearPageUptodate(page);
1415         return err;
1416 }
1417
1418 static int __block_prepare_write(struct inode *inode, struct page *page,
1419                 unsigned from, unsigned to, get_block_t *get_block)
1420 {
1421         unsigned block_start, block_end;
1422         unsigned long block;
1423         int err = 0;
1424         unsigned blocksize, bbits;
1425         struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1426         char *kaddr = (char *)kmap(page);
1427
1428         blocksize = inode->i_sb->s_blocksize;
1429         if (!page->buffers)
1430                 create_empty_buffers(page, inode, blocksize);
1431         head = page->buffers;
1432
1433         bbits = inode->i_sb->s_blocksize_bits;
1434         block = page->index << (PAGE_CACHE_SHIFT - bbits);
1435
1436         for(bh = head, block_start = 0; bh != head || !block_start;
1437             block++, block_start=block_end, bh = bh->b_this_page) {
1438                 if (!bh)
1439                         BUG();
1440                 block_end = block_start+blocksize;
1441                 if (block_end <= from)
1442                         continue;
1443                 if (block_start >= to)
1444                         break;
1445                 bh->b_end_io = end_buffer_io_sync;
1446                 if (!buffer_mapped(bh)) {
1447                         err = get_block(inode, block, bh, 1);
1448                         if (err)
1449                                 goto out;
1450                         if (buffer_new(bh)) {
1451                                 unmap_underlying_metadata(bh);
1452                                 if (block_end > to)
1453                                         memset(kaddr+to, 0, block_end-to);
1454                                 if (block_start < from)
1455                                         memset(kaddr+block_start, 0, from-block_start);
1456                                 continue;
1457                         }
1458                 }
1459                 if (!buffer_uptodate(bh) &&
1460                      (block_start < from || block_end > to)) {
1461                         ll_rw_block(READ, 1, &bh);
1462                         *wait_bh++=bh;
1463                 }
1464         }
1465         /*
1466          * If we issued read requests - let them complete.
1467          */
1468         while(wait_bh > wait) {
1469                 wait_on_buffer(*--wait_bh);
1470                 err = -EIO;
1471                 if (!buffer_uptodate(*wait_bh))
1472                         goto out;
1473         }
1474         return 0;
1475 out:
1476         return err;
1477 }
1478
1479 static int __block_commit_write(struct inode *inode, struct page *page,
1480                 unsigned from, unsigned to)
1481 {
1482         unsigned block_start, block_end;
1483         int partial = 0, need_balance_dirty = 0;
1484         unsigned blocksize;
1485         struct buffer_head *bh, *head;
1486
1487         blocksize = inode->i_sb->s_blocksize;
1488
1489         for(bh = head = page->buffers, block_start = 0;
1490             bh != head || !block_start;
1491             block_start=block_end, bh = bh->b_this_page) {
1492                 block_end = block_start + blocksize;
1493                 if (block_end <= from || block_start >= to) {
1494                         if (!buffer_uptodate(bh))
1495                                 partial = 1;
1496                 } else {
1497                         set_bit(BH_Uptodate, &bh->b_state);
1498                         if (!atomic_set_buffer_dirty(bh)) {
1499                                 __mark_dirty(bh, 0);
1500                                 need_balance_dirty = 1;
1501                         }
1502                 }
1503         }
1504
1505         if (need_balance_dirty)
1506                 balance_dirty(bh->b_dev);
1507         /*
1508          * is this a partial write that happened to make all buffers
1509          * uptodate then we can optimize away a bogus readpage() for
1510          * the next read(). Here we 'discover' wether the page went
1511          * uptodate as a result of this (potentially partial) write.
1512          */
1513         if (!partial)
1514                 SetPageUptodate(page);
1515         return 0;
1516 }
1517
1518 /*
1519  * Generic "read page" function for block devices that have the normal
1520  * get_block functionality. This is most of the block device filesystems.
1521  * Reads the page asynchronously --- the unlock_buffer() and
1522  * mark_buffer_uptodate() functions propagate buffer state into the
1523  * page struct once IO has completed.
1524  */
1525 int block_read_full_page(struct page *page, get_block_t *get_block)
1526 {
1527         struct inode *inode = (struct inode*)page->mapping->host;
1528         unsigned long iblock, lblock;
1529         struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1530         unsigned int blocksize, blocks;
1531         unsigned long kaddr = 0;
1532         int nr, i;
1533
1534         if (!PageLocked(page))
1535                 PAGE_BUG(page);
1536         blocksize = inode->i_sb->s_blocksize;
1537         if (!page->buffers)
1538                 create_empty_buffers(page, inode, blocksize);
1539         head = page->buffers;
1540
1541         blocks = PAGE_CACHE_SIZE >> inode->i_sb->s_blocksize_bits;
1542         iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1543         lblock = (inode->i_size+blocksize-1) >> inode->i_sb->s_blocksize_bits;
1544         bh = head;
1545         nr = 0;
1546         i = 0;
1547
1548         do {
1549                 if (buffer_uptodate(bh))
1550                         continue;
1551
1552                 if (!buffer_mapped(bh)) {
1553                         if (iblock < lblock)
1554                                 get_block(inode, iblock, bh, 0);
1555                         if (!buffer_mapped(bh)) {
1556                                 if (!kaddr)
1557                                         kaddr = kmap(page);
1558                                 memset((char *)(kaddr + i*blocksize), 0, blocksize);
1559                                 set_bit(BH_Uptodate, &bh->b_state);
1560                                 continue;
1561                         }
1562                 }
1563
1564                 init_buffer(bh, end_buffer_io_async, NULL);
1565                 atomic_inc(&bh->b_count);
1566                 arr[nr] = bh;
1567                 nr++;
1568         } while (i++, iblock++, (bh = bh->b_this_page) != head);
1569
1570         if (nr) {
1571                 if (Page_Uptodate(page))
1572                         BUG();
1573                 ll_rw_block(READ, nr, arr);
1574         } else {
1575                 /*
1576                  * all buffers are uptodate - we can set the page
1577                  * uptodate as well.
1578                  */
1579                 SetPageUptodate(page);
1580                 UnlockPage(page);
1581         }
1582         if (kaddr)
1583                 kunmap(page);
1584         return 0;
1585 }
1586
1587 /*
1588  * For moronic filesystems that do not allow holes in file.
1589  * We may have to extend the file.
1590  */
1591
1592 int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes)
1593 {
1594         struct address_space *mapping = page->mapping;
1595         struct inode *inode = (struct inode*)mapping->host;
1596         struct page *new_page;
1597         unsigned long pgpos;
1598         long status;
1599         unsigned zerofrom;
1600         unsigned blocksize = inode->i_sb->s_blocksize;
1601         char *kaddr;
1602
1603         while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
1604                 status = -ENOMEM;
1605                 new_page = grab_cache_page(mapping, pgpos);
1606                 if (!new_page)
1607                         goto out;
1608                 /* we might sleep */
1609                 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
1610                         UnlockPage(new_page);
1611                         page_cache_release(new_page);
1612                         continue;
1613                 }
1614                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1615                 if (zerofrom & (blocksize-1)) {
1616                         *bytes |= (blocksize-1);
1617                         (*bytes)++;
1618                 }
1619                 status = __block_prepare_write(inode, new_page, zerofrom,
1620                                                 PAGE_CACHE_SIZE, get_block);
1621                 if (status)
1622                         goto out_unmap;
1623                 kaddr = (char*)page_address(new_page);
1624                 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
1625                 __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
1626                 kunmap(new_page);
1627                 UnlockPage(new_page);
1628                 page_cache_release(new_page);
1629         }
1630
1631         if (page->index < pgpos) {
1632                 /* completely inside the area */
1633                 zerofrom = offset;
1634         } else {
1635                 /* page covers the boundary, find the boundary offset */
1636                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1637
1638                 /* if we will expand the thing last block will be filled */
1639                 if (to > zerofrom && (zerofrom & (blocksize-1))) {
1640                         *bytes |= (blocksize-1);
1641                         (*bytes)++;
1642                 }
1643
1644                 /* starting below the boundary? Nothing to zero out */
1645                 if (offset <= zerofrom)
1646                         zerofrom = offset;
1647         }
1648         status = __block_prepare_write(inode, page, zerofrom, to, get_block);
1649         if (status)
1650                 goto out1;
1651         kaddr = (char*)page_address(page);
1652         if (zerofrom < offset) {
1653                 memset(kaddr+zerofrom, 0, offset-zerofrom);
1654                 __block_commit_write(inode, page, zerofrom, offset);
1655         }
1656         return 0;
1657 out1:
1658         ClearPageUptodate(page);
1659         kunmap(page);
1660         return status;
1661
1662 out_unmap:
1663         ClearPageUptodate(new_page);
1664         kunmap(new_page);
1665         UnlockPage(new_page);
1666         page_cache_release(new_page);
1667 out:
1668         return status;
1669 }
1670
1671 int block_prepare_write(struct page *page, unsigned from, unsigned to,
1672                         get_block_t *get_block)
1673 {
1674         struct inode *inode = (struct inode*)page->mapping->host;
1675         int err = __block_prepare_write(inode, page, from, to, get_block);
1676         if (err) {
1677                 ClearPageUptodate(page);
1678                 kunmap(page);
1679         }
1680         return err;
1681 }
1682
1683 int generic_commit_write(struct file *file, struct page *page,
1684                 unsigned from, unsigned to)
1685 {
1686         struct inode *inode = (struct inode*)page->mapping->host;
1687         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1688         __block_commit_write(inode,page,from,to);
1689         kunmap(page);
1690         if (pos > inode->i_size)
1691                 inode->i_size = pos;
1692         return 0;
1693 }
1694
1695 int block_write_full_page(struct page *page, get_block_t *get_block)
1696 {
1697         struct inode *inode = (struct inode*)page->mapping->host;
1698         unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1699         unsigned offset;
1700         int err;
1701
1702         /* easy case */
1703         if (page->index < end_index)
1704                 return __block_write_full_page(inode, page, get_block);
1705
1706         /* things got complicated... */
1707         offset = inode->i_size & (PAGE_CACHE_SIZE-1);
1708         /* OK, are we completely out? */
1709         if (page->index >= end_index+1 || !offset)
1710                 return -EIO;
1711         /* Sigh... will have to work, then... */
1712         err = __block_prepare_write(inode, page, 0, offset, get_block);
1713         if (!err) {
1714                 memset((char *)page_address(page)+offset, 0, PAGE_CACHE_SIZE-offset);
1715                 __block_commit_write(inode,page,0,offset);
1716 done:
1717                 kunmap(page);
1718                 return err;
1719         }
1720         ClearPageUptodate(page);
1721         goto done;
1722 }
1723
1724 int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block)
1725 {
1726         struct buffer_head tmp;
1727         struct inode *inode = (struct inode*)mapping->host;
1728         tmp.b_state = 0;
1729         tmp.b_blocknr = 0;
1730         get_block(inode, block, &tmp, 0);
1731         return tmp.b_blocknr;
1732 }
1733
1734 /*
1735  * IO completion routine for a buffer_head being used for kiobuf IO: we
1736  * can't dispatch the kiobuf callback until io_count reaches 0.
1737  */
1738
1739 static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate)
1740 {
1741         struct kiobuf *kiobuf;
1742
1743         mark_buffer_uptodate(bh, uptodate);
1744
1745         kiobuf = bh->b_private;
1746         unlock_buffer(bh);
1747         end_kio_request(kiobuf, uptodate);
1748 }
1749
1750
1751 /*
1752  * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
1753  * for them to complete.  Clean up the buffer_heads afterwards.
1754  */
1755
1756 static int do_kio(int rw, int nr, struct buffer_head *bh[], int size)
1757 {
1758         int iosize;
1759         int i;
1760         struct buffer_head *tmp;
1761
1762         if (rw == WRITE)
1763                 rw = WRITERAW;
1764         ll_rw_block(rw, nr, bh);
1765
1766         iosize = 0;
1767         spin_lock(&unused_list_lock);
1768
1769         for (i = nr; --i >= 0; ) {
1770                 iosize += size;
1771                 tmp = bh[i];
1772                 if (buffer_locked(tmp)) {
1773                         spin_unlock(&unused_list_lock);
1774                         wait_on_buffer(tmp);
1775                         spin_lock(&unused_list_lock);
1776                 }
1777
1778                 if (!buffer_uptodate(tmp)) {
1779                         /* We are traversing bh'es in reverse order so
1780                            clearing iosize on error calculates the
1781                            amount of IO before the first error. */
1782                         iosize = 0;
1783                 }
1784                 __put_unused_buffer_head(tmp);
1785         }
1786
1787         spin_unlock(&unused_list_lock);
1788
1789         return iosize;
1790 }
1791
1792 /*
1793  * Start I/O on a physical range of kernel memory, defined by a vector
1794  * of kiobuf structs (much like a user-space iovec list).
1795  *
1796  * The kiobuf must already be locked for IO.  IO is submitted
1797  * asynchronously: you need to check page->locked, page->uptodate, and
1798  * maybe wait on page->wait.
1799  *
1800  * It is up to the caller to make sure that there are enough blocks
1801  * passed in to completely map the iobufs to disk.
1802  */
1803
1804 int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
1805                kdev_t dev, unsigned long b[], int size)
1806 {
1807         int             err;
1808         int             length;
1809         int             transferred;
1810         int             i;
1811         int             bufind;
1812         int             pageind;
1813         int             bhind;
1814         int             offset;
1815         unsigned long   blocknr;
1816         struct kiobuf * iobuf = NULL;
1817         struct page *   map;
1818         struct buffer_head *tmp, *bh[KIO_MAX_SECTORS];
1819
1820         if (!nr)
1821                 return 0;
1822
1823         /*
1824          * First, do some alignment and validity checks
1825          */
1826         for (i = 0; i < nr; i++) {
1827                 iobuf = iovec[i];
1828                 if ((iobuf->offset & (size-1)) ||
1829                     (iobuf->length & (size-1)))
1830                         return -EINVAL;
1831                 if (!iobuf->nr_pages)
1832                         panic("brw_kiovec: iobuf not initialised");
1833         }
1834
1835         /*
1836          * OK to walk down the iovec doing page IO on each page we find.
1837          */
1838         bufind = bhind = transferred = err = 0;
1839         for (i = 0; i < nr; i++) {
1840                 iobuf = iovec[i];
1841                 offset = iobuf->offset;
1842                 length = iobuf->length;
1843                 iobuf->errno = 0;
1844
1845                 for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
1846                         map  = iobuf->maplist[pageind];
1847                         if (!map) {
1848                                 err = -EFAULT;
1849                                 goto error;
1850                         }
1851
1852                         while (length > 0) {
1853                                 blocknr = b[bufind++];
1854                                 tmp = get_unused_buffer_head(0);
1855                                 if (!tmp) {
1856                                         err = -ENOMEM;
1857                                         goto error;
1858                                 }
1859
1860                                 tmp->b_dev = B_FREE;
1861                                 tmp->b_size = size;
1862                                 set_bh_page(tmp, map, offset);
1863                                 tmp->b_this_page = tmp;
1864
1865                                 init_buffer(tmp, end_buffer_io_kiobuf, iobuf);
1866                                 tmp->b_dev = dev;
1867                                 tmp->b_blocknr = blocknr;
1868                                 tmp->b_state = 1 << BH_Mapped;
1869
1870                                 if (rw == WRITE) {
1871                                         set_bit(BH_Uptodate, &tmp->b_state);
1872                                         set_bit(BH_Dirty, &tmp->b_state);
1873                                 }
1874
1875                                 bh[bhind++] = tmp;
1876                                 length -= size;
1877                                 offset += size;
1878
1879                                 atomic_inc(&iobuf->io_count);
1880
1881                                 /*
1882                                  * Start the IO if we have got too much
1883                                  */
1884                                 if (bhind >= KIO_MAX_SECTORS) {
1885                                         err = do_kio(rw, bhind, bh, size);
1886                                         if (err >= 0)
1887                                                 transferred += err;
1888                                         else
1889                                                 goto finished;
1890                                         bhind = 0;
1891                                 }
1892
1893                                 if (offset >= PAGE_SIZE) {
1894                                         offset = 0;
1895                                         break;
1896                                 }
1897                         } /* End of block loop */
1898                 } /* End of page loop */
1899         } /* End of iovec loop */
1900
1901         /* Is there any IO still left to submit? */
1902         if (bhind) {
1903                 err = do_kio(rw, bhind, bh, size);
1904                 if (err >= 0)
1905                         transferred += err;
1906                 else
1907                         goto finished;
1908         }
1909
1910  finished:
1911         if (transferred)
1912                 return transferred;
1913         return err;
1914
1915  error:
1916         /* We got an error allocating the bh'es.  Just free the current
1917            buffer_heads and exit. */
1918         spin_lock(&unused_list_lock);
1919         for (i = bhind; --i >= 0; ) {
1920                 __put_unused_buffer_head(bh[bhind]);
1921         }
1922         spin_unlock(&unused_list_lock);
1923         goto finished;
1924 }
1925
1926 /*
1927  * Start I/O on a page.
1928  * This function expects the page to be locked and may return
1929  * before I/O is complete. You then have to check page->locked,
1930  * page->uptodate, and maybe wait on page->wait.
1931  *
1932  * brw_page() is SMP-safe, although it's being called with the
1933  * kernel lock held - but the code is ready.
1934  *
1935  * FIXME: we need a swapper_inode->get_block function to remove
1936  *        some of the bmap kludges and interface ugliness here.
1937  */
1938 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size)
1939 {
1940         struct buffer_head *head, *bh, *arr[MAX_BUF_PER_PAGE];
1941         int nr, fresh /* temporary debugging flag */, block;
1942
1943         if (!PageLocked(page))
1944                 panic("brw_page: page not locked for I/O");
1945 //      ClearPageError(page);
1946         /*
1947          * We pretty much rely on the page lock for this, because
1948          * create_page_buffers() might sleep.
1949          */
1950         fresh = 0;
1951         if (!page->buffers) {
1952                 create_page_buffers(rw, page, dev, b, size);
1953                 fresh = 1;
1954         }
1955         if (!page->buffers)
1956                 BUG();
1957
1958         head = page->buffers;
1959         bh = head;
1960         nr = 0;
1961         do {
1962                 block = *(b++);
1963
1964                 if (fresh && (atomic_read(&bh->b_count) != 0))
1965                         BUG();
1966                 if (rw == READ) {
1967                         if (!fresh)
1968                                 BUG();
1969                         if (!buffer_uptodate(bh)) {
1970                                 arr[nr++] = bh;
1971                                 atomic_inc(&bh->b_count);
1972                         }
1973                 } else { /* WRITE */
1974                         if (!bh->b_blocknr) {
1975                                 if (!block)
1976                                         BUG();
1977                                 bh->b_blocknr = block;
1978                         } else {
1979                                 if (!block)
1980                                         BUG();
1981                         }
1982                         set_bit(BH_Uptodate, &bh->b_state);
1983                         set_bit(BH_Dirty, &bh->b_state);
1984                         arr[nr++] = bh;
1985                         atomic_inc(&bh->b_count);
1986                 }
1987                 bh = bh->b_this_page;
1988         } while (bh != head);
1989         if ((rw == READ) && nr) {
1990                 if (Page_Uptodate(page))
1991                         BUG();
1992                 ll_rw_block(rw, nr, arr);
1993         } else {
1994                 if (!nr && rw == READ) {
1995                         SetPageUptodate(page);
1996                         UnlockPage(page);
1997                 }
1998                 if (nr && (rw == WRITE))
1999                         ll_rw_block(rw, nr, arr);
2000         }
2001         return 0;
2002 }
2003
2004 int block_symlink(struct inode *inode, const char *symname, int len)
2005 {
2006         struct address_space *mapping = inode->i_mapping;
2007         struct page *page = grab_cache_page(mapping, 0);
2008         int err = -ENOMEM;
2009         char *kaddr;
2010
2011         if (!page)
2012                 goto fail;
2013         err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
2014         if (err)
2015                 goto fail_map;
2016         kaddr = (char*)page_address(page);
2017         memcpy(kaddr, symname, len-1);
2018         mapping->a_ops->commit_write(NULL, page, 0, len-1);
2019         /*
2020          * Notice that we are _not_ going to block here - end of page is
2021          * unmapped, so this will only try to map the rest of page, see
2022          * that it is unmapped (typically even will not look into inode -
2023          * ->i_size will be enough for everything) and zero it out.
2024          * OTOH it's obviously correct and should make the page up-to-date.
2025          */
2026         err = mapping->a_ops->readpage(NULL, page);
2027         wait_on_page(page);
2028         page_cache_release(page);
2029         if (err < 0)
2030                 goto fail;
2031         mark_inode_dirty(inode);
2032         return 0;
2033 fail_map:
2034         UnlockPage(page);
2035         page_cache_release(page);
2036 fail:
2037         return err;
2038 }
2039
2040 /*
2041  * Try to increase the number of buffers available: the size argument
2042  * is used to determine what kind of buffers we want.
2043  */
2044 static int grow_buffers(int size)
2045 {
2046         struct page * page;
2047         struct buffer_head *bh, *tmp;
2048         struct buffer_head * insert_point;
2049         int isize;
2050
2051         if ((size & 511) || (size > PAGE_SIZE)) {
2052                 printk("VFS: grow_buffers: size = %d\n",size);
2053                 return 0;
2054         }
2055
2056         page = alloc_page(GFP_BUFFER);
2057         if (!page)
2058                 goto out;
2059         bh = create_buffers(page, size, 0);
2060         if (!bh)
2061                 goto no_buffer_head;
2062
2063         isize = BUFSIZE_INDEX(size);
2064
2065         spin_lock(&free_list[isize].lock);
2066         insert_point = free_list[isize].list;
2067         tmp = bh;
2068         while (1) {
2069                 if (insert_point) {
2070                         tmp->b_next_free = insert_point->b_next_free;
2071                         tmp->b_prev_free = insert_point;
2072                         insert_point->b_next_free->b_prev_free = tmp;
2073                         insert_point->b_next_free = tmp;
2074                 } else {
2075                         tmp->b_prev_free = tmp;
2076                         tmp->b_next_free = tmp;
2077                 }
2078                 insert_point = tmp;
2079                 if (tmp->b_this_page)
2080                         tmp = tmp->b_this_page;
2081                 else
2082                         break;
2083         }
2084         tmp->b_this_page = bh;
2085         free_list[isize].list = bh;
2086         spin_unlock(&free_list[isize].lock);
2087
2088         page->buffers = bh;
2089         page->flags &= ~(1 << PG_referenced);
2090         lru_cache_add(page);
2091         atomic_inc(&buffermem_pages);
2092         return 1;
2093
2094 no_buffer_head:
2095         page_cache_release(page);
2096 out:
2097         return 0;
2098 }
2099
2100 /*
2101  * Sync all the buffers on one page..
2102  *
2103  * If we have old buffers that are locked, we'll
2104  * wait on them, but we won't wait on the new ones
2105  * we're writing out now.
2106  *
2107  * This all is required so that we can free up memory
2108  * later.
2109  */
2110 static void sync_page_buffers(struct buffer_head *bh, int wait)
2111 {
2112         struct buffer_head * tmp = bh;
2113
2114         do {
2115                 struct buffer_head *p = tmp;
2116                 tmp = tmp->b_this_page;
2117                 if (buffer_locked(p)) {
2118                         if (wait)
2119                                 __wait_on_buffer(p);
2120                 } else if (buffer_dirty(p))
2121                         ll_rw_block(WRITE, 1, &p);
2122         } while (tmp != bh);
2123 }
2124
2125 /*
2126  * Can the buffer be thrown out?
2127  */
2128 #define BUFFER_BUSY_BITS        ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
2129 #define buffer_busy(bh)         (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
2130
2131 /*
2132  * try_to_free_buffers() checks if all the buffers on this particular page
2133  * are unused, and free's the page if so.
2134  *
2135  * Wake up bdflush() if this fails - if we're running low on memory due
2136  * to dirty buffers, we need to flush them out as quickly as possible.
2137  *
2138  * NOTE: There are quite a number of ways that threads of control can
2139  *       obtain a reference to a buffer head within a page.  So we must
2140  *       lock out all of these paths to cleanly toss the page.
2141  */
2142 int try_to_free_buffers(struct page * page, int wait)
2143 {
2144         struct buffer_head * tmp, * bh = page->buffers;
2145         int index = BUFSIZE_INDEX(bh->b_size);
2146
2147         spin_lock(&lru_list_lock);
2148         write_lock(&hash_table_lock);
2149         spin_lock(&free_list[index].lock);
2150         tmp = bh;
2151         do {
2152                 struct buffer_head *p = tmp;
2153
2154                 tmp = tmp->b_this_page;
2155                 if (buffer_busy(p))
2156                         goto busy_buffer_page;
2157         } while (tmp != bh);
2158
2159         spin_lock(&unused_list_lock);
2160         tmp = bh;
2161         do {
2162                 struct buffer_head * p = tmp;
2163                 tmp = tmp->b_this_page;
2164
2165                 /* The buffer can be either on the regular
2166                  * queues or on the free list..
2167                  */
2168                 if (p->b_dev != B_FREE)
2169                         __remove_from_queues(p);
2170                 else
2171                         __remove_from_free_list(p, index);
2172                 __put_unused_buffer_head(p);
2173         } while (tmp != bh);
2174         spin_unlock(&unused_list_lock);
2175
2176         /* Wake up anyone waiting for buffer heads */
2177         wake_up(&buffer_wait);
2178
2179         /* And free the page */
2180         page->buffers = NULL;
2181         page_cache_release(page);
2182         spin_unlock(&free_list[index].lock);
2183         write_unlock(&hash_table_lock);
2184         spin_unlock(&lru_list_lock);
2185         return 1;
2186
2187 busy_buffer_page:
2188         /* Uhhuh, start writeback so that we don't end up with all dirty pages */
2189         spin_unlock(&free_list[index].lock);
2190         write_unlock(&hash_table_lock);
2191         spin_unlock(&lru_list_lock);
2192         sync_page_buffers(bh, wait);
2193         return 0;
2194 }
2195
2196 /* ================== Debugging =================== */
2197
2198 void show_buffers(void)
2199 {
2200 #ifdef CONFIG_SMP
2201         struct buffer_head * bh;
2202         int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
2203         int protected = 0;
2204         int nlist;
2205         static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", "PROTECTED", };
2206 #endif
2207
2208         printk("Buffer memory:   %6dkB\n",
2209                         atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
2210
2211 #ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
2212         if (!spin_trylock(&lru_list_lock))
2213                 return;
2214         for(nlist = 0; nlist < NR_LIST; nlist++) {
2215                 found = locked = dirty = used = lastused = protected = 0;
2216                 bh = lru_list[nlist];
2217                 if(!bh) continue;
2218
2219                 do {
2220                         found++;
2221                         if (buffer_locked(bh))
2222                                 locked++;
2223                         if (buffer_protected(bh))
2224                                 protected++;
2225                         if (buffer_dirty(bh))
2226                                 dirty++;
2227                         if (atomic_read(&bh->b_count))
2228                                 used++, lastused = found;
2229                         bh = bh->b_next_free;
2230                 } while (bh != lru_list[nlist]);
2231                 {
2232                         int tmp = nr_buffers_type[nlist];
2233                         if (found != tmp)
2234                                 printk("%9s: BUG -> found %d, reported %d\n",
2235                                        buf_types[nlist], found, tmp);
2236                 }
2237                 printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
2238                        "%d locked, %d protected, %d dirty\n",
2239                        buf_types[nlist], found, size_buffers_type[nlist]>>10,
2240                        used, lastused, locked, protected, dirty);
2241         }
2242         spin_unlock(&lru_list_lock);
2243 #endif
2244 }
2245
2246 /* ===================== Init ======================= */
2247
2248 /*
2249  * allocate the hash table and init the free list
2250  * Use gfp() for the hash table to decrease TLB misses, use
2251  * SLAB cache for buffer heads.
2252  */
2253 void __init buffer_init(unsigned long mempages)
2254 {
2255         int order, i;
2256         unsigned int nr_hash;
2257
2258         /* The buffer cache hash table is less important these days,
2259          * trim it a bit.
2260          */
2261         mempages >>= 14;
2262
2263         mempages *= sizeof(struct buffer_head *);
2264
2265         for (order = 0; (1 << order) < mempages; order++)
2266                 ;
2267
2268         /* try to allocate something until we get it or we're asking
2269            for something that is really too small */
2270
2271         do {
2272                 unsigned long tmp;
2273
2274                 nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
2275                 bh_hash_mask = (nr_hash - 1);
2276
2277                 tmp = nr_hash;
2278                 bh_hash_shift = 0;
2279                 while((tmp >>= 1UL) != 0UL)
2280                         bh_hash_shift++;
2281
2282                 hash_table = (struct buffer_head **)
2283                     __get_free_pages(GFP_ATOMIC, order);
2284         } while (hash_table == NULL && --order > 0);
2285         printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
2286                nr_hash, order, (PAGE_SIZE << order));
2287
2288         if (!hash_table)
2289                 panic("Failed to allocate buffer hash table\n");
2290
2291         /* Setup hash chains. */
2292         for(i = 0; i < nr_hash; i++)
2293                 hash_table[i] = NULL;
2294
2295         /* Setup free lists. */
2296         for(i = 0; i < NR_SIZES; i++) {
2297                 free_list[i].list = NULL;
2298                 free_list[i].lock = SPIN_LOCK_UNLOCKED;
2299         }
2300
2301         /* Setup lru lists. */
2302         for(i = 0; i < NR_LIST; i++)
2303                 lru_list[i] = NULL;
2304
2305         bh_cachep = kmem_cache_create("buffer_head",
2306                                       sizeof(struct buffer_head),
2307                                       0,
2308                                       SLAB_HWCACHE_ALIGN, NULL, NULL);
2309         if(!bh_cachep)
2310                 panic("Cannot create buffer head SLAB cache\n");
2311 }
2312
2313
2314 /* ====================== bdflush support =================== */
2315
2316 /* This is a simple kernel daemon, whose job it is to provide a dynamic
2317  * response to dirty buffers.  Once this process is activated, we write back
2318  * a limited number of buffers to the disks and then go back to sleep again.
2319  */
2320 static DECLARE_WAIT_QUEUE_HEAD(bdflush_done);
2321 struct task_struct *bdflush_tsk = 0;
2322
2323 void wakeup_bdflush(int block)
2324 {
2325         DECLARE_WAITQUEUE(wait, current);
2326
2327         if (current == bdflush_tsk)
2328                 return;
2329
2330         if (!block) {
2331                 wake_up_process(bdflush_tsk);
2332                 return;
2333         }
2334
2335         /* kflushd can wakeup us before we have a chance to
2336            go to sleep so we must be smart in handling
2337            this wakeup event from kflushd to avoid deadlocking in SMP
2338            (we are not holding any lock anymore in these two paths). */
2339         __set_current_state(TASK_UNINTERRUPTIBLE);
2340         add_wait_queue(&bdflush_done, &wait);
2341
2342         wake_up_process(bdflush_tsk);
2343         schedule();
2344
2345         remove_wait_queue(&bdflush_done, &wait);
2346         __set_current_state(TASK_RUNNING);
2347 }
2348
2349 /* This is the _only_ function that deals with flushing async writes
2350    to disk.
2351    NOTENOTENOTENOTE: we _only_ need to browse the DIRTY lru list
2352    as all dirty buffers lives _only_ in the DIRTY lru list.
2353    As we never browse the LOCKED and CLEAN lru lists they are infact
2354    completly useless. */
2355 static int flush_dirty_buffers(int check_flushtime)
2356 {
2357         struct buffer_head * bh, *next;
2358         int flushed = 0, i;
2359
2360  restart:
2361         spin_lock(&lru_list_lock);
2362         bh = lru_list[BUF_DIRTY];
2363         if (!bh)
2364                 goto out_unlock;
2365         for (i = nr_buffers_type[BUF_DIRTY]; i-- > 0; bh = next) {
2366                 next = bh->b_next_free;
2367
2368                 if (!buffer_dirty(bh)) {
2369                         __refile_buffer(bh);
2370                         continue;
2371                 }
2372                 if (buffer_locked(bh))
2373                         continue;
2374
2375                 if (check_flushtime) {
2376                         /* The dirty lru list is chronologically ordered so
2377                            if the current bh is not yet timed out,
2378                            then also all the following bhs
2379                            will be too young. */
2380                         if (time_before(jiffies, bh->b_flushtime))
2381                                 goto out_unlock;
2382                 } else {
2383                         if (++flushed > bdf_prm.b_un.ndirty)
2384                                 goto out_unlock;
2385                 }
2386
2387                 /* OK, now we are committed to write it out. */
2388                 atomic_inc(&bh->b_count);
2389                 spin_unlock(&lru_list_lock);
2390                 ll_rw_block(WRITE, 1, &bh);
2391                 atomic_dec(&bh->b_count);
2392
2393                 if (current->need_resched)
2394                         schedule();
2395                 goto restart;
2396         }
2397  out_unlock:
2398         spin_unlock(&lru_list_lock);
2399
2400         return flushed;
2401 }
2402
2403 /*
2404  * Here we attempt to write back old buffers.  We also try to flush inodes
2405  * and supers as well, since this function is essentially "update", and
2406  * otherwise there would be no way of ensuring that these quantities ever
2407  * get written back.  Ideally, we would have a timestamp on the inodes
2408  * and superblocks so that we could write back only the old ones as well
2409  */
2410
2411 static int sync_old_buffers(void)
2412 {
2413         lock_kernel();
2414         sync_supers(0);
2415         sync_inodes(0);
2416         unlock_kernel();
2417
2418         flush_dirty_buffers(1);
2419         /* must really sync all the active I/O request to disk here */
2420         run_task_queue(&tq_disk);
2421         return 0;
2422 }
2423
2424 int block_sync_page(struct page *page)
2425 {
2426         run_task_queue(&tq_disk);
2427         return 0;
2428 }
2429
2430 /* This is the interface to bdflush.  As we get more sophisticated, we can
2431  * pass tuning parameters to this "process", to adjust how it behaves.
2432  * We would want to verify each parameter, however, to make sure that it
2433  * is reasonable. */
2434
2435 asmlinkage long sys_bdflush(int func, long data)
2436 {
2437         if (!capable(CAP_SYS_ADMIN))
2438                 return -EPERM;
2439
2440         if (func == 1) {
2441                 /* do_exit directly and let kupdate to do its work alone. */
2442                 do_exit(0);
2443 #if 0 /* left here as it's the only example of lazy-mm-stuff used from
2444          a syscall that doesn't care about the current mm context. */
2445                 int error;
2446                 struct mm_struct *user_mm;
2447
2448                 /*
2449                  * bdflush will spend all of it's time in kernel-space,
2450                  * without touching user-space, so we can switch it into
2451                  * 'lazy TLB mode' to reduce the cost of context-switches
2452                  * to and from bdflush.
2453                  */
2454                 user_mm = start_lazy_tlb();
2455                 error = sync_old_buffers();
2456                 end_lazy_tlb(user_mm);
2457                 return error;
2458 #endif
2459         }
2460
2461         /* Basically func 1 means read param 1, 2 means write param 1, etc */
2462         if (func >= 2) {
2463                 int i = (func-2) >> 1;
2464                 if (i >= 0 && i < N_PARAM) {
2465                         if ((func & 1) == 0)
2466                                 return put_user(bdf_prm.data[i], (int*)data);
2467
2468                         if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
2469                                 bdf_prm.data[i] = data;
2470                                 return 0;
2471                         }
2472                 }
2473                 return -EINVAL;
2474         }
2475
2476         /* Having func 0 used to launch the actual bdflush and then never
2477          * return (unless explicitly killed). We return zero here to
2478          * remain semi-compatible with present update(8) programs.
2479          */
2480         return 0;
2481 }
2482
2483 /*
2484  * This is the actual bdflush daemon itself. It used to be started from
2485  * the syscall above, but now we launch it ourselves internally with
2486  * kernel_thread(...)  directly after the first thread in init/main.c
2487  */
2488 int bdflush(void *sem)
2489 {
2490         struct task_struct *tsk = current;
2491         int flushed;
2492         /*
2493          *      We have a bare-bones task_struct, and really should fill
2494          *      in a few more things so "top" and /proc/2/{exe,root,cwd}
2495          *      display semi-sane things. Not real crucial though...
2496          */
2497
2498         tsk->session = 1;
2499         tsk->pgrp = 1;
2500         strcpy(tsk->comm, "kflushd");
2501         bdflush_tsk = tsk;
2502
2503         /* avoid getting signals */
2504         spin_lock_irq(&tsk->sigmask_lock);
2505         flush_signals(tsk);
2506         sigfillset(&tsk->blocked);
2507         recalc_sigpending(tsk);
2508         spin_unlock_irq(&tsk->sigmask_lock);
2509
2510         up((struct semaphore *)sem);
2511
2512         for (;;) {
2513                 CHECK_EMERGENCY_SYNC
2514
2515                 flushed = flush_dirty_buffers(0);
2516
2517                 /* If wakeup_bdflush will wakeup us
2518                    after our bdflush_done wakeup, then
2519                    we must make sure to not sleep
2520                    in schedule_timeout otherwise
2521                    wakeup_bdflush may wait for our
2522                    bdflush_done wakeup that would never arrive
2523                    (as we would be sleeping) and so it would
2524                    deadlock in SMP. */
2525                 __set_current_state(TASK_INTERRUPTIBLE);
2526                 wake_up(&bdflush_done);
2527                 /*
2528                  * If there are still a lot of dirty buffers around,
2529                  * skip the sleep and flush some more. Otherwise, we
2530                  * go to sleep waiting a wakeup.
2531                  */
2532                 if (!flushed || balance_dirty_state(NODEV) < 0)
2533                         schedule();
2534                 /* Remember to mark us as running otherwise
2535                    the next schedule will block. */
2536                 __set_current_state(TASK_RUNNING);
2537         }
2538 }
2539
2540 /*
2541  * This is the kernel update daemon. It was used to live in userspace
2542  * but since it's need to run safely we want it unkillable by mistake.
2543  * You don't need to change your userspace configuration since
2544  * the userspace `update` will do_exit(0) at the first sys_bdflush().
2545  */
2546 int kupdate(void *sem)
2547 {
2548         struct task_struct * tsk = current;
2549         int interval;
2550
2551         tsk->session = 1;
2552         tsk->pgrp = 1;
2553         strcpy(tsk->comm, "kupdate");
2554
2555         /* sigstop and sigcont will stop and wakeup kupdate */
2556         spin_lock_irq(&tsk->sigmask_lock);
2557         sigfillset(&tsk->blocked);
2558         siginitsetinv(&current->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP));
2559         recalc_sigpending(tsk);
2560         spin_unlock_irq(&tsk->sigmask_lock);
2561
2562         up((struct semaphore *)sem);
2563
2564         for (;;) {
2565                 /* update interval */
2566                 interval = bdf_prm.b_un.interval;
2567                 if (interval) {
2568                         tsk->state = TASK_INTERRUPTIBLE;
2569                         schedule_timeout(interval);
2570                 } else {
2571                 stop_kupdate:
2572                         tsk->state = TASK_STOPPED;
2573                         schedule(); /* wait for SIGCONT */
2574                 }
2575                 /* check for sigstop */
2576                 if (signal_pending(tsk)) {
2577                         int stopped = 0;
2578                         spin_lock_irq(&tsk->sigmask_lock);
2579                         if (sigismember(&tsk->signal, SIGSTOP)) {
2580                                 sigdelset(&tsk->signal, SIGSTOP);
2581                                 stopped = 1;
2582                         }
2583                         recalc_sigpending(tsk);
2584                         spin_unlock_irq(&tsk->sigmask_lock);
2585                         if (stopped)
2586                                 goto stop_kupdate;
2587                 }
2588 #ifdef DEBUG
2589                 printk("kupdate() activated...\n");
2590 #endif
2591                 sync_old_buffers();
2592         }
2593 }
2594
2595 static int __init bdflush_init(void)
2596 {
2597         DECLARE_MUTEX_LOCKED(sem);
2598         kernel_thread(bdflush, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
2599         down(&sem);
2600         kernel_thread(kupdate, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
2601         down(&sem);
2602         return 0;
2603 }
2604
2605 module_init(bdflush_init)
2606