fs/buffer.c

   1 /*
   2  *  linux/fs/buffer.c
   3  *
   4  *  Copyright (C) 1991, 1992  Linus Torvalds
   5  */
   6
   7 /*
   8  *  'buffer.c' implements the buffer-cache functions. Race-conditions have
   9  * been avoided by NEVER letting an interrupt change a buffer (except for the
  10  * data, of course), but instead letting the caller do it.
  11  */
  12
  13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
  14
  15 /* Removed a lot of unnecessary code and simplified things now that
  16  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  17  */
  18
  19 /* Speed up hash, lru, and free list operations.  Use gfp() for allocating
  20  * hash table, use SLAB cache for buffer heads. -DaveM
  21  */
  22
  23 /* Added 32k buffer block sizes - these are required older ARM systems.
  24  * - RMK
  25  */
  26
  27 /* Thread it... -DaveM */
  28
  29 /* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
  30
  31 #include <linux/config.h>
  32 #include <linux/sched.h>
  33 #include <linux/fs.h>
  34 #include <linux/malloc.h>
  35 #include <linux/locks.h>
  36 #include <linux/errno.h>
  37 #include <linux/swap.h>
  38 #include <linux/swapctl.h>
  39 #include <linux/smp_lock.h>
  40 #include <linux/vmalloc.h>
  41 #include <linux/blkdev.h>
  42 #include <linux/sysrq.h>
  43 #include <linux/file.h>
  44 #include <linux/init.h>
  45 #include <linux/quotaops.h>
  46 #include <linux/iobuf.h>
  47 #include <linux/highmem.h>
  48
  49 #include <asm/uaccess.h>
  50 #include <asm/io.h>
  51 #include <asm/bitops.h>
  52 #include <asm/mmu_context.h>
  53
  54 #define NR_SIZES 7
  55 static char buffersize_index[65] =
  56 {-1,  0,  1, -1,  2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
  57   4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  58   5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  59  -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  60   6};
  61
  62 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
  63 #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
  64 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
  65 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
  66                                              number of unused buffer heads */
  67
  68 /* Anti-deadlock ordering:
  69  *      lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
  70  */
  71
  72 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_inode_buffers)
  73
  74 /*
  75  * Hash table gook..
  76  */
  77 static unsigned int bh_hash_mask;
  78 static unsigned int bh_hash_shift;
  79 static struct buffer_head **hash_table;
  80 static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
  81
  82 static struct buffer_head *lru_list[NR_LIST];
  83 static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED;
  84 static int nr_buffers_type[NR_LIST];
  85 static unsigned long size_buffers_type[NR_LIST];
  86
  87 static struct buffer_head * unused_list;
  88 static int nr_unused_buffer_heads;
  89 static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
  90 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
  91
  92 struct bh_free_head {
  93         struct buffer_head *list;
  94         spinlock_t lock;
  95 };
  96 static struct bh_free_head free_list[NR_SIZES];
  97
  98 static int grow_buffers(int size);
  99 static void __refile_buffer(struct buffer_head *);
 100
 101 /* This is used by some architectures to estimate available memory. */
 102 atomic_t buffermem_pages = ATOMIC_INIT(0);
 103
 104 /* Here is the parameter block for the bdflush process. If you add or
 105  * remove any of the parameters, make sure to update kernel/sysctl.c.
 106  */
 107
 108 #define N_PARAM 9
 109
 110 /* The dummy values in this structure are left in there for compatibility
 111  * with old programs that play with the /proc entries.
 112  */
 113 union bdflush_param {
 114         struct {
 115                 int nfract;  /* Percentage of buffer cache dirty to
 116                                 activate bdflush */
 117                 int ndirty;  /* Maximum number of dirty blocks to write out per
 118                                 wake-cycle */
 119                 int nrefill; /* Number of clean buffers to try to obtain
 120                                 each time we call refill */
 121                 int nref_dirt; /* Dirty buffer threshold for activating bdflush
 122                                   when trying to refill buffers. */
 123                 int interval; /* jiffies delay between kupdate flushes */
 124                 int age_buffer;  /* Time for normal buffer to age before we flush it */
 125                 int dummy1;    /* unused, was age_super */
 126                 int dummy2;    /* unused */
 127                 int dummy3;    /* unused */
 128         } b_un;
 129         unsigned int data[N_PARAM];
 130 } bdf_prm = {{40, 500, 64, 256, 5*HZ, 30*HZ, 5*HZ, 1884, 2}};
 131
 132 /* These are the min and max parameter values that we will allow to be assigned */
 133 int bdflush_min[N_PARAM] = {  0,  10,    5,   25,  0,   1*HZ,   1*HZ, 1, 1};
 134 int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,600*HZ, 6000*HZ, 6000*HZ, 2047, 5};
 135
 136 /*
 137  * Rewrote the wait-routines to use the "new" wait-queue functionality,
 138  * and getting rid of the cli-sti pairs. The wait-queue routines still
 139  * need cli-sti, but now it's just a couple of 386 instructions or so.
 140  *
 141  * Note that the real wait_on_buffer() is an inline function that checks
 142  * if 'b_wait' is set before calling this, so that the queues aren't set
 143  * up unnecessarily.
 144  */
 145 void __wait_on_buffer(struct buffer_head * bh)
 146 {
 147         struct task_struct *tsk = current;
 148         DECLARE_WAITQUEUE(wait, tsk);
 149
 150         atomic_inc(&bh->b_count);
 151         add_wait_queue(&bh->b_wait, &wait);
 152         do {
 153                 run_task_queue(&tq_disk);
 154                 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 155                 if (!buffer_locked(bh))
 156                         break;
 157                 schedule();
 158         } while (buffer_locked(bh));
 159         tsk->state = TASK_RUNNING;
 160         remove_wait_queue(&bh->b_wait, &wait);
 161         atomic_dec(&bh->b_count);
 162 }
 163
 164 /* Call sync_buffers with wait!=0 to ensure that the call does not
 165  * return until all buffer writes have completed.  Sync() may return
 166  * before the writes have finished; fsync() may not.
 167  */
 168
 169 /* Godamity-damn.  Some buffers (bitmaps for filesystems)
 170  * spontaneously dirty themselves without ever brelse being called.
 171  * We will ultimately want to put these in a separate list, but for
 172  * now we search all of the lists for dirty buffers.
 173  */
 174 static int sync_buffers(kdev_t dev, int wait)
 175 {
 176         int i, retry, pass = 0, err = 0;
 177         struct buffer_head * bh, *next;
 178
 179         /* One pass for no-wait, three for wait:
 180          * 0) write out all dirty, unlocked buffers;
 181          * 1) write out all dirty buffers, waiting if locked;
 182          * 2) wait for completion by waiting for all buffers to unlock.
 183          */
 184         do {
 185                 retry = 0;
 186
 187                 /* We search all lists as a failsafe mechanism, not because we expect
 188                  * there to be dirty buffers on any of the other lists.
 189                  */
 190 repeat:
 191                 spin_lock(&lru_list_lock);
 192                 bh = lru_list[BUF_DIRTY];
 193                 if (!bh)
 194                         goto repeat2;
 195
 196                 for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
 197                         next = bh->b_next_free;
 198
 199                         if (!lru_list[BUF_DIRTY])
 200                                 break;
 201                         if (dev && bh->b_dev != dev)
 202                                 continue;
 203                         if (buffer_locked(bh)) {
 204                                 /* Buffer is locked; skip it unless wait is
 205                                  * requested AND pass > 0.
 206                                  */
 207                                 if (!wait || !pass) {
 208                                         retry = 1;
 209                                         continue;
 210                                 }
 211                                 atomic_inc(&bh->b_count);
 212                                 spin_unlock(&lru_list_lock);
 213                                 wait_on_buffer (bh);
 214                                 atomic_dec(&bh->b_count);
 215                                 goto repeat;
 216                         }
 217
 218                         /* If an unlocked buffer is not uptodate, there has
 219                          * been an IO error. Skip it.
 220                          */
 221                         if (wait && buffer_req(bh) && !buffer_locked(bh) &&
 222                             !buffer_dirty(bh) && !buffer_uptodate(bh)) {
 223                                 err = -EIO;
 224                                 continue;
 225                         }
 226
 227                         /* Don't write clean buffers.  Don't write ANY buffers
 228                          * on the third pass.
 229                          */
 230                         if (!buffer_dirty(bh) || pass >= 2)
 231                                 continue;
 232
 233                         atomic_inc(&bh->b_count);
 234                         spin_unlock(&lru_list_lock);
 235                         ll_rw_block(WRITE, 1, &bh);
 236                         atomic_dec(&bh->b_count);
 237                         retry = 1;
 238                         goto repeat;
 239                 }
 240
 241     repeat2:
 242                 bh = lru_list[BUF_LOCKED];
 243                 if (!bh) {
 244                         spin_unlock(&lru_list_lock);
 245                         break;
 246                 }
 247                 for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
 248                         next = bh->b_next_free;
 249
 250                         if (!lru_list[BUF_LOCKED])
 251                                 break;
 252                         if (dev && bh->b_dev != dev)
 253                                 continue;
 254                         if (buffer_locked(bh)) {
 255                                 /* Buffer is locked; skip it unless wait is
 256                                  * requested AND pass > 0.
 257                                  */
 258                                 if (!wait || !pass) {
 259                                         retry = 1;
 260                                         continue;
 261                                 }
 262                                 atomic_inc(&bh->b_count);
 263                                 spin_unlock(&lru_list_lock);
 264                                 wait_on_buffer (bh);
 265                                 spin_lock(&lru_list_lock);
 266                                 atomic_dec(&bh->b_count);
 267                                 goto repeat2;
 268                         }
 269                 }
 270                 spin_unlock(&lru_list_lock);
 271
 272                 /* If we are waiting for the sync to succeed, and if any dirty
 273                  * blocks were written, then repeat; on the second pass, only
 274                  * wait for buffers being written (do not pass to write any
 275                  * more buffers on the second pass).
 276                  */
 277         } while (wait && retry && ++pass<=2);
 278         return err;
 279 }
 280
 281 void sync_dev(kdev_t dev)
 282 {
 283         sync_supers(dev);
 284         sync_inodes(dev);
 285         DQUOT_SYNC(dev);
 286         /* sync all the dirty buffers out to disk only _after_ all the
 287            high level layers finished generated buffer dirty data
 288            (or we'll return with some buffer still dirty on the blockdevice
 289            so breaking the semantics of this call) */
 290         sync_buffers(dev, 0);
 291         /*
 292          * FIXME(eric) we need to sync the physical devices here.
 293          * This is because some (scsi) controllers have huge amounts of
 294          * cache onboard (hundreds of Mb), and we need to instruct
 295          * them to commit all of the dirty memory to disk, and we should
 296          * not return until this has happened.
 297          *
 298          * This would need to get implemented by going through the assorted
 299          * layers so that each block major number can be synced, and this
 300          * would call down into the upper and mid-layer scsi.
 301          */
 302 }
 303
 304 int fsync_dev(kdev_t dev)
 305 {
 306         sync_buffers(dev, 0);
 307
 308         lock_kernel();
 309         sync_supers(dev);
 310         sync_inodes(dev);
 311         DQUOT_SYNC(dev);
 312         unlock_kernel();
 313
 314         return sync_buffers(dev, 1);
 315 }
 316
 317 asmlinkage long sys_sync(void)
 318 {
 319         fsync_dev(0);
 320         return 0;
 321 }
 322
 323 /*
 324  *      filp may be NULL if called via the msync of a vma.
 325  */
 326
 327 int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
 328 {
 329         struct inode * inode = dentry->d_inode;
 330         struct super_block * sb;
 331         kdev_t dev;
 332         int ret;
 333
 334         lock_kernel();
 335         /* sync the inode to buffers */
 336         write_inode_now(inode, 0);
 337
 338         /* sync the superblock to buffers */
 339         sb = inode->i_sb;
 340         wait_on_super(sb);
 341         if (sb->s_op && sb->s_op->write_super)
 342                 sb->s_op->write_super(sb);
 343
 344         /* .. finally sync the buffers to disk */
 345         dev = inode->i_dev;
 346         ret = sync_buffers(dev, 1);
 347         unlock_kernel();
 348         return ret;
 349 }
 350
 351 asmlinkage long sys_fsync(unsigned int fd)
 352 {
 353         struct file * file;
 354         struct dentry * dentry;
 355         struct inode * inode;
 356         int err;
 357
 358         err = -EBADF;
 359         file = fget(fd);
 360         if (!file)
 361                 goto out;
 362
 363         dentry = file->f_dentry;
 364         inode = dentry->d_inode;
 365
 366         err = -EINVAL;
 367         if (!file->f_op || !file->f_op->fsync)
 368                 goto out_putf;
 369
 370         /* We need to protect against concurrent writers.. */
 371         down(&inode->i_sem);
 372         err = file->f_op->fsync(file, dentry, 0);
 373         up(&inode->i_sem);
 374
 375 out_putf:
 376         fput(file);
 377 out:
 378         return err;
 379 }
 380
 381 asmlinkage long sys_fdatasync(unsigned int fd)
 382 {
 383         struct file * file;
 384         struct dentry * dentry;
 385         struct inode * inode;
 386         int err;
 387
 388         err = -EBADF;
 389         file = fget(fd);
 390         if (!file)
 391                 goto out;
 392
 393         dentry = file->f_dentry;
 394         inode = dentry->d_inode;
 395
 396         err = -EINVAL;
 397         if (!file->f_op || !file->f_op->fsync)
 398                 goto out_putf;
 399
 400         down(&inode->i_sem);
 401         err = file->f_op->fsync(file, dentry, 1);
 402         up(&inode->i_sem);
 403
 404 out_putf:
 405         fput(file);
 406 out:
 407         return err;
 408 }
 409
 410 /* After several hours of tedious analysis, the following hash
 411  * function won.  Do not mess with it... -DaveM
 412  */
 413 #define _hashfn(dev,block)      \
 414         ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
 415          (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ \
 416           ((block) << (bh_hash_shift - 12))))
 417 #define hash(dev,block) hash_table[(_hashfn(HASHDEV(dev),block) & bh_hash_mask)]
 418
 419 static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head)
 420 {
 421         if ((bh->b_next = *head) != NULL)
 422                 bh->b_next->b_pprev = &bh->b_next;
 423         *head = bh;
 424         bh->b_pprev = head;
 425 }
 426
 427 static __inline__ void __hash_unlink(struct buffer_head *bh)
 428 {
 429         if (bh->b_pprev) {
 430                 if (bh->b_next)
 431                         bh->b_next->b_pprev = bh->b_pprev;
 432                 *(bh->b_pprev) = bh->b_next;
 433                 bh->b_pprev = NULL;
 434         }
 435 }
 436
 437 static void __insert_into_lru_list(struct buffer_head * bh, int blist)
 438 {
 439         struct buffer_head **bhp = &lru_list[blist];
 440
 441         if(!*bhp) {
 442                 *bhp = bh;
 443                 bh->b_prev_free = bh;
 444         }
 445         bh->b_next_free = *bhp;
 446         bh->b_prev_free = (*bhp)->b_prev_free;
 447         (*bhp)->b_prev_free->b_next_free = bh;
 448         (*bhp)->b_prev_free = bh;
 449         nr_buffers_type[blist]++;
 450         size_buffers_type[blist] += bh->b_size;
 451 }
 452
 453 static void __remove_from_lru_list(struct buffer_head * bh, int blist)
 454 {
 455         if (bh->b_prev_free || bh->b_next_free) {
 456                 bh->b_prev_free->b_next_free = bh->b_next_free;
 457                 bh->b_next_free->b_prev_free = bh->b_prev_free;
 458                 if (lru_list[blist] == bh)
 459                         lru_list[blist] = bh->b_next_free;
 460                 if (lru_list[blist] == bh)
 461                         lru_list[blist] = NULL;
 462                 bh->b_next_free = bh->b_prev_free = NULL;
 463                 nr_buffers_type[blist]--;
 464                 size_buffers_type[blist] -= bh->b_size;
 465         }
 466 }
 467
 468 static void __remove_from_free_list(struct buffer_head * bh, int index)
 469 {
 470         if(bh->b_next_free == bh)
 471                  free_list[index].list = NULL;
 472         else {
 473                 bh->b_prev_free->b_next_free = bh->b_next_free;
 474                 bh->b_next_free->b_prev_free = bh->b_prev_free;
 475                 if (free_list[index].list == bh)
 476                          free_list[index].list = bh->b_next_free;
 477         }
 478         bh->b_next_free = bh->b_prev_free = NULL;
 479 }
 480
 481 /* must be called with both the hash_table_lock and the lru_list_lock
 482    held */
 483 static void __remove_from_queues(struct buffer_head *bh)
 484 {
 485         __hash_unlink(bh);
 486         __remove_from_lru_list(bh, bh->b_list);
 487 }
 488
 489 static void __insert_into_queues(struct buffer_head *bh)
 490 {
 491         struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
 492
 493         __hash_link(bh, head);
 494         __insert_into_lru_list(bh, bh->b_list);
 495 }
 496
 497 /* This function must only run if there are no other
 498  * references _anywhere_ to this buffer head.
 499  */
 500 static void put_last_free(struct buffer_head * bh)
 501 {
 502         struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)];
 503         struct buffer_head **bhp = &head->list;
 504
 505         bh->b_state = 0;
 506
 507         spin_lock(&head->lock);
 508         bh->b_dev = B_FREE;
 509         if(!*bhp) {
 510                 *bhp = bh;
 511                 bh->b_prev_free = bh;
 512         }
 513         bh->b_next_free = *bhp;
 514         bh->b_prev_free = (*bhp)->b_prev_free;
 515         (*bhp)->b_prev_free->b_next_free = bh;
 516         (*bhp)->b_prev_free = bh;
 517         spin_unlock(&head->lock);
 518 }
 519
 520 /*
 521  * Why like this, I hear you say... The reason is race-conditions.
 522  * As we don't lock buffers (unless we are reading them, that is),
 523  * something might happen to it while we sleep (ie a read-error
 524  * will force it bad). This shouldn't really happen currently, but
 525  * the code is ready.
 526  */
 527 static inline struct buffer_head * __get_hash_table(kdev_t dev, int block, int size)
 528 {
 529         struct buffer_head *bh = hash(dev, block);
 530
 531         for (; bh; bh = bh->b_next)
 532                 if (bh->b_blocknr == block      &&
 533                     bh->b_size    == size       &&
 534                     bh->b_dev     == dev)
 535                         break;
 536         if (bh)
 537                 atomic_inc(&bh->b_count);
 538
 539         return bh;
 540 }
 541
 542 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
 543 {
 544         struct buffer_head *bh;
 545
 546         read_lock(&hash_table_lock);
 547         bh = __get_hash_table(dev, block, size);
 548         read_unlock(&hash_table_lock);
 549
 550         return bh;
 551 }
 552
 553 unsigned int get_hardblocksize(kdev_t dev)
 554 {
 555         /*
 556          * Get the hard sector size for the given device.  If we don't know
 557          * what it is, return 0.
 558          */
 559         if (hardsect_size[MAJOR(dev)] != NULL) {
 560                 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
 561                 if (blksize != 0)
 562                         return blksize;
 563         }
 564
 565         /*
 566          * We don't know what the hardware sector size for this device is.
 567          * Return 0 indicating that we don't know.
 568          */
 569         return 0;
 570 }
 571
 572 void buffer_insert_inode_queue(struct buffer_head *bh, struct inode *inode)
 573 {
 574         spin_lock(&lru_list_lock);
 575         if (bh->b_inode)
 576                 list_del(&bh->b_inode_buffers);
 577         bh->b_inode = inode;
 578         list_add(&bh->b_inode_buffers, &inode->i_dirty_buffers);
 579         spin_unlock(&lru_list_lock);
 580 }
 581
 582 /* The caller must have the lru_list lock before calling the
 583    remove_inode_queue functions.  */
 584 static void __remove_inode_queue(struct buffer_head *bh)
 585 {
 586         bh->b_inode = NULL;
 587         list_del(&bh->b_inode_buffers);
 588 }
 589
 590 static inline void remove_inode_queue(struct buffer_head *bh)
 591 {
 592         if (bh->b_inode)
 593                 __remove_inode_queue(bh);
 594 }
 595
 596 int inode_has_buffers(struct inode *inode)
 597 {
 598         int ret;
 599
 600         spin_lock(&lru_list_lock);
 601         ret = !list_empty(&inode->i_dirty_buffers);
 602         spin_unlock(&lru_list_lock);
 603
 604         return ret;
 605 }
 606
 607
 608 /* If invalidate_buffers() will trash dirty buffers, it means some kind
 609    of fs corruption is going on. Trashing dirty data always imply losing
 610    information that was supposed to be just stored on the physical layer
 611    by the user.
 612
 613    Thus invalidate_buffers in general usage is not allwowed to trash dirty
 614    buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved.
 615
 616    NOTE: In the case where the user removed a removable-media-disk even if
 617    there's still dirty data not synced on disk (due a bug in the device driver
 618    or due an error of the user), by not destroying the dirty buffers we could
 619    generate corruption also on the next media inserted, thus a parameter is
 620    necessary to handle this case in the most safe way possible (trying
 621    to not corrupt also the new disk inserted with the data belonging to
 622    the old now corrupted disk). Also for the ramdisk the natural thing
 623    to do in order to release the ramdisk memory is to destroy dirty buffers.
 624
 625    These are two special cases. Normal usage imply the device driver
 626    to issue a sync on the device (without waiting I/O completation) and
 627    then an invalidate_buffers call that doesn't trash dirty buffers. */
 628 void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
 629 {
 630         int i, nlist, slept;
 631         struct buffer_head * bh, * bh_next;
 632
 633  retry:
 634         slept = 0;
 635         spin_lock(&lru_list_lock);
 636         for(nlist = 0; nlist < NR_LIST; nlist++) {
 637                 bh = lru_list[nlist];
 638                 if (!bh)
 639                         continue;
 640                 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
 641                         bh_next = bh->b_next_free;
 642                         if (bh->b_dev != dev)
 643                                 continue;
 644                         if (buffer_locked(bh)) {
 645                                 atomic_inc(&bh->b_count);
 646                                 spin_unlock(&lru_list_lock);
 647                                 wait_on_buffer(bh);
 648                                 slept = 1;
 649                                 spin_lock(&lru_list_lock);
 650                                 atomic_dec(&bh->b_count);
 651                         }
 652
 653                         write_lock(&hash_table_lock);
 654                         if (!atomic_read(&bh->b_count) &&
 655                             (destroy_dirty_buffers || !buffer_dirty(bh))) {
 656                                 remove_inode_queue(bh);
 657                                 __remove_from_queues(bh);
 658                                 put_last_free(bh);
 659                         }
 660                         /* else complain loudly? */
 661
 662                         write_unlock(&hash_table_lock);
 663                         if (slept)
 664                                 goto out;
 665                 }
 666         }
 667 out:
 668         spin_unlock(&lru_list_lock);
 669         if (slept)
 670                 goto retry;
 671 }
 672
 673 void set_blocksize(kdev_t dev, int size)
 674 {
 675         extern int *blksize_size[];
 676         int i, nlist, slept;
 677         struct buffer_head * bh, * bh_next;
 678
 679         if (!blksize_size[MAJOR(dev)])
 680                 return;
 681
 682         /* Size must be a power of two, and between 512 and PAGE_SIZE */
 683         if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
 684                 panic("Invalid blocksize passed to set_blocksize");
 685
 686         if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
 687                 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 688                 return;
 689         }
 690         if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
 691                 return;
 692         sync_buffers(dev, 2);
 693         blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 694
 695  retry:
 696         slept = 0;
 697         spin_lock(&lru_list_lock);
 698         for(nlist = 0; nlist < NR_LIST; nlist++) {
 699                 bh = lru_list[nlist];
 700                 if (!bh)
 701                         continue;
 702                 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
 703                         bh_next = bh->b_next_free;
 704                         if (bh->b_dev != dev || bh->b_size == size)
 705                                 continue;
 706                         if (buffer_locked(bh)) {
 707                                 atomic_inc(&bh->b_count);
 708                                 spin_unlock(&lru_list_lock);
 709                                 wait_on_buffer(bh);
 710                                 slept = 1;
 711                                 spin_lock(&lru_list_lock);
 712                                 atomic_dec(&bh->b_count);
 713                         }
 714
 715                         write_lock(&hash_table_lock);
 716                         if (!atomic_read(&bh->b_count)) {
 717                                 if (buffer_dirty(bh))
 718                                         printk(KERN_WARNING
 719                                                "set_blocksize: dev %s buffer_dirty %lu size %hu\n",
 720                                                kdevname(dev), bh->b_blocknr, bh->b_size);
 721                                 remove_inode_queue(bh);
 722                                 __remove_from_queues(bh);
 723                                 put_last_free(bh);
 724                         } else {
 725                                 if (atomic_set_buffer_clean(bh))
 726                                         __refile_buffer(bh);
 727                                 clear_bit(BH_Uptodate, &bh->b_state);
 728                                 printk(KERN_WARNING
 729                                        "set_blocksize: "
 730                                        "b_count %d, dev %s, block %lu, from %p\n",
 731                                        atomic_read(&bh->b_count), bdevname(bh->b_dev),
 732                                        bh->b_blocknr, __builtin_return_address(0));
 733                         }
 734                         write_unlock(&hash_table_lock);
 735                         if (slept)
 736                                 goto out;
 737                 }
 738         }
 739  out:
 740         spin_unlock(&lru_list_lock);
 741         if (slept)
 742                 goto retry;
 743 }
 744
 745 /*
 746  * We used to try various strange things. Let's not.
 747  */
 748 static void refill_freelist(int size)
 749 {
 750         if (!grow_buffers(size))
 751                 wakeup_bdflush(1);  /* Sets task->state to TASK_RUNNING */
 752 }
 753
 754 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
 755 {
 756         bh->b_list = BUF_CLEAN;
 757         bh->b_end_io = handler;
 758         bh->b_private = private;
 759 }
 760
 761 static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 762 {
 763         mark_buffer_uptodate(bh, uptodate);
 764         unlock_buffer(bh);
 765 }
 766
 767 static void end_buffer_io_bad(struct buffer_head *bh, int uptodate)
 768 {
 769         mark_buffer_uptodate(bh, uptodate);
 770         unlock_buffer(bh);
 771         BUG();
 772 }
 773
 774 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
 775 {
 776         static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
 777         unsigned long flags;
 778         struct buffer_head *tmp;
 779         struct page *page;
 780
 781         mark_buffer_uptodate(bh, uptodate);
 782
 783         /* This is a temporary buffer used for page I/O. */
 784         page = bh->b_page;
 785
 786         if (!uptodate)
 787                 SetPageError(page);
 788
 789         /*
 790          * Be _very_ careful from here on. Bad things can happen if
 791          * two buffer heads end IO at almost the same time and both
 792          * decide that the page is now completely done.
 793          *
 794          * Async buffer_heads are here only as labels for IO, and get
 795          * thrown away once the IO for this page is complete.  IO is
 796          * deemed complete once all buffers have been visited
 797          * (b_count==0) and are now unlocked. We must make sure that
 798          * only the _last_ buffer that decrements its count is the one
 799          * that unlock the page..
 800          */
 801         spin_lock_irqsave(&page_uptodate_lock, flags);
 802         unlock_buffer(bh);
 803         atomic_dec(&bh->b_count);
 804         tmp = bh->b_this_page;
 805         while (tmp != bh) {
 806                 if (tmp->b_end_io == end_buffer_io_async && buffer_locked(tmp))
 807                         goto still_busy;
 808                 tmp = tmp->b_this_page;
 809         }
 810
 811         /* OK, the async IO on this page is complete. */
 812         spin_unlock_irqrestore(&page_uptodate_lock, flags);
 813
 814         /*
 815          * if none of the buffers had errors then we can set the
 816          * page uptodate:
 817          */
 818         if (!PageError(page))
 819                 SetPageUptodate(page);
 820
 821         /*
 822          * Run the hooks that have to be done when a page I/O has completed.
 823          */
 824         if (PageTestandClearDecrAfter(page))
 825                 atomic_dec(&nr_async_pages);
 826
 827         UnlockPage(page);
 828
 829         return;
 830
 831 still_busy:
 832         spin_unlock_irqrestore(&page_uptodate_lock, flags);
 833         return;
 834 }
 835
 836 /*
 837  * Synchronise all the inode's dirty buffers to the disk.
 838  *
 839  * We have conflicting pressures: we want to make sure that all
 840  * initially dirty buffers get waited on, but that any subsequently
 841  * dirtied buffers don't.  After all, we don't want fsync to last
 842  * forever if somebody is actively writing to the file.
 843  *
 844  * Do this in two main stages: first we copy dirty buffers to a
 845  * temporary inode list, queueing the writes as we go.  Then we clean
 846  * up, waiting for those writes to complete.
 847  *
 848  * During this second stage, any subsequent updates to the file may end
 849  * up refiling the buffer on the original inode's dirty list again, so
 850  * there is a chance we will end up with a buffer queued for write but
 851  * not yet completed on that list.  So, as a final cleanup we go through
 852  * the osync code to catch these locked, dirty buffers without requeuing
 853  * any newly dirty buffers for write.
 854  */
 855
 856 int fsync_inode_buffers(struct inode *inode)
 857 {
 858         struct buffer_head *bh;
 859         struct inode tmp;
 860         int err = 0, err2;
 861
 862         INIT_LIST_HEAD(&tmp.i_dirty_buffers);
 863
 864         spin_lock(&lru_list_lock);
 865
 866         while (!list_empty(&inode->i_dirty_buffers)) {
 867                 bh = BH_ENTRY(inode->i_dirty_buffers.next);
 868                 list_del(&bh->b_inode_buffers);
 869                 if (!buffer_dirty(bh) && !buffer_locked(bh))
 870                         bh->b_inode = NULL;
 871                 else {
 872                         bh->b_inode = &tmp;
 873                         list_add(&bh->b_inode_buffers, &tmp.i_dirty_buffers);
 874                         if (buffer_dirty(bh)) {
 875                                 atomic_inc(&bh->b_count);
 876                                 spin_unlock(&lru_list_lock);
 877                                 ll_rw_block(WRITE, 1, &bh);
 878                                 brelse(bh);
 879                                 spin_lock(&lru_list_lock);
 880                         }
 881                 }
 882         }
 883
 884         while (!list_empty(&tmp.i_dirty_buffers)) {
 885                 bh = BH_ENTRY(tmp.i_dirty_buffers.prev);
 886                 remove_inode_queue(bh);
 887                 atomic_inc(&bh->b_count);
 888                 spin_unlock(&lru_list_lock);
 889                 wait_on_buffer(bh);
 890                 if (!buffer_uptodate(bh))
 891                         err = -EIO;
 892                 brelse(bh);
 893                 spin_lock(&lru_list_lock);
 894         }
 895
 896         spin_unlock(&lru_list_lock);
 897         err2 = osync_inode_buffers(inode);
 898
 899         if (err)
 900                 return err;
 901         else
 902                 return err2;
 903 }
 904
 905
 906 /*
 907  * osync is designed to support O_SYNC io.  It waits synchronously for
 908  * all already-submitted IO to complete, but does not queue any new
 909  * writes to the disk.
 910  *
 911  * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
 912  * you dirty the buffers, and then use osync_inode_buffers to wait for
 913  * completion.  Any other dirty buffers which are not yet queued for
 914  * write will not be flushed to disk by the osync.
 915  */
 916
 917 int osync_inode_buffers(struct inode *inode)
 918 {
 919         struct buffer_head *bh;
 920         struct list_head *list;
 921         int err = 0;
 922
 923         spin_lock(&lru_list_lock);
 924
 925  repeat:
 926
 927         for (list = inode->i_dirty_buffers.prev;
 928              bh = BH_ENTRY(list), list != &inode->i_dirty_buffers;
 929              list = bh->b_inode_buffers.prev) {
 930                 if (buffer_locked(bh)) {
 931                         atomic_inc(&bh->b_count);
 932                         spin_unlock(&lru_list_lock);
 933                         wait_on_buffer(bh);
 934                         if (!buffer_uptodate(bh))
 935                                 err = -EIO;
 936                         brelse(bh);
 937                         spin_lock(&lru_list_lock);
 938                         goto repeat;
 939                 }
 940         }
 941
 942         spin_unlock(&lru_list_lock);
 943         return err;
 944 }
 945
 946
 947 /*
 948  * Invalidate any and all dirty buffers on a given inode.  We are
 949  * probably unmounting the fs, but that doesn't mean we have already
 950  * done a sync().  Just drop the buffers from the inode list.
 951  */
 952 void invalidate_inode_buffers(struct inode *inode)
 953 {
 954         struct list_head *list, *next;
 955
 956         spin_lock(&lru_list_lock);
 957         list = inode->i_dirty_buffers.next;
 958         while (list != &inode->i_dirty_buffers) {
 959                 next = list->next;
 960                 remove_inode_queue(BH_ENTRY(list));
 961                 list = next;
 962         }
 963         spin_unlock(&lru_list_lock);
 964 }
 965
 966
 967 /*
 968  * Ok, this is getblk, and it isn't very clear, again to hinder
 969  * race-conditions. Most of the code is seldom used, (ie repeating),
 970  * so it should be much more efficient than it looks.
 971  *
 972  * The algorithm is changed: hopefully better, and an elusive bug removed.
 973  *
 974  * 14.02.92: changed it to sync dirty buffers a bit: better performance
 975  * when the filesystem starts to get full of dirty blocks (I hope).
 976  */
 977 struct buffer_head * getblk(kdev_t dev, int block, int size)
 978 {
 979         struct buffer_head * bh;
 980         int isize;
 981
 982 repeat:
 983         spin_lock(&lru_list_lock);
 984         write_lock(&hash_table_lock);
 985         bh = __get_hash_table(dev, block, size);
 986         if (bh)
 987                 goto out;
 988
 989         isize = BUFSIZE_INDEX(size);
 990         spin_lock(&free_list[isize].lock);
 991         bh = free_list[isize].list;
 992         if (bh) {
 993                 __remove_from_free_list(bh, isize);
 994                 atomic_set(&bh->b_count, 1);
 995         }
 996         spin_unlock(&free_list[isize].lock);
 997
 998         /*
 999          * OK, FINALLY we know that this buffer is the only one of
1000          * its kind, we hold a reference (b_count>0), it is unlocked,
1001          * and it is clean.
1002          */
1003         if (bh) {
1004                 init_buffer(bh, end_buffer_io_sync, NULL);
1005                 bh->b_dev = dev;
1006                 bh->b_blocknr = block;
1007                 bh->b_state = 1 << BH_Mapped;
1008
1009                 /* Insert the buffer into the regular lists */
1010                 __insert_into_queues(bh);
1011         out:
1012                 write_unlock(&hash_table_lock);
1013                 spin_unlock(&lru_list_lock);
1014                 touch_buffer(bh);
1015                 return bh;
1016         }
1017
1018         /*
1019          * If we block while refilling the free list, somebody may
1020          * create the buffer first ... search the hashes again.
1021          */
1022         write_unlock(&hash_table_lock);
1023         spin_unlock(&lru_list_lock);
1024         refill_freelist(size);
1025         goto repeat;
1026 }
1027
1028 /* -1 -> no need to flush
1029     0 -> async flush
1030     1 -> sync flush (wait for I/O completation) */
1031 int balance_dirty_state(kdev_t dev)
1032 {
1033         unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
1034         int shortage;
1035
1036         dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
1037         tot = nr_free_buffer_pages();
1038
1039         dirty *= 200;
1040         soft_dirty_limit = tot * bdf_prm.b_un.nfract;
1041         hard_dirty_limit = soft_dirty_limit * 2;
1042
1043         /* First, check for the "real" dirty limit. */
1044         if (dirty > soft_dirty_limit) {
1045                 if (dirty > hard_dirty_limit)
1046                         return 1;
1047                 return 0;
1048         }
1049
1050         /*
1051          * If we are about to get low on free pages and
1052          * cleaning the inactive_dirty pages would help
1053          * fix this, wake up bdflush.
1054          */
1055         shortage = free_shortage();
1056         if (shortage && nr_inactive_dirty_pages > shortage &&
1057                         nr_inactive_dirty_pages > freepages.high)
1058                 return 0;
1059
1060         return -1;
1061 }
1062
1063 /*
1064  * if a new dirty buffer is created we need to balance bdflush.
1065  *
1066  * in the future we might want to make bdflush aware of different
1067  * pressures on different devices - thus the (currently unused)
1068  * 'dev' parameter.
1069  */
1070 void balance_dirty(kdev_t dev)
1071 {
1072         int state = balance_dirty_state(dev);
1073
1074         if (state < 0)
1075                 return;
1076         wakeup_bdflush(state);
1077 }
1078
1079 static __inline__ void __mark_dirty(struct buffer_head *bh)
1080 {
1081         bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
1082         refile_buffer(bh);
1083 }
1084
1085 /* atomic version, the user must call balance_dirty() by hand
1086    as soon as it become possible to block */
1087 void __mark_buffer_dirty(struct buffer_head *bh)
1088 {
1089         if (!atomic_set_buffer_dirty(bh))
1090                 __mark_dirty(bh);
1091 }
1092
1093 void mark_buffer_dirty(struct buffer_head *bh)
1094 {
1095         __mark_buffer_dirty(bh);
1096         balance_dirty(bh->b_dev);
1097 }
1098
1099 /*
1100  * A buffer may need to be moved from one buffer list to another
1101  * (e.g. in case it is not shared any more). Handle this.
1102  */
1103 static void __refile_buffer(struct buffer_head *bh)
1104 {
1105         int dispose = BUF_CLEAN;
1106         if (buffer_locked(bh))
1107                 dispose = BUF_LOCKED;
1108         if (buffer_dirty(bh))
1109                 dispose = BUF_DIRTY;
1110         if (buffer_protected(bh))
1111                 dispose = BUF_PROTECTED;
1112         if (dispose != bh->b_list) {
1113                 __remove_from_lru_list(bh, bh->b_list);
1114                 bh->b_list = dispose;
1115                 if (dispose == BUF_CLEAN)
1116                         remove_inode_queue(bh);
1117                 __insert_into_lru_list(bh, dispose);
1118         }
1119 }
1120
1121 void refile_buffer(struct buffer_head *bh)
1122 {
1123         spin_lock(&lru_list_lock);
1124         __refile_buffer(bh);
1125         spin_unlock(&lru_list_lock);
1126 }
1127
1128 /*
1129  * Release a buffer head
1130  */
1131 void __brelse(struct buffer_head * buf)
1132 {
1133         if (atomic_read(&buf->b_count)) {
1134                 atomic_dec(&buf->b_count);
1135                 return;
1136         }
1137         printk("VFS: brelse: Trying to free free buffer\n");
1138 }
1139
1140 /*
1141  * bforget() is like brelse(), except it puts the buffer on the
1142  * free list if it can.. We can NOT free the buffer if:
1143  *  - there are other users of it
1144  *  - it is locked and thus can have active IO
1145  */
1146 void __bforget(struct buffer_head * buf)
1147 {
1148         /* grab the lru lock here to block bdflush. */
1149         spin_lock(&lru_list_lock);
1150         write_lock(&hash_table_lock);
1151         if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf))
1152                 goto in_use;
1153         __hash_unlink(buf);
1154         remove_inode_queue(buf);
1155         write_unlock(&hash_table_lock);
1156         __remove_from_lru_list(buf, buf->b_list);
1157         spin_unlock(&lru_list_lock);
1158         put_last_free(buf);
1159         return;
1160
1161  in_use:
1162         write_unlock(&hash_table_lock);
1163         spin_unlock(&lru_list_lock);
1164 }
1165
1166 /*
1167  * bread() reads a specified block and returns the buffer that contains
1168  * it. It returns NULL if the block was unreadable.
1169  */
1170 struct buffer_head * bread(kdev_t dev, int block, int size)
1171 {
1172         struct buffer_head * bh;
1173
1174         bh = getblk(dev, block, size);
1175         if (buffer_uptodate(bh))
1176                 return bh;
1177         ll_rw_block(READ, 1, &bh);
1178         wait_on_buffer(bh);
1179         if (buffer_uptodate(bh))
1180                 return bh;
1181         brelse(bh);
1182         return NULL;
1183 }
1184
1185 /*
1186  * Ok, breada can be used as bread, but additionally to mark other
1187  * blocks for reading as well. End the argument list with a negative
1188  * number.
1189  */
1190
1191 #define NBUF 16
1192
1193 struct buffer_head * breada(kdev_t dev, int block, int bufsize,
1194         unsigned int pos, unsigned int filesize)
1195 {
1196         struct buffer_head * bhlist[NBUF];
1197         unsigned int blocks;
1198         struct buffer_head * bh;
1199         int index;
1200         int i, j;
1201
1202         if (pos >= filesize)
1203                 return NULL;
1204
1205         if (block < 0)
1206                 return NULL;
1207
1208         bh = getblk(dev, block, bufsize);
1209         index = BUFSIZE_INDEX(bh->b_size);
1210
1211         if (buffer_uptodate(bh))
1212                 return(bh);
1213         else ll_rw_block(READ, 1, &bh);
1214
1215         blocks = (filesize - pos) >> (9+index);
1216
1217         if (blocks > NBUF)
1218                 blocks = NBUF;
1219
1220         bhlist[0] = bh;
1221         j = 1;
1222         for(i=1; i<blocks; i++) {
1223                 bh = getblk(dev,block+i,bufsize);
1224                 if (buffer_uptodate(bh)) {
1225                         brelse(bh);
1226                         break;
1227                 }
1228                 else bhlist[j++] = bh;
1229         }
1230
1231         /* Request the read for these buffers, and then release them. */
1232         if (j>1)
1233                 ll_rw_block(READA, (j-1), bhlist+1);
1234         for(i=1; i<j; i++)
1235                 brelse(bhlist[i]);
1236
1237         /* Wait for this buffer, and then continue on. */
1238         bh = bhlist[0];
1239         wait_on_buffer(bh);
1240         if (buffer_uptodate(bh))
1241                 return bh;
1242         brelse(bh);
1243         return NULL;
1244 }
1245
1246 /*
1247  * Note: the caller should wake up the buffer_wait list if needed.
1248  */
1249 static __inline__ void __put_unused_buffer_head(struct buffer_head * bh)
1250 {
1251         if (bh->b_inode)
1252                 BUG();
1253         if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1254                 kmem_cache_free(bh_cachep, bh);
1255         } else {
1256                 bh->b_blocknr = -1;
1257                 init_waitqueue_head(&bh->b_wait);
1258                 nr_unused_buffer_heads++;
1259                 bh->b_next_free = unused_list;
1260                 bh->b_this_page = NULL;
1261                 unused_list = bh;
1262         }
1263 }
1264
1265 /*
1266  * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1267  * no-buffer-head deadlock.  Return NULL on failure; waiting for
1268  * buffer heads is now handled in create_buffers().
1269  */
1270 static struct buffer_head * get_unused_buffer_head(int async)
1271 {
1272         struct buffer_head * bh;
1273
1274         spin_lock(&unused_list_lock);
1275         if (nr_unused_buffer_heads > NR_RESERVED) {
1276                 bh = unused_list;
1277                 unused_list = bh->b_next_free;
1278                 nr_unused_buffer_heads--;
1279                 spin_unlock(&unused_list_lock);
1280                 return bh;
1281         }
1282         spin_unlock(&unused_list_lock);
1283
1284         /* This is critical.  We can't swap out pages to get
1285          * more buffer heads, because the swap-out may need
1286          * more buffer-heads itself.  Thus SLAB_BUFFER.
1287          */
1288         if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
1289                 memset(bh, 0, sizeof(*bh));
1290                 init_waitqueue_head(&bh->b_wait);
1291                 return bh;
1292         }
1293
1294         /*
1295          * If we need an async buffer, use the reserved buffer heads.
1296          */
1297         if (async) {
1298                 spin_lock(&unused_list_lock);
1299                 if (unused_list) {
1300                         bh = unused_list;
1301                         unused_list = bh->b_next_free;
1302                         nr_unused_buffer_heads--;
1303                         spin_unlock(&unused_list_lock);
1304                         return bh;
1305                 }
1306                 spin_unlock(&unused_list_lock);
1307         }
1308 #if 0
1309         /*
1310          * (Pending further analysis ...)
1311          * Ordinary (non-async) requests can use a different memory priority
1312          * to free up pages. Any swapping thus generated will use async
1313          * buffer heads.
1314          */
1315         if(!async &&
1316            (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1317                 memset(bh, 0, sizeof(*bh));
1318                 init_waitqueue_head(&bh->b_wait);
1319                 return bh;
1320         }
1321 #endif
1322
1323         return NULL;
1324 }
1325
1326 void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
1327 {
1328         bh->b_page = page;
1329         if (offset >= PAGE_SIZE)
1330                 BUG();
1331         if (PageHighMem(page))
1332                 /*
1333                  * This catches illegal uses and preserves the offset:
1334                  */
1335                 bh->b_data = (char *)(0 + offset);
1336         else
1337                 bh->b_data = page_address(page) + offset;
1338 }
1339
1340 /*
1341  * Create the appropriate buffers when given a page for data area and
1342  * the size of each buffer.. Use the bh->b_this_page linked list to
1343  * follow the buffers created.  Return NULL if unable to create more
1344  * buffers.
1345  * The async flag is used to differentiate async IO (paging, swapping)
1346  * from ordinary buffer allocations, and only async requests are allowed
1347  * to sleep waiting for buffer heads.
1348  */
1349 static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
1350 {
1351         struct buffer_head *bh, *head;
1352         long offset;
1353
1354 try_again:
1355         head = NULL;
1356         offset = PAGE_SIZE;
1357         while ((offset -= size) >= 0) {
1358                 bh = get_unused_buffer_head(async);
1359                 if (!bh)
1360                         goto no_grow;
1361
1362                 bh->b_dev = B_FREE;  /* Flag as unused */
1363                 bh->b_this_page = head;
1364                 head = bh;
1365
1366                 bh->b_state = 0;
1367                 bh->b_next_free = NULL;
1368                 bh->b_pprev = NULL;
1369                 atomic_set(&bh->b_count, 0);
1370                 bh->b_size = size;
1371
1372                 set_bh_page(bh, page, offset);
1373
1374                 bh->b_list = BUF_CLEAN;
1375                 bh->b_end_io = end_buffer_io_bad;
1376         }
1377         return head;
1378 /*
1379  * In case anything failed, we just free everything we got.
1380  */
1381 no_grow:
1382         if (head) {
1383                 spin_lock(&unused_list_lock);
1384                 do {
1385                         bh = head;
1386                         head = head->b_this_page;
1387                         __put_unused_buffer_head(bh);
1388                 } while (head);
1389                 spin_unlock(&unused_list_lock);
1390
1391                 /* Wake up any waiters ... */
1392                 wake_up(&buffer_wait);
1393         }
1394
1395         /*
1396          * Return failure for non-async IO requests.  Async IO requests
1397          * are not allowed to fail, so we have to wait until buffer heads
1398          * become available.  But we don't want tasks sleeping with
1399          * partially complete buffers, so all were released above.
1400          */
1401         if (!async)
1402                 return NULL;
1403
1404         /* We're _really_ low on memory. Now we just
1405          * wait for old buffer heads to become free due to
1406          * finishing IO.  Since this is an async request and
1407          * the reserve list is empty, we're sure there are
1408          * async buffer heads in use.
1409          */
1410         run_task_queue(&tq_disk);
1411
1412         /*
1413          * Set our state for sleeping, then check again for buffer heads.
1414          * This ensures we won't miss a wake_up from an interrupt.
1415          */
1416         wait_event(buffer_wait, nr_unused_buffer_heads >= MAX_BUF_PER_PAGE);
1417         goto try_again;
1418 }
1419
1420 static int create_page_buffers(int rw, struct page *page, kdev_t dev, int b[], int size)
1421 {
1422         struct buffer_head *head, *bh, *tail;
1423         int block;
1424
1425         if (!PageLocked(page))
1426                 BUG();
1427         /*
1428          * Allocate async buffer heads pointing to this page, just for I/O.
1429          * They don't show up in the buffer hash table, but they *are*
1430          * registered in page->buffers.
1431          */
1432         head = create_buffers(page, size, 1);
1433         if (page->buffers)
1434                 BUG();
1435         if (!head)
1436                 BUG();
1437         tail = head;
1438         for (bh = head; bh; bh = bh->b_this_page) {
1439                 block = *(b++);
1440
1441                 tail = bh;
1442                 init_buffer(bh, end_buffer_io_async, NULL);
1443                 bh->b_dev = dev;
1444                 bh->b_blocknr = block;
1445
1446                 set_bit(BH_Mapped, &bh->b_state);
1447         }
1448         tail->b_this_page = head;
1449         page_cache_get(page);
1450         page->buffers = head;
1451         return 0;
1452 }
1453
1454 static void unmap_buffer(struct buffer_head * bh)
1455 {
1456         if (buffer_mapped(bh)) {
1457                 mark_buffer_clean(bh);
1458                 wait_on_buffer(bh);
1459                 clear_bit(BH_Uptodate, &bh->b_state);
1460                 clear_bit(BH_Mapped, &bh->b_state);
1461                 clear_bit(BH_Req, &bh->b_state);
1462                 clear_bit(BH_New, &bh->b_state);
1463         }
1464 }
1465
1466 /*
1467  * We don't have to release all buffers here, but
1468  * we have to be sure that no dirty buffer is left
1469  * and no IO is going on (no buffer is locked), because
1470  * we have truncated the file and are going to free the
1471  * blocks on-disk..
1472  */
1473 int block_flushpage(struct page *page, unsigned long offset)
1474 {
1475         struct buffer_head *head, *bh, *next;
1476         unsigned int curr_off = 0;
1477
1478         if (!PageLocked(page))
1479                 BUG();
1480         if (!page->buffers)
1481                 return 1;
1482
1483         head = page->buffers;
1484         bh = head;
1485         do {
1486                 unsigned int next_off = curr_off + bh->b_size;
1487                 next = bh->b_this_page;
1488
1489                 /*
1490                  * is this block fully flushed?
1491                  */
1492                 if (offset <= curr_off)
1493                         unmap_buffer(bh);
1494                 curr_off = next_off;
1495                 bh = next;
1496         } while (bh != head);
1497
1498         /*
1499          * subtle. We release buffer-heads only if this is
1500          * the 'final' flushpage. We have invalidated the get_block
1501          * cached value unconditionally, so real IO is not
1502          * possible anymore.
1503          *
1504          * If the free doesn't work out, the buffers can be
1505          * left around - they just turn into anonymous buffers
1506          * instead.
1507          */
1508         if (!offset) {
1509                 if (!try_to_free_buffers(page, 0)) {
1510                         atomic_inc(&buffermem_pages);
1511                         return 0;
1512                 }
1513         }
1514
1515         return 1;
1516 }
1517
1518 static void create_empty_buffers(struct page *page, struct inode *inode, unsigned long blocksize)
1519 {
1520         struct buffer_head *bh, *head, *tail;
1521
1522         head = create_buffers(page, blocksize, 1);
1523         if (page->buffers)
1524                 BUG();
1525
1526         bh = head;
1527         do {
1528                 bh->b_dev = inode->i_dev;
1529                 bh->b_blocknr = 0;
1530                 bh->b_end_io = end_buffer_io_bad;
1531                 tail = bh;
1532                 bh = bh->b_this_page;
1533         } while (bh);
1534         tail->b_this_page = head;
1535         page->buffers = head;
1536         page_cache_get(page);
1537 }
1538
1539 /*
1540  * We are taking a block for data and we don't want any output from any
1541  * buffer-cache aliases starting from return from that function and
1542  * until the moment when something will explicitly mark the buffer
1543  * dirty (hopefully that will not happen until we will free that block ;-)
1544  * We don't even need to mark it not-uptodate - nobody can expect
1545  * anything from a newly allocated buffer anyway. We used to used
1546  * unmap_buffer() for such invalidation, but that was wrong. We definitely
1547  * don't want to mark the alias unmapped, for example - it would confuse
1548  * anyone who might pick it with bread() afterwards...
1549  */
1550
1551 static void unmap_underlying_metadata(struct buffer_head * bh)
1552 {
1553         struct buffer_head *old_bh;
1554
1555         old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
1556         if (old_bh) {
1557                 mark_buffer_clean(old_bh);
1558                 wait_on_buffer(old_bh);
1559                 clear_bit(BH_Req, &old_bh->b_state);
1560                 /* Here we could run brelse or bforget. We use
1561                    bforget because it will try to put the buffer
1562                    in the freelist. */
1563                 __bforget(old_bh);
1564         }
1565 }
1566
1567 /*
1568  * NOTE! All mapped/uptodate combinations are valid:
1569  *
1570  *      Mapped  Uptodate        Meaning
1571  *
1572  *      No      No              "unknown" - must do get_block()
1573  *      No      Yes             "hole" - zero-filled
1574  *      Yes     No              "allocated" - allocated on disk, not read in
1575  *      Yes     Yes             "valid" - allocated and up-to-date in memory.
1576  *
1577  * "Dirty" is valid only with the last case (mapped+uptodate).
1578  */
1579
1580 /*
1581  * block_write_full_page() is SMP-safe - currently it's still
1582  * being called with the kernel lock held, but the code is ready.
1583  */
1584 static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
1585 {
1586         int err, i, need_balance_dirty = 0;
1587         unsigned long block;
1588         struct buffer_head *bh, *head;
1589
1590         if (!PageLocked(page))
1591                 BUG();
1592
1593         if (!page->buffers)
1594                 create_empty_buffers(page, inode, inode->i_sb->s_blocksize);
1595         head = page->buffers;
1596
1597         block = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1598
1599         bh = head;
1600         i = 0;
1601         do {
1602                 /*
1603                  * If the buffer isn't up-to-date, we can't be sure
1604                  * that the buffer has been initialized with the proper
1605                  * block number information etc..
1606                  *
1607                  * Leave it to the low-level FS to make all those
1608                  * decisions (block #0 may actually be a valid block)
1609                  */
1610                 bh->b_end_io = end_buffer_io_sync;
1611                 if (!buffer_mapped(bh)) {
1612                         err = get_block(inode, block, bh, 1);
1613                         if (err)
1614                                 goto out;
1615                         if (buffer_new(bh))
1616                                 unmap_underlying_metadata(bh);
1617                 }
1618                 set_bit(BH_Uptodate, &bh->b_state);
1619                 if (!atomic_set_buffer_dirty(bh)) {
1620                         buffer_insert_inode_queue(bh, inode);
1621                         __mark_dirty(bh);
1622                         need_balance_dirty = 1;
1623                 }
1624
1625                 bh = bh->b_this_page;
1626                 block++;
1627         } while (bh != head);
1628
1629         if (need_balance_dirty)
1630                 balance_dirty(bh->b_dev);
1631
1632         SetPageUptodate(page);
1633         return 0;
1634 out:
1635         ClearPageUptodate(page);
1636         return err;
1637 }
1638
1639 static int __block_prepare_write(struct inode *inode, struct page *page,
1640                 unsigned from, unsigned to, get_block_t *get_block)
1641 {
1642         unsigned block_start, block_end;
1643         unsigned long block;
1644         int err = 0;
1645         unsigned blocksize, bbits;
1646         struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1647         char *kaddr = kmap(page);
1648
1649         blocksize = inode->i_sb->s_blocksize;
1650         if (!page->buffers)
1651                 create_empty_buffers(page, inode, blocksize);
1652         head = page->buffers;
1653
1654         bbits = inode->i_sb->s_blocksize_bits;
1655         block = page->index << (PAGE_CACHE_SHIFT - bbits);
1656
1657         for(bh = head, block_start = 0; bh != head || !block_start;
1658             block++, block_start=block_end, bh = bh->b_this_page) {
1659                 if (!bh)
1660                         BUG();
1661                 block_end = block_start+blocksize;
1662                 if (block_end <= from)
1663                         continue;
1664                 if (block_start >= to)
1665                         break;
1666                 bh->b_end_io = end_buffer_io_sync;
1667                 if (!buffer_mapped(bh)) {
1668                         err = get_block(inode, block, bh, 1);
1669                         if (err)
1670                                 goto out;
1671                         if (buffer_new(bh)) {
1672                                 unmap_underlying_metadata(bh);
1673                                 if (Page_Uptodate(page)) {
1674                                         set_bit(BH_Uptodate, &bh->b_state);
1675                                         continue;
1676                                 }
1677                                 if (block_end > to)
1678                                         memset(kaddr+to, 0, block_end-to);
1679                                 if (block_start < from)
1680                                         memset(kaddr+block_start, 0, from-block_start);
1681                                 if (block_end > to || block_start < from)
1682                                         flush_dcache_page(page);
1683                                 continue;
1684                         }
1685                 }
1686                 if (Page_Uptodate(page)) {
1687                         set_bit(BH_Uptodate, &bh->b_state);
1688                         continue;
1689                 }
1690                 if (!buffer_uptodate(bh) &&
1691                      (block_start < from || block_end > to)) {
1692                         ll_rw_block(READ, 1, &bh);
1693                         *wait_bh++=bh;
1694                 }
1695         }
1696         /*
1697          * If we issued read requests - let them complete.
1698          */
1699         while(wait_bh > wait) {
1700                 wait_on_buffer(*--wait_bh);
1701                 err = -EIO;
1702                 if (!buffer_uptodate(*wait_bh))
1703                         goto out;
1704         }
1705         return 0;
1706 out:
1707         return err;
1708 }
1709
1710 static int __block_commit_write(struct inode *inode, struct page *page,
1711                 unsigned from, unsigned to)
1712 {
1713         unsigned block_start, block_end;
1714         int partial = 0, need_balance_dirty = 0;
1715         unsigned blocksize;
1716         struct buffer_head *bh, *head;
1717
1718         blocksize = inode->i_sb->s_blocksize;
1719
1720         for(bh = head = page->buffers, block_start = 0;
1721             bh != head || !block_start;
1722             block_start=block_end, bh = bh->b_this_page) {
1723                 block_end = block_start + blocksize;
1724                 if (block_end <= from || block_start >= to) {
1725                         if (!buffer_uptodate(bh))
1726                                 partial = 1;
1727                 } else {
1728                         set_bit(BH_Uptodate, &bh->b_state);
1729                         if (!atomic_set_buffer_dirty(bh)) {
1730                                 __mark_dirty(bh);
1731                                 buffer_insert_inode_queue(bh, inode);
1732                                 need_balance_dirty = 1;
1733                         }
1734                 }
1735         }
1736
1737         if (need_balance_dirty)
1738                 balance_dirty(bh->b_dev);
1739         /*
1740          * is this a partial write that happened to make all buffers
1741          * uptodate then we can optimize away a bogus readpage() for
1742          * the next read(). Here we 'discover' wether the page went
1743          * uptodate as a result of this (potentially partial) write.
1744          */
1745         if (!partial)
1746                 SetPageUptodate(page);
1747         return 0;
1748 }
1749
1750 /*
1751  * Generic "read page" function for block devices that have the normal
1752  * get_block functionality. This is most of the block device filesystems.
1753  * Reads the page asynchronously --- the unlock_buffer() and
1754  * mark_buffer_uptodate() functions propagate buffer state into the
1755  * page struct once IO has completed.
1756  */
1757 int block_read_full_page(struct page *page, get_block_t *get_block)
1758 {
1759         struct inode *inode = (struct inode*)page->mapping->host;
1760         unsigned long iblock, lblock;
1761         struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1762         unsigned int blocksize, blocks;
1763         char *kaddr = NULL;
1764         int nr, i;
1765
1766         if (!PageLocked(page))
1767                 PAGE_BUG(page);
1768         blocksize = inode->i_sb->s_blocksize;
1769         if (!page->buffers)
1770                 create_empty_buffers(page, inode, blocksize);
1771         head = page->buffers;
1772
1773         blocks = PAGE_CACHE_SIZE >> inode->i_sb->s_blocksize_bits;
1774         iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1775         lblock = (inode->i_size+blocksize-1) >> inode->i_sb->s_blocksize_bits;
1776         bh = head;
1777         nr = 0;
1778         i = 0;
1779
1780         do {
1781                 if (buffer_uptodate(bh))
1782                         continue;
1783
1784                 if (!buffer_mapped(bh)) {
1785                         if (iblock < lblock) {
1786                                 if (get_block(inode, iblock, bh, 0))
1787                                         continue;
1788                         }
1789                         if (!buffer_mapped(bh)) {
1790                                 if (!kaddr)
1791                                         kaddr = kmap(page);
1792                                 memset(kaddr + i*blocksize, 0, blocksize);
1793                                 flush_dcache_page(page);
1794                                 set_bit(BH_Uptodate, &bh->b_state);
1795                                 continue;
1796                         }
1797                 }
1798
1799                 init_buffer(bh, end_buffer_io_async, NULL);
1800                 atomic_inc(&bh->b_count);
1801                 arr[nr] = bh;
1802                 nr++;
1803         } while (i++, iblock++, (bh = bh->b_this_page) != head);
1804
1805         if (nr) {
1806                 if (Page_Uptodate(page))
1807                         BUG();
1808                 ll_rw_block(READ, nr, arr);
1809         } else {
1810                 /*
1811                  * all buffers are uptodate - we can set the page
1812                  * uptodate as well.
1813                  */
1814                 SetPageUptodate(page);
1815                 UnlockPage(page);
1816         }
1817         if (kaddr)
1818                 kunmap(page);
1819         return 0;
1820 }
1821
1822 /*
1823  * For moronic filesystems that do not allow holes in file.
1824  * We may have to extend the file.
1825  */
1826
1827 int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes)
1828 {
1829         struct address_space *mapping = page->mapping;
1830         struct inode *inode = (struct inode*)mapping->host;
1831         struct page *new_page;
1832         unsigned long pgpos;
1833         long status;
1834         unsigned zerofrom;
1835         unsigned blocksize = inode->i_sb->s_blocksize;
1836         char *kaddr;
1837
1838         while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
1839                 status = -ENOMEM;
1840                 new_page = grab_cache_page(mapping, pgpos);
1841                 if (!new_page)
1842                         goto out;
1843                 /* we might sleep */
1844                 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
1845                         UnlockPage(new_page);
1846                         page_cache_release(new_page);
1847                         continue;
1848                 }
1849                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1850                 if (zerofrom & (blocksize-1)) {
1851                         *bytes |= (blocksize-1);
1852                         (*bytes)++;
1853                 }
1854                 status = __block_prepare_write(inode, new_page, zerofrom,
1855                                                 PAGE_CACHE_SIZE, get_block);
1856                 if (status)
1857                         goto out_unmap;
1858                 kaddr = page_address(new_page);
1859                 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
1860                 flush_dcache_page(new_page);
1861                 __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
1862                 kunmap(new_page);
1863                 UnlockPage(new_page);
1864                 page_cache_release(new_page);
1865         }
1866
1867         if (page->index < pgpos) {
1868                 /* completely inside the area */
1869                 zerofrom = offset;
1870         } else {
1871                 /* page covers the boundary, find the boundary offset */
1872                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1873
1874                 /* if we will expand the thing last block will be filled */
1875                 if (to > zerofrom && (zerofrom & (blocksize-1))) {
1876                         *bytes |= (blocksize-1);
1877                         (*bytes)++;
1878                 }
1879
1880                 /* starting below the boundary? Nothing to zero out */
1881                 if (offset <= zerofrom)
1882                         zerofrom = offset;
1883         }
1884         status = __block_prepare_write(inode, page, zerofrom, to, get_block);
1885         if (status)
1886                 goto out1;
1887         kaddr = page_address(page);
1888         if (zerofrom < offset) {
1889                 memset(kaddr+zerofrom, 0, offset-zerofrom);
1890                 flush_dcache_page(page);
1891                 __block_commit_write(inode, page, zerofrom, offset);
1892         }
1893         return 0;
1894 out1:
1895         ClearPageUptodate(page);
1896         kunmap(page);
1897         return status;
1898
1899 out_unmap:
1900         ClearPageUptodate(new_page);
1901         kunmap(new_page);
1902         UnlockPage(new_page);
1903         page_cache_release(new_page);
1904 out:
1905         return status;
1906 }
1907
1908 int block_prepare_write(struct page *page, unsigned from, unsigned to,
1909                         get_block_t *get_block)
1910 {
1911         struct inode *inode = (struct inode*)page->mapping->host;
1912         int err = __block_prepare_write(inode, page, from, to, get_block);
1913         if (err) {
1914                 ClearPageUptodate(page);
1915                 kunmap(page);
1916         }
1917         return err;
1918 }
1919
1920 int generic_commit_write(struct file *file, struct page *page,
1921                 unsigned from, unsigned to)
1922 {
1923         struct inode *inode = (struct inode*)page->mapping->host;
1924         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1925         __block_commit_write(inode,page,from,to);
1926         kunmap(page);
1927         if (pos > inode->i_size) {
1928                 inode->i_size = pos;
1929                 mark_inode_dirty(inode);
1930         }
1931         return 0;
1932 }
1933
1934 int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t *get_block)
1935 {
1936         unsigned long index = from >> PAGE_CACHE_SHIFT;
1937         unsigned offset = from & (PAGE_CACHE_SIZE-1);
1938         unsigned blocksize, iblock, length, pos;
1939         struct inode *inode = (struct inode *)mapping->host;
1940         struct page *page;
1941         struct buffer_head *bh;
1942         int err;
1943
1944         blocksize = inode->i_sb->s_blocksize;
1945         length = offset & (blocksize - 1);
1946
1947         /* Block boundary? Nothing to do */
1948         if (!length)
1949                 return 0;
1950
1951         length = blocksize - length;
1952         iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1953
1954         page = grab_cache_page(mapping, index);
1955         err = PTR_ERR(page);
1956         if (IS_ERR(page))
1957                 goto out;
1958
1959         if (!page->buffers)
1960                 create_empty_buffers(page, inode, blocksize);
1961
1962         /* Find the buffer that contains "offset" */
1963         bh = page->buffers;
1964         pos = blocksize;
1965         while (offset >= pos) {
1966                 bh = bh->b_this_page;
1967                 iblock++;
1968                 pos += blocksize;
1969         }
1970
1971         err = 0;
1972         if (!buffer_mapped(bh)) {
1973                 /* Hole? Nothing to do */
1974                 if (buffer_uptodate(bh))
1975                         goto unlock;
1976                 get_block(inode, iblock, bh, 0);
1977                 /* Still unmapped? Nothing to do */
1978                 if (!buffer_mapped(bh))
1979                         goto unlock;
1980         }
1981
1982         /* Ok, it's mapped. Make sure it's up-to-date */
1983         if (Page_Uptodate(page))
1984                 set_bit(BH_Uptodate, &bh->b_state);
1985
1986         bh->b_end_io = end_buffer_io_sync;
1987         if (!buffer_uptodate(bh)) {
1988                 err = -EIO;
1989                 ll_rw_block(READ, 1, &bh);
1990                 wait_on_buffer(bh);
1991                 /* Uhhuh. Read error. Complain and punt. */
1992                 if (!buffer_uptodate(bh))
1993                         goto unlock;
1994         }
1995
1996         memset(kmap(page) + offset, 0, length);
1997         flush_dcache_page(page);
1998         kunmap(page);
1999
2000         mark_buffer_dirty(bh);
2001         err = 0;
2002
2003 unlock:
2004         UnlockPage(page);
2005         page_cache_release(page);
2006 out:
2007         return err;
2008 }
2009
2010 int block_write_full_page(struct page *page, get_block_t *get_block)
2011 {
2012         struct inode *inode = (struct inode*)page->mapping->host;
2013         unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
2014         unsigned offset;
2015         int err;
2016
2017         /* easy case */
2018         if (page->index < end_index)
2019                 return __block_write_full_page(inode, page, get_block);
2020
2021         /* things got complicated... */
2022         offset = inode->i_size & (PAGE_CACHE_SIZE-1);
2023         /* OK, are we completely out? */
2024         if (page->index >= end_index+1 || !offset)
2025                 return -EIO;
2026         /* Sigh... will have to work, then... */
2027         err = __block_prepare_write(inode, page, 0, offset, get_block);
2028         if (!err) {
2029                 memset(page_address(page) + offset, 0, PAGE_CACHE_SIZE - offset);
2030                 flush_dcache_page(page);
2031                 __block_commit_write(inode,page,0,offset);
2032 done:
2033                 kunmap(page);
2034                 return err;
2035         }
2036         ClearPageUptodate(page);
2037         goto done;
2038 }
2039
2040 int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block)
2041 {
2042         struct buffer_head tmp;
2043         struct inode *inode = (struct inode*)mapping->host;
2044         tmp.b_state = 0;
2045         tmp.b_blocknr = 0;
2046         get_block(inode, block, &tmp, 0);
2047         return tmp.b_blocknr;
2048 }
2049
2050 /*
2051  * IO completion routine for a buffer_head being used for kiobuf IO: we
2052  * can't dispatch the kiobuf callback until io_count reaches 0.
2053  */
2054
2055 static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate)
2056 {
2057         struct kiobuf *kiobuf;
2058
2059         mark_buffer_uptodate(bh, uptodate);
2060
2061         kiobuf = bh->b_private;
2062         unlock_buffer(bh);
2063         end_kio_request(kiobuf, uptodate);
2064 }
2065
2066
2067 /*
2068  * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
2069  * for them to complete.  Clean up the buffer_heads afterwards.
2070  */
2071
2072 static int wait_kio(int rw, int nr, struct buffer_head *bh[], int size)
2073 {
2074         int iosize;
2075         int i;
2076         struct buffer_head *tmp;
2077
2078
2079         iosize = 0;
2080         spin_lock(&unused_list_lock);
2081
2082         for (i = nr; --i >= 0; ) {
2083                 iosize += size;
2084                 tmp = bh[i];
2085                 if (buffer_locked(tmp)) {
2086                         spin_unlock(&unused_list_lock);
2087                         wait_on_buffer(tmp);
2088                         spin_lock(&unused_list_lock);
2089                 }
2090
2091                 if (!buffer_uptodate(tmp)) {
2092                         /* We are traversing bh'es in reverse order so
2093                            clearing iosize on error calculates the
2094                            amount of IO before the first error. */
2095                         iosize = 0;
2096                 }
2097                 __put_unused_buffer_head(tmp);
2098         }
2099
2100         spin_unlock(&unused_list_lock);
2101
2102         return iosize;
2103 }
2104
2105 /*
2106  * Start I/O on a physical range of kernel memory, defined by a vector
2107  * of kiobuf structs (much like a user-space iovec list).
2108  *
2109  * The kiobuf must already be locked for IO.  IO is submitted
2110  * asynchronously: you need to check page->locked, page->uptodate, and
2111  * maybe wait on page->wait.
2112  *
2113  * It is up to the caller to make sure that there are enough blocks
2114  * passed in to completely map the iobufs to disk.
2115  */
2116
2117 int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
2118                kdev_t dev, unsigned long b[], int size)
2119 {
2120         int             err;
2121         int             length;
2122         int             transferred;
2123         int             i;
2124         int             bufind;
2125         int             pageind;
2126         int             bhind;
2127         int             offset;
2128         int             sectors = size>>9;
2129         unsigned long   blocknr;
2130         struct kiobuf * iobuf = NULL;
2131         struct page *   map;
2132         struct buffer_head *tmp, *bh[KIO_MAX_SECTORS];
2133
2134         if (!nr)
2135                 return 0;
2136
2137         /*
2138          * First, do some alignment and validity checks
2139          */
2140         for (i = 0; i < nr; i++) {
2141                 iobuf = iovec[i];
2142                 if ((iobuf->offset & (size-1)) ||
2143                     (iobuf->length & (size-1)))
2144                         return -EINVAL;
2145                 if (!iobuf->nr_pages)
2146                         panic("brw_kiovec: iobuf not initialised");
2147         }
2148
2149         /*
2150          * OK to walk down the iovec doing page IO on each page we find.
2151          */
2152         bufind = bhind = transferred = err = 0;
2153         for (i = 0; i < nr; i++) {
2154                 iobuf = iovec[i];
2155                 offset = iobuf->offset;
2156                 length = iobuf->length;
2157                 iobuf->errno = 0;
2158
2159                 for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
2160                         map  = iobuf->maplist[pageind];
2161                         if (!map) {
2162                                 err = -EFAULT;
2163                                 goto error;
2164                         }
2165
2166                         while (length > 0) {
2167                                 blocknr = b[bufind++];
2168                                 tmp = get_unused_buffer_head(0);
2169                                 if (!tmp) {
2170                                         err = -ENOMEM;
2171                                         goto error;
2172                                 }
2173
2174                                 tmp->b_dev = B_FREE;
2175                                 tmp->b_size = size;
2176                                 set_bh_page(tmp, map, offset);
2177                                 tmp->b_this_page = tmp;
2178
2179                                 init_buffer(tmp, end_buffer_io_kiobuf, iobuf);
2180                                 tmp->b_rdev = tmp->b_dev = dev;
2181                                 tmp->b_blocknr = blocknr;
2182                                 tmp->b_rsector = blocknr*sectors;
2183                                 tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req);
2184
2185                                 if (rw == WRITE) {
2186                                         set_bit(BH_Uptodate, &tmp->b_state);
2187                                         set_bit(BH_Dirty, &tmp->b_state);
2188                                 }
2189
2190                                 bh[bhind++] = tmp;
2191                                 length -= size;
2192                                 offset += size;
2193
2194                                 atomic_inc(&iobuf->io_count);
2195
2196                                 generic_make_request(rw, tmp);
2197                                 /*
2198                                  * Wait for IO if we have got too much
2199                                  */
2200                                 if (bhind >= KIO_MAX_SECTORS) {
2201                                         err = wait_kio(rw, bhind, bh, size);
2202                                         if (err >= 0)
2203                                                 transferred += err;
2204                                         else
2205                                                 goto finished;
2206                                         bhind = 0;
2207                                 }
2208
2209                                 if (offset >= PAGE_SIZE) {
2210                                         offset = 0;
2211                                         break;
2212                                 }
2213                         } /* End of block loop */
2214                 } /* End of page loop */
2215         } /* End of iovec loop */
2216
2217         /* Is there any IO still left to submit? */
2218         if (bhind) {
2219                 err = wait_kio(rw, bhind, bh, size);
2220                 if (err >= 0)
2221                         transferred += err;
2222                 else
2223                         goto finished;
2224         }
2225
2226  finished:
2227         if (transferred)
2228                 return transferred;
2229         return err;
2230
2231  error:
2232         /* We got an error allocating the bh'es.  Just free the current
2233            buffer_heads and exit. */
2234         spin_lock(&unused_list_lock);
2235         for (i = bhind; --i >= 0; ) {
2236                 __put_unused_buffer_head(bh[i]);
2237         }
2238         spin_unlock(&unused_list_lock);
2239         goto finished;
2240 }
2241
2242 /*
2243  * Start I/O on a page.
2244  * This function expects the page to be locked and may return
2245  * before I/O is complete. You then have to check page->locked,
2246  * page->uptodate, and maybe wait on page->wait.
2247  *
2248  * brw_page() is SMP-safe, although it's being called with the
2249  * kernel lock held - but the code is ready.
2250  *
2251  * FIXME: we need a swapper_inode->get_block function to remove
2252  *        some of the bmap kludges and interface ugliness here.
2253  */
2254 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size)
2255 {
2256         struct buffer_head *head, *bh, *arr[MAX_BUF_PER_PAGE];
2257         int nr, fresh /* temporary debugging flag */, block;
2258
2259         if (!PageLocked(page))
2260                 panic("brw_page: page not locked for I/O");
2261 //      ClearPageError(page);
2262         /*
2263          * We pretty much rely on the page lock for this, because
2264          * create_page_buffers() might sleep.
2265          */
2266         fresh = 0;
2267         if (!page->buffers) {
2268                 create_page_buffers(rw, page, dev, b, size);
2269                 fresh = 1;
2270         }
2271         if (!page->buffers)
2272                 BUG();
2273
2274         head = page->buffers;
2275         bh = head;
2276         nr = 0;
2277         do {
2278                 block = *(b++);
2279
2280                 if (fresh && (atomic_read(&bh->b_count) != 0))
2281                         BUG();
2282                 if (rw == READ) {
2283                         if (!fresh)
2284                                 BUG();
2285                         if (!buffer_uptodate(bh)) {
2286                                 arr[nr++] = bh;
2287                                 atomic_inc(&bh->b_count);
2288                         }
2289                 } else { /* WRITE */
2290                         if (!bh->b_blocknr) {
2291                                 if (!block)
2292                                         BUG();
2293                                 bh->b_blocknr = block;
2294                         } else {
2295                                 if (!block)
2296                                         BUG();
2297                         }
2298                         set_bit(BH_Uptodate, &bh->b_state);
2299                         set_bit(BH_Dirty, &bh->b_state);
2300                         arr[nr++] = bh;
2301                         atomic_inc(&bh->b_count);
2302                 }
2303                 bh = bh->b_this_page;
2304         } while (bh != head);
2305         if ((rw == READ) && nr) {
2306                 if (Page_Uptodate(page))
2307                         BUG();
2308                 ll_rw_block(rw, nr, arr);
2309         } else {
2310                 if (!nr && rw == READ) {
2311                         SetPageUptodate(page);
2312                         UnlockPage(page);
2313                 }
2314                 if (nr && (rw == WRITE))
2315                         ll_rw_block(rw, nr, arr);
2316         }
2317         return 0;
2318 }
2319
2320 int block_symlink(struct inode *inode, const char *symname, int len)
2321 {
2322         struct address_space *mapping = inode->i_mapping;
2323         struct page *page = grab_cache_page(mapping, 0);
2324         int err = -ENOMEM;
2325         char *kaddr;
2326
2327         if (!page)
2328                 goto fail;
2329         err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
2330         if (err)
2331                 goto fail_map;
2332         kaddr = page_address(page);
2333         memcpy(kaddr, symname, len-1);
2334         mapping->a_ops->commit_write(NULL, page, 0, len-1);
2335         /*
2336          * Notice that we are _not_ going to block here - end of page is
2337          * unmapped, so this will only try to map the rest of page, see
2338          * that it is unmapped (typically even will not look into inode -
2339          * ->i_size will be enough for everything) and zero it out.
2340          * OTOH it's obviously correct and should make the page up-to-date.
2341          */
2342         err = mapping->a_ops->readpage(NULL, page);
2343         wait_on_page(page);
2344         page_cache_release(page);
2345         if (err < 0)
2346                 goto fail;
2347         mark_inode_dirty(inode);
2348         return 0;
2349 fail_map:
2350         UnlockPage(page);
2351         page_cache_release(page);
2352 fail:
2353         return err;
2354 }
2355
2356 /*
2357  * Try to increase the number of buffers available: the size argument
2358  * is used to determine what kind of buffers we want.
2359  */
2360 static int grow_buffers(int size)
2361 {
2362         struct page * page;
2363         struct buffer_head *bh, *tmp;
2364         struct buffer_head * insert_point;
2365         int isize;
2366
2367         if ((size & 511) || (size > PAGE_SIZE)) {
2368                 printk("VFS: grow_buffers: size = %d\n",size);
2369                 return 0;
2370         }
2371
2372         page = alloc_page(GFP_BUFFER);
2373         if (!page)
2374                 goto out;
2375         LockPage(page);
2376         bh = create_buffers(page, size, 0);
2377         if (!bh)
2378                 goto no_buffer_head;
2379
2380         isize = BUFSIZE_INDEX(size);
2381
2382         spin_lock(&free_list[isize].lock);
2383         insert_point = free_list[isize].list;
2384         tmp = bh;
2385         while (1) {
2386                 if (insert_point) {
2387                         tmp->b_next_free = insert_point->b_next_free;
2388                         tmp->b_prev_free = insert_point;
2389                         insert_point->b_next_free->b_prev_free = tmp;
2390                         insert_point->b_next_free = tmp;
2391                 } else {
2392                         tmp->b_prev_free = tmp;
2393                         tmp->b_next_free = tmp;
2394                 }
2395                 insert_point = tmp;
2396                 if (tmp->b_this_page)
2397                         tmp = tmp->b_this_page;
2398                 else
2399                         break;
2400         }
2401         tmp->b_this_page = bh;
2402         free_list[isize].list = bh;
2403         spin_unlock(&free_list[isize].lock);
2404
2405         page->buffers = bh;
2406         page->flags &= ~(1 << PG_referenced);
2407         lru_cache_add(page);
2408         UnlockPage(page);
2409         atomic_inc(&buffermem_pages);
2410         return 1;
2411
2412 no_buffer_head:
2413         UnlockPage(page);
2414         page_cache_release(page);
2415 out:
2416         return 0;
2417 }
2418
2419 /*
2420  * Sync all the buffers on one page..
2421  *
2422  * If we have old buffers that are locked, we'll
2423  * wait on them, but we won't wait on the new ones
2424  * we're writing out now.
2425  *
2426  * This all is required so that we can free up memory
2427  * later.
2428  *
2429  * Wait:
2430  *      0 - no wait (this does not get called - see try_to_free_buffers below)
2431  *      1 - start IO for dirty buffers
2432  *      2 - wait for completion of locked buffers
2433  */
2434 static void sync_page_buffers(struct buffer_head *bh, int wait)
2435 {
2436         struct buffer_head * tmp = bh;
2437
2438         do {
2439                 struct buffer_head *p = tmp;
2440                 tmp = tmp->b_this_page;
2441                 if (buffer_locked(p)) {
2442                         if (wait > 1)
2443                                 __wait_on_buffer(p);
2444                 } else if (buffer_dirty(p))
2445                         ll_rw_block(WRITE, 1, &p);
2446         } while (tmp != bh);
2447 }
2448
2449 /*
2450  * Can the buffer be thrown out?
2451  */
2452 #define BUFFER_BUSY_BITS        ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
2453 #define buffer_busy(bh)         (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
2454
2455 /*
2456  * try_to_free_buffers() checks if all the buffers on this particular page
2457  * are unused, and free's the page if so.
2458  *
2459  * Wake up bdflush() if this fails - if we're running low on memory due
2460  * to dirty buffers, we need to flush them out as quickly as possible.
2461  *
2462  * NOTE: There are quite a number of ways that threads of control can
2463  *       obtain a reference to a buffer head within a page.  So we must
2464  *       lock out all of these paths to cleanly toss the page.
2465  */
2466 int try_to_free_buffers(struct page * page, int wait)
2467 {
2468         struct buffer_head * tmp, * bh = page->buffers;
2469         int index = BUFSIZE_INDEX(bh->b_size);
2470         int loop = 0;
2471
2472 cleaned_buffers_try_again:
2473         spin_lock(&lru_list_lock);
2474         write_lock(&hash_table_lock);
2475         spin_lock(&free_list[index].lock);
2476         tmp = bh;
2477         do {
2478                 struct buffer_head *p = tmp;
2479
2480                 tmp = tmp->b_this_page;
2481                 if (buffer_busy(p))
2482                         goto busy_buffer_page;
2483         } while (tmp != bh);
2484
2485         spin_lock(&unused_list_lock);
2486         tmp = bh;
2487         do {
2488                 struct buffer_head * p = tmp;
2489                 tmp = tmp->b_this_page;
2490
2491                 /* The buffer can be either on the regular
2492                  * queues or on the free list..
2493                  */
2494                 if (p->b_dev != B_FREE) {
2495                         remove_inode_queue(p);
2496                         __remove_from_queues(p);
2497                 } else
2498                         __remove_from_free_list(p, index);
2499                 __put_unused_buffer_head(p);
2500         } while (tmp != bh);
2501         spin_unlock(&unused_list_lock);
2502
2503         /* Wake up anyone waiting for buffer heads */
2504         wake_up(&buffer_wait);
2505
2506         /* And free the page */
2507         page->buffers = NULL;
2508         page_cache_release(page);
2509         spin_unlock(&free_list[index].lock);
2510         write_unlock(&hash_table_lock);
2511         spin_unlock(&lru_list_lock);
2512         return 1;
2513
2514 busy_buffer_page:
2515         /* Uhhuh, start writeback so that we don't end up with all dirty pages */
2516         spin_unlock(&free_list[index].lock);
2517         write_unlock(&hash_table_lock);
2518         spin_unlock(&lru_list_lock);
2519         if (wait) {
2520                 sync_page_buffers(bh, wait);
2521                 /* We waited synchronously, so we can free the buffers. */
2522                 if (wait > 1 && !loop) {
2523                         loop = 1;
2524                         goto cleaned_buffers_try_again;
2525                 }
2526         }
2527         return 0;
2528 }
2529
2530 /* ================== Debugging =================== */
2531
2532 void show_buffers(void)
2533 {
2534 #ifdef CONFIG_SMP
2535         struct buffer_head * bh;
2536         int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
2537         int protected = 0;
2538         int nlist;
2539         static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", "PROTECTED", };
2540 #endif
2541
2542         printk("Buffer memory:   %6dkB\n",
2543                         atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
2544
2545 #ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
2546         if (!spin_trylock(&lru_list_lock))
2547                 return;
2548         for(nlist = 0; nlist < NR_LIST; nlist++) {
2549                 found = locked = dirty = used = lastused = protected = 0;
2550                 bh = lru_list[nlist];
2551                 if(!bh) continue;
2552
2553                 do {
2554                         found++;
2555                         if (buffer_locked(bh))
2556                                 locked++;
2557                         if (buffer_protected(bh))
2558                                 protected++;
2559                         if (buffer_dirty(bh))
2560                                 dirty++;
2561                         if (atomic_read(&bh->b_count))
2562                                 used++, lastused = found;
2563                         bh = bh->b_next_free;
2564                 } while (bh != lru_list[nlist]);
2565                 {
2566                         int tmp = nr_buffers_type[nlist];
2567                         if (found != tmp)
2568                                 printk("%9s: BUG -> found %d, reported %d\n",
2569                                        buf_types[nlist], found, tmp);
2570                 }
2571                 printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
2572                        "%d locked, %d protected, %d dirty\n",
2573                        buf_types[nlist], found, size_buffers_type[nlist]>>10,
2574                        used, lastused, locked, protected, dirty);
2575         }
2576         spin_unlock(&lru_list_lock);
2577 #endif
2578 }
2579
2580 /* ===================== Init ======================= */
2581
2582 /*
2583  * allocate the hash table and init the free list
2584  * Use gfp() for the hash table to decrease TLB misses, use
2585  * SLAB cache for buffer heads.
2586  */
2587 void __init buffer_init(unsigned long mempages)
2588 {
2589         int order, i;
2590         unsigned int nr_hash;
2591
2592         /* The buffer cache hash table is less important these days,
2593          * trim it a bit.
2594          */
2595         mempages >>= 14;
2596
2597         mempages *= sizeof(struct buffer_head *);
2598
2599         for (order = 0; (1 << order) < mempages; order++)
2600                 ;
2601
2602         /* try to allocate something until we get it or we're asking
2603            for something that is really too small */
2604
2605         do {
2606                 unsigned long tmp;
2607
2608                 nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
2609                 bh_hash_mask = (nr_hash - 1);
2610
2611                 tmp = nr_hash;
2612                 bh_hash_shift = 0;
2613                 while((tmp >>= 1UL) != 0UL)
2614                         bh_hash_shift++;
2615
2616                 hash_table = (struct buffer_head **)
2617                     __get_free_pages(GFP_ATOMIC, order);
2618         } while (hash_table == NULL && --order > 0);
2619         printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
2620                nr_hash, order, (PAGE_SIZE << order));
2621
2622         if (!hash_table)
2623                 panic("Failed to allocate buffer hash table\n");
2624
2625         /* Setup hash chains. */
2626         for(i = 0; i < nr_hash; i++)
2627                 hash_table[i] = NULL;
2628
2629         /* Setup free lists. */
2630         for(i = 0; i < NR_SIZES; i++) {
2631                 free_list[i].list = NULL;
2632                 free_list[i].lock = SPIN_LOCK_UNLOCKED;
2633         }
2634
2635         /* Setup lru lists. */
2636         for(i = 0; i < NR_LIST; i++)
2637                 lru_list[i] = NULL;
2638
2639 }
2640
2641
2642 /* ====================== bdflush support =================== */
2643
2644 /* This is a simple kernel daemon, whose job it is to provide a dynamic
2645  * response to dirty buffers.  Once this process is activated, we write back
2646  * a limited number of buffers to the disks and then go back to sleep again.
2647  */
2648 static DECLARE_WAIT_QUEUE_HEAD(bdflush_done);
2649 struct task_struct *bdflush_tsk = 0;
2650
2651 void wakeup_bdflush(int block)
2652 {
2653         DECLARE_WAITQUEUE(wait, current);
2654
2655         if (current == bdflush_tsk)
2656                 return;
2657
2658         if (!block) {
2659                 wake_up_process(bdflush_tsk);
2660                 return;
2661         }
2662
2663         /* bdflush can wakeup us before we have a chance to
2664            go to sleep so we must be smart in handling
2665            this wakeup event from bdflush to avoid deadlocking in SMP
2666            (we are not holding any lock anymore in these two paths). */
2667         __set_current_state(TASK_UNINTERRUPTIBLE);
2668         add_wait_queue(&bdflush_done, &wait);
2669
2670         wake_up_process(bdflush_tsk);
2671         schedule();
2672
2673         remove_wait_queue(&bdflush_done, &wait);
2674         __set_current_state(TASK_RUNNING);
2675 }
2676
2677 /* This is the _only_ function that deals with flushing async writes
2678    to disk.
2679    NOTENOTENOTENOTE: we _only_ need to browse the DIRTY lru list
2680    as all dirty buffers lives _only_ in the DIRTY lru list.
2681    As we never browse the LOCKED and CLEAN lru lists they are infact
2682    completly useless. */
2683 static int flush_dirty_buffers(int check_flushtime)
2684 {
2685         struct buffer_head * bh, *next;
2686         int flushed = 0, i;
2687
2688  restart:
2689         spin_lock(&lru_list_lock);
2690         bh = lru_list[BUF_DIRTY];
2691         if (!bh)
2692                 goto out_unlock;
2693         for (i = nr_buffers_type[BUF_DIRTY]; i-- > 0; bh = next) {
2694                 next = bh->b_next_free;
2695
2696                 if (!buffer_dirty(bh)) {
2697                         __refile_buffer(bh);
2698                         continue;
2699                 }
2700                 if (buffer_locked(bh))
2701                         continue;
2702
2703                 if (check_flushtime) {
2704                         /* The dirty lru list is chronologically ordered so
2705                            if the current bh is not yet timed out,
2706                            then also all the following bhs
2707                            will be too young. */
2708                         if (time_before(jiffies, bh->b_flushtime))
2709                                 goto out_unlock;
2710                 } else {
2711                         if (++flushed > bdf_prm.b_un.ndirty)
2712                                 goto out_unlock;
2713                 }
2714
2715                 /* OK, now we are committed to write it out. */
2716                 atomic_inc(&bh->b_count);
2717                 spin_unlock(&lru_list_lock);
2718                 ll_rw_block(WRITE, 1, &bh);
2719                 atomic_dec(&bh->b_count);
2720
2721                 if (current->need_resched)
2722                         schedule();
2723                 goto restart;
2724         }
2725  out_unlock:
2726         spin_unlock(&lru_list_lock);
2727
2728         return flushed;
2729 }
2730
2731 /*
2732  * Here we attempt to write back old buffers.  We also try to flush inodes
2733  * and supers as well, since this function is essentially "update", and
2734  * otherwise there would be no way of ensuring that these quantities ever
2735  * get written back.  Ideally, we would have a timestamp on the inodes
2736  * and superblocks so that we could write back only the old ones as well
2737  */
2738
2739 static int sync_old_buffers(void)
2740 {
2741         lock_kernel();
2742         sync_supers(0);
2743         sync_inodes(0);
2744         unlock_kernel();
2745
2746         flush_dirty_buffers(1);
2747         /* must really sync all the active I/O request to disk here */
2748         run_task_queue(&tq_disk);
2749         return 0;
2750 }
2751
2752 int block_sync_page(struct page *page)
2753 {
2754         run_task_queue(&tq_disk);
2755         return 0;
2756 }
2757
2758 /* This is the interface to bdflush.  As we get more sophisticated, we can
2759  * pass tuning parameters to this "process", to adjust how it behaves.
2760  * We would want to verify each parameter, however, to make sure that it
2761  * is reasonable. */
2762
2763 asmlinkage long sys_bdflush(int func, long data)
2764 {
2765         if (!capable(CAP_SYS_ADMIN))
2766                 return -EPERM;
2767
2768         if (func == 1) {
2769                 /* do_exit directly and let kupdate to do its work alone. */
2770                 do_exit(0);
2771 #if 0 /* left here as it's the only example of lazy-mm-stuff used from
2772          a syscall that doesn't care about the current mm context. */
2773                 int error;
2774                 struct mm_struct *user_mm;
2775
2776                 /*
2777                  * bdflush will spend all of it's time in kernel-space,
2778                  * without touching user-space, so we can switch it into
2779                  * 'lazy TLB mode' to reduce the cost of context-switches
2780                  * to and from bdflush.
2781                  */
2782                 user_mm = start_lazy_tlb();
2783                 error = sync_old_buffers();
2784                 end_lazy_tlb(user_mm);
2785                 return error;
2786 #endif
2787         }
2788
2789         /* Basically func 1 means read param 1, 2 means write param 1, etc */
2790         if (func >= 2) {
2791                 int i = (func-2) >> 1;
2792                 if (i >= 0 && i < N_PARAM) {
2793                         if ((func & 1) == 0)
2794                                 return put_user(bdf_prm.data[i], (int*)data);
2795
2796                         if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
2797                                 bdf_prm.data[i] = data;
2798                                 return 0;
2799                         }
2800                 }
2801                 return -EINVAL;
2802         }
2803
2804         /* Having func 0 used to launch the actual bdflush and then never
2805          * return (unless explicitly killed). We return zero here to
2806          * remain semi-compatible with present update(8) programs.
2807          */
2808         return 0;
2809 }
2810
2811 /*
2812  * This is the actual bdflush daemon itself. It used to be started from
2813  * the syscall above, but now we launch it ourselves internally with
2814  * kernel_thread(...)  directly after the first thread in init/main.c
2815  */
2816 int bdflush(void *sem)
2817 {
2818         struct task_struct *tsk = current;
2819         int flushed;
2820         /*
2821          *      We have a bare-bones task_struct, and really should fill
2822          *      in a few more things so "top" and /proc/2/{exe,root,cwd}
2823          *      display semi-sane things. Not real crucial though...
2824          */
2825
2826         tsk->session = 1;
2827         tsk->pgrp = 1;
2828         strcpy(tsk->comm, "bdflush");
2829         bdflush_tsk = tsk;
2830
2831         /* avoid getting signals */
2832         spin_lock_irq(&tsk->sigmask_lock);
2833         flush_signals(tsk);
2834         sigfillset(&tsk->blocked);
2835         recalc_sigpending(tsk);
2836         spin_unlock_irq(&tsk->sigmask_lock);
2837
2838         up((struct semaphore *)sem);
2839
2840         for (;;) {
2841                 CHECK_EMERGENCY_SYNC
2842
2843                 flushed = flush_dirty_buffers(0);
2844                 if (free_shortage())
2845                         flushed += page_launder(GFP_BUFFER, 0);
2846
2847                 /* If wakeup_bdflush will wakeup us
2848                    after our bdflush_done wakeup, then
2849                    we must make sure to not sleep
2850                    in schedule_timeout otherwise
2851                    wakeup_bdflush may wait for our
2852                    bdflush_done wakeup that would never arrive
2853                    (as we would be sleeping) and so it would
2854                    deadlock in SMP. */
2855                 __set_current_state(TASK_INTERRUPTIBLE);
2856                 wake_up_all(&bdflush_done);
2857                 /*
2858                  * If there are still a lot of dirty buffers around,
2859                  * skip the sleep and flush some more. Otherwise, we
2860                  * go to sleep waiting a wakeup.
2861                  */
2862                 if (!flushed || balance_dirty_state(NODEV) < 0) {
2863                         run_task_queue(&tq_disk);
2864                         schedule();
2865                 }
2866                 /* Remember to mark us as running otherwise
2867                    the next schedule will block. */
2868                 __set_current_state(TASK_RUNNING);
2869         }
2870 }
2871
2872 /*
2873  * This is the kernel update daemon. It was used to live in userspace
2874  * but since it's need to run safely we want it unkillable by mistake.
2875  * You don't need to change your userspace configuration since
2876  * the userspace `update` will do_exit(0) at the first sys_bdflush().
2877  */
2878 int kupdate(void *sem)
2879 {
2880         struct task_struct * tsk = current;
2881         int interval;
2882
2883         tsk->session = 1;
2884         tsk->pgrp = 1;
2885         strcpy(tsk->comm, "kupdate");
2886
2887         /* sigstop and sigcont will stop and wakeup kupdate */
2888         spin_lock_irq(&tsk->sigmask_lock);
2889         sigfillset(&tsk->blocked);
2890         siginitsetinv(&current->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP));
2891         recalc_sigpending(tsk);
2892         spin_unlock_irq(&tsk->sigmask_lock);
2893
2894         up((struct semaphore *)sem);
2895
2896         for (;;) {
2897                 /* update interval */
2898                 interval = bdf_prm.b_un.interval;
2899                 if (interval) {
2900                         tsk->state = TASK_INTERRUPTIBLE;
2901                         schedule_timeout(interval);
2902                 } else {
2903                 stop_kupdate:
2904                         tsk->state = TASK_STOPPED;
2905                         schedule(); /* wait for SIGCONT */
2906                 }
2907                 /* check for sigstop */
2908                 if (signal_pending(tsk)) {
2909                         int stopped = 0;
2910                         spin_lock_irq(&tsk->sigmask_lock);
2911                         if (sigismember(&tsk->pending.signal, SIGSTOP)) {
2912                                 sigdelset(&tsk->pending.signal, SIGSTOP);
2913                                 stopped = 1;
2914                         }
2915                         recalc_sigpending(tsk);
2916                         spin_unlock_irq(&tsk->sigmask_lock);
2917                         if (stopped)
2918                                 goto stop_kupdate;
2919                 }
2920 #ifdef DEBUG
2921                 printk("kupdate() activated...\n");
2922 #endif
2923                 sync_old_buffers();
2924         }
2925 }
2926
2927 static int __init bdflush_init(void)
2928 {
2929         DECLARE_MUTEX_LOCKED(sem);
2930         kernel_thread(bdflush, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
2931         down(&sem);
2932         kernel_thread(kupdate, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
2933         down(&sem);
2934         return 0;
2935 }
2936
2937 module_init(bdflush_init)
2938