fs/buffer.c

   1 /*
   2  *  linux/fs/buffer.c
   3  *
   4  *  Copyright (C) 1991, 1992  Linus Torvalds
   5  */
   6
   7 /*
   8  *  'buffer.c' implements the buffer-cache functions. Race-conditions have
   9  * been avoided by NEVER letting an interrupt change a buffer (except for the
  10  * data, of course), but instead letting the caller do it.
  11  */
  12
  13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
  14
  15 /* Removed a lot of unnecessary code and simplified things now that
  16  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  17  */
  18
  19 /* Speed up hash, lru, and free list operations.  Use gfp() for allocating
  20  * hash table, use SLAB cache for buffer heads. -DaveM
  21  */
  22
  23 /* Added 32k buffer block sizes - these are required older ARM systems.
  24  * - RMK
  25  */
  26
  27 /* Thread it... -DaveM */
  28
  29 /* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
  30
  31 #include <linux/config.h>
  32 #include <linux/sched.h>
  33 #include <linux/fs.h>
  34 #include <linux/malloc.h>
  35 #include <linux/locks.h>
  36 #include <linux/errno.h>
  37 #include <linux/swap.h>
  38 #include <linux/swapctl.h>
  39 #include <linux/smp_lock.h>
  40 #include <linux/vmalloc.h>
  41 #include <linux/blkdev.h>
  42 #include <linux/sysrq.h>
  43 #include <linux/file.h>
  44 #include <linux/init.h>
  45 #include <linux/quotaops.h>
  46 #include <linux/iobuf.h>
  47 #include <linux/highmem.h>
  48
  49 #include <asm/uaccess.h>
  50 #include <asm/io.h>
  51 #include <asm/bitops.h>
  52 #include <asm/mmu_context.h>
  53
  54 #define NR_SIZES 7
  55 static char buffersize_index[65] =
  56 {-1,  0,  1, -1,  2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
  57   4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  58   5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  59  -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  60   6};
  61
  62 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
  63 #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
  64 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
  65 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
  66                                              number of unused buffer heads */
  67
  68 /* Anti-deadlock ordering:
  69  *      lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
  70  */
  71
  72 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_inode_buffers)
  73
  74 /*
  75  * Hash table gook..
  76  */
  77 static unsigned int bh_hash_mask;
  78 static unsigned int bh_hash_shift;
  79 static struct buffer_head **hash_table;
  80 static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
  81
  82 static struct buffer_head *lru_list[NR_LIST];
  83 static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED;
  84 static int nr_buffers_type[NR_LIST];
  85 static unsigned long size_buffers_type[NR_LIST];
  86
  87 static struct buffer_head * unused_list;
  88 static int nr_unused_buffer_heads;
  89 static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
  90 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
  91
  92 struct bh_free_head {
  93         struct buffer_head *list;
  94         spinlock_t lock;
  95 };
  96 static struct bh_free_head free_list[NR_SIZES];
  97
  98 static int grow_buffers(int size);
  99 static void __refile_buffer(struct buffer_head *);
 100
 101 /* This is used by some architectures to estimate available memory. */
 102 atomic_t buffermem_pages = ATOMIC_INIT(0);
 103
 104 /* Here is the parameter block for the bdflush process. If you add or
 105  * remove any of the parameters, make sure to update kernel/sysctl.c.
 106  */
 107
 108 #define N_PARAM 9
 109
 110 /* The dummy values in this structure are left in there for compatibility
 111  * with old programs that play with the /proc entries.
 112  */
 113 union bdflush_param {
 114         struct {
 115                 int nfract;  /* Percentage of buffer cache dirty to
 116                                 activate bdflush */
 117                 int ndirty;  /* Maximum number of dirty blocks to write out per
 118                                 wake-cycle */
 119                 int nrefill; /* Number of clean buffers to try to obtain
 120                                 each time we call refill */
 121                 int nref_dirt; /* Dirty buffer threshold for activating bdflush
 122                                   when trying to refill buffers. */
 123                 int interval; /* jiffies delay between kupdate flushes */
 124                 int age_buffer;  /* Time for normal buffer to age before we flush it */
 125                 int dummy1;    /* unused, was age_super */
 126                 int dummy2;    /* unused */
 127                 int dummy3;    /* unused */
 128         } b_un;
 129         unsigned int data[N_PARAM];
 130 } bdf_prm = {{40, 500, 64, 256, 5*HZ, 30*HZ, 5*HZ, 1884, 2}};
 131
 132 /* These are the min and max parameter values that we will allow to be assigned */
 133 int bdflush_min[N_PARAM] = {  0,  10,    5,   25,  0,   1*HZ,   1*HZ, 1, 1};
 134 int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,600*HZ, 6000*HZ, 6000*HZ, 2047, 5};
 135
 136 /*
 137  * Rewrote the wait-routines to use the "new" wait-queue functionality,
 138  * and getting rid of the cli-sti pairs. The wait-queue routines still
 139  * need cli-sti, but now it's just a couple of 386 instructions or so.
 140  *
 141  * Note that the real wait_on_buffer() is an inline function that checks
 142  * if 'b_wait' is set before calling this, so that the queues aren't set
 143  * up unnecessarily.
 144  */
 145 void __wait_on_buffer(struct buffer_head * bh)
 146 {
 147         struct task_struct *tsk = current;
 148         DECLARE_WAITQUEUE(wait, tsk);
 149
 150         atomic_inc(&bh->b_count);
 151         add_wait_queue(&bh->b_wait, &wait);
 152         do {
 153                 run_task_queue(&tq_disk);
 154                 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 155                 if (!buffer_locked(bh))
 156                         break;
 157                 schedule();
 158         } while (buffer_locked(bh));
 159         tsk->state = TASK_RUNNING;
 160         remove_wait_queue(&bh->b_wait, &wait);
 161         atomic_dec(&bh->b_count);
 162 }
 163
 164 /* Call sync_buffers with wait!=0 to ensure that the call does not
 165  * return until all buffer writes have completed.  Sync() may return
 166  * before the writes have finished; fsync() may not.
 167  */
 168
 169 /* Godamity-damn.  Some buffers (bitmaps for filesystems)
 170  * spontaneously dirty themselves without ever brelse being called.
 171  * We will ultimately want to put these in a separate list, but for
 172  * now we search all of the lists for dirty buffers.
 173  */
 174 static int sync_buffers(kdev_t dev, int wait)
 175 {
 176         int i, retry, pass = 0, err = 0;
 177         struct buffer_head * bh, *next;
 178
 179         /* One pass for no-wait, three for wait:
 180          * 0) write out all dirty, unlocked buffers;
 181          * 1) write out all dirty buffers, waiting if locked;
 182          * 2) wait for completion by waiting for all buffers to unlock.
 183          */
 184         do {
 185                 retry = 0;
 186
 187                 /* We search all lists as a failsafe mechanism, not because we expect
 188                  * there to be dirty buffers on any of the other lists.
 189                  */
 190 repeat:
 191                 spin_lock(&lru_list_lock);
 192                 bh = lru_list[BUF_DIRTY];
 193                 if (!bh)
 194                         goto repeat2;
 195
 196                 for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
 197                         next = bh->b_next_free;
 198
 199                         if (!lru_list[BUF_DIRTY])
 200                                 break;
 201                         if (dev && bh->b_dev != dev)
 202                                 continue;
 203                         if (buffer_locked(bh)) {
 204                                 /* Buffer is locked; skip it unless wait is
 205                                  * requested AND pass > 0.
 206                                  */
 207                                 if (!wait || !pass) {
 208                                         retry = 1;
 209                                         continue;
 210                                 }
 211                                 atomic_inc(&bh->b_count);
 212                                 spin_unlock(&lru_list_lock);
 213                                 wait_on_buffer (bh);
 214                                 atomic_dec(&bh->b_count);
 215                                 goto repeat;
 216                         }
 217
 218                         /* If an unlocked buffer is not uptodate, there has
 219                          * been an IO error. Skip it.
 220                          */
 221                         if (wait && buffer_req(bh) && !buffer_locked(bh) &&
 222                             !buffer_dirty(bh) && !buffer_uptodate(bh)) {
 223                                 err = -EIO;
 224                                 continue;
 225                         }
 226
 227                         /* Don't write clean buffers.  Don't write ANY buffers
 228                          * on the third pass.
 229                          */
 230                         if (!buffer_dirty(bh) || pass >= 2)
 231                                 continue;
 232
 233                         atomic_inc(&bh->b_count);
 234                         spin_unlock(&lru_list_lock);
 235                         ll_rw_block(WRITE, 1, &bh);
 236                         atomic_dec(&bh->b_count);
 237                         retry = 1;
 238                         goto repeat;
 239                 }
 240
 241     repeat2:
 242                 bh = lru_list[BUF_LOCKED];
 243                 if (!bh) {
 244                         spin_unlock(&lru_list_lock);
 245                         break;
 246                 }
 247                 for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
 248                         next = bh->b_next_free;
 249
 250                         if (!lru_list[BUF_LOCKED])
 251                                 break;
 252                         if (dev && bh->b_dev != dev)
 253                                 continue;
 254                         if (buffer_locked(bh)) {
 255                                 /* Buffer is locked; skip it unless wait is
 256                                  * requested AND pass > 0.
 257                                  */
 258                                 if (!wait || !pass) {
 259                                         retry = 1;
 260                                         continue;
 261                                 }
 262                                 atomic_inc(&bh->b_count);
 263                                 spin_unlock(&lru_list_lock);
 264                                 wait_on_buffer (bh);
 265                                 spin_lock(&lru_list_lock);
 266                                 atomic_dec(&bh->b_count);
 267                                 goto repeat2;
 268                         }
 269                 }
 270                 spin_unlock(&lru_list_lock);
 271
 272                 /* If we are waiting for the sync to succeed, and if any dirty
 273                  * blocks were written, then repeat; on the second pass, only
 274                  * wait for buffers being written (do not pass to write any
 275                  * more buffers on the second pass).
 276                  */
 277         } while (wait && retry && ++pass<=2);
 278         return err;
 279 }
 280
 281 void sync_dev(kdev_t dev)
 282 {
 283         sync_supers(dev);
 284         sync_inodes(dev);
 285         DQUOT_SYNC(dev);
 286         /* sync all the dirty buffers out to disk only _after_ all the
 287            high level layers finished generated buffer dirty data
 288            (or we'll return with some buffer still dirty on the blockdevice
 289            so breaking the semantics of this call) */
 290         sync_buffers(dev, 0);
 291         /*
 292          * FIXME(eric) we need to sync the physical devices here.
 293          * This is because some (scsi) controllers have huge amounts of
 294          * cache onboard (hundreds of Mb), and we need to instruct
 295          * them to commit all of the dirty memory to disk, and we should
 296          * not return until this has happened.
 297          *
 298          * This would need to get implemented by going through the assorted
 299          * layers so that each block major number can be synced, and this
 300          * would call down into the upper and mid-layer scsi.
 301          */
 302 }
 303
 304 int fsync_dev(kdev_t dev)
 305 {
 306         sync_buffers(dev, 0);
 307
 308         lock_kernel();
 309         sync_supers(dev);
 310         sync_inodes(dev);
 311         DQUOT_SYNC(dev);
 312         unlock_kernel();
 313
 314         return sync_buffers(dev, 1);
 315 }
 316
 317 asmlinkage long sys_sync(void)
 318 {
 319         fsync_dev(0);
 320         return 0;
 321 }
 322
 323 /*
 324  *      filp may be NULL if called via the msync of a vma.
 325  */
 326
 327 int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
 328 {
 329         struct inode * inode = dentry->d_inode;
 330         struct super_block * sb;
 331         kdev_t dev;
 332         int ret;
 333
 334         lock_kernel();
 335         /* sync the inode to buffers */
 336         write_inode_now(inode, 0);
 337
 338         /* sync the superblock to buffers */
 339         sb = inode->i_sb;
 340         wait_on_super(sb);
 341         if (sb->s_op && sb->s_op->write_super)
 342                 sb->s_op->write_super(sb);
 343
 344         /* .. finally sync the buffers to disk */
 345         dev = inode->i_dev;
 346         ret = sync_buffers(dev, 1);
 347         unlock_kernel();
 348         return ret;
 349 }
 350
 351 asmlinkage long sys_fsync(unsigned int fd)
 352 {
 353         struct file * file;
 354         struct dentry * dentry;
 355         struct inode * inode;
 356         int err;
 357
 358         err = -EBADF;
 359         file = fget(fd);
 360         if (!file)
 361                 goto out;
 362
 363         dentry = file->f_dentry;
 364         inode = dentry->d_inode;
 365
 366         err = -EINVAL;
 367         if (!file->f_op || !file->f_op->fsync)
 368                 goto out_putf;
 369
 370         /* We need to protect against concurrent writers.. */
 371         down(&inode->i_sem);
 372         err = file->f_op->fsync(file, dentry, 0);
 373         up(&inode->i_sem);
 374
 375 out_putf:
 376         fput(file);
 377 out:
 378         return err;
 379 }
 380
 381 asmlinkage long sys_fdatasync(unsigned int fd)
 382 {
 383         struct file * file;
 384         struct dentry * dentry;
 385         struct inode * inode;
 386         int err;
 387
 388         err = -EBADF;
 389         file = fget(fd);
 390         if (!file)
 391                 goto out;
 392
 393         dentry = file->f_dentry;
 394         inode = dentry->d_inode;
 395
 396         err = -EINVAL;
 397         if (!file->f_op || !file->f_op->fsync)
 398                 goto out_putf;
 399
 400         down(&inode->i_sem);
 401         err = file->f_op->fsync(file, dentry, 1);
 402         up(&inode->i_sem);
 403
 404 out_putf:
 405         fput(file);
 406 out:
 407         return err;
 408 }
 409
 410 /* After several hours of tedious analysis, the following hash
 411  * function won.  Do not mess with it... -DaveM
 412  */
 413 #define _hashfn(dev,block)      \
 414         ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
 415          (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ \
 416           ((block) << (bh_hash_shift - 12))))
 417 #define hash(dev,block) hash_table[(_hashfn(HASHDEV(dev),block) & bh_hash_mask)]
 418
 419 static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head)
 420 {
 421         if ((bh->b_next = *head) != NULL)
 422                 bh->b_next->b_pprev = &bh->b_next;
 423         *head = bh;
 424         bh->b_pprev = head;
 425 }
 426
 427 static __inline__ void __hash_unlink(struct buffer_head *bh)
 428 {
 429         if (bh->b_pprev) {
 430                 if (bh->b_next)
 431                         bh->b_next->b_pprev = bh->b_pprev;
 432                 *(bh->b_pprev) = bh->b_next;
 433                 bh->b_pprev = NULL;
 434         }
 435 }
 436
 437 static void __insert_into_lru_list(struct buffer_head * bh, int blist)
 438 {
 439         struct buffer_head **bhp = &lru_list[blist];
 440
 441         if(!*bhp) {
 442                 *bhp = bh;
 443                 bh->b_prev_free = bh;
 444         }
 445         bh->b_next_free = *bhp;
 446         bh->b_prev_free = (*bhp)->b_prev_free;
 447         (*bhp)->b_prev_free->b_next_free = bh;
 448         (*bhp)->b_prev_free = bh;
 449         nr_buffers_type[blist]++;
 450         size_buffers_type[blist] += bh->b_size;
 451 }
 452
 453 static void __remove_from_lru_list(struct buffer_head * bh, int blist)
 454 {
 455         if (bh->b_prev_free || bh->b_next_free) {
 456                 bh->b_prev_free->b_next_free = bh->b_next_free;
 457                 bh->b_next_free->b_prev_free = bh->b_prev_free;
 458                 if (lru_list[blist] == bh)
 459                         lru_list[blist] = bh->b_next_free;
 460                 if (lru_list[blist] == bh)
 461                         lru_list[blist] = NULL;
 462                 bh->b_next_free = bh->b_prev_free = NULL;
 463                 nr_buffers_type[blist]--;
 464                 size_buffers_type[blist] -= bh->b_size;
 465         }
 466 }
 467
 468 static void __remove_from_free_list(struct buffer_head * bh, int index)
 469 {
 470         if(bh->b_next_free == bh)
 471                  free_list[index].list = NULL;
 472         else {
 473                 bh->b_prev_free->b_next_free = bh->b_next_free;
 474                 bh->b_next_free->b_prev_free = bh->b_prev_free;
 475                 if (free_list[index].list == bh)
 476                          free_list[index].list = bh->b_next_free;
 477         }
 478         bh->b_next_free = bh->b_prev_free = NULL;
 479 }
 480
 481 /* must be called with both the hash_table_lock and the lru_list_lock
 482    held */
 483 static void __remove_from_queues(struct buffer_head *bh)
 484 {
 485         __hash_unlink(bh);
 486         __remove_from_lru_list(bh, bh->b_list);
 487 }
 488
 489 static void __insert_into_queues(struct buffer_head *bh)
 490 {
 491         struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
 492
 493         __hash_link(bh, head);
 494         __insert_into_lru_list(bh, bh->b_list);
 495 }
 496
 497 /* This function must only run if there are no other
 498  * references _anywhere_ to this buffer head.
 499  */
 500 static void put_last_free(struct buffer_head * bh)
 501 {
 502         struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)];
 503         struct buffer_head **bhp = &head->list;
 504
 505         bh->b_state = 0;
 506
 507         spin_lock(&head->lock);
 508         bh->b_dev = B_FREE;
 509         if(!*bhp) {
 510                 *bhp = bh;
 511                 bh->b_prev_free = bh;
 512         }
 513         bh->b_next_free = *bhp;
 514         bh->b_prev_free = (*bhp)->b_prev_free;
 515         (*bhp)->b_prev_free->b_next_free = bh;
 516         (*bhp)->b_prev_free = bh;
 517         spin_unlock(&head->lock);
 518 }
 519
 520 /*
 521  * Why like this, I hear you say... The reason is race-conditions.
 522  * As we don't lock buffers (unless we are reading them, that is),
 523  * something might happen to it while we sleep (ie a read-error
 524  * will force it bad). This shouldn't really happen currently, but
 525  * the code is ready.
 526  */
 527 static inline struct buffer_head * __get_hash_table(kdev_t dev, int block, int size)
 528 {
 529         struct buffer_head *bh = hash(dev, block);
 530
 531         for (; bh; bh = bh->b_next)
 532                 if (bh->b_blocknr == block      &&
 533                     bh->b_size    == size       &&
 534                     bh->b_dev     == dev)
 535                         break;
 536         if (bh)
 537                 atomic_inc(&bh->b_count);
 538
 539         return bh;
 540 }
 541
 542 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
 543 {
 544         struct buffer_head *bh;
 545
 546         read_lock(&hash_table_lock);
 547         bh = __get_hash_table(dev, block, size);
 548         read_unlock(&hash_table_lock);
 549
 550         return bh;
 551 }
 552
 553 unsigned int get_hardblocksize(kdev_t dev)
 554 {
 555         /*
 556          * Get the hard sector size for the given device.  If we don't know
 557          * what it is, return 0.
 558          */
 559         if (hardsect_size[MAJOR(dev)] != NULL) {
 560                 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
 561                 if (blksize != 0)
 562                         return blksize;
 563         }
 564
 565         /*
 566          * We don't know what the hardware sector size for this device is.
 567          * Return 0 indicating that we don't know.
 568          */
 569         return 0;
 570 }
 571
 572 void buffer_insert_inode_queue(struct buffer_head *bh, struct inode *inode)
 573 {
 574         spin_lock(&lru_list_lock);
 575         if (bh->b_inode)
 576                 list_del(&bh->b_inode_buffers);
 577         bh->b_inode = inode;
 578         list_add(&bh->b_inode_buffers, &inode->i_dirty_buffers);
 579         spin_unlock(&lru_list_lock);
 580 }
 581
 582 /* The caller must have the lru_list lock before calling the
 583    remove_inode_queue functions.  */
 584 static void __remove_inode_queue(struct buffer_head *bh)
 585 {
 586         bh->b_inode = NULL;
 587         list_del(&bh->b_inode_buffers);
 588 }
 589
 590 static inline void remove_inode_queue(struct buffer_head *bh)
 591 {
 592         if (bh->b_inode)
 593                 __remove_inode_queue(bh);
 594 }
 595
 596 int inode_has_buffers(struct inode *inode)
 597 {
 598         int ret;
 599
 600         spin_lock(&lru_list_lock);
 601         ret = !list_empty(&inode->i_dirty_buffers);
 602         spin_unlock(&lru_list_lock);
 603
 604         return ret;
 605 }
 606
 607
 608 /* If invalidate_buffers() will trash dirty buffers, it means some kind
 609    of fs corruption is going on. Trashing dirty data always imply losing
 610    information that was supposed to be just stored on the physical layer
 611    by the user.
 612
 613    Thus invalidate_buffers in general usage is not allwowed to trash dirty
 614    buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved.
 615
 616    NOTE: In the case where the user removed a removable-media-disk even if
 617    there's still dirty data not synced on disk (due a bug in the device driver
 618    or due an error of the user), by not destroying the dirty buffers we could
 619    generate corruption also on the next media inserted, thus a parameter is
 620    necessary to handle this case in the most safe way possible (trying
 621    to not corrupt also the new disk inserted with the data belonging to
 622    the old now corrupted disk). Also for the ramdisk the natural thing
 623    to do in order to release the ramdisk memory is to destroy dirty buffers.
 624
 625    These are two special cases. Normal usage imply the device driver
 626    to issue a sync on the device (without waiting I/O completation) and
 627    then an invalidate_buffers call that doesn't trash dirty buffers. */
 628 void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
 629 {
 630         int i, nlist, slept;
 631         struct buffer_head * bh, * bh_next;
 632
 633  retry:
 634         slept = 0;
 635         spin_lock(&lru_list_lock);
 636         for(nlist = 0; nlist < NR_LIST; nlist++) {
 637                 bh = lru_list[nlist];
 638                 if (!bh)
 639                         continue;
 640                 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
 641                         bh_next = bh->b_next_free;
 642                         if (bh->b_dev != dev)
 643                                 continue;
 644                         if (buffer_locked(bh)) {
 645                                 atomic_inc(&bh->b_count);
 646                                 spin_unlock(&lru_list_lock);
 647                                 wait_on_buffer(bh);
 648                                 slept = 1;
 649                                 spin_lock(&lru_list_lock);
 650                                 atomic_dec(&bh->b_count);
 651                         }
 652
 653                         write_lock(&hash_table_lock);
 654                         if (!atomic_read(&bh->b_count) &&
 655                             (destroy_dirty_buffers || !buffer_dirty(bh))) {
 656                                 remove_inode_queue(bh);
 657                                 __remove_from_queues(bh);
 658                                 put_last_free(bh);
 659                         }
 660                         /* else complain loudly? */
 661
 662                         write_unlock(&hash_table_lock);
 663                         if (slept)
 664                                 goto out;
 665                 }
 666         }
 667 out:
 668         spin_unlock(&lru_list_lock);
 669         if (slept)
 670                 goto retry;
 671 }
 672
 673 void set_blocksize(kdev_t dev, int size)
 674 {
 675         extern int *blksize_size[];
 676         int i, nlist, slept;
 677         struct buffer_head * bh, * bh_next;
 678
 679         if (!blksize_size[MAJOR(dev)])
 680                 return;
 681
 682         /* Size must be a power of two, and between 512 and PAGE_SIZE */
 683         if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
 684                 panic("Invalid blocksize passed to set_blocksize");
 685
 686         if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
 687                 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 688                 return;
 689         }
 690         if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
 691                 return;
 692         sync_buffers(dev, 2);
 693         blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 694
 695  retry:
 696         slept = 0;
 697         spin_lock(&lru_list_lock);
 698         for(nlist = 0; nlist < NR_LIST; nlist++) {
 699                 bh = lru_list[nlist];
 700                 if (!bh)
 701                         continue;
 702                 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
 703                         bh_next = bh->b_next_free;
 704                         if (bh->b_dev != dev || bh->b_size == size)
 705                                 continue;
 706                         if (buffer_locked(bh)) {
 707                                 atomic_inc(&bh->b_count);
 708                                 spin_unlock(&lru_list_lock);
 709                                 wait_on_buffer(bh);
 710                                 slept = 1;
 711                                 spin_lock(&lru_list_lock);
 712                                 atomic_dec(&bh->b_count);
 713                         }
 714
 715                         write_lock(&hash_table_lock);
 716                         if (!atomic_read(&bh->b_count)) {
 717                                 if (buffer_dirty(bh))
 718                                         printk(KERN_WARNING
 719                                                "set_blocksize: dev %s buffer_dirty %lu size %hu\n",
 720                                                kdevname(dev), bh->b_blocknr, bh->b_size);
 721                                 remove_inode_queue(bh);
 722                                 __remove_from_queues(bh);
 723                                 put_last_free(bh);
 724                         } else {
 725                                 if (atomic_set_buffer_clean(bh))
 726                                         __refile_buffer(bh);
 727                                 clear_bit(BH_Uptodate, &bh->b_state);
 728                                 printk(KERN_WARNING
 729                                        "set_blocksize: "
 730                                        "b_count %d, dev %s, block %lu, from %p\n",
 731                                        atomic_read(&bh->b_count), bdevname(bh->b_dev),
 732                                        bh->b_blocknr, __builtin_return_address(0));
 733                         }
 734                         write_unlock(&hash_table_lock);
 735                         if (slept)
 736                                 goto out;
 737                 }
 738         }
 739  out:
 740         spin_unlock(&lru_list_lock);
 741         if (slept)
 742                 goto retry;
 743 }
 744
 745 /*
 746  * We used to try various strange things. Let's not.
 747  */
 748 static void refill_freelist(int size)
 749 {
 750         if (!grow_buffers(size))
 751                 wakeup_bdflush(1);  /* Sets task->state to TASK_RUNNING */
 752 }
 753
 754 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
 755 {
 756         bh->b_list = BUF_CLEAN;
 757         bh->b_end_io = handler;
 758         bh->b_private = private;
 759 }
 760
 761 static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 762 {
 763         mark_buffer_uptodate(bh, uptodate);
 764         unlock_buffer(bh);
 765 }
 766
 767 static void end_buffer_io_bad(struct buffer_head *bh, int uptodate)
 768 {
 769         mark_buffer_uptodate(bh, uptodate);
 770         unlock_buffer(bh);
 771         BUG();
 772 }
 773
 774 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
 775 {
 776         static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
 777         unsigned long flags;
 778         struct buffer_head *tmp;
 779         struct page *page;
 780
 781         mark_buffer_uptodate(bh, uptodate);
 782
 783         /* This is a temporary buffer used for page I/O. */
 784         page = bh->b_page;
 785
 786         if (!uptodate)
 787                 SetPageError(page);
 788
 789         /*
 790          * Be _very_ careful from here on. Bad things can happen if
 791          * two buffer heads end IO at almost the same time and both
 792          * decide that the page is now completely done.
 793          *
 794          * Async buffer_heads are here only as labels for IO, and get
 795          * thrown away once the IO for this page is complete.  IO is
 796          * deemed complete once all buffers have been visited
 797          * (b_count==0) and are now unlocked. We must make sure that
 798          * only the _last_ buffer that decrements its count is the one
 799          * that unlock the page..
 800          */
 801         spin_lock_irqsave(&page_uptodate_lock, flags);
 802         unlock_buffer(bh);
 803         atomic_dec(&bh->b_count);
 804         tmp = bh->b_this_page;
 805         while (tmp != bh) {
 806                 if (tmp->b_end_io == end_buffer_io_async && buffer_locked(tmp))
 807                         goto still_busy;
 808                 tmp = tmp->b_this_page;
 809         }
 810
 811         /* OK, the async IO on this page is complete. */
 812         spin_unlock_irqrestore(&page_uptodate_lock, flags);
 813
 814         /*
 815          * if none of the buffers had errors then we can set the
 816          * page uptodate:
 817          */
 818         if (!PageError(page))
 819                 SetPageUptodate(page);
 820
 821         /*
 822          * Run the hooks that have to be done when a page I/O has completed.
 823          */
 824         if (PageTestandClearDecrAfter(page))
 825                 atomic_dec(&nr_async_pages);
 826
 827         UnlockPage(page);
 828
 829         return;
 830
 831 still_busy:
 832         spin_unlock_irqrestore(&page_uptodate_lock, flags);
 833         return;
 834 }
 835
 836 /*
 837  * Synchronise all the inode's dirty buffers to the disk.
 838  *
 839  * We have conflicting pressures: we want to make sure that all
 840  * initially dirty buffers get waited on, but that any subsequently
 841  * dirtied buffers don't.  After all, we don't want fsync to last
 842  * forever if somebody is actively writing to the file.
 843  *
 844  * Do this in two main stages: first we copy dirty buffers to a
 845  * temporary inode list, queueing the writes as we go.  Then we clean
 846  * up, waiting for those writes to complete.
 847  *
 848  * During this second stage, any subsequent updates to the file may end
 849  * up refiling the buffer on the original inode's dirty list again, so
 850  * there is a chance we will end up with a buffer queued for write but
 851  * not yet completed on that list.  So, as a final cleanup we go through
 852  * the osync code to catch these locked, dirty buffers without requeuing
 853  * any newly dirty buffers for write.
 854  */
 855
 856 int fsync_inode_buffers(struct inode *inode)
 857 {
 858         struct buffer_head *bh;
 859         struct inode tmp;
 860         int err = 0, err2;
 861
 862         INIT_LIST_HEAD(&tmp.i_dirty_buffers);
 863
 864         spin_lock(&lru_list_lock);
 865
 866         while (!list_empty(&inode->i_dirty_buffers)) {
 867                 bh = BH_ENTRY(inode->i_dirty_buffers.next);
 868                 list_del(&bh->b_inode_buffers);
 869                 if (!buffer_dirty(bh) && !buffer_locked(bh))
 870                         bh->b_inode = NULL;
 871                 else {
 872                         bh->b_inode = &tmp;
 873                         list_add(&bh->b_inode_buffers, &tmp.i_dirty_buffers);
 874                         atomic_inc(&bh->b_count);
 875                         if (buffer_dirty(bh)) {
 876                                 spin_unlock(&lru_list_lock);
 877                                 ll_rw_block(WRITE, 1, &bh);
 878                                 spin_lock(&lru_list_lock);
 879                         }
 880                 }
 881         }
 882
 883         while (!list_empty(&tmp.i_dirty_buffers)) {
 884                 bh = BH_ENTRY(tmp.i_dirty_buffers.prev);
 885                 remove_inode_queue(bh);
 886                 spin_unlock(&lru_list_lock);
 887                 wait_on_buffer(bh);
 888                 if (!buffer_uptodate(bh))
 889                         err = -EIO;
 890                 brelse(bh);
 891                 spin_lock(&lru_list_lock);
 892         }
 893
 894         spin_unlock(&lru_list_lock);
 895         err2 = osync_inode_buffers(inode);
 896
 897         if (err)
 898                 return err;
 899         else
 900                 return err2;
 901 }
 902
 903
 904 /*
 905  * osync is designed to support O_SYNC io.  It waits synchronously for
 906  * all already-submitted IO to complete, but does not queue any new
 907  * writes to the disk.
 908  *
 909  * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
 910  * you dirty the buffers, and then use osync_inode_buffers to wait for
 911  * completion.  Any other dirty buffers which are not yet queued for
 912  * write will not be flushed to disk by the osync.
 913  */
 914
 915 int osync_inode_buffers(struct inode *inode)
 916 {
 917         struct buffer_head *bh;
 918         struct list_head *list;
 919         int err = 0;
 920
 921         spin_lock(&lru_list_lock);
 922
 923  repeat:
 924
 925         for (list = inode->i_dirty_buffers.prev;
 926              bh = BH_ENTRY(list), list != &inode->i_dirty_buffers;
 927              list = bh->b_inode_buffers.prev) {
 928                 if (buffer_locked(bh)) {
 929                         atomic_inc(&bh->b_count);
 930                         spin_unlock(&lru_list_lock);
 931                         wait_on_buffer(bh);
 932                         brelse(bh);
 933                         if (!buffer_uptodate(bh))
 934                                 err = -EIO;
 935                         spin_lock(&lru_list_lock);
 936                         goto repeat;
 937                 }
 938         }
 939
 940         spin_unlock(&lru_list_lock);
 941         return err;
 942 }
 943
 944
 945 /*
 946  * Invalidate any and all dirty buffers on a given inode.  We are
 947  * probably unmounting the fs, but that doesn't mean we have already
 948  * done a sync().  Just drop the buffers from the inode list.
 949  */
 950
 951 void invalidate_inode_buffers(struct inode *inode)
 952 {
 953         struct list_head *list, *next;
 954
 955         spin_lock(&lru_list_lock);
 956         list = inode->i_dirty_buffers.next;
 957         while (list != &inode->i_dirty_buffers) {
 958                 next = list->next;
 959                 remove_inode_queue(BH_ENTRY(list));
 960                 list = next;
 961         }
 962         spin_unlock(&lru_list_lock);
 963 }
 964
 965
 966 /*
 967  * Ok, this is getblk, and it isn't very clear, again to hinder
 968  * race-conditions. Most of the code is seldom used, (ie repeating),
 969  * so it should be much more efficient than it looks.
 970  *
 971  * The algorithm is changed: hopefully better, and an elusive bug removed.
 972  *
 973  * 14.02.92: changed it to sync dirty buffers a bit: better performance
 974  * when the filesystem starts to get full of dirty blocks (I hope).
 975  */
 976 struct buffer_head * getblk(kdev_t dev, int block, int size)
 977 {
 978         struct buffer_head * bh;
 979         int isize;
 980
 981 repeat:
 982         spin_lock(&lru_list_lock);
 983         write_lock(&hash_table_lock);
 984         bh = __get_hash_table(dev, block, size);
 985         if (bh)
 986                 goto out;
 987
 988         isize = BUFSIZE_INDEX(size);
 989         spin_lock(&free_list[isize].lock);
 990         bh = free_list[isize].list;
 991         if (bh) {
 992                 __remove_from_free_list(bh, isize);
 993                 atomic_set(&bh->b_count, 1);
 994         }
 995         spin_unlock(&free_list[isize].lock);
 996
 997         /*
 998          * OK, FINALLY we know that this buffer is the only one of
 999          * its kind, we hold a reference (b_count>0), it is unlocked,
1000          * and it is clean.
1001          */
1002         if (bh) {
1003                 init_buffer(bh, end_buffer_io_sync, NULL);
1004                 bh->b_dev = dev;
1005                 bh->b_blocknr = block;
1006                 bh->b_state = 1 << BH_Mapped;
1007
1008                 /* Insert the buffer into the regular lists */
1009                 __insert_into_queues(bh);
1010         out:
1011                 write_unlock(&hash_table_lock);
1012                 spin_unlock(&lru_list_lock);
1013                 touch_buffer(bh);
1014                 return bh;
1015         }
1016
1017         /*
1018          * If we block while refilling the free list, somebody may
1019          * create the buffer first ... search the hashes again.
1020          */
1021         write_unlock(&hash_table_lock);
1022         spin_unlock(&lru_list_lock);
1023         refill_freelist(size);
1024         goto repeat;
1025 }
1026
1027 /* -1 -> no need to flush
1028     0 -> async flush
1029     1 -> sync flush (wait for I/O completation) */
1030 int balance_dirty_state(kdev_t dev)
1031 {
1032         unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
1033         int shortage;
1034
1035         dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
1036         tot = nr_free_buffer_pages();
1037
1038         dirty *= 200;
1039         soft_dirty_limit = tot * bdf_prm.b_un.nfract;
1040         hard_dirty_limit = soft_dirty_limit * 2;
1041
1042         /* First, check for the "real" dirty limit. */
1043         if (dirty > soft_dirty_limit) {
1044                 if (dirty > hard_dirty_limit)
1045                         return 1;
1046                 return 0;
1047         }
1048
1049         /*
1050          * If we are about to get low on free pages and
1051          * cleaning the inactive_dirty pages would help
1052          * fix this, wake up bdflush.
1053          */
1054         shortage = free_shortage();
1055         if (shortage && nr_inactive_dirty_pages > shortage &&
1056                         nr_inactive_dirty_pages > freepages.high)
1057                 return 0;
1058
1059         return -1;
1060 }
1061
1062 /*
1063  * if a new dirty buffer is created we need to balance bdflush.
1064  *
1065  * in the future we might want to make bdflush aware of different
1066  * pressures on different devices - thus the (currently unused)
1067  * 'dev' parameter.
1068  */
1069 void balance_dirty(kdev_t dev)
1070 {
1071         int state = balance_dirty_state(dev);
1072
1073         if (state < 0)
1074                 return;
1075         wakeup_bdflush(state);
1076 }
1077
1078 static __inline__ void __mark_dirty(struct buffer_head *bh)
1079 {
1080         bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
1081         refile_buffer(bh);
1082 }
1083
1084 /* atomic version, the user must call balance_dirty() by hand
1085    as soon as it become possible to block */
1086 void __mark_buffer_dirty(struct buffer_head *bh)
1087 {
1088         if (!atomic_set_buffer_dirty(bh))
1089                 __mark_dirty(bh);
1090 }
1091
1092 void mark_buffer_dirty(struct buffer_head *bh)
1093 {
1094         __mark_buffer_dirty(bh);
1095         balance_dirty(bh->b_dev);
1096 }
1097
1098 /*
1099  * A buffer may need to be moved from one buffer list to another
1100  * (e.g. in case it is not shared any more). Handle this.
1101  */
1102 static void __refile_buffer(struct buffer_head *bh)
1103 {
1104         int dispose = BUF_CLEAN;
1105         if (buffer_locked(bh))
1106                 dispose = BUF_LOCKED;
1107         if (buffer_dirty(bh))
1108                 dispose = BUF_DIRTY;
1109         if (buffer_protected(bh))
1110                 dispose = BUF_PROTECTED;
1111         if (dispose != bh->b_list) {
1112                 __remove_from_lru_list(bh, bh->b_list);
1113                 bh->b_list = dispose;
1114                 if (dispose == BUF_CLEAN)
1115                         remove_inode_queue(bh);
1116                 __insert_into_lru_list(bh, dispose);
1117         }
1118 }
1119
1120 void refile_buffer(struct buffer_head *bh)
1121 {
1122         spin_lock(&lru_list_lock);
1123         __refile_buffer(bh);
1124         spin_unlock(&lru_list_lock);
1125 }
1126
1127 /*
1128  * Release a buffer head
1129  */
1130 void __brelse(struct buffer_head * buf)
1131 {
1132         if (atomic_read(&buf->b_count)) {
1133                 atomic_dec(&buf->b_count);
1134                 return;
1135         }
1136         printk("VFS: brelse: Trying to free free buffer\n");
1137 }
1138
1139 /*
1140  * bforget() is like brelse(), except it puts the buffer on the
1141  * free list if it can.. We can NOT free the buffer if:
1142  *  - there are other users of it
1143  *  - it is locked and thus can have active IO
1144  */
1145 void __bforget(struct buffer_head * buf)
1146 {
1147         /* grab the lru lock here to block bdflush. */
1148         spin_lock(&lru_list_lock);
1149         write_lock(&hash_table_lock);
1150         if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf))
1151                 goto in_use;
1152         __hash_unlink(buf);
1153         remove_inode_queue(buf);
1154         write_unlock(&hash_table_lock);
1155         __remove_from_lru_list(buf, buf->b_list);
1156         spin_unlock(&lru_list_lock);
1157         put_last_free(buf);
1158         return;
1159
1160  in_use:
1161         write_unlock(&hash_table_lock);
1162         spin_unlock(&lru_list_lock);
1163 }
1164
1165 /*
1166  * bread() reads a specified block and returns the buffer that contains
1167  * it. It returns NULL if the block was unreadable.
1168  */
1169 struct buffer_head * bread(kdev_t dev, int block, int size)
1170 {
1171         struct buffer_head * bh;
1172
1173         bh = getblk(dev, block, size);
1174         if (buffer_uptodate(bh))
1175                 return bh;
1176         ll_rw_block(READ, 1, &bh);
1177         wait_on_buffer(bh);
1178         if (buffer_uptodate(bh))
1179                 return bh;
1180         brelse(bh);
1181         return NULL;
1182 }
1183
1184 /*
1185  * Ok, breada can be used as bread, but additionally to mark other
1186  * blocks for reading as well. End the argument list with a negative
1187  * number.
1188  */
1189
1190 #define NBUF 16
1191
1192 struct buffer_head * breada(kdev_t dev, int block, int bufsize,
1193         unsigned int pos, unsigned int filesize)
1194 {
1195         struct buffer_head * bhlist[NBUF];
1196         unsigned int blocks;
1197         struct buffer_head * bh;
1198         int index;
1199         int i, j;
1200
1201         if (pos >= filesize)
1202                 return NULL;
1203
1204         if (block < 0)
1205                 return NULL;
1206
1207         bh = getblk(dev, block, bufsize);
1208         index = BUFSIZE_INDEX(bh->b_size);
1209
1210         if (buffer_uptodate(bh))
1211                 return(bh);
1212         else ll_rw_block(READ, 1, &bh);
1213
1214         blocks = (filesize - pos) >> (9+index);
1215
1216         if (blocks > NBUF)
1217                 blocks = NBUF;
1218
1219         bhlist[0] = bh;
1220         j = 1;
1221         for(i=1; i<blocks; i++) {
1222                 bh = getblk(dev,block+i,bufsize);
1223                 if (buffer_uptodate(bh)) {
1224                         brelse(bh);
1225                         break;
1226                 }
1227                 else bhlist[j++] = bh;
1228         }
1229
1230         /* Request the read for these buffers, and then release them. */
1231         if (j>1)
1232                 ll_rw_block(READA, (j-1), bhlist+1);
1233         for(i=1; i<j; i++)
1234                 brelse(bhlist[i]);
1235
1236         /* Wait for this buffer, and then continue on. */
1237         bh = bhlist[0];
1238         wait_on_buffer(bh);
1239         if (buffer_uptodate(bh))
1240                 return bh;
1241         brelse(bh);
1242         return NULL;
1243 }
1244
1245 /*
1246  * Note: the caller should wake up the buffer_wait list if needed.
1247  */
1248 static __inline__ void __put_unused_buffer_head(struct buffer_head * bh)
1249 {
1250         if (bh->b_inode)
1251                 BUG();
1252         if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1253                 kmem_cache_free(bh_cachep, bh);
1254         } else {
1255                 bh->b_blocknr = -1;
1256                 init_waitqueue_head(&bh->b_wait);
1257                 nr_unused_buffer_heads++;
1258                 bh->b_next_free = unused_list;
1259                 bh->b_this_page = NULL;
1260                 unused_list = bh;
1261         }
1262 }
1263
1264 /*
1265  * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1266  * no-buffer-head deadlock.  Return NULL on failure; waiting for
1267  * buffer heads is now handled in create_buffers().
1268  */
1269 static struct buffer_head * get_unused_buffer_head(int async)
1270 {
1271         struct buffer_head * bh;
1272
1273         spin_lock(&unused_list_lock);
1274         if (nr_unused_buffer_heads > NR_RESERVED) {
1275                 bh = unused_list;
1276                 unused_list = bh->b_next_free;
1277                 nr_unused_buffer_heads--;
1278                 spin_unlock(&unused_list_lock);
1279                 return bh;
1280         }
1281         spin_unlock(&unused_list_lock);
1282
1283         /* This is critical.  We can't swap out pages to get
1284          * more buffer heads, because the swap-out may need
1285          * more buffer-heads itself.  Thus SLAB_BUFFER.
1286          */
1287         if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
1288                 memset(bh, 0, sizeof(*bh));
1289                 init_waitqueue_head(&bh->b_wait);
1290                 return bh;
1291         }
1292
1293         /*
1294          * If we need an async buffer, use the reserved buffer heads.
1295          */
1296         if (async) {
1297                 spin_lock(&unused_list_lock);
1298                 if (unused_list) {
1299                         bh = unused_list;
1300                         unused_list = bh->b_next_free;
1301                         nr_unused_buffer_heads--;
1302                         spin_unlock(&unused_list_lock);
1303                         return bh;
1304                 }
1305                 spin_unlock(&unused_list_lock);
1306         }
1307 #if 0
1308         /*
1309          * (Pending further analysis ...)
1310          * Ordinary (non-async) requests can use a different memory priority
1311          * to free up pages. Any swapping thus generated will use async
1312          * buffer heads.
1313          */
1314         if(!async &&
1315            (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1316                 memset(bh, 0, sizeof(*bh));
1317                 init_waitqueue_head(&bh->b_wait);
1318                 return bh;
1319         }
1320 #endif
1321
1322         return NULL;
1323 }
1324
1325 void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
1326 {
1327         bh->b_page = page;
1328         if (offset >= PAGE_SIZE)
1329                 BUG();
1330         if (PageHighMem(page))
1331                 /*
1332                  * This catches illegal uses and preserves the offset:
1333                  */
1334                 bh->b_data = (char *)(0 + offset);
1335         else
1336                 bh->b_data = page_address(page) + offset;
1337 }
1338
1339 /*
1340  * Create the appropriate buffers when given a page for data area and
1341  * the size of each buffer.. Use the bh->b_this_page linked list to
1342  * follow the buffers created.  Return NULL if unable to create more
1343  * buffers.
1344  * The async flag is used to differentiate async IO (paging, swapping)
1345  * from ordinary buffer allocations, and only async requests are allowed
1346  * to sleep waiting for buffer heads.
1347  */
1348 static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
1349 {
1350         struct buffer_head *bh, *head;
1351         long offset;
1352
1353 try_again:
1354         head = NULL;
1355         offset = PAGE_SIZE;
1356         while ((offset -= size) >= 0) {
1357                 bh = get_unused_buffer_head(async);
1358                 if (!bh)
1359                         goto no_grow;
1360
1361                 bh->b_dev = B_FREE;  /* Flag as unused */
1362                 bh->b_this_page = head;
1363                 head = bh;
1364
1365                 bh->b_state = 0;
1366                 bh->b_next_free = NULL;
1367                 bh->b_pprev = NULL;
1368                 atomic_set(&bh->b_count, 0);
1369                 bh->b_size = size;
1370
1371                 set_bh_page(bh, page, offset);
1372
1373                 bh->b_list = BUF_CLEAN;
1374                 bh->b_end_io = end_buffer_io_bad;
1375         }
1376         return head;
1377 /*
1378  * In case anything failed, we just free everything we got.
1379  */
1380 no_grow:
1381         if (head) {
1382                 spin_lock(&unused_list_lock);
1383                 do {
1384                         bh = head;
1385                         head = head->b_this_page;
1386                         __put_unused_buffer_head(bh);
1387                 } while (head);
1388                 spin_unlock(&unused_list_lock);
1389
1390                 /* Wake up any waiters ... */
1391                 wake_up(&buffer_wait);
1392         }
1393
1394         /*
1395          * Return failure for non-async IO requests.  Async IO requests
1396          * are not allowed to fail, so we have to wait until buffer heads
1397          * become available.  But we don't want tasks sleeping with
1398          * partially complete buffers, so all were released above.
1399          */
1400         if (!async)
1401                 return NULL;
1402
1403         /* We're _really_ low on memory. Now we just
1404          * wait for old buffer heads to become free due to
1405          * finishing IO.  Since this is an async request and
1406          * the reserve list is empty, we're sure there are
1407          * async buffer heads in use.
1408          */
1409         run_task_queue(&tq_disk);
1410
1411         /*
1412          * Set our state for sleeping, then check again for buffer heads.
1413          * This ensures we won't miss a wake_up from an interrupt.
1414          */
1415         wait_event(buffer_wait, nr_unused_buffer_heads >= MAX_BUF_PER_PAGE);
1416         goto try_again;
1417 }
1418
1419 static int create_page_buffers(int rw, struct page *page, kdev_t dev, int b[], int size)
1420 {
1421         struct buffer_head *head, *bh, *tail;
1422         int block;
1423
1424         if (!PageLocked(page))
1425                 BUG();
1426         /*
1427          * Allocate async buffer heads pointing to this page, just for I/O.
1428          * They don't show up in the buffer hash table, but they *are*
1429          * registered in page->buffers.
1430          */
1431         head = create_buffers(page, size, 1);
1432         if (page->buffers)
1433                 BUG();
1434         if (!head)
1435                 BUG();
1436         tail = head;
1437         for (bh = head; bh; bh = bh->b_this_page) {
1438                 block = *(b++);
1439
1440                 tail = bh;
1441                 init_buffer(bh, end_buffer_io_async, NULL);
1442                 bh->b_dev = dev;
1443                 bh->b_blocknr = block;
1444
1445                 set_bit(BH_Mapped, &bh->b_state);
1446         }
1447         tail->b_this_page = head;
1448         page_cache_get(page);
1449         page->buffers = head;
1450         return 0;
1451 }
1452
1453 static void unmap_buffer(struct buffer_head * bh)
1454 {
1455         if (buffer_mapped(bh)) {
1456                 mark_buffer_clean(bh);
1457                 wait_on_buffer(bh);
1458                 clear_bit(BH_Uptodate, &bh->b_state);
1459                 clear_bit(BH_Mapped, &bh->b_state);
1460                 clear_bit(BH_Req, &bh->b_state);
1461                 clear_bit(BH_New, &bh->b_state);
1462         }
1463 }
1464
1465 /*
1466  * We don't have to release all buffers here, but
1467  * we have to be sure that no dirty buffer is left
1468  * and no IO is going on (no buffer is locked), because
1469  * we have truncated the file and are going to free the
1470  * blocks on-disk..
1471  */
1472 int block_flushpage(struct page *page, unsigned long offset)
1473 {
1474         struct buffer_head *head, *bh, *next;
1475         unsigned int curr_off = 0;
1476
1477         if (!PageLocked(page))
1478                 BUG();
1479         if (!page->buffers)
1480                 return 1;
1481
1482         head = page->buffers;
1483         bh = head;
1484         do {
1485                 unsigned int next_off = curr_off + bh->b_size;
1486                 next = bh->b_this_page;
1487
1488                 /*
1489                  * is this block fully flushed?
1490                  */
1491                 if (offset <= curr_off)
1492                         unmap_buffer(bh);
1493                 curr_off = next_off;
1494                 bh = next;
1495         } while (bh != head);
1496
1497         /*
1498          * subtle. We release buffer-heads only if this is
1499          * the 'final' flushpage. We have invalidated the get_block
1500          * cached value unconditionally, so real IO is not
1501          * possible anymore.
1502          *
1503          * If the free doesn't work out, the buffers can be
1504          * left around - they just turn into anonymous buffers
1505          * instead.
1506          */
1507         if (!offset) {
1508                 if (!try_to_free_buffers(page, 0)) {
1509                         atomic_inc(&buffermem_pages);
1510                         return 0;
1511                 }
1512         }
1513
1514         return 1;
1515 }
1516
1517 static void create_empty_buffers(struct page *page, struct inode *inode, unsigned long blocksize)
1518 {
1519         struct buffer_head *bh, *head, *tail;
1520
1521         head = create_buffers(page, blocksize, 1);
1522         if (page->buffers)
1523                 BUG();
1524
1525         bh = head;
1526         do {
1527                 bh->b_dev = inode->i_dev;
1528                 bh->b_blocknr = 0;
1529                 bh->b_end_io = end_buffer_io_bad;
1530                 tail = bh;
1531                 bh = bh->b_this_page;
1532         } while (bh);
1533         tail->b_this_page = head;
1534         page->buffers = head;
1535         page_cache_get(page);
1536 }
1537
1538 /*
1539  * We are taking a block for data and we don't want any output from any
1540  * buffer-cache aliases starting from return from that function and
1541  * until the moment when something will explicitly mark the buffer
1542  * dirty (hopefully that will not happen until we will free that block ;-)
1543  * We don't even need to mark it not-uptodate - nobody can expect
1544  * anything from a newly allocated buffer anyway. We used to used
1545  * unmap_buffer() for such invalidation, but that was wrong. We definitely
1546  * don't want to mark the alias unmapped, for example - it would confuse
1547  * anyone who might pick it with bread() afterwards...
1548  */
1549
1550 static void unmap_underlying_metadata(struct buffer_head * bh)
1551 {
1552         struct buffer_head *old_bh;
1553
1554         old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
1555         if (old_bh) {
1556                 mark_buffer_clean(old_bh);
1557                 wait_on_buffer(old_bh);
1558                 clear_bit(BH_Req, &old_bh->b_state);
1559                 /* Here we could run brelse or bforget. We use
1560                    bforget because it will try to put the buffer
1561                    in the freelist. */
1562                 __bforget(old_bh);
1563         }
1564 }
1565
1566 /*
1567  * NOTE! All mapped/uptodate combinations are valid:
1568  *
1569  *      Mapped  Uptodate        Meaning
1570  *
1571  *      No      No              "unknown" - must do get_block()
1572  *      No      Yes             "hole" - zero-filled
1573  *      Yes     No              "allocated" - allocated on disk, not read in
1574  *      Yes     Yes             "valid" - allocated and up-to-date in memory.
1575  *
1576  * "Dirty" is valid only with the last case (mapped+uptodate).
1577  */
1578
1579 /*
1580  * block_write_full_page() is SMP-safe - currently it's still
1581  * being called with the kernel lock held, but the code is ready.
1582  */
1583 static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
1584 {
1585         int err, i, need_balance_dirty = 0;
1586         unsigned long block;
1587         struct buffer_head *bh, *head;
1588
1589         if (!PageLocked(page))
1590                 BUG();
1591
1592         if (!page->buffers)
1593                 create_empty_buffers(page, inode, inode->i_sb->s_blocksize);
1594         head = page->buffers;
1595
1596         block = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1597
1598         bh = head;
1599         i = 0;
1600         do {
1601                 /*
1602                  * If the buffer isn't up-to-date, we can't be sure
1603                  * that the buffer has been initialized with the proper
1604                  * block number information etc..
1605                  *
1606                  * Leave it to the low-level FS to make all those
1607                  * decisions (block #0 may actually be a valid block)
1608                  */
1609                 bh->b_end_io = end_buffer_io_sync;
1610                 if (!buffer_mapped(bh)) {
1611                         err = get_block(inode, block, bh, 1);
1612                         if (err)
1613                                 goto out;
1614                         if (buffer_new(bh))
1615                                 unmap_underlying_metadata(bh);
1616                 }
1617                 set_bit(BH_Uptodate, &bh->b_state);
1618                 if (!atomic_set_buffer_dirty(bh)) {
1619                         buffer_insert_inode_queue(bh, inode);
1620                         __mark_dirty(bh);
1621                         need_balance_dirty = 1;
1622                 }
1623
1624                 bh = bh->b_this_page;
1625                 block++;
1626         } while (bh != head);
1627
1628         if (need_balance_dirty)
1629                 balance_dirty(bh->b_dev);
1630
1631         SetPageUptodate(page);
1632         return 0;
1633 out:
1634         ClearPageUptodate(page);
1635         return err;
1636 }
1637
1638 static int __block_prepare_write(struct inode *inode, struct page *page,
1639                 unsigned from, unsigned to, get_block_t *get_block)
1640 {
1641         unsigned block_start, block_end;
1642         unsigned long block;
1643         int err = 0;
1644         unsigned blocksize, bbits;
1645         struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1646         char *kaddr = kmap(page);
1647
1648         blocksize = inode->i_sb->s_blocksize;
1649         if (!page->buffers)
1650                 create_empty_buffers(page, inode, blocksize);
1651         head = page->buffers;
1652
1653         bbits = inode->i_sb->s_blocksize_bits;
1654         block = page->index << (PAGE_CACHE_SHIFT - bbits);
1655
1656         for(bh = head, block_start = 0; bh != head || !block_start;
1657             block++, block_start=block_end, bh = bh->b_this_page) {
1658                 if (!bh)
1659                         BUG();
1660                 block_end = block_start+blocksize;
1661                 if (block_end <= from)
1662                         continue;
1663                 if (block_start >= to)
1664                         break;
1665                 bh->b_end_io = end_buffer_io_sync;
1666                 if (!buffer_mapped(bh)) {
1667                         err = get_block(inode, block, bh, 1);
1668                         if (err)
1669                                 goto out;
1670                         if (buffer_new(bh)) {
1671                                 unmap_underlying_metadata(bh);
1672                                 if (Page_Uptodate(page)) {
1673                                         set_bit(BH_Uptodate, &bh->b_state);
1674                                         continue;
1675                                 }
1676                                 if (block_end > to)
1677                                         memset(kaddr+to, 0, block_end-to);
1678                                 if (block_start < from)
1679                                         memset(kaddr+block_start, 0, from-block_start);
1680                                 if (block_end > to || block_start < from)
1681                                         flush_dcache_page(page);
1682                                 continue;
1683                         }
1684                 }
1685                 if (Page_Uptodate(page)) {
1686                         set_bit(BH_Uptodate, &bh->b_state);
1687                         continue;
1688                 }
1689                 if (!buffer_uptodate(bh) &&
1690                      (block_start < from || block_end > to)) {
1691                         ll_rw_block(READ, 1, &bh);
1692                         *wait_bh++=bh;
1693                 }
1694         }
1695         /*
1696          * If we issued read requests - let them complete.
1697          */
1698         while(wait_bh > wait) {
1699                 wait_on_buffer(*--wait_bh);
1700                 err = -EIO;
1701                 if (!buffer_uptodate(*wait_bh))
1702                         goto out;
1703         }
1704         return 0;
1705 out:
1706         return err;
1707 }
1708
1709 static int __block_commit_write(struct inode *inode, struct page *page,
1710                 unsigned from, unsigned to)
1711 {
1712         unsigned block_start, block_end;
1713         int partial = 0, need_balance_dirty = 0;
1714         unsigned blocksize;
1715         struct buffer_head *bh, *head;
1716
1717         blocksize = inode->i_sb->s_blocksize;
1718
1719         for(bh = head = page->buffers, block_start = 0;
1720             bh != head || !block_start;
1721             block_start=block_end, bh = bh->b_this_page) {
1722                 block_end = block_start + blocksize;
1723                 if (block_end <= from || block_start >= to) {
1724                         if (!buffer_uptodate(bh))
1725                                 partial = 1;
1726                 } else {
1727                         set_bit(BH_Uptodate, &bh->b_state);
1728                         if (!atomic_set_buffer_dirty(bh)) {
1729                                 __mark_dirty(bh);
1730                                 buffer_insert_inode_queue(bh, inode);
1731                                 need_balance_dirty = 1;
1732                         }
1733                 }
1734         }
1735
1736         if (need_balance_dirty)
1737                 balance_dirty(bh->b_dev);
1738         /*
1739          * is this a partial write that happened to make all buffers
1740          * uptodate then we can optimize away a bogus readpage() for
1741          * the next read(). Here we 'discover' wether the page went
1742          * uptodate as a result of this (potentially partial) write.
1743          */
1744         if (!partial)
1745                 SetPageUptodate(page);
1746         return 0;
1747 }
1748
1749 /*
1750  * Generic "read page" function for block devices that have the normal
1751  * get_block functionality. This is most of the block device filesystems.
1752  * Reads the page asynchronously --- the unlock_buffer() and
1753  * mark_buffer_uptodate() functions propagate buffer state into the
1754  * page struct once IO has completed.
1755  */
1756 int block_read_full_page(struct page *page, get_block_t *get_block)
1757 {
1758         struct inode *inode = (struct inode*)page->mapping->host;
1759         unsigned long iblock, lblock;
1760         struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1761         unsigned int blocksize, blocks;
1762         char *kaddr = NULL;
1763         int nr, i;
1764
1765         if (!PageLocked(page))
1766                 PAGE_BUG(page);
1767         blocksize = inode->i_sb->s_blocksize;
1768         if (!page->buffers)
1769                 create_empty_buffers(page, inode, blocksize);
1770         head = page->buffers;
1771
1772         blocks = PAGE_CACHE_SIZE >> inode->i_sb->s_blocksize_bits;
1773         iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1774         lblock = (inode->i_size+blocksize-1) >> inode->i_sb->s_blocksize_bits;
1775         bh = head;
1776         nr = 0;
1777         i = 0;
1778
1779         do {
1780                 if (buffer_uptodate(bh))
1781                         continue;
1782
1783                 if (!buffer_mapped(bh)) {
1784                         if (iblock < lblock) {
1785                                 if (get_block(inode, iblock, bh, 0))
1786                                         continue;
1787                         }
1788                         if (!buffer_mapped(bh)) {
1789                                 if (!kaddr)
1790                                         kaddr = kmap(page);
1791                                 memset(kaddr + i*blocksize, 0, blocksize);
1792                                 flush_dcache_page(page);
1793                                 set_bit(BH_Uptodate, &bh->b_state);
1794                                 continue;
1795                         }
1796                 }
1797
1798                 init_buffer(bh, end_buffer_io_async, NULL);
1799                 atomic_inc(&bh->b_count);
1800                 arr[nr] = bh;
1801                 nr++;
1802         } while (i++, iblock++, (bh = bh->b_this_page) != head);
1803
1804         if (nr) {
1805                 if (Page_Uptodate(page))
1806                         BUG();
1807                 ll_rw_block(READ, nr, arr);
1808         } else {
1809                 /*
1810                  * all buffers are uptodate - we can set the page
1811                  * uptodate as well.
1812                  */
1813                 SetPageUptodate(page);
1814                 UnlockPage(page);
1815         }
1816         if (kaddr)
1817                 kunmap(page);
1818         return 0;
1819 }
1820
1821 /*
1822  * For moronic filesystems that do not allow holes in file.
1823  * We may have to extend the file.
1824  */
1825
1826 int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes)
1827 {
1828         struct address_space *mapping = page->mapping;
1829         struct inode *inode = (struct inode*)mapping->host;
1830         struct page *new_page;
1831         unsigned long pgpos;
1832         long status;
1833         unsigned zerofrom;
1834         unsigned blocksize = inode->i_sb->s_blocksize;
1835         char *kaddr;
1836
1837         while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
1838                 status = -ENOMEM;
1839                 new_page = grab_cache_page(mapping, pgpos);
1840                 if (!new_page)
1841                         goto out;
1842                 /* we might sleep */
1843                 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
1844                         UnlockPage(new_page);
1845                         page_cache_release(new_page);
1846                         continue;
1847                 }
1848                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1849                 if (zerofrom & (blocksize-1)) {
1850                         *bytes |= (blocksize-1);
1851                         (*bytes)++;
1852                 }
1853                 status = __block_prepare_write(inode, new_page, zerofrom,
1854                                                 PAGE_CACHE_SIZE, get_block);
1855                 if (status)
1856                         goto out_unmap;
1857                 kaddr = page_address(new_page);
1858                 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
1859                 flush_dcache_page(new_page);
1860                 __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
1861                 kunmap(new_page);
1862                 UnlockPage(new_page);
1863                 page_cache_release(new_page);
1864         }
1865
1866         if (page->index < pgpos) {
1867                 /* completely inside the area */
1868                 zerofrom = offset;
1869         } else {
1870                 /* page covers the boundary, find the boundary offset */
1871                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1872
1873                 /* if we will expand the thing last block will be filled */
1874                 if (to > zerofrom && (zerofrom & (blocksize-1))) {
1875                         *bytes |= (blocksize-1);
1876                         (*bytes)++;
1877                 }
1878
1879                 /* starting below the boundary? Nothing to zero out */
1880                 if (offset <= zerofrom)
1881                         zerofrom = offset;
1882         }
1883         status = __block_prepare_write(inode, page, zerofrom, to, get_block);
1884         if (status)
1885                 goto out1;
1886         kaddr = page_address(page);
1887         if (zerofrom < offset) {
1888                 memset(kaddr+zerofrom, 0, offset-zerofrom);
1889                 flush_dcache_page(page);
1890                 __block_commit_write(inode, page, zerofrom, offset);
1891         }
1892         return 0;
1893 out1:
1894         ClearPageUptodate(page);
1895         kunmap(page);
1896         return status;
1897
1898 out_unmap:
1899         ClearPageUptodate(new_page);
1900         kunmap(new_page);
1901         UnlockPage(new_page);
1902         page_cache_release(new_page);
1903 out:
1904         return status;
1905 }
1906
1907 int block_prepare_write(struct page *page, unsigned from, unsigned to,
1908                         get_block_t *get_block)
1909 {
1910         struct inode *inode = (struct inode*)page->mapping->host;
1911         int err = __block_prepare_write(inode, page, from, to, get_block);
1912         if (err) {
1913                 ClearPageUptodate(page);
1914                 kunmap(page);
1915         }
1916         return err;
1917 }
1918
1919 int generic_commit_write(struct file *file, struct page *page,
1920                 unsigned from, unsigned to)
1921 {
1922         struct inode *inode = (struct inode*)page->mapping->host;
1923         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1924         __block_commit_write(inode,page,from,to);
1925         kunmap(page);
1926         if (pos > inode->i_size) {
1927                 inode->i_size = pos;
1928                 mark_inode_dirty(inode);
1929         }
1930         return 0;
1931 }
1932
1933 int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t *get_block)
1934 {
1935         unsigned long index = from >> PAGE_CACHE_SHIFT;
1936         unsigned offset = from & (PAGE_CACHE_SIZE-1);
1937         unsigned blocksize, iblock, length, pos;
1938         struct inode *inode = (struct inode *)mapping->host;
1939         struct page *page;
1940         struct buffer_head *bh;
1941         int err;
1942
1943         blocksize = inode->i_sb->s_blocksize;
1944         length = offset & (blocksize - 1);
1945
1946         /* Block boundary? Nothing to do */
1947         if (!length)
1948                 return 0;
1949
1950         length = blocksize - length;
1951         iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1952
1953         page = grab_cache_page(mapping, index);
1954         err = PTR_ERR(page);
1955         if (IS_ERR(page))
1956                 goto out;
1957
1958         if (!page->buffers)
1959                 create_empty_buffers(page, inode, blocksize);
1960
1961         /* Find the buffer that contains "offset" */
1962         bh = page->buffers;
1963         pos = blocksize;
1964         while (offset >= pos) {
1965                 bh = bh->b_this_page;
1966                 iblock++;
1967                 pos += blocksize;
1968         }
1969
1970         err = 0;
1971         if (!buffer_mapped(bh)) {
1972                 /* Hole? Nothing to do */
1973                 if (buffer_uptodate(bh))
1974                         goto unlock;
1975                 get_block(inode, iblock, bh, 0);
1976                 /* Still unmapped? Nothing to do */
1977                 if (!buffer_mapped(bh))
1978                         goto unlock;
1979         }
1980
1981         /* Ok, it's mapped. Make sure it's up-to-date */
1982         if (Page_Uptodate(page))
1983                 set_bit(BH_Uptodate, &bh->b_state);
1984
1985         bh->b_end_io = end_buffer_io_sync;
1986         if (!buffer_uptodate(bh)) {
1987                 err = -EIO;
1988                 ll_rw_block(READ, 1, &bh);
1989                 wait_on_buffer(bh);
1990                 /* Uhhuh. Read error. Complain and punt. */
1991                 if (!buffer_uptodate(bh))
1992                         goto unlock;
1993         }
1994
1995         memset(kmap(page) + offset, 0, length);
1996         flush_dcache_page(page);
1997         kunmap(page);
1998
1999         mark_buffer_dirty(bh);
2000         err = 0;
2001
2002 unlock:
2003         UnlockPage(page);
2004         page_cache_release(page);
2005 out:
2006         return err;
2007 }
2008
2009 int block_write_full_page(struct page *page, get_block_t *get_block)
2010 {
2011         struct inode *inode = (struct inode*)page->mapping->host;
2012         unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
2013         unsigned offset;
2014         int err;
2015
2016         /* easy case */
2017         if (page->index < end_index)
2018                 return __block_write_full_page(inode, page, get_block);
2019
2020         /* things got complicated... */
2021         offset = inode->i_size & (PAGE_CACHE_SIZE-1);
2022         /* OK, are we completely out? */
2023         if (page->index >= end_index+1 || !offset)
2024                 return -EIO;
2025         /* Sigh... will have to work, then... */
2026         err = __block_prepare_write(inode, page, 0, offset, get_block);
2027         if (!err) {
2028                 memset(page_address(page) + offset, 0, PAGE_CACHE_SIZE - offset);
2029                 flush_dcache_page(page);
2030                 __block_commit_write(inode,page,0,offset);
2031 done:
2032                 kunmap(page);
2033                 return err;
2034         }
2035         ClearPageUptodate(page);
2036         goto done;
2037 }
2038
2039 int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block)
2040 {
2041         struct buffer_head tmp;
2042         struct inode *inode = (struct inode*)mapping->host;
2043         tmp.b_state = 0;
2044         tmp.b_blocknr = 0;
2045         get_block(inode, block, &tmp, 0);
2046         return tmp.b_blocknr;
2047 }
2048
2049 /*
2050  * IO completion routine for a buffer_head being used for kiobuf IO: we
2051  * can't dispatch the kiobuf callback until io_count reaches 0.
2052  */
2053
2054 static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate)
2055 {
2056         struct kiobuf *kiobuf;
2057
2058         mark_buffer_uptodate(bh, uptodate);
2059
2060         kiobuf = bh->b_private;
2061         unlock_buffer(bh);
2062         end_kio_request(kiobuf, uptodate);
2063 }
2064
2065
2066 /*
2067  * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
2068  * for them to complete.  Clean up the buffer_heads afterwards.
2069  */
2070
2071 static int wait_kio(int rw, int nr, struct buffer_head *bh[], int size)
2072 {
2073         int iosize;
2074         int i;
2075         struct buffer_head *tmp;
2076
2077
2078         iosize = 0;
2079         spin_lock(&unused_list_lock);
2080
2081         for (i = nr; --i >= 0; ) {
2082                 iosize += size;
2083                 tmp = bh[i];
2084                 if (buffer_locked(tmp)) {
2085                         spin_unlock(&unused_list_lock);
2086                         wait_on_buffer(tmp);
2087                         spin_lock(&unused_list_lock);
2088                 }
2089
2090                 if (!buffer_uptodate(tmp)) {
2091                         /* We are traversing bh'es in reverse order so
2092                            clearing iosize on error calculates the
2093                            amount of IO before the first error. */
2094                         iosize = 0;
2095                 }
2096                 __put_unused_buffer_head(tmp);
2097         }
2098
2099         spin_unlock(&unused_list_lock);
2100
2101         return iosize;
2102 }
2103
2104 /*
2105  * Start I/O on a physical range of kernel memory, defined by a vector
2106  * of kiobuf structs (much like a user-space iovec list).
2107  *
2108  * The kiobuf must already be locked for IO.  IO is submitted
2109  * asynchronously: you need to check page->locked, page->uptodate, and
2110  * maybe wait on page->wait.
2111  *
2112  * It is up to the caller to make sure that there are enough blocks
2113  * passed in to completely map the iobufs to disk.
2114  */
2115
2116 int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
2117                kdev_t dev, unsigned long b[], int size)
2118 {
2119         int             err;
2120         int             length;
2121         int             transferred;
2122         int             i;
2123         int             bufind;
2124         int             pageind;
2125         int             bhind;
2126         int             offset;
2127         int             sectors = size>>9;
2128         unsigned long   blocknr;
2129         struct kiobuf * iobuf = NULL;
2130         struct page *   map;
2131         struct buffer_head *tmp, *bh[KIO_MAX_SECTORS];
2132
2133         if (!nr)
2134                 return 0;
2135
2136         /*
2137          * First, do some alignment and validity checks
2138          */
2139         for (i = 0; i < nr; i++) {
2140                 iobuf = iovec[i];
2141                 if ((iobuf->offset & (size-1)) ||
2142                     (iobuf->length & (size-1)))
2143                         return -EINVAL;
2144                 if (!iobuf->nr_pages)
2145                         panic("brw_kiovec: iobuf not initialised");
2146         }
2147
2148         /*
2149          * OK to walk down the iovec doing page IO on each page we find.
2150          */
2151         bufind = bhind = transferred = err = 0;
2152         for (i = 0; i < nr; i++) {
2153                 iobuf = iovec[i];
2154                 offset = iobuf->offset;
2155                 length = iobuf->length;
2156                 iobuf->errno = 0;
2157
2158                 for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
2159                         map  = iobuf->maplist[pageind];
2160                         if (!map) {
2161                                 err = -EFAULT;
2162                                 goto error;
2163                         }
2164
2165                         while (length > 0) {
2166                                 blocknr = b[bufind++];
2167                                 tmp = get_unused_buffer_head(0);
2168                                 if (!tmp) {
2169                                         err = -ENOMEM;
2170                                         goto error;
2171                                 }
2172
2173                                 tmp->b_dev = B_FREE;
2174                                 tmp->b_size = size;
2175                                 set_bh_page(tmp, map, offset);
2176                                 tmp->b_this_page = tmp;
2177
2178                                 init_buffer(tmp, end_buffer_io_kiobuf, iobuf);
2179                                 tmp->b_rdev = tmp->b_dev = dev;
2180                                 tmp->b_blocknr = blocknr;
2181                                 tmp->b_rsector = blocknr*sectors;
2182                                 tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req);
2183
2184                                 if (rw == WRITE) {
2185                                         set_bit(BH_Uptodate, &tmp->b_state);
2186                                         set_bit(BH_Dirty, &tmp->b_state);
2187                                 }
2188
2189                                 bh[bhind++] = tmp;
2190                                 length -= size;
2191                                 offset += size;
2192
2193                                 atomic_inc(&iobuf->io_count);
2194
2195                                 generic_make_request(rw, tmp);
2196                                 /*
2197                                  * Wait for IO if we have got too much
2198                                  */
2199                                 if (bhind >= KIO_MAX_SECTORS) {
2200                                         err = wait_kio(rw, bhind, bh, size);
2201                                         if (err >= 0)
2202                                                 transferred += err;
2203                                         else
2204                                                 goto finished;
2205                                         bhind = 0;
2206                                 }
2207
2208                                 if (offset >= PAGE_SIZE) {
2209                                         offset = 0;
2210                                         break;
2211                                 }
2212                         } /* End of block loop */
2213                 } /* End of page loop */
2214         } /* End of iovec loop */
2215
2216         /* Is there any IO still left to submit? */
2217         if (bhind) {
2218                 err = wait_kio(rw, bhind, bh, size);
2219                 if (err >= 0)
2220                         transferred += err;
2221                 else
2222                         goto finished;
2223         }
2224
2225  finished:
2226         if (transferred)
2227                 return transferred;
2228         return err;
2229
2230  error:
2231         /* We got an error allocating the bh'es.  Just free the current
2232            buffer_heads and exit. */
2233         spin_lock(&unused_list_lock);
2234         for (i = bhind; --i >= 0; ) {
2235                 __put_unused_buffer_head(bh[i]);
2236         }
2237         spin_unlock(&unused_list_lock);
2238         goto finished;
2239 }
2240
2241 /*
2242  * Start I/O on a page.
2243  * This function expects the page to be locked and may return
2244  * before I/O is complete. You then have to check page->locked,
2245  * page->uptodate, and maybe wait on page->wait.
2246  *
2247  * brw_page() is SMP-safe, although it's being called with the
2248  * kernel lock held - but the code is ready.
2249  *
2250  * FIXME: we need a swapper_inode->get_block function to remove
2251  *        some of the bmap kludges and interface ugliness here.
2252  */
2253 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size)
2254 {
2255         struct buffer_head *head, *bh, *arr[MAX_BUF_PER_PAGE];
2256         int nr, fresh /* temporary debugging flag */, block;
2257
2258         if (!PageLocked(page))
2259                 panic("brw_page: page not locked for I/O");
2260 //      ClearPageError(page);
2261         /*
2262          * We pretty much rely on the page lock for this, because
2263          * create_page_buffers() might sleep.
2264          */
2265         fresh = 0;
2266         if (!page->buffers) {
2267                 create_page_buffers(rw, page, dev, b, size);
2268                 fresh = 1;
2269         }
2270         if (!page->buffers)
2271                 BUG();
2272
2273         head = page->buffers;
2274         bh = head;
2275         nr = 0;
2276         do {
2277                 block = *(b++);
2278
2279                 if (fresh && (atomic_read(&bh->b_count) != 0))
2280                         BUG();
2281                 if (rw == READ) {
2282                         if (!fresh)
2283                                 BUG();
2284                         if (!buffer_uptodate(bh)) {
2285                                 arr[nr++] = bh;
2286                                 atomic_inc(&bh->b_count);
2287                         }
2288                 } else { /* WRITE */
2289                         if (!bh->b_blocknr) {
2290                                 if (!block)
2291                                         BUG();
2292                                 bh->b_blocknr = block;
2293                         } else {
2294                                 if (!block)
2295                                         BUG();
2296                         }
2297                         set_bit(BH_Uptodate, &bh->b_state);
2298                         set_bit(BH_Dirty, &bh->b_state);
2299                         arr[nr++] = bh;
2300                         atomic_inc(&bh->b_count);
2301                 }
2302                 bh = bh->b_this_page;
2303         } while (bh != head);
2304         if ((rw == READ) && nr) {
2305                 if (Page_Uptodate(page))
2306                         BUG();
2307                 ll_rw_block(rw, nr, arr);
2308         } else {
2309                 if (!nr && rw == READ) {
2310                         SetPageUptodate(page);
2311                         UnlockPage(page);
2312                 }
2313                 if (nr && (rw == WRITE))
2314                         ll_rw_block(rw, nr, arr);
2315         }
2316         return 0;
2317 }
2318
2319 int block_symlink(struct inode *inode, const char *symname, int len)
2320 {
2321         struct address_space *mapping = inode->i_mapping;
2322         struct page *page = grab_cache_page(mapping, 0);
2323         int err = -ENOMEM;
2324         char *kaddr;
2325
2326         if (!page)
2327                 goto fail;
2328         err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
2329         if (err)
2330                 goto fail_map;
2331         kaddr = page_address(page);
2332         memcpy(kaddr, symname, len-1);
2333         mapping->a_ops->commit_write(NULL, page, 0, len-1);
2334         /*
2335          * Notice that we are _not_ going to block here - end of page is
2336          * unmapped, so this will only try to map the rest of page, see
2337          * that it is unmapped (typically even will not look into inode -
2338          * ->i_size will be enough for everything) and zero it out.
2339          * OTOH it's obviously correct and should make the page up-to-date.
2340          */
2341         err = mapping->a_ops->readpage(NULL, page);
2342         wait_on_page(page);
2343         page_cache_release(page);
2344         if (err < 0)
2345                 goto fail;
2346         mark_inode_dirty(inode);
2347         return 0;
2348 fail_map:
2349         UnlockPage(page);
2350         page_cache_release(page);
2351 fail:
2352         return err;
2353 }
2354
2355 /*
2356  * Try to increase the number of buffers available: the size argument
2357  * is used to determine what kind of buffers we want.
2358  */
2359 static int grow_buffers(int size)
2360 {
2361         struct page * page;
2362         struct buffer_head *bh, *tmp;
2363         struct buffer_head * insert_point;
2364         int isize;
2365
2366         if ((size & 511) || (size > PAGE_SIZE)) {
2367                 printk("VFS: grow_buffers: size = %d\n",size);
2368                 return 0;
2369         }
2370
2371         page = alloc_page(GFP_BUFFER);
2372         if (!page)
2373                 goto out;
2374         LockPage(page);
2375         bh = create_buffers(page, size, 0);
2376         if (!bh)
2377                 goto no_buffer_head;
2378
2379         isize = BUFSIZE_INDEX(size);
2380
2381         spin_lock(&free_list[isize].lock);
2382         insert_point = free_list[isize].list;
2383         tmp = bh;
2384         while (1) {
2385                 if (insert_point) {
2386                         tmp->b_next_free = insert_point->b_next_free;
2387                         tmp->b_prev_free = insert_point;
2388                         insert_point->b_next_free->b_prev_free = tmp;
2389                         insert_point->b_next_free = tmp;
2390                 } else {
2391                         tmp->b_prev_free = tmp;
2392                         tmp->b_next_free = tmp;
2393                 }
2394                 insert_point = tmp;
2395                 if (tmp->b_this_page)
2396                         tmp = tmp->b_this_page;
2397                 else
2398                         break;
2399         }
2400         tmp->b_this_page = bh;
2401         free_list[isize].list = bh;
2402         spin_unlock(&free_list[isize].lock);
2403
2404         page->buffers = bh;
2405         page->flags &= ~(1 << PG_referenced);
2406         lru_cache_add(page);
2407         UnlockPage(page);
2408         atomic_inc(&buffermem_pages);
2409         return 1;
2410
2411 no_buffer_head:
2412         UnlockPage(page);
2413         page_cache_release(page);
2414 out:
2415         return 0;
2416 }
2417
2418 /*
2419  * Sync all the buffers on one page..
2420  *
2421  * If we have old buffers that are locked, we'll
2422  * wait on them, but we won't wait on the new ones
2423  * we're writing out now.
2424  *
2425  * This all is required so that we can free up memory
2426  * later.
2427  *
2428  * Wait:
2429  *      0 - no wait (this does not get called - see try_to_free_buffers below)
2430  *      1 - start IO for dirty buffers
2431  *      2 - wait for completion of locked buffers
2432  */
2433 static void sync_page_buffers(struct buffer_head *bh, int wait)
2434 {
2435         struct buffer_head * tmp = bh;
2436
2437         do {
2438                 struct buffer_head *p = tmp;
2439                 tmp = tmp->b_this_page;
2440                 if (buffer_locked(p)) {
2441                         if (wait > 1)
2442                                 __wait_on_buffer(p);
2443                 } else if (buffer_dirty(p))
2444                         ll_rw_block(WRITE, 1, &p);
2445         } while (tmp != bh);
2446 }
2447
2448 /*
2449  * Can the buffer be thrown out?
2450  */
2451 #define BUFFER_BUSY_BITS        ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
2452 #define buffer_busy(bh)         (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
2453
2454 /*
2455  * try_to_free_buffers() checks if all the buffers on this particular page
2456  * are unused, and free's the page if so.
2457  *
2458  * Wake up bdflush() if this fails - if we're running low on memory due
2459  * to dirty buffers, we need to flush them out as quickly as possible.
2460  *
2461  * NOTE: There are quite a number of ways that threads of control can
2462  *       obtain a reference to a buffer head within a page.  So we must
2463  *       lock out all of these paths to cleanly toss the page.
2464  */
2465 int try_to_free_buffers(struct page * page, int wait)
2466 {
2467         struct buffer_head * tmp, * bh = page->buffers;
2468         int index = BUFSIZE_INDEX(bh->b_size);
2469         int loop = 0;
2470
2471 cleaned_buffers_try_again:
2472         spin_lock(&lru_list_lock);
2473         write_lock(&hash_table_lock);
2474         spin_lock(&free_list[index].lock);
2475         tmp = bh;
2476         do {
2477                 struct buffer_head *p = tmp;
2478
2479                 tmp = tmp->b_this_page;
2480                 if (buffer_busy(p))
2481                         goto busy_buffer_page;
2482         } while (tmp != bh);
2483
2484         spin_lock(&unused_list_lock);
2485         tmp = bh;
2486         do {
2487                 struct buffer_head * p = tmp;
2488                 tmp = tmp->b_this_page;
2489
2490                 /* The buffer can be either on the regular
2491                  * queues or on the free list..
2492                  */
2493                 if (p->b_dev != B_FREE) {
2494                         remove_inode_queue(p);
2495                         __remove_from_queues(p);
2496                 } else
2497                         __remove_from_free_list(p, index);
2498                 __put_unused_buffer_head(p);
2499         } while (tmp != bh);
2500         spin_unlock(&unused_list_lock);
2501
2502         /* Wake up anyone waiting for buffer heads */
2503         wake_up(&buffer_wait);
2504
2505         /* And free the page */
2506         page->buffers = NULL;
2507         page_cache_release(page);
2508         spin_unlock(&free_list[index].lock);
2509         write_unlock(&hash_table_lock);
2510         spin_unlock(&lru_list_lock);
2511         return 1;
2512
2513 busy_buffer_page:
2514         /* Uhhuh, start writeback so that we don't end up with all dirty pages */
2515         spin_unlock(&free_list[index].lock);
2516         write_unlock(&hash_table_lock);
2517         spin_unlock(&lru_list_lock);
2518         if (wait) {
2519                 sync_page_buffers(bh, wait);
2520                 /* We waited synchronously, so we can free the buffers. */
2521                 if (wait > 1 && !loop) {
2522                         loop = 1;
2523                         goto cleaned_buffers_try_again;
2524                 }
2525         }
2526         return 0;
2527 }
2528
2529 /* ================== Debugging =================== */
2530
2531 void show_buffers(void)
2532 {
2533 #ifdef CONFIG_SMP
2534         struct buffer_head * bh;
2535         int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
2536         int protected = 0;
2537         int nlist;
2538         static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", "PROTECTED", };
2539 #endif
2540
2541         printk("Buffer memory:   %6dkB\n",
2542                         atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
2543
2544 #ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
2545         if (!spin_trylock(&lru_list_lock))
2546                 return;
2547         for(nlist = 0; nlist < NR_LIST; nlist++) {
2548                 found = locked = dirty = used = lastused = protected = 0;
2549                 bh = lru_list[nlist];
2550                 if(!bh) continue;
2551
2552                 do {
2553                         found++;
2554                         if (buffer_locked(bh))
2555                                 locked++;
2556                         if (buffer_protected(bh))
2557                                 protected++;
2558                         if (buffer_dirty(bh))
2559                                 dirty++;
2560                         if (atomic_read(&bh->b_count))
2561                                 used++, lastused = found;
2562                         bh = bh->b_next_free;
2563                 } while (bh != lru_list[nlist]);
2564                 {
2565                         int tmp = nr_buffers_type[nlist];
2566                         if (found != tmp)
2567                                 printk("%9s: BUG -> found %d, reported %d\n",
2568                                        buf_types[nlist], found, tmp);
2569                 }
2570                 printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
2571                        "%d locked, %d protected, %d dirty\n",
2572                        buf_types[nlist], found, size_buffers_type[nlist]>>10,
2573                        used, lastused, locked, protected, dirty);
2574         }
2575         spin_unlock(&lru_list_lock);
2576 #endif
2577 }
2578
2579 /* ===================== Init ======================= */
2580
2581 /*
2582  * allocate the hash table and init the free list
2583  * Use gfp() for the hash table to decrease TLB misses, use
2584  * SLAB cache for buffer heads.
2585  */
2586 void __init buffer_init(unsigned long mempages)
2587 {
2588         int order, i;
2589         unsigned int nr_hash;
2590
2591         /* The buffer cache hash table is less important these days,
2592          * trim it a bit.
2593          */
2594         mempages >>= 14;
2595
2596         mempages *= sizeof(struct buffer_head *);
2597
2598         for (order = 0; (1 << order) < mempages; order++)
2599                 ;
2600
2601         /* try to allocate something until we get it or we're asking
2602            for something that is really too small */
2603
2604         do {
2605                 unsigned long tmp;
2606
2607                 nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
2608                 bh_hash_mask = (nr_hash - 1);
2609
2610                 tmp = nr_hash;
2611                 bh_hash_shift = 0;
2612                 while((tmp >>= 1UL) != 0UL)
2613                         bh_hash_shift++;
2614
2615                 hash_table = (struct buffer_head **)
2616                     __get_free_pages(GFP_ATOMIC, order);
2617         } while (hash_table == NULL && --order > 0);
2618         printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
2619                nr_hash, order, (PAGE_SIZE << order));
2620
2621         if (!hash_table)
2622                 panic("Failed to allocate buffer hash table\n");
2623
2624         /* Setup hash chains. */
2625         for(i = 0; i < nr_hash; i++)
2626                 hash_table[i] = NULL;
2627
2628         /* Setup free lists. */
2629         for(i = 0; i < NR_SIZES; i++) {
2630                 free_list[i].list = NULL;
2631                 free_list[i].lock = SPIN_LOCK_UNLOCKED;
2632         }
2633
2634         /* Setup lru lists. */
2635         for(i = 0; i < NR_LIST; i++)
2636                 lru_list[i] = NULL;
2637
2638 }
2639
2640
2641 /* ====================== bdflush support =================== */
2642
2643 /* This is a simple kernel daemon, whose job it is to provide a dynamic
2644  * response to dirty buffers.  Once this process is activated, we write back
2645  * a limited number of buffers to the disks and then go back to sleep again.
2646  */
2647 static DECLARE_WAIT_QUEUE_HEAD(bdflush_done);
2648 struct task_struct *bdflush_tsk = 0;
2649
2650 void wakeup_bdflush(int block)
2651 {
2652         DECLARE_WAITQUEUE(wait, current);
2653
2654         if (current == bdflush_tsk)
2655                 return;
2656
2657         if (!block) {
2658                 wake_up_process(bdflush_tsk);
2659                 return;
2660         }
2661
2662         /* bdflush can wakeup us before we have a chance to
2663            go to sleep so we must be smart in handling
2664            this wakeup event from bdflush to avoid deadlocking in SMP
2665            (we are not holding any lock anymore in these two paths). */
2666         __set_current_state(TASK_UNINTERRUPTIBLE);
2667         add_wait_queue(&bdflush_done, &wait);
2668
2669         wake_up_process(bdflush_tsk);
2670         schedule();
2671
2672         remove_wait_queue(&bdflush_done, &wait);
2673         __set_current_state(TASK_RUNNING);
2674 }
2675
2676 /* This is the _only_ function that deals with flushing async writes
2677    to disk.
2678    NOTENOTENOTENOTE: we _only_ need to browse the DIRTY lru list
2679    as all dirty buffers lives _only_ in the DIRTY lru list.
2680    As we never browse the LOCKED and CLEAN lru lists they are infact
2681    completly useless. */
2682 static int flush_dirty_buffers(int check_flushtime)
2683 {
2684         struct buffer_head * bh, *next;
2685         int flushed = 0, i;
2686
2687  restart:
2688         spin_lock(&lru_list_lock);
2689         bh = lru_list[BUF_DIRTY];
2690         if (!bh)
2691                 goto out_unlock;
2692         for (i = nr_buffers_type[BUF_DIRTY]; i-- > 0; bh = next) {
2693                 next = bh->b_next_free;
2694
2695                 if (!buffer_dirty(bh)) {
2696                         __refile_buffer(bh);
2697                         continue;
2698                 }
2699                 if (buffer_locked(bh))
2700                         continue;
2701
2702                 if (check_flushtime) {
2703                         /* The dirty lru list is chronologically ordered so
2704                            if the current bh is not yet timed out,
2705                            then also all the following bhs
2706                            will be too young. */
2707                         if (time_before(jiffies, bh->b_flushtime))
2708                                 goto out_unlock;
2709                 } else {
2710                         if (++flushed > bdf_prm.b_un.ndirty)
2711                                 goto out_unlock;
2712                 }
2713
2714                 /* OK, now we are committed to write it out. */
2715                 atomic_inc(&bh->b_count);
2716                 spin_unlock(&lru_list_lock);
2717                 ll_rw_block(WRITE, 1, &bh);
2718                 atomic_dec(&bh->b_count);
2719
2720                 if (current->need_resched)
2721                         schedule();
2722                 goto restart;
2723         }
2724  out_unlock:
2725         spin_unlock(&lru_list_lock);
2726
2727         return flushed;
2728 }
2729
2730 /*
2731  * Here we attempt to write back old buffers.  We also try to flush inodes
2732  * and supers as well, since this function is essentially "update", and
2733  * otherwise there would be no way of ensuring that these quantities ever
2734  * get written back.  Ideally, we would have a timestamp on the inodes
2735  * and superblocks so that we could write back only the old ones as well
2736  */
2737
2738 static int sync_old_buffers(void)
2739 {
2740         lock_kernel();
2741         sync_supers(0);
2742         sync_inodes(0);
2743         unlock_kernel();
2744
2745         flush_dirty_buffers(1);
2746         /* must really sync all the active I/O request to disk here */
2747         run_task_queue(&tq_disk);
2748         return 0;
2749 }
2750
2751 int block_sync_page(struct page *page)
2752 {
2753         run_task_queue(&tq_disk);
2754         return 0;
2755 }
2756
2757 /* This is the interface to bdflush.  As we get more sophisticated, we can
2758  * pass tuning parameters to this "process", to adjust how it behaves.
2759  * We would want to verify each parameter, however, to make sure that it
2760  * is reasonable. */
2761
2762 asmlinkage long sys_bdflush(int func, long data)
2763 {
2764         if (!capable(CAP_SYS_ADMIN))
2765                 return -EPERM;
2766
2767         if (func == 1) {
2768                 /* do_exit directly and let kupdate to do its work alone. */
2769                 do_exit(0);
2770 #if 0 /* left here as it's the only example of lazy-mm-stuff used from
2771          a syscall that doesn't care about the current mm context. */
2772                 int error;
2773                 struct mm_struct *user_mm;
2774
2775                 /*
2776                  * bdflush will spend all of it's time in kernel-space,
2777                  * without touching user-space, so we can switch it into
2778                  * 'lazy TLB mode' to reduce the cost of context-switches
2779                  * to and from bdflush.
2780                  */
2781                 user_mm = start_lazy_tlb();
2782                 error = sync_old_buffers();
2783                 end_lazy_tlb(user_mm);
2784                 return error;
2785 #endif
2786         }
2787
2788         /* Basically func 1 means read param 1, 2 means write param 1, etc */
2789         if (func >= 2) {
2790                 int i = (func-2) >> 1;
2791                 if (i >= 0 && i < N_PARAM) {
2792                         if ((func & 1) == 0)
2793                                 return put_user(bdf_prm.data[i], (int*)data);
2794
2795                         if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
2796                                 bdf_prm.data[i] = data;
2797                                 return 0;
2798                         }
2799                 }
2800                 return -EINVAL;
2801         }
2802
2803         /* Having func 0 used to launch the actual bdflush and then never
2804          * return (unless explicitly killed). We return zero here to
2805          * remain semi-compatible with present update(8) programs.
2806          */
2807         return 0;
2808 }
2809
2810 /*
2811  * This is the actual bdflush daemon itself. It used to be started from
2812  * the syscall above, but now we launch it ourselves internally with
2813  * kernel_thread(...)  directly after the first thread in init/main.c
2814  */
2815 int bdflush(void *sem)
2816 {
2817         struct task_struct *tsk = current;
2818         int flushed;
2819         /*
2820          *      We have a bare-bones task_struct, and really should fill
2821          *      in a few more things so "top" and /proc/2/{exe,root,cwd}
2822          *      display semi-sane things. Not real crucial though...
2823          */
2824
2825         tsk->session = 1;
2826         tsk->pgrp = 1;
2827         strcpy(tsk->comm, "bdflush");
2828         bdflush_tsk = tsk;
2829
2830         /* avoid getting signals */
2831         spin_lock_irq(&tsk->sigmask_lock);
2832         flush_signals(tsk);
2833         sigfillset(&tsk->blocked);
2834         recalc_sigpending(tsk);
2835         spin_unlock_irq(&tsk->sigmask_lock);
2836
2837         up((struct semaphore *)sem);
2838
2839         for (;;) {
2840                 CHECK_EMERGENCY_SYNC
2841
2842                 flushed = flush_dirty_buffers(0);
2843                 if (free_shortage())
2844                         flushed += page_launder(GFP_BUFFER, 0);
2845
2846                 /* If wakeup_bdflush will wakeup us
2847                    after our bdflush_done wakeup, then
2848                    we must make sure to not sleep
2849                    in schedule_timeout otherwise
2850                    wakeup_bdflush may wait for our
2851                    bdflush_done wakeup that would never arrive
2852                    (as we would be sleeping) and so it would
2853                    deadlock in SMP. */
2854                 __set_current_state(TASK_INTERRUPTIBLE);
2855                 wake_up_all(&bdflush_done);
2856                 /*
2857                  * If there are still a lot of dirty buffers around,
2858                  * skip the sleep and flush some more. Otherwise, we
2859                  * go to sleep waiting a wakeup.
2860                  */
2861                 if (!flushed || balance_dirty_state(NODEV) < 0) {
2862                         run_task_queue(&tq_disk);
2863                         schedule();
2864                 }
2865                 /* Remember to mark us as running otherwise
2866                    the next schedule will block. */
2867                 __set_current_state(TASK_RUNNING);
2868         }
2869 }
2870
2871 /*
2872  * This is the kernel update daemon. It was used to live in userspace
2873  * but since it's need to run safely we want it unkillable by mistake.
2874  * You don't need to change your userspace configuration since
2875  * the userspace `update` will do_exit(0) at the first sys_bdflush().
2876  */
2877 int kupdate(void *sem)
2878 {
2879         struct task_struct * tsk = current;
2880         int interval;
2881
2882         tsk->session = 1;
2883         tsk->pgrp = 1;
2884         strcpy(tsk->comm, "kupdate");
2885
2886         /* sigstop and sigcont will stop and wakeup kupdate */
2887         spin_lock_irq(&tsk->sigmask_lock);
2888         sigfillset(&tsk->blocked);
2889         siginitsetinv(&current->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP));
2890         recalc_sigpending(tsk);
2891         spin_unlock_irq(&tsk->sigmask_lock);
2892
2893         up((struct semaphore *)sem);
2894
2895         for (;;) {
2896                 /* update interval */
2897                 interval = bdf_prm.b_un.interval;
2898                 if (interval) {
2899                         tsk->state = TASK_INTERRUPTIBLE;
2900                         schedule_timeout(interval);
2901                 } else {
2902                 stop_kupdate:
2903                         tsk->state = TASK_STOPPED;
2904                         schedule(); /* wait for SIGCONT */
2905                 }
2906                 /* check for sigstop */
2907                 if (signal_pending(tsk)) {
2908                         int stopped = 0;
2909                         spin_lock_irq(&tsk->sigmask_lock);
2910                         if (sigismember(&tsk->pending.signal, SIGSTOP)) {
2911                                 sigdelset(&tsk->pending.signal, SIGSTOP);
2912                                 stopped = 1;
2913                         }
2914                         recalc_sigpending(tsk);
2915                         spin_unlock_irq(&tsk->sigmask_lock);
2916                         if (stopped)
2917                                 goto stop_kupdate;
2918                 }
2919 #ifdef DEBUG
2920                 printk("kupdate() activated...\n");
2921 #endif
2922                 sync_old_buffers();
2923         }
2924 }
2925
2926 static int __init bdflush_init(void)
2927 {
2928         DECLARE_MUTEX_LOCKED(sem);
2929         kernel_thread(bdflush, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
2930         down(&sem);
2931         kernel_thread(kupdate, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
2932         down(&sem);
2933         return 0;
2934 }
2935
2936 module_init(bdflush_init)
2937