fs/buffer.c

   1 /*
   2  *  linux/fs/buffer.c
   3  *
   4  *  Copyright (C) 1991, 1992  Linus Torvalds
   5  */
   6
   7 /*
   8  *  'buffer.c' implements the buffer-cache functions. Race-conditions have
   9  * been avoided by NEVER letting an interrupt change a buffer (except for the
  10  * data, of course), but instead letting the caller do it.
  11  */
  12
  13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
  14
  15 /* Removed a lot of unnecessary code and simplified things now that
  16  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  17  */
  18
  19 /* Speed up hash, lru, and free list operations.  Use gfp() for allocating
  20  * hash table, use SLAB cache for buffer heads. -DaveM
  21  */
  22
  23 /* Added 32k buffer block sizes - these are required older ARM systems.
  24  * - RMK
  25  */
  26
  27 /* Thread it... -DaveM */
  28
  29 /* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
  30
  31 #include <linux/config.h>
  32 #include <linux/sched.h>
  33 #include <linux/fs.h>
  34 #include <linux/malloc.h>
  35 #include <linux/locks.h>
  36 #include <linux/errno.h>
  37 #include <linux/swap.h>
  38 #include <linux/smp_lock.h>
  39 #include <linux/vmalloc.h>
  40 #include <linux/blkdev.h>
  41 #include <linux/sysrq.h>
  42 #include <linux/file.h>
  43 #include <linux/init.h>
  44 #include <linux/quotaops.h>
  45 #include <linux/iobuf.h>
  46 #include <linux/highmem.h>
  47
  48 #include <asm/uaccess.h>
  49 #include <asm/io.h>
  50 #include <asm/bitops.h>
  51 #include <asm/mmu_context.h>
  52
  53 #define NR_SIZES 7
  54 static char buffersize_index[65] =
  55 {-1,  0,  1, -1,  2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
  56   4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  57   5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  58  -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  59   6};
  60
  61 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
  62 #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
  63 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
  64 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
  65                                              number of unused buffer heads */
  66
  67 /* Anti-deadlock ordering:
  68  *      lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
  69  */
  70
  71 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_inode_buffers)
  72
  73 /*
  74  * Hash table gook..
  75  */
  76 static unsigned int bh_hash_mask;
  77 static unsigned int bh_hash_shift;
  78 static struct buffer_head **hash_table;
  79 static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
  80
  81 static struct buffer_head *lru_list[NR_LIST];
  82 static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED;
  83 static int nr_buffers_type[NR_LIST];
  84 static unsigned long size_buffers_type[NR_LIST];
  85
  86 static struct buffer_head * unused_list;
  87 static int nr_unused_buffer_heads;
  88 static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
  89 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
  90
  91 struct bh_free_head {
  92         struct buffer_head *list;
  93         spinlock_t lock;
  94 };
  95 static struct bh_free_head free_list[NR_SIZES];
  96
  97 kmem_cache_t *bh_cachep;
  98
  99 static int grow_buffers(int size);
 100 static void __refile_buffer(struct buffer_head *);
 101
 102 /* This is used by some architectures to estimate available memory. */
 103 atomic_t buffermem_pages = ATOMIC_INIT(0);
 104
 105 /* Here is the parameter block for the bdflush process. If you add or
 106  * remove any of the parameters, make sure to update kernel/sysctl.c.
 107  */
 108
 109 #define N_PARAM 9
 110
 111 /* The dummy values in this structure are left in there for compatibility
 112  * with old programs that play with the /proc entries.
 113  */
 114 union bdflush_param {
 115         struct {
 116                 int nfract;  /* Percentage of buffer cache dirty to
 117                                 activate bdflush */
 118                 int ndirty;  /* Maximum number of dirty blocks to write out per
 119                                 wake-cycle */
 120                 int nrefill; /* Number of clean buffers to try to obtain
 121                                 each time we call refill */
 122                 int nref_dirt; /* Dirty buffer threshold for activating bdflush
 123                                   when trying to refill buffers. */
 124                 int interval; /* jiffies delay between kupdate flushes */
 125                 int age_buffer;  /* Time for normal buffer to age before we flush it */
 126                 int age_super;  /* Time for superblock to age before we flush it */
 127                 int dummy2;    /* unused */
 128                 int dummy3;    /* unused */
 129         } b_un;
 130         unsigned int data[N_PARAM];
 131 } bdf_prm = {{40, 500, 64, 256, 5*HZ, 30*HZ, 5*HZ, 1884, 2}};
 132
 133 /* These are the min and max parameter values that we will allow to be assigned */
 134 int bdflush_min[N_PARAM] = {  0,  10,    5,   25,  0,   1*HZ,   1*HZ, 1, 1};
 135 int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,600*HZ, 6000*HZ, 6000*HZ, 2047, 5};
 136
 137 /*
 138  * Rewrote the wait-routines to use the "new" wait-queue functionality,
 139  * and getting rid of the cli-sti pairs. The wait-queue routines still
 140  * need cli-sti, but now it's just a couple of 386 instructions or so.
 141  *
 142  * Note that the real wait_on_buffer() is an inline function that checks
 143  * if 'b_wait' is set before calling this, so that the queues aren't set
 144  * up unnecessarily.
 145  */
 146 void __wait_on_buffer(struct buffer_head * bh)
 147 {
 148         struct task_struct *tsk = current;
 149         DECLARE_WAITQUEUE(wait, tsk);
 150
 151         atomic_inc(&bh->b_count);
 152         add_wait_queue(&bh->b_wait, &wait);
 153         do {
 154                 run_task_queue(&tq_disk);
 155                 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 156                 if (!buffer_locked(bh))
 157                         break;
 158                 schedule();
 159         } while (buffer_locked(bh));
 160         tsk->state = TASK_RUNNING;
 161         remove_wait_queue(&bh->b_wait, &wait);
 162         atomic_dec(&bh->b_count);
 163 }
 164
 165 /* Call sync_buffers with wait!=0 to ensure that the call does not
 166  * return until all buffer writes have completed.  Sync() may return
 167  * before the writes have finished; fsync() may not.
 168  */
 169
 170 /* Godamity-damn.  Some buffers (bitmaps for filesystems)
 171  * spontaneously dirty themselves without ever brelse being called.
 172  * We will ultimately want to put these in a separate list, but for
 173  * now we search all of the lists for dirty buffers.
 174  */
 175 static int sync_buffers(kdev_t dev, int wait)
 176 {
 177         int i, retry, pass = 0, err = 0;
 178         struct buffer_head * bh, *next;
 179
 180         /* One pass for no-wait, three for wait:
 181          * 0) write out all dirty, unlocked buffers;
 182          * 1) write out all dirty buffers, waiting if locked;
 183          * 2) wait for completion by waiting for all buffers to unlock.
 184          */
 185         do {
 186                 retry = 0;
 187
 188                 /* We search all lists as a failsafe mechanism, not because we expect
 189                  * there to be dirty buffers on any of the other lists.
 190                  */
 191 repeat:
 192                 spin_lock(&lru_list_lock);
 193                 bh = lru_list[BUF_DIRTY];
 194                 if (!bh)
 195                         goto repeat2;
 196
 197                 for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
 198                         next = bh->b_next_free;
 199
 200                         if (!lru_list[BUF_DIRTY])
 201                                 break;
 202                         if (dev && bh->b_dev != dev)
 203                                 continue;
 204                         if (buffer_locked(bh)) {
 205                                 /* Buffer is locked; skip it unless wait is
 206                                  * requested AND pass > 0.
 207                                  */
 208                                 if (!wait || !pass) {
 209                                         retry = 1;
 210                                         continue;
 211                                 }
 212                                 atomic_inc(&bh->b_count);
 213                                 spin_unlock(&lru_list_lock);
 214                                 wait_on_buffer (bh);
 215                                 atomic_dec(&bh->b_count);
 216                                 goto repeat;
 217                         }
 218
 219                         /* If an unlocked buffer is not uptodate, there has
 220                          * been an IO error. Skip it.
 221                          */
 222                         if (wait && buffer_req(bh) && !buffer_locked(bh) &&
 223                             !buffer_dirty(bh) && !buffer_uptodate(bh)) {
 224                                 err = -EIO;
 225                                 continue;
 226                         }
 227
 228                         /* Don't write clean buffers.  Don't write ANY buffers
 229                          * on the third pass.
 230                          */
 231                         if (!buffer_dirty(bh) || pass >= 2)
 232                                 continue;
 233
 234                         atomic_inc(&bh->b_count);
 235                         spin_unlock(&lru_list_lock);
 236                         ll_rw_block(WRITE, 1, &bh);
 237                         atomic_dec(&bh->b_count);
 238                         retry = 1;
 239                         goto repeat;
 240                 }
 241
 242     repeat2:
 243                 bh = lru_list[BUF_LOCKED];
 244                 if (!bh) {
 245                         spin_unlock(&lru_list_lock);
 246                         break;
 247                 }
 248                 for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
 249                         next = bh->b_next_free;
 250
 251                         if (!lru_list[BUF_LOCKED])
 252                                 break;
 253                         if (dev && bh->b_dev != dev)
 254                                 continue;
 255                         if (buffer_locked(bh)) {
 256                                 /* Buffer is locked; skip it unless wait is
 257                                  * requested AND pass > 0.
 258                                  */
 259                                 if (!wait || !pass) {
 260                                         retry = 1;
 261                                         continue;
 262                                 }
 263                                 atomic_inc(&bh->b_count);
 264                                 spin_unlock(&lru_list_lock);
 265                                 wait_on_buffer (bh);
 266                                 spin_lock(&lru_list_lock);
 267                                 atomic_dec(&bh->b_count);
 268                                 goto repeat2;
 269                         }
 270                 }
 271                 spin_unlock(&lru_list_lock);
 272
 273                 /* If we are waiting for the sync to succeed, and if any dirty
 274                  * blocks were written, then repeat; on the second pass, only
 275                  * wait for buffers being written (do not pass to write any
 276                  * more buffers on the second pass).
 277                  */
 278         } while (wait && retry && ++pass<=2);
 279         return err;
 280 }
 281
 282 void sync_dev(kdev_t dev)
 283 {
 284         sync_supers(dev);
 285         sync_inodes(dev);
 286         DQUOT_SYNC(dev);
 287         /* sync all the dirty buffers out to disk only _after_ all the
 288            high level layers finished generated buffer dirty data
 289            (or we'll return with some buffer still dirty on the blockdevice
 290            so breaking the semantics of this call) */
 291         sync_buffers(dev, 0);
 292         /*
 293          * FIXME(eric) we need to sync the physical devices here.
 294          * This is because some (scsi) controllers have huge amounts of
 295          * cache onboard (hundreds of Mb), and we need to instruct
 296          * them to commit all of the dirty memory to disk, and we should
 297          * not return until this has happened.
 298          *
 299          * This would need to get implemented by going through the assorted
 300          * layers so that each block major number can be synced, and this
 301          * would call down into the upper and mid-layer scsi.
 302          */
 303 }
 304
 305 int fsync_dev(kdev_t dev)
 306 {
 307         sync_buffers(dev, 0);
 308
 309         lock_kernel();
 310         sync_supers(dev);
 311         sync_inodes(dev);
 312         DQUOT_SYNC(dev);
 313         unlock_kernel();
 314
 315         return sync_buffers(dev, 1);
 316 }
 317
 318 asmlinkage long sys_sync(void)
 319 {
 320         fsync_dev(0);
 321         return 0;
 322 }
 323
 324 /*
 325  *      filp may be NULL if called via the msync of a vma.
 326  */
 327
 328 int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
 329 {
 330         struct inode * inode = dentry->d_inode;
 331         struct super_block * sb;
 332         kdev_t dev;
 333         int ret;
 334
 335         lock_kernel();
 336         /* sync the inode to buffers */
 337         write_inode_now(inode, 0);
 338
 339         /* sync the superblock to buffers */
 340         sb = inode->i_sb;
 341         wait_on_super(sb);
 342         if (sb->s_op && sb->s_op->write_super)
 343                 sb->s_op->write_super(sb);
 344
 345         /* .. finally sync the buffers to disk */
 346         dev = inode->i_dev;
 347         ret = sync_buffers(dev, 1);
 348         unlock_kernel();
 349         return ret;
 350 }
 351
 352 asmlinkage long sys_fsync(unsigned int fd)
 353 {
 354         struct file * file;
 355         struct dentry * dentry;
 356         struct inode * inode;
 357         int err;
 358
 359         err = -EBADF;
 360         file = fget(fd);
 361         if (!file)
 362                 goto out;
 363
 364         dentry = file->f_dentry;
 365         inode = dentry->d_inode;
 366
 367         err = -EINVAL;
 368         if (!file->f_op || !file->f_op->fsync)
 369                 goto out_putf;
 370
 371         /* We need to protect against concurrent writers.. */
 372         down(&inode->i_sem);
 373         err = file->f_op->fsync(file, dentry, 0);
 374         up(&inode->i_sem);
 375
 376 out_putf:
 377         fput(file);
 378 out:
 379         return err;
 380 }
 381
 382 asmlinkage long sys_fdatasync(unsigned int fd)
 383 {
 384         struct file * file;
 385         struct dentry * dentry;
 386         struct inode * inode;
 387         int err;
 388
 389         err = -EBADF;
 390         file = fget(fd);
 391         if (!file)
 392                 goto out;
 393
 394         dentry = file->f_dentry;
 395         inode = dentry->d_inode;
 396
 397         err = -EINVAL;
 398         if (!file->f_op || !file->f_op->fsync)
 399                 goto out_putf;
 400
 401         down(&inode->i_sem);
 402         err = file->f_op->fsync(file, dentry, 1);
 403         up(&inode->i_sem);
 404
 405 out_putf:
 406         fput(file);
 407 out:
 408         return err;
 409 }
 410
 411 /* After several hours of tedious analysis, the following hash
 412  * function won.  Do not mess with it... -DaveM
 413  */
 414 #define _hashfn(dev,block)      \
 415         ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
 416          (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ ((block) << (bh_hash_shift - 12))))
 417 #define hash(dev,block) hash_table[(_hashfn(dev,block) & bh_hash_mask)]
 418
 419 static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head)
 420 {
 421         if ((bh->b_next = *head) != NULL)
 422                 bh->b_next->b_pprev = &bh->b_next;
 423         *head = bh;
 424         bh->b_pprev = head;
 425 }
 426
 427 static __inline__ void __hash_unlink(struct buffer_head *bh)
 428 {
 429         if (bh->b_pprev) {
 430                 if (bh->b_next)
 431                         bh->b_next->b_pprev = bh->b_pprev;
 432                 *(bh->b_pprev) = bh->b_next;
 433                 bh->b_pprev = NULL;
 434         }
 435 }
 436
 437 static void __insert_into_lru_list(struct buffer_head * bh, int blist)
 438 {
 439         struct buffer_head **bhp = &lru_list[blist];
 440
 441         if(!*bhp) {
 442                 *bhp = bh;
 443                 bh->b_prev_free = bh;
 444         }
 445         bh->b_next_free = *bhp;
 446         bh->b_prev_free = (*bhp)->b_prev_free;
 447         (*bhp)->b_prev_free->b_next_free = bh;
 448         (*bhp)->b_prev_free = bh;
 449         nr_buffers_type[blist]++;
 450         size_buffers_type[blist] += bh->b_size;
 451 }
 452
 453 static void __remove_from_lru_list(struct buffer_head * bh, int blist)
 454 {
 455         if (bh->b_prev_free || bh->b_next_free) {
 456                 bh->b_prev_free->b_next_free = bh->b_next_free;
 457                 bh->b_next_free->b_prev_free = bh->b_prev_free;
 458                 if (lru_list[blist] == bh)
 459                         lru_list[blist] = bh->b_next_free;
 460                 if (lru_list[blist] == bh)
 461                         lru_list[blist] = NULL;
 462                 bh->b_next_free = bh->b_prev_free = NULL;
 463                 nr_buffers_type[blist]--;
 464                 size_buffers_type[blist] -= bh->b_size;
 465         }
 466 }
 467
 468 static void __remove_from_free_list(struct buffer_head * bh, int index)
 469 {
 470         if(bh->b_next_free == bh)
 471                  free_list[index].list = NULL;
 472         else {
 473                 bh->b_prev_free->b_next_free = bh->b_next_free;
 474                 bh->b_next_free->b_prev_free = bh->b_prev_free;
 475                 if (free_list[index].list == bh)
 476                          free_list[index].list = bh->b_next_free;
 477         }
 478         bh->b_next_free = bh->b_prev_free = NULL;
 479 }
 480
 481 /* must be called with both the hash_table_lock and the lru_list_lock
 482    held */
 483 static void __remove_from_queues(struct buffer_head *bh)
 484 {
 485         __hash_unlink(bh);
 486         __remove_from_lru_list(bh, bh->b_list);
 487 }
 488
 489 static void insert_into_queues(struct buffer_head *bh)
 490 {
 491         struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
 492
 493         spin_lock(&lru_list_lock);
 494         write_lock(&hash_table_lock);
 495         __hash_link(bh, head);
 496         __insert_into_lru_list(bh, bh->b_list);
 497         write_unlock(&hash_table_lock);
 498         spin_unlock(&lru_list_lock);
 499 }
 500
 501 /* This function must only run if there are no other
 502  * references _anywhere_ to this buffer head.
 503  */
 504 static void put_last_free(struct buffer_head * bh)
 505 {
 506         struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)];
 507         struct buffer_head **bhp = &head->list;
 508
 509         bh->b_state = 0;
 510
 511         spin_lock(&head->lock);
 512         bh->b_dev = B_FREE;
 513         if(!*bhp) {
 514                 *bhp = bh;
 515                 bh->b_prev_free = bh;
 516         }
 517         bh->b_next_free = *bhp;
 518         bh->b_prev_free = (*bhp)->b_prev_free;
 519         (*bhp)->b_prev_free->b_next_free = bh;
 520         (*bhp)->b_prev_free = bh;
 521         spin_unlock(&head->lock);
 522 }
 523
 524 /*
 525  * Why like this, I hear you say... The reason is race-conditions.
 526  * As we don't lock buffers (unless we are reading them, that is),
 527  * something might happen to it while we sleep (ie a read-error
 528  * will force it bad). This shouldn't really happen currently, but
 529  * the code is ready.  */
 530 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
 531 {
 532         struct buffer_head **head = &hash(dev, block);
 533         struct buffer_head *bh;
 534
 535         read_lock(&hash_table_lock);
 536         for(bh = *head; bh; bh = bh->b_next)
 537                 if (bh->b_blocknr == block      &&
 538                     bh->b_size    == size       &&
 539                     bh->b_dev     == dev)
 540                         break;
 541         if (bh)
 542                 atomic_inc(&bh->b_count);
 543         read_unlock(&hash_table_lock);
 544
 545         return bh;
 546 }
 547
 548 unsigned int get_hardblocksize(kdev_t dev)
 549 {
 550         /*
 551          * Get the hard sector size for the given device.  If we don't know
 552          * what it is, return 0.
 553          */
 554         if (hardsect_size[MAJOR(dev)] != NULL) {
 555                 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
 556                 if (blksize != 0)
 557                         return blksize;
 558         }
 559
 560         /*
 561          * We don't know what the hardware sector size for this device is.
 562          * Return 0 indicating that we don't know.
 563          */
 564         return 0;
 565 }
 566
 567 void buffer_insert_inode_queue(struct buffer_head *bh, struct inode *inode)
 568 {
 569         spin_lock(&lru_list_lock);
 570         if (bh->b_inode)
 571                 list_del(&bh->b_inode_buffers);
 572         bh->b_inode = inode;
 573         list_add(&bh->b_inode_buffers, &inode->i_dirty_buffers);
 574         spin_unlock(&lru_list_lock);
 575 }
 576
 577 /* The caller must have the lru_list lock before calling the
 578    remove_inode_queue functions.  */
 579 static void __remove_inode_queue(struct buffer_head *bh)
 580 {
 581         bh->b_inode = NULL;
 582         list_del(&bh->b_inode_buffers);
 583 }
 584
 585 static inline void remove_inode_queue(struct buffer_head *bh)
 586 {
 587         if (bh->b_inode)
 588                 __remove_inode_queue(bh);
 589 }
 590
 591 int inode_has_buffers(struct inode *inode)
 592 {
 593         int ret;
 594
 595         spin_lock(&lru_list_lock);
 596         ret = !list_empty(&inode->i_dirty_buffers);
 597         spin_unlock(&lru_list_lock);
 598
 599         return ret;
 600 }
 601
 602
 603 /* If invalidate_buffers() will trash dirty buffers, it means some kind
 604    of fs corruption is going on. Trashing dirty data always imply losing
 605    information that was supposed to be just stored on the physical layer
 606    by the user.
 607
 608    Thus invalidate_buffers in general usage is not allwowed to trash dirty
 609    buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved.
 610
 611    NOTE: In the case where the user removed a removable-media-disk even if
 612    there's still dirty data not synced on disk (due a bug in the device driver
 613    or due an error of the user), by not destroying the dirty buffers we could
 614    generate corruption also on the next media inserted, thus a parameter is
 615    necessary to handle this case in the most safe way possible (trying
 616    to not corrupt also the new disk inserted with the data belonging to
 617    the old now corrupted disk). Also for the ramdisk the natural thing
 618    to do in order to release the ramdisk memory is to destroy dirty buffers.
 619
 620    These are two special cases. Normal usage imply the device driver
 621    to issue a sync on the device (without waiting I/O completation) and
 622    then an invalidate_buffers call that doesn't trashes dirty buffers. */
 623 void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
 624 {
 625         int i, nlist, slept;
 626         struct buffer_head * bh, * bh_next;
 627
 628  retry:
 629         slept = 0;
 630         spin_lock(&lru_list_lock);
 631         for(nlist = 0; nlist < NR_LIST; nlist++) {
 632                 bh = lru_list[nlist];
 633                 if (!bh)
 634                         continue;
 635                 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
 636                         bh_next = bh->b_next_free;
 637                         if (bh->b_dev != dev)
 638                                 continue;
 639                         if (buffer_locked(bh)) {
 640                                 atomic_inc(&bh->b_count);
 641                                 spin_unlock(&lru_list_lock);
 642                                 wait_on_buffer(bh);
 643                                 slept = 1;
 644                                 spin_lock(&lru_list_lock);
 645                                 atomic_dec(&bh->b_count);
 646                         }
 647
 648                         write_lock(&hash_table_lock);
 649                         if (!atomic_read(&bh->b_count) &&
 650                             (destroy_dirty_buffers || !buffer_dirty(bh))) {
 651                                 __remove_from_queues(bh);
 652                                 put_last_free(bh);
 653                         }
 654                         write_unlock(&hash_table_lock);
 655                         if (slept)
 656                                 goto out;
 657                 }
 658         }
 659 out:
 660         spin_unlock(&lru_list_lock);
 661         if (slept)
 662                 goto retry;
 663 }
 664
 665 void set_blocksize(kdev_t dev, int size)
 666 {
 667         extern int *blksize_size[];
 668         int i, nlist, slept;
 669         struct buffer_head * bh, * bh_next;
 670
 671         if (!blksize_size[MAJOR(dev)])
 672                 return;
 673
 674         /* Size must be a power of two, and between 512 and PAGE_SIZE */
 675         if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
 676                 panic("Invalid blocksize passed to set_blocksize");
 677
 678         if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
 679                 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 680                 return;
 681         }
 682         if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
 683                 return;
 684         sync_buffers(dev, 2);
 685         blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 686
 687  retry:
 688         slept = 0;
 689         spin_lock(&lru_list_lock);
 690         for(nlist = 0; nlist < NR_LIST; nlist++) {
 691                 bh = lru_list[nlist];
 692                 if (!bh)
 693                         continue;
 694                 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
 695                         bh_next = bh->b_next_free;
 696                         if (bh->b_dev != dev || bh->b_size == size)
 697                                 continue;
 698                         if (buffer_locked(bh)) {
 699                                 atomic_inc(&bh->b_count);
 700                                 spin_unlock(&lru_list_lock);
 701                                 wait_on_buffer(bh);
 702                                 slept = 1;
 703                                 spin_lock(&lru_list_lock);
 704                                 atomic_dec(&bh->b_count);
 705                         }
 706
 707                         write_lock(&hash_table_lock);
 708                         if (!atomic_read(&bh->b_count)) {
 709                                 if (buffer_dirty(bh))
 710                                         printk(KERN_WARNING
 711                                                "set_blocksize: dev %s buffer_dirty %lu size %hu\n",
 712                                                kdevname(dev), bh->b_blocknr, bh->b_size);
 713                                 __remove_from_queues(bh);
 714                                 put_last_free(bh);
 715                         } else {
 716                                 if (atomic_set_buffer_clean(bh))
 717                                         __refile_buffer(bh);
 718                                 clear_bit(BH_Uptodate, &bh->b_state);
 719                                 printk(KERN_WARNING
 720                                        "set_blocksize: "
 721                                        "b_count %d, dev %s, block %lu, from %p\n",
 722                                        atomic_read(&bh->b_count), bdevname(bh->b_dev),
 723                                        bh->b_blocknr, __builtin_return_address(0));
 724                         }
 725                         write_unlock(&hash_table_lock);
 726                         if (slept)
 727                                 goto out;
 728                 }
 729         }
 730  out:
 731         spin_unlock(&lru_list_lock);
 732         if (slept)
 733                 goto retry;
 734 }
 735
 736 /*
 737  * We used to try various strange things. Let's not.
 738  */
 739 static void refill_freelist(int size)
 740 {
 741         if (!grow_buffers(size)) {
 742                 wakeup_bdflush(1);
 743                 current->policy |= SCHED_YIELD;
 744                 schedule();
 745         }
 746 }
 747
 748 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *dev_id)
 749 {
 750         bh->b_list = BUF_CLEAN;
 751         bh->b_end_io = handler;
 752         bh->b_dev_id = dev_id;
 753 }
 754
 755 static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 756 {
 757         mark_buffer_uptodate(bh, uptodate);
 758         unlock_buffer(bh);
 759 }
 760
 761 static void end_buffer_io_bad(struct buffer_head *bh, int uptodate)
 762 {
 763         mark_buffer_uptodate(bh, uptodate);
 764         unlock_buffer(bh);
 765         BUG();
 766 }
 767
 768 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
 769 {
 770         static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
 771         unsigned long flags;
 772         struct buffer_head *tmp;
 773         struct page *page;
 774
 775         mark_buffer_uptodate(bh, uptodate);
 776
 777         /* This is a temporary buffer used for page I/O. */
 778         page = bh->b_page;
 779
 780         if (!uptodate)
 781                 SetPageError(page);
 782
 783         /*
 784          * Be _very_ careful from here on. Bad things can happen if
 785          * two buffer heads end IO at almost the same time and both
 786          * decide that the page is now completely done.
 787          *
 788          * Async buffer_heads are here only as labels for IO, and get
 789          * thrown away once the IO for this page is complete.  IO is
 790          * deemed complete once all buffers have been visited
 791          * (b_count==0) and are now unlocked. We must make sure that
 792          * only the _last_ buffer that decrements its count is the one
 793          * that unlock the page..
 794          */
 795         spin_lock_irqsave(&page_uptodate_lock, flags);
 796         unlock_buffer(bh);
 797         atomic_dec(&bh->b_count);
 798         tmp = bh->b_this_page;
 799         while (tmp != bh) {
 800                 if (tmp->b_end_io == end_buffer_io_async && buffer_locked(tmp))
 801                         goto still_busy;
 802                 tmp = tmp->b_this_page;
 803         }
 804
 805         /* OK, the async IO on this page is complete. */
 806         spin_unlock_irqrestore(&page_uptodate_lock, flags);
 807
 808         /*
 809          * if none of the buffers had errors then we can set the
 810          * page uptodate:
 811          */
 812         if (!PageError(page))
 813                 SetPageUptodate(page);
 814
 815         /*
 816          * Run the hooks that have to be done when a page I/O has completed.
 817          */
 818         if (PageTestandClearDecrAfter(page))
 819                 atomic_dec(&nr_async_pages);
 820
 821         UnlockPage(page);
 822
 823         return;
 824
 825 still_busy:
 826         spin_unlock_irqrestore(&page_uptodate_lock, flags);
 827         return;
 828 }
 829
 830
 831 /*
 832  * Synchronise all the inode's dirty buffers to the disk.
 833  *
 834  * We have conflicting pressures: we want to make sure that all
 835  * initially dirty buffers get waited on, but that any subsequently
 836  * dirtied buffers don't.  After all, we don't want fsync to last
 837  * forever if somebody is actively writing to the file.
 838  *
 839  * Do this in two main stages: first we copy dirty buffers to a
 840  * temporary inode list, queueing the writes as we go.  Then we clean
 841  * up, waiting for those writes to complete.
 842  *
 843  * During this second stage, any subsequent updates to the file may end
 844  * up refiling the buffer on the original inode's dirty list again, so
 845  * there is a chance we will end up with a buffer queued for write but
 846  * not yet completed on that list.  So, as a final cleanup we go through
 847  * the osync code to catch these locked, dirty buffers without requeuing
 848  * any newly dirty buffers for write.
 849  */
 850
 851 int fsync_inode_buffers(struct inode *inode)
 852 {
 853         struct buffer_head *bh;
 854         struct inode tmp;
 855         int err = 0, err2;
 856
 857         INIT_LIST_HEAD(&tmp.i_dirty_buffers);
 858
 859         spin_lock(&lru_list_lock);
 860
 861         while (!list_empty(&inode->i_dirty_buffers)) {
 862                 bh = BH_ENTRY(inode->i_dirty_buffers.next);
 863                 list_del(&bh->b_inode_buffers);
 864                 if (!buffer_dirty(bh) && !buffer_locked(bh))
 865                         bh->b_inode = NULL;
 866                 else {
 867                         bh->b_inode = &tmp;
 868                         list_add(&bh->b_inode_buffers, &tmp.i_dirty_buffers);
 869                         atomic_inc(&bh->b_count);
 870                         if (buffer_dirty(bh)) {
 871                                 spin_unlock(&lru_list_lock);
 872                                 ll_rw_block(WRITE, 1, &bh);
 873                                 spin_lock(&lru_list_lock);
 874                         }
 875                 }
 876         }
 877
 878         while (!list_empty(&tmp.i_dirty_buffers)) {
 879                 bh = BH_ENTRY(tmp.i_dirty_buffers.prev);
 880                 remove_inode_queue(bh);
 881                 spin_unlock(&lru_list_lock);
 882                 wait_on_buffer(bh);
 883                 if (!buffer_uptodate(bh))
 884                         err = -EIO;
 885                 brelse(bh);
 886                 spin_lock(&lru_list_lock);
 887         }
 888
 889         spin_unlock(&lru_list_lock);
 890         err2 = osync_inode_buffers(inode);
 891
 892         if (err)
 893                 return err;
 894         else
 895                 return err2;
 896 }
 897
 898
 899 /*
 900  * osync is designed to support O_SYNC io.  It waits synchronously for
 901  * all already-submitted IO to complete, but does not queue any new
 902  * writes to the disk.
 903  *
 904  * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
 905  * you dirty the buffers, and then use osync_inode_buffers to wait for
 906  * completion.  Any other dirty buffers which are not yet queued for
 907  * write will not be flushed to disk by the osync.
 908  */
 909
 910 int osync_inode_buffers(struct inode *inode)
 911 {
 912         struct buffer_head *bh;
 913         struct list_head *list;
 914         int err = 0;
 915
 916         spin_lock(&lru_list_lock);
 917
 918  repeat:
 919
 920         for (list = inode->i_dirty_buffers.prev;
 921              bh = BH_ENTRY(list), list != &inode->i_dirty_buffers;
 922              list = bh->b_inode_buffers.prev) {
 923                 if (buffer_locked(bh)) {
 924                         atomic_inc(&bh->b_count);
 925                         spin_unlock(&lru_list_lock);
 926                         wait_on_buffer(bh);
 927                         brelse(bh);
 928                         if (!buffer_uptodate(bh))
 929                                 err = -EIO;
 930                         spin_lock(&lru_list_lock);
 931                         goto repeat;
 932                 }
 933         }
 934
 935         spin_unlock(&lru_list_lock);
 936         return err;
 937 }
 938
 939
 940 /*
 941  * Invalidate any and all dirty buffers on a given inode.  We are
 942  * probably unmounting the fs, but that doesn't mean we have already
 943  * done a sync().  Just drop the buffers from the inode list.
 944  */
 945
 946 void invalidate_inode_buffers(struct inode *inode)
 947 {
 948         struct list_head *list, *next;
 949
 950         spin_lock(&lru_list_lock);
 951         list = inode->i_dirty_buffers.next;
 952         while (list != &inode->i_dirty_buffers) {
 953                 next = list->next;
 954                 remove_inode_queue(BH_ENTRY(list));
 955                 list = next;
 956         }
 957         spin_unlock(&lru_list_lock);
 958 }
 959
 960
 961 /*
 962  * Ok, this is getblk, and it isn't very clear, again to hinder
 963  * race-conditions. Most of the code is seldom used, (ie repeating),
 964  * so it should be much more efficient than it looks.
 965  *
 966  * The algorithm is changed: hopefully better, and an elusive bug removed.
 967  *
 968  * 14.02.92: changed it to sync dirty buffers a bit: better performance
 969  * when the filesystem starts to get full of dirty blocks (I hope).
 970  */
 971 struct buffer_head * getblk(kdev_t dev, int block, int size)
 972 {
 973         struct buffer_head * bh;
 974         int isize;
 975
 976 repeat:
 977         bh = get_hash_table(dev, block, size);
 978         if (bh)
 979                 goto out;
 980
 981         isize = BUFSIZE_INDEX(size);
 982         spin_lock(&free_list[isize].lock);
 983         bh = free_list[isize].list;
 984         if (bh) {
 985                 __remove_from_free_list(bh, isize);
 986                 atomic_set(&bh->b_count, 1);
 987         }
 988         spin_unlock(&free_list[isize].lock);
 989
 990         /*
 991          * OK, FINALLY we know that this buffer is the only one of
 992          * its kind, we hold a reference (b_count>0), it is unlocked,
 993          * and it is clean.
 994          */
 995         if (bh) {
 996                 init_buffer(bh, end_buffer_io_sync, NULL);
 997                 bh->b_dev = dev;
 998                 bh->b_blocknr = block;
 999                 bh->b_state = 1 << BH_Mapped;
1000
1001                 /* Insert the buffer into the regular lists */
1002                 insert_into_queues(bh);
1003         out:
1004                 touch_buffer(bh);
1005                 return bh;
1006         }
1007
1008         /*
1009          * If we block while refilling the free list, somebody may
1010          * create the buffer first ... search the hashes again.
1011          */
1012         refill_freelist(size);
1013         goto repeat;
1014 }
1015
1016 /* -1 -> no need to flush
1017     0 -> async flush
1018     1 -> sync flush (wait for I/O completation) */
1019 static int balance_dirty_state(kdev_t dev)
1020 {
1021         unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
1022
1023         dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
1024         tot = nr_free_buffer_pages();
1025         tot -= size_buffers_type[BUF_PROTECTED] >> PAGE_SHIFT;
1026
1027         dirty *= 200;
1028         soft_dirty_limit = tot * bdf_prm.b_un.nfract;
1029         hard_dirty_limit = soft_dirty_limit * 2;
1030
1031         if (dirty > soft_dirty_limit) {
1032                 if (dirty > hard_dirty_limit)
1033                         return 1;
1034                 return 0;
1035         }
1036         return -1;
1037 }
1038
1039 /*
1040  * if a new dirty buffer is created we need to balance bdflush.
1041  *
1042  * in the future we might want to make bdflush aware of different
1043  * pressures on different devices - thus the (currently unused)
1044  * 'dev' parameter.
1045  */
1046 void balance_dirty(kdev_t dev)
1047 {
1048         int state = balance_dirty_state(dev);
1049
1050         if (state < 0)
1051                 return;
1052         wakeup_bdflush(state);
1053 }
1054
1055 static __inline__ void __mark_dirty(struct buffer_head *bh, int flag)
1056 {
1057         bh->b_flushtime = jiffies + (flag ? bdf_prm.b_un.age_super : bdf_prm.b_un.age_buffer);
1058         refile_buffer(bh);
1059 }
1060
1061 /* atomic version, the user must call balance_dirty() by hand
1062    as soon as it become possible to block */
1063 void __mark_buffer_dirty(struct buffer_head *bh, int flag)
1064 {
1065         if (!atomic_set_buffer_dirty(bh))
1066                 __mark_dirty(bh, flag);
1067 }
1068
1069 void mark_buffer_dirty(struct buffer_head *bh, int flag)
1070 {
1071         __mark_buffer_dirty(bh, flag);
1072         balance_dirty(bh->b_dev);
1073 }
1074
1075 /*
1076  * A buffer may need to be moved from one buffer list to another
1077  * (e.g. in case it is not shared any more). Handle this.
1078  */
1079 static void __refile_buffer(struct buffer_head *bh)
1080 {
1081         int dispose = BUF_CLEAN;
1082         if (buffer_locked(bh))
1083                 dispose = BUF_LOCKED;
1084         if (buffer_dirty(bh))
1085                 dispose = BUF_DIRTY;
1086         if (buffer_protected(bh))
1087                 dispose = BUF_PROTECTED;
1088         if (dispose != bh->b_list) {
1089                 __remove_from_lru_list(bh, bh->b_list);
1090                 bh->b_list = dispose;
1091                 __insert_into_lru_list(bh, dispose);
1092                 if (dispose == BUF_CLEAN)
1093                         remove_inode_queue(bh);
1094         }
1095 }
1096
1097 void refile_buffer(struct buffer_head *bh)
1098 {
1099         spin_lock(&lru_list_lock);
1100         __refile_buffer(bh);
1101         spin_unlock(&lru_list_lock);
1102 }
1103
1104 /*
1105  * Release a buffer head
1106  */
1107 void __brelse(struct buffer_head * buf)
1108 {
1109         if (atomic_read(&buf->b_count)) {
1110                 atomic_dec(&buf->b_count);
1111                 return;
1112         }
1113         printk("VFS: brelse: Trying to free free buffer\n");
1114 }
1115
1116 /*
1117  * bforget() is like brelse(), except it puts the buffer on the
1118  * free list if it can.. We can NOT free the buffer if:
1119  *  - there are other users of it
1120  *  - it is locked and thus can have active IO
1121  */
1122 void __bforget(struct buffer_head * buf)
1123 {
1124         /* grab the lru lock here to block bdflush. */
1125         spin_lock(&lru_list_lock);
1126         write_lock(&hash_table_lock);
1127         if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf))
1128                 goto in_use;
1129         __hash_unlink(buf);
1130         remove_inode_queue(buf);
1131         write_unlock(&hash_table_lock);
1132         __remove_from_lru_list(buf, buf->b_list);
1133         spin_unlock(&lru_list_lock);
1134         put_last_free(buf);
1135         return;
1136
1137  in_use:
1138         write_unlock(&hash_table_lock);
1139         spin_unlock(&lru_list_lock);
1140 }
1141
1142 /*
1143  * bread() reads a specified block and returns the buffer that contains
1144  * it. It returns NULL if the block was unreadable.
1145  */
1146 struct buffer_head * bread(kdev_t dev, int block, int size)
1147 {
1148         struct buffer_head * bh;
1149
1150         bh = getblk(dev, block, size);
1151         if (buffer_uptodate(bh))
1152                 return bh;
1153         ll_rw_block(READ, 1, &bh);
1154         wait_on_buffer(bh);
1155         if (buffer_uptodate(bh))
1156                 return bh;
1157         brelse(bh);
1158         return NULL;
1159 }
1160
1161 /*
1162  * Ok, breada can be used as bread, but additionally to mark other
1163  * blocks for reading as well. End the argument list with a negative
1164  * number.
1165  */
1166
1167 #define NBUF 16
1168
1169 struct buffer_head * breada(kdev_t dev, int block, int bufsize,
1170         unsigned int pos, unsigned int filesize)
1171 {
1172         struct buffer_head * bhlist[NBUF];
1173         unsigned int blocks;
1174         struct buffer_head * bh;
1175         int index;
1176         int i, j;
1177
1178         if (pos >= filesize)
1179                 return NULL;
1180
1181         if (block < 0)
1182                 return NULL;
1183
1184         bh = getblk(dev, block, bufsize);
1185         index = BUFSIZE_INDEX(bh->b_size);
1186
1187         if (buffer_uptodate(bh))
1188                 return(bh);
1189         else ll_rw_block(READ, 1, &bh);
1190
1191         blocks = (filesize - pos) >> (9+index);
1192
1193         if (blocks < (read_ahead[MAJOR(dev)] >> index))
1194                 blocks = read_ahead[MAJOR(dev)] >> index;
1195         if (blocks > NBUF)
1196                 blocks = NBUF;
1197
1198 /*      if (blocks) printk("breada (new) %d blocks\n",blocks); */
1199
1200         bhlist[0] = bh;
1201         j = 1;
1202         for(i=1; i<blocks; i++) {
1203                 bh = getblk(dev,block+i,bufsize);
1204                 if (buffer_uptodate(bh)) {
1205                         brelse(bh);
1206                         break;
1207                 }
1208                 else bhlist[j++] = bh;
1209         }
1210
1211         /* Request the read for these buffers, and then release them. */
1212         if (j>1)
1213                 ll_rw_block(READA, (j-1), bhlist+1);
1214         for(i=1; i<j; i++)
1215                 brelse(bhlist[i]);
1216
1217         /* Wait for this buffer, and then continue on. */
1218         bh = bhlist[0];
1219         wait_on_buffer(bh);
1220         if (buffer_uptodate(bh))
1221                 return bh;
1222         brelse(bh);
1223         return NULL;
1224 }
1225
1226 /*
1227  * Note: the caller should wake up the buffer_wait list if needed.
1228  */
1229 static __inline__ void __put_unused_buffer_head(struct buffer_head * bh)
1230 {
1231         if (bh->b_inode)
1232                 BUG();
1233         if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1234                 kmem_cache_free(bh_cachep, bh);
1235         } else {
1236                 bh->b_blocknr = -1;
1237                 init_waitqueue_head(&bh->b_wait);
1238                 nr_unused_buffer_heads++;
1239                 bh->b_next_free = unused_list;
1240                 bh->b_this_page = NULL;
1241                 unused_list = bh;
1242         }
1243 }
1244
1245 /*
1246  * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1247  * no-buffer-head deadlock.  Return NULL on failure; waiting for
1248  * buffer heads is now handled in create_buffers().
1249  */
1250 static struct buffer_head * get_unused_buffer_head(int async)
1251 {
1252         struct buffer_head * bh;
1253
1254         spin_lock(&unused_list_lock);
1255         if (nr_unused_buffer_heads > NR_RESERVED) {
1256                 bh = unused_list;
1257                 unused_list = bh->b_next_free;
1258                 nr_unused_buffer_heads--;
1259                 spin_unlock(&unused_list_lock);
1260                 return bh;
1261         }
1262         spin_unlock(&unused_list_lock);
1263
1264         /* This is critical.  We can't swap out pages to get
1265          * more buffer heads, because the swap-out may need
1266          * more buffer-heads itself.  Thus SLAB_BUFFER.
1267          */
1268         if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
1269                 memset(bh, 0, sizeof(*bh));
1270                 init_waitqueue_head(&bh->b_wait);
1271                 return bh;
1272         }
1273
1274         /*
1275          * If we need an async buffer, use the reserved buffer heads.
1276          */
1277         if (async) {
1278                 spin_lock(&unused_list_lock);
1279                 if (unused_list) {
1280                         bh = unused_list;
1281                         unused_list = bh->b_next_free;
1282                         nr_unused_buffer_heads--;
1283                         spin_unlock(&unused_list_lock);
1284                         return bh;
1285                 }
1286                 spin_unlock(&unused_list_lock);
1287         }
1288 #if 0
1289         /*
1290          * (Pending further analysis ...)
1291          * Ordinary (non-async) requests can use a different memory priority
1292          * to free up pages. Any swapping thus generated will use async
1293          * buffer heads.
1294          */
1295         if(!async &&
1296            (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1297                 memset(bh, 0, sizeof(*bh));
1298                 init_waitqueue_head(&bh->b_wait);
1299                 return bh;
1300         }
1301 #endif
1302
1303         return NULL;
1304 }
1305
1306 void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
1307 {
1308         bh->b_page = page;
1309         if (offset >= PAGE_SIZE)
1310                 BUG();
1311         if (PageHighMem(page))
1312                 /*
1313                  * This catches illegal uses and preserves the offset:
1314                  */
1315                 bh->b_data = (char *)(0 + offset);
1316         else
1317                 bh->b_data = (char *)(page_address(page) + offset);
1318 }
1319
1320 /*
1321  * Create the appropriate buffers when given a page for data area and
1322  * the size of each buffer.. Use the bh->b_this_page linked list to
1323  * follow the buffers created.  Return NULL if unable to create more
1324  * buffers.
1325  * The async flag is used to differentiate async IO (paging, swapping)
1326  * from ordinary buffer allocations, and only async requests are allowed
1327  * to sleep waiting for buffer heads.
1328  */
1329 static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
1330 {
1331         struct buffer_head *bh, *head;
1332         long offset;
1333
1334 try_again:
1335         head = NULL;
1336         offset = PAGE_SIZE;
1337         while ((offset -= size) >= 0) {
1338                 bh = get_unused_buffer_head(async);
1339                 if (!bh)
1340                         goto no_grow;
1341
1342                 bh->b_dev = B_FREE;  /* Flag as unused */
1343                 bh->b_this_page = head;
1344                 head = bh;
1345
1346                 bh->b_state = 0;
1347                 bh->b_next_free = NULL;
1348                 bh->b_pprev = NULL;
1349                 atomic_set(&bh->b_count, 0);
1350                 bh->b_size = size;
1351
1352                 set_bh_page(bh, page, offset);
1353
1354                 bh->b_list = BUF_CLEAN;
1355                 bh->b_end_io = end_buffer_io_bad;
1356         }
1357         return head;
1358 /*
1359  * In case anything failed, we just free everything we got.
1360  */
1361 no_grow:
1362         if (head) {
1363                 spin_lock(&unused_list_lock);
1364                 do {
1365                         bh = head;
1366                         head = head->b_this_page;
1367                         __put_unused_buffer_head(bh);
1368                 } while (head);
1369                 spin_unlock(&unused_list_lock);
1370
1371                 /* Wake up any waiters ... */
1372                 wake_up(&buffer_wait);
1373         }
1374
1375         /*
1376          * Return failure for non-async IO requests.  Async IO requests
1377          * are not allowed to fail, so we have to wait until buffer heads
1378          * become available.  But we don't want tasks sleeping with
1379          * partially complete buffers, so all were released above.
1380          */
1381         if (!async)
1382                 return NULL;
1383
1384         /* We're _really_ low on memory. Now we just
1385          * wait for old buffer heads to become free due to
1386          * finishing IO.  Since this is an async request and
1387          * the reserve list is empty, we're sure there are
1388          * async buffer heads in use.
1389          */
1390         run_task_queue(&tq_disk);
1391
1392         /*
1393          * Set our state for sleeping, then check again for buffer heads.
1394          * This ensures we won't miss a wake_up from an interrupt.
1395          */
1396         wait_event(buffer_wait, nr_unused_buffer_heads >= MAX_BUF_PER_PAGE);
1397         goto try_again;
1398 }
1399
1400 static int create_page_buffers(int rw, struct page *page, kdev_t dev, int b[], int size)
1401 {
1402         struct buffer_head *head, *bh, *tail;
1403         int block;
1404
1405         if (!PageLocked(page))
1406                 BUG();
1407         /*
1408          * Allocate async buffer heads pointing to this page, just for I/O.
1409          * They don't show up in the buffer hash table, but they *are*
1410          * registered in page->buffers.
1411          */
1412         head = create_buffers(page, size, 1);
1413         if (page->buffers)
1414                 BUG();
1415         if (!head)
1416                 BUG();
1417         tail = head;
1418         for (bh = head; bh; bh = bh->b_this_page) {
1419                 block = *(b++);
1420
1421                 tail = bh;
1422                 init_buffer(bh, end_buffer_io_async, NULL);
1423                 bh->b_dev = dev;
1424                 bh->b_blocknr = block;
1425
1426                 set_bit(BH_Mapped, &bh->b_state);
1427         }
1428         tail->b_this_page = head;
1429         page_cache_get(page);
1430         page->buffers = head;
1431         return 0;
1432 }
1433
1434 static void unmap_buffer(struct buffer_head * bh)
1435 {
1436         if (buffer_mapped(bh)) {
1437                 mark_buffer_clean(bh);
1438                 wait_on_buffer(bh);
1439                 clear_bit(BH_Uptodate, &bh->b_state);
1440                 clear_bit(BH_Mapped, &bh->b_state);
1441                 clear_bit(BH_Req, &bh->b_state);
1442                 clear_bit(BH_New, &bh->b_state);
1443         }
1444 }
1445
1446 /**
1447  * discard_buffer - discard that buffer without doing any IO
1448  * @bh: buffer to discard
1449  *
1450  * This function removes a buffer from all the queues, without doing
1451  * any IO, we are not interested in the contents of the buffer.  This
1452  * function can block if the buffer is locked.
1453  */
1454 static inline struct buffer_head *discard_buffer(struct buffer_head * bh)
1455 {
1456         struct buffer_head *next;
1457
1458         if (bh->b_dev == B_FREE)
1459                 BUG();
1460
1461         next = bh->b_this_page;
1462
1463         unmap_buffer(bh);
1464
1465         spin_lock(&lru_list_lock);
1466         write_lock(&hash_table_lock);
1467         spin_lock(&unused_list_lock);
1468
1469         if (atomic_read(&bh->b_count))
1470                 BUG();
1471
1472         __hash_unlink(bh);
1473         write_unlock(&hash_table_lock);
1474
1475         remove_inode_queue(bh);
1476         __remove_from_lru_list(bh, bh->b_list);
1477         spin_unlock(&lru_list_lock);
1478
1479         __put_unused_buffer_head(bh);
1480         spin_unlock(&unused_list_lock);
1481
1482         return next;
1483 }
1484
1485
1486 /*
1487  * We don't have to release all buffers here, but
1488  * we have to be sure that no dirty buffer is left
1489  * and no IO is going on (no buffer is locked), because
1490  * we have truncated the file and are going to free the
1491  * blocks on-disk..
1492  */
1493 int block_flushpage(struct page *page, unsigned long offset)
1494 {
1495         struct buffer_head *head, *bh, *next;
1496         unsigned int curr_off = 0;
1497
1498         if (!PageLocked(page))
1499                 BUG();
1500         if (!page->buffers)
1501                 return 1;
1502
1503         head = page->buffers;
1504         bh = head;
1505         do {
1506                 unsigned int next_off = curr_off + bh->b_size;
1507                 next = bh->b_this_page;
1508
1509                 /*
1510                  * is this block fully flushed?
1511                  */
1512                 if (offset <= curr_off)
1513                         unmap_buffer(bh);
1514                 curr_off = next_off;
1515                 bh = next;
1516         } while (bh != head);
1517
1518         return 1;
1519 }
1520
1521 /**
1522  * block_destroy_buffers - Will destroy the contents of all the
1523  * buffers in this page
1524  * @page: page to examine the buffers
1525  *
1526  * This function destroy all the buffers in one page without making
1527  * any IO.  The function can block due to the fact that discad_bufferr
1528  * can block.
1529  */
1530 void block_destroy_buffers(struct page *page)
1531 {
1532         struct buffer_head  *bh, *head;
1533
1534         if (!PageLocked(page))
1535                 BUG();
1536         if (!page->buffers)
1537                 return;
1538
1539         head = page->buffers;
1540         bh = head;
1541         do {
1542                 /* We need to get the next buffer from discard buffer
1543                  * because discard buffer can block and anybody else
1544                  * can change the buffer list under our feet.
1545                  */
1546                 bh = discard_buffer(bh);
1547         }while (bh != head);
1548
1549         /* Wake up anyone waiting for buffer heads */
1550         wake_up(&buffer_wait);
1551
1552         /* And free the page */
1553         page->buffers = NULL;
1554         page_cache_release(page);
1555 }
1556
1557 static void create_empty_buffers(struct page *page, struct inode *inode, unsigned long blocksize)
1558 {
1559         struct buffer_head *bh, *head, *tail;
1560
1561         head = create_buffers(page, blocksize, 1);
1562         if (page->buffers)
1563                 BUG();
1564
1565         bh = head;
1566         do {
1567                 bh->b_dev = inode->i_dev;
1568                 bh->b_blocknr = 0;
1569                 bh->b_end_io = end_buffer_io_bad;
1570                 tail = bh;
1571                 bh = bh->b_this_page;
1572         } while (bh);
1573         tail->b_this_page = head;
1574         page->buffers = head;
1575         page_cache_get(page);
1576 }
1577
1578 static void unmap_underlying_metadata(struct buffer_head * bh)
1579 {
1580         struct buffer_head *old_bh;
1581
1582         old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
1583         if (old_bh) {
1584                 unmap_buffer(old_bh);
1585                 /* Here we could run brelse or bforget. We use
1586                    bforget because it will try to put the buffer
1587                    in the freelist. */
1588                 __bforget(old_bh);
1589         }
1590 }
1591
1592 /*
1593  * block_write_full_page() is SMP-safe - currently it's still
1594  * being called with the kernel lock held, but the code is ready.
1595  */
1596 static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
1597 {
1598         int err, i, need_balance_dirty = 0;
1599         unsigned long block;
1600         struct buffer_head *bh, *head;
1601
1602         if (!PageLocked(page))
1603                 BUG();
1604
1605         if (!page->buffers)
1606                 create_empty_buffers(page, inode, inode->i_sb->s_blocksize);
1607         head = page->buffers;
1608
1609         block = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1610
1611         bh = head;
1612         i = 0;
1613         do {
1614                 /*
1615                  * If the buffer isn't up-to-date, we can't be sure
1616                  * that the buffer has been initialized with the proper
1617                  * block number information etc..
1618                  *
1619                  * Leave it to the low-level FS to make all those
1620                  * decisions (block #0 may actually be a valid block)
1621                  */
1622                 bh->b_end_io = end_buffer_io_sync;
1623                 if (!buffer_mapped(bh)) {
1624                         err = get_block(inode, block, bh, 1);
1625                         if (err)
1626                                 goto out;
1627                         if (buffer_new(bh))
1628                                 unmap_underlying_metadata(bh);
1629                 }
1630                 set_bit(BH_Uptodate, &bh->b_state);
1631                 if (!atomic_set_buffer_dirty(bh)) {
1632                         __mark_dirty(bh, 0);
1633                         need_balance_dirty = 1;
1634                 }
1635
1636                 bh = bh->b_this_page;
1637                 block++;
1638         } while (bh != head);
1639
1640         if (need_balance_dirty)
1641                 balance_dirty(bh->b_dev);
1642
1643         SetPageUptodate(page);
1644         return 0;
1645 out:
1646         ClearPageUptodate(page);
1647         return err;
1648 }
1649
1650 static int __block_prepare_write(struct inode *inode, struct page *page,
1651                 unsigned from, unsigned to, get_block_t *get_block)
1652 {
1653         unsigned block_start, block_end;
1654         unsigned long block;
1655         int err = 0;
1656         unsigned blocksize, bbits;
1657         struct buffer_head *bh, *head, *wait[MAX_BUF_PER_PAGE], **wait_bh=wait;
1658         char *kaddr = (char *)kmap(page);
1659
1660         blocksize = inode->i_sb->s_blocksize;
1661         if (!page->buffers)
1662                 create_empty_buffers(page, inode, blocksize);
1663         head = page->buffers;
1664
1665         bbits = inode->i_sb->s_blocksize_bits;
1666         block = page->index << (PAGE_CACHE_SHIFT - bbits);
1667
1668         for(bh = head, block_start = 0; bh != head || !block_start;
1669             block++, block_start=block_end, bh = bh->b_this_page) {
1670                 if (!bh)
1671                         BUG();
1672                 block_end = block_start+blocksize;
1673                 if (block_end <= from)
1674                         continue;
1675                 if (block_start >= to)
1676                         break;
1677                 bh->b_end_io = end_buffer_io_sync;
1678                 if (!buffer_mapped(bh)) {
1679                         err = get_block(inode, block, bh, 1);
1680                         if (err)
1681                                 goto out;
1682                         if (buffer_new(bh)) {
1683                                 unmap_underlying_metadata(bh);
1684                                 if (block_end > to)
1685                                         memset(kaddr+to, 0, block_end-to);
1686                                 if (block_start < from)
1687                                         memset(kaddr+block_start, 0, from-block_start);
1688                                 continue;
1689                         }
1690                 }
1691                 if (!buffer_uptodate(bh) &&
1692                      (block_start < from || block_end > to)) {
1693                         ll_rw_block(READ, 1, &bh);
1694                         *wait_bh++=bh;
1695                 }
1696         }
1697         /*
1698          * If we issued read requests - let them complete.
1699          */
1700         while(wait_bh > wait) {
1701                 wait_on_buffer(*--wait_bh);
1702                 err = -EIO;
1703                 if (!buffer_uptodate(*wait_bh))
1704                         goto out;
1705         }
1706         return 0;
1707 out:
1708         return err;
1709 }
1710
1711 static int __block_commit_write(struct inode *inode, struct page *page,
1712                 unsigned from, unsigned to)
1713 {
1714         unsigned block_start, block_end;
1715         int partial = 0, need_balance_dirty = 0;
1716         unsigned blocksize;
1717         struct buffer_head *bh, *head;
1718
1719         blocksize = inode->i_sb->s_blocksize;
1720
1721         for(bh = head = page->buffers, block_start = 0;
1722             bh != head || !block_start;
1723             block_start=block_end, bh = bh->b_this_page) {
1724                 block_end = block_start + blocksize;
1725                 if (block_end <= from || block_start >= to) {
1726                         if (!buffer_uptodate(bh))
1727                                 partial = 1;
1728                 } else {
1729                         set_bit(BH_Uptodate, &bh->b_state);
1730                         if (!atomic_set_buffer_dirty(bh)) {
1731                                 buffer_insert_inode_queue(bh, inode);
1732                                 __mark_dirty(bh, 0);
1733                                 need_balance_dirty = 1;
1734                         }
1735                 }
1736         }
1737
1738         if (need_balance_dirty)
1739                 balance_dirty(bh->b_dev);
1740         /*
1741          * is this a partial write that happened to make all buffers
1742          * uptodate then we can optimize away a bogus readpage() for
1743          * the next read(). Here we 'discover' wether the page went
1744          * uptodate as a result of this (potentially partial) write.
1745          */
1746         if (!partial)
1747                 SetPageUptodate(page);
1748         return 0;
1749 }
1750
1751 /*
1752  * Generic "read page" function for block devices that have the normal
1753  * get_block functionality. This is most of the block device filesystems.
1754  * Reads the page asynchronously --- the unlock_buffer() and
1755  * mark_buffer_uptodate() functions propagate buffer state into the
1756  * page struct once IO has completed.
1757  */
1758 int block_read_full_page(struct page *page, get_block_t *get_block)
1759 {
1760         struct inode *inode = (struct inode*)page->mapping->host;
1761         unsigned long iblock, lblock;
1762         struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1763         unsigned int blocksize, blocks;
1764         unsigned long kaddr = 0;
1765         int nr, i;
1766
1767         if (!PageLocked(page))
1768                 PAGE_BUG(page);
1769         blocksize = inode->i_sb->s_blocksize;
1770         if (!page->buffers)
1771                 create_empty_buffers(page, inode, blocksize);
1772         head = page->buffers;
1773
1774         blocks = PAGE_CACHE_SIZE >> inode->i_sb->s_blocksize_bits;
1775         iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1776         lblock = (inode->i_size+blocksize-1) >> inode->i_sb->s_blocksize_bits;
1777         bh = head;
1778         nr = 0;
1779         i = 0;
1780
1781         do {
1782                 if (buffer_uptodate(bh))
1783                         continue;
1784
1785                 if (!buffer_mapped(bh)) {
1786                         if (iblock < lblock)
1787                                 get_block(inode, iblock, bh, 0);
1788                         if (!buffer_mapped(bh)) {
1789                                 if (!kaddr)
1790                                         kaddr = kmap(page);
1791                                 memset((char *)(kaddr + i*blocksize), 0, blocksize);
1792                                 set_bit(BH_Uptodate, &bh->b_state);
1793                                 continue;
1794                         }
1795                 }
1796
1797                 init_buffer(bh, end_buffer_io_async, NULL);
1798                 atomic_inc(&bh->b_count);
1799                 arr[nr] = bh;
1800                 nr++;
1801         } while (i++, iblock++, (bh = bh->b_this_page) != head);
1802
1803         if (nr) {
1804                 if (Page_Uptodate(page))
1805                         BUG();
1806                 ll_rw_block(READ, nr, arr);
1807         } else {
1808                 /*
1809                  * all buffers are uptodate - we can set the page
1810                  * uptodate as well.
1811                  */
1812                 SetPageUptodate(page);
1813                 UnlockPage(page);
1814         }
1815         if (kaddr)
1816                 kunmap(page);
1817         return 0;
1818 }
1819
1820 /*
1821  * For moronic filesystems that do not allow holes in file.
1822  * We may have to extend the file.
1823  */
1824
1825 int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes)
1826 {
1827         struct address_space *mapping = page->mapping;
1828         struct inode *inode = (struct inode*)mapping->host;
1829         struct page *new_page;
1830         unsigned long pgpos;
1831         long status;
1832         unsigned zerofrom;
1833         unsigned blocksize = inode->i_sb->s_blocksize;
1834         char *kaddr;
1835
1836         while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
1837                 status = -ENOMEM;
1838                 new_page = grab_cache_page(mapping, pgpos);
1839                 if (!new_page)
1840                         goto out;
1841                 /* we might sleep */
1842                 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
1843                         UnlockPage(new_page);
1844                         page_cache_release(new_page);
1845                         continue;
1846                 }
1847                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1848                 if (zerofrom & (blocksize-1)) {
1849                         *bytes |= (blocksize-1);
1850                         (*bytes)++;
1851                 }
1852                 status = __block_prepare_write(inode, new_page, zerofrom,
1853                                                 PAGE_CACHE_SIZE, get_block);
1854                 if (status)
1855                         goto out_unmap;
1856                 kaddr = (char*)page_address(page);
1857                 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
1858                 __block_commit_write(inode, new_page, zerofrom, to);
1859                 kunmap(new_page);
1860                 UnlockPage(new_page);
1861                 page_cache_release(new_page);
1862         }
1863
1864         if (page->index < pgpos) {
1865                 /* completely inside the area */
1866                 zerofrom = offset;
1867         } else {
1868                 /* page covers the boundary, find the boundary offset */
1869                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1870
1871                 /* if we will expand the thing last block will be filled */
1872                 if (to > zerofrom && (zerofrom & (blocksize-1))) {
1873                         *bytes |= (blocksize-1);
1874                         (*bytes)++;
1875                 }
1876
1877                 /* starting below the boundary? Nothing to zero out */
1878                 if (offset <= zerofrom)
1879                         zerofrom = offset;
1880         }
1881         status = __block_prepare_write(inode, page, zerofrom, to, get_block);
1882         if (status)
1883                 goto out1;
1884         kaddr = (char*)page_address(page);
1885         if (zerofrom < offset) {
1886                 memset(kaddr+zerofrom, 0, offset-zerofrom);
1887                 __block_commit_write(inode, page, zerofrom, offset);
1888         }
1889         return 0;
1890 out1:
1891         ClearPageUptodate(page);
1892         kunmap(page);
1893         return status;
1894
1895 out_unmap:
1896         ClearPageUptodate(new_page);
1897         kunmap(new_page);
1898         UnlockPage(new_page);
1899         page_cache_release(new_page);
1900 out:
1901         return status;
1902 }
1903
1904 int block_prepare_write(struct page *page, unsigned from, unsigned to,
1905                         get_block_t *get_block)
1906 {
1907         struct inode *inode = (struct inode*)page->mapping->host;
1908         int err = __block_prepare_write(inode, page, from, to, get_block);
1909         if (err) {
1910                 ClearPageUptodate(page);
1911                 kunmap(page);
1912         }
1913         return err;
1914 }
1915
1916 int generic_commit_write(struct file *file, struct page *page,
1917                 unsigned from, unsigned to)
1918 {
1919         struct inode *inode = (struct inode*)page->mapping->host;
1920         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1921         __block_commit_write(inode,page,from,to);
1922         kunmap(page);
1923         if (pos > inode->i_size)
1924                 inode->i_size = pos;
1925         return 0;
1926 }
1927
1928 int block_write_full_page(struct page *page, get_block_t *get_block)
1929 {
1930         struct inode *inode = (struct inode*)page->mapping->host;
1931         unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1932         unsigned offset;
1933         int err;
1934
1935         /* easy case */
1936         if (page->index < end_index)
1937                 return __block_write_full_page(inode, page, get_block);
1938
1939         /* things got complicated... */
1940         offset = inode->i_size & (PAGE_CACHE_SIZE-1);
1941         /* OK, are we completely out? */
1942         if (page->index >= end_index+1 || !offset)
1943                 return -EIO;
1944         /* Sigh... will have to work, then... */
1945         err = __block_prepare_write(inode, page, 0, offset, get_block);
1946         if (!err) {
1947                 memset((char *)page_address(page)+offset, 0, PAGE_CACHE_SIZE-offset);
1948                 __block_commit_write(inode,page,0,offset);
1949 done:
1950                 kunmap(page);
1951                 return err;
1952         }
1953         ClearPageUptodate(page);
1954         goto done;
1955 }
1956
1957 int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block)
1958 {
1959         struct buffer_head tmp;
1960         struct inode *inode = (struct inode*)mapping->host;
1961         tmp.b_state = 0;
1962         tmp.b_blocknr = 0;
1963         get_block(inode, block, &tmp, 0);
1964         return tmp.b_blocknr;
1965 }
1966
1967 /*
1968  * IO completion routine for a buffer_head being used for kiobuf IO: we
1969  * can't dispatch the kiobuf callback until io_count reaches 0.
1970  */
1971
1972 static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate)
1973 {
1974         struct kiobuf *kiobuf;
1975
1976         mark_buffer_uptodate(bh, uptodate);
1977
1978         kiobuf = bh->b_kiobuf;
1979         unlock_buffer(bh);
1980         end_kio_request(kiobuf, uptodate);
1981 }
1982
1983
1984 /*
1985  * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
1986  * for them to complete.  Clean up the buffer_heads afterwards.
1987  */
1988
1989 static int do_kio(int rw, int nr, struct buffer_head *bh[], int size)
1990 {
1991         int iosize;
1992         int i;
1993         struct buffer_head *tmp;
1994
1995         struct task_struct *tsk = current;
1996         DECLARE_WAITQUEUE(wait, tsk);
1997
1998         if (rw == WRITE)
1999                 rw = WRITERAW;
2000         ll_rw_block(rw, nr, bh);
2001
2002         iosize = 0;
2003         spin_lock(&unused_list_lock);
2004
2005         for (i = nr; --i >= 0; ) {
2006                 iosize += size;
2007                 tmp = bh[i];
2008                 if (buffer_locked(tmp)) {
2009                         spin_unlock(&unused_list_lock);
2010                         wait_on_buffer(tmp);
2011                         spin_lock(&unused_list_lock);
2012                 }
2013
2014                 if (!buffer_uptodate(tmp)) {
2015                         /* We are traversing bh'es in reverse order so
2016                            clearing iosize on error calculates the
2017                            amount of IO before the first error. */
2018                         iosize = 0;
2019                 }
2020                 __put_unused_buffer_head(tmp);
2021         }
2022
2023         spin_unlock(&unused_list_lock);
2024         wake_up(&buffer_wait);
2025
2026         return iosize;
2027 }
2028
2029 /*
2030  * Start I/O on a physical range of kernel memory, defined by a vector
2031  * of kiobuf structs (much like a user-space iovec list).
2032  *
2033  * The kiobuf must already be locked for IO.  IO is submitted
2034  * asynchronously: you need to check page->locked, page->uptodate, and
2035  * maybe wait on page->wait.
2036  *
2037  * It is up to the caller to make sure that there are enough blocks
2038  * passed in to completely map the iobufs to disk.
2039  */
2040
2041 int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
2042                kdev_t dev, unsigned long b[], int size)
2043 {
2044         int             err;
2045         int             length;
2046         int             transferred;
2047         int             i;
2048         int             bufind;
2049         int             pageind;
2050         int             bhind;
2051         int             offset;
2052         unsigned long   blocknr;
2053         struct kiobuf * iobuf = NULL;
2054         struct page *   map;
2055         struct buffer_head *tmp, *bh[KIO_MAX_SECTORS];
2056
2057         if (!nr)
2058                 return 0;
2059
2060         /*
2061          * First, do some alignment and validity checks
2062          */
2063         for (i = 0; i < nr; i++) {
2064                 iobuf = iovec[i];
2065                 if ((iobuf->offset & (size-1)) ||
2066                     (iobuf->length & (size-1)))
2067                         return -EINVAL;
2068                 if (!iobuf->nr_pages)
2069                         panic("brw_kiovec: iobuf not initialised");
2070         }
2071
2072         /*
2073          * OK to walk down the iovec doing page IO on each page we find.
2074          */
2075         bufind = bhind = transferred = err = 0;
2076         for (i = 0; i < nr; i++) {
2077                 iobuf = iovec[i];
2078                 offset = iobuf->offset;
2079                 length = iobuf->length;
2080                 iobuf->errno = 0;
2081
2082                 for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
2083                         map  = iobuf->maplist[pageind];
2084                         if (!map) {
2085                                 err = -EFAULT;
2086                                 goto error;
2087                         }
2088
2089                         while (length > 0) {
2090                                 blocknr = b[bufind++];
2091                                 tmp = get_unused_buffer_head(0);
2092                                 if (!tmp) {
2093                                         err = -ENOMEM;
2094                                         goto error;
2095                                 }
2096
2097                                 tmp->b_dev = B_FREE;
2098                                 tmp->b_size = size;
2099                                 set_bh_page(tmp, map, offset);
2100                                 tmp->b_this_page = tmp;
2101
2102                                 init_buffer(tmp, end_buffer_io_kiobuf, NULL);
2103                                 tmp->b_dev = dev;
2104                                 tmp->b_blocknr = blocknr;
2105                                 tmp->b_state = 1 << BH_Mapped;
2106                                 tmp->b_kiobuf = iobuf;
2107
2108                                 if (rw == WRITE) {
2109                                         set_bit(BH_Uptodate, &tmp->b_state);
2110                                         set_bit(BH_Dirty, &tmp->b_state);
2111                                 }
2112
2113                                 bh[bhind++] = tmp;
2114                                 length -= size;
2115                                 offset += size;
2116
2117                                 atomic_inc(&iobuf->io_count);
2118
2119                                 /*
2120                                  * Start the IO if we have got too much
2121                                  */
2122                                 if (bhind >= KIO_MAX_SECTORS) {
2123                                         err = do_kio(rw, bhind, bh, size);
2124                                         if (err >= 0)
2125                                                 transferred += err;
2126                                         else
2127                                                 goto finished;
2128                                         bhind = 0;
2129                                 }
2130
2131                                 if (offset >= PAGE_SIZE) {
2132                                         offset = 0;
2133                                         break;
2134                                 }
2135                         } /* End of block loop */
2136                 } /* End of page loop */
2137         } /* End of iovec loop */
2138
2139         /* Is there any IO still left to submit? */
2140         if (bhind) {
2141                 err = do_kio(rw, bhind, bh, size);
2142                 if (err >= 0)
2143                         transferred += err;
2144                 else
2145                         goto finished;
2146         }
2147
2148  finished:
2149         if (transferred)
2150                 return transferred;
2151         return err;
2152
2153  error:
2154         /* We got an error allocating the bh'es.  Just free the current
2155            buffer_heads and exit. */
2156         spin_lock(&unused_list_lock);
2157         for (i = bhind; --i >= 0; ) {
2158                 __put_unused_buffer_head(bh[bhind]);
2159         }
2160         spin_unlock(&unused_list_lock);
2161         wake_up(&buffer_wait);
2162
2163         goto finished;
2164 }
2165
2166 /*
2167  * Start I/O on a page.
2168  * This function expects the page to be locked and may return
2169  * before I/O is complete. You then have to check page->locked,
2170  * page->uptodate, and maybe wait on page->wait.
2171  *
2172  * brw_page() is SMP-safe, although it's being called with the
2173  * kernel lock held - but the code is ready.
2174  *
2175  * FIXME: we need a swapper_inode->get_block function to remove
2176  *        some of the bmap kludges and interface ugliness here.
2177  */
2178 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size)
2179 {
2180         struct buffer_head *head, *bh, *arr[MAX_BUF_PER_PAGE];
2181         int nr, fresh /* temporary debugging flag */, block;
2182
2183         if (!PageLocked(page))
2184                 panic("brw_page: page not locked for I/O");
2185 //      ClearPageError(page);
2186         /*
2187          * We pretty much rely on the page lock for this, because
2188          * create_page_buffers() might sleep.
2189          */
2190         fresh = 0;
2191         if (!page->buffers) {
2192                 create_page_buffers(rw, page, dev, b, size);
2193                 fresh = 1;
2194         }
2195         if (!page->buffers)
2196                 BUG();
2197
2198         head = page->buffers;
2199         bh = head;
2200         nr = 0;
2201         do {
2202                 block = *(b++);
2203
2204                 if (fresh && (atomic_read(&bh->b_count) != 0))
2205                         BUG();
2206                 if (rw == READ) {
2207                         if (!fresh)
2208                                 BUG();
2209                         if (!buffer_uptodate(bh)) {
2210                                 arr[nr++] = bh;
2211                                 atomic_inc(&bh->b_count);
2212                         }
2213                 } else { /* WRITE */
2214                         if (!bh->b_blocknr) {
2215                                 if (!block)
2216                                         BUG();
2217                                 bh->b_blocknr = block;
2218                         } else {
2219                                 if (!block)
2220                                         BUG();
2221                         }
2222                         set_bit(BH_Uptodate, &bh->b_state);
2223                         set_bit(BH_Dirty, &bh->b_state);
2224                         arr[nr++] = bh;
2225                         atomic_inc(&bh->b_count);
2226                 }
2227                 bh = bh->b_this_page;
2228         } while (bh != head);
2229         if ((rw == READ) && nr) {
2230                 if (Page_Uptodate(page))
2231                         BUG();
2232                 ll_rw_block(rw, nr, arr);
2233         } else {
2234                 if (!nr && rw == READ) {
2235                         SetPageUptodate(page);
2236                         UnlockPage(page);
2237                 }
2238                 if (nr && (rw == WRITE))
2239                         ll_rw_block(rw, nr, arr);
2240         }
2241         return 0;
2242 }
2243
2244 int block_symlink(struct inode *inode, const char *symname, int len)
2245 {
2246         struct address_space *mapping = inode->i_mapping;
2247         struct page *page = grab_cache_page(mapping, 0);
2248         int err = -ENOMEM;
2249         char *kaddr;
2250
2251         if (!page)
2252                 goto fail;
2253         err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
2254         if (err)
2255                 goto fail_map;
2256         kaddr = (char*)page_address(page);
2257         memcpy(kaddr, symname, len-1);
2258         mapping->a_ops->commit_write(NULL, page, 0, len-1);
2259         /*
2260          * Notice that we are _not_ going to block here - end of page is
2261          * unmapped, so this will only try to map the rest of page, see
2262          * that it is unmapped (typically even will not look into inode -
2263          * ->i_size will be enough for everything) and zero it out.
2264          * OTOH it's obviously correct and should make the page up-to-date.
2265          */
2266         err = mapping->a_ops->readpage(NULL, page);
2267         wait_on_page(page);
2268         page_cache_release(page);
2269         if (err < 0)
2270                 goto fail;
2271         mark_inode_dirty(inode);
2272         return 0;
2273 fail_map:
2274         UnlockPage(page);
2275         page_cache_release(page);
2276 fail:
2277         return err;
2278 }
2279
2280 /*
2281  * Try to increase the number of buffers available: the size argument
2282  * is used to determine what kind of buffers we want.
2283  */
2284 static int grow_buffers(int size)
2285 {
2286         struct page * page;
2287         struct buffer_head *bh, *tmp;
2288         struct buffer_head * insert_point;
2289         int isize;
2290
2291         if ((size & 511) || (size > PAGE_SIZE)) {
2292                 printk("VFS: grow_buffers: size = %d\n",size);
2293                 return 0;
2294         }
2295
2296         page = alloc_page(GFP_BUFFER);
2297         if (!page)
2298                 goto out;
2299         bh = create_buffers(page, size, 0);
2300         if (!bh)
2301                 goto no_buffer_head;
2302
2303         isize = BUFSIZE_INDEX(size);
2304
2305         spin_lock(&free_list[isize].lock);
2306         insert_point = free_list[isize].list;
2307         tmp = bh;
2308         while (1) {
2309                 if (insert_point) {
2310                         tmp->b_next_free = insert_point->b_next_free;
2311                         tmp->b_prev_free = insert_point;
2312                         insert_point->b_next_free->b_prev_free = tmp;
2313                         insert_point->b_next_free = tmp;
2314                 } else {
2315                         tmp->b_prev_free = tmp;
2316                         tmp->b_next_free = tmp;
2317                 }
2318                 insert_point = tmp;
2319                 if (tmp->b_this_page)
2320                         tmp = tmp->b_this_page;
2321                 else
2322                         break;
2323         }
2324         tmp->b_this_page = bh;
2325         free_list[isize].list = bh;
2326         spin_unlock(&free_list[isize].lock);
2327
2328         page->buffers = bh;
2329         lru_cache_add(page);
2330         atomic_inc(&buffermem_pages);
2331         return 1;
2332
2333 no_buffer_head:
2334         page_cache_release(page);
2335 out:
2336         return 0;
2337 }
2338
2339 /*
2340  * Can the buffer be thrown out?
2341  */
2342 #define BUFFER_BUSY_BITS        ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
2343 #define buffer_busy(bh)         (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
2344
2345 /*
2346  * Sync all the buffers on one page..
2347  *
2348  * If we have old buffers that are locked, we'll
2349  * wait on them, but we won't wait on the new ones
2350  * we're writing out now.
2351  *
2352  * This all is required so that we can free up memory
2353  * later.
2354  */
2355 static int sync_page_buffers(struct buffer_head *bh, int wait)
2356 {
2357         struct buffer_head * tmp = bh;
2358
2359         do {
2360                 struct buffer_head *p = tmp;
2361                 tmp = tmp->b_this_page;
2362                 if (buffer_locked(p)) {
2363                         if (wait)
2364                                 __wait_on_buffer(p);
2365                 } else if (buffer_dirty(p))
2366                         ll_rw_block(WRITE, 1, &p);
2367         } while (tmp != bh);
2368
2369         do {
2370                 struct buffer_head *p = tmp;
2371                 tmp = tmp->b_this_page;
2372                 if (buffer_busy(p))
2373                         return 0;
2374         } while (tmp != bh);
2375
2376         /* Success. Now try_to_free_buffers can free the page. */
2377         return 1;
2378 }
2379
2380 /*
2381  * try_to_free_buffers() checks if all the buffers on this particular page
2382  * are unused, and free's the page if so.
2383  *
2384  * Wake up bdflush() if this fails - if we're running low on memory due
2385  * to dirty buffers, we need to flush them out as quickly as possible.
2386  *
2387  * NOTE: There are quite a number of ways that threads of control can
2388  *       obtain a reference to a buffer head within a page.  So we must
2389  *       lock out all of these paths to cleanly toss the page.
2390  */
2391 int try_to_free_buffers(struct page * page, int wait)
2392 {
2393         struct buffer_head * tmp, * bh = page->buffers;
2394         int index = BUFSIZE_INDEX(bh->b_size);
2395
2396 again:
2397         spin_lock(&lru_list_lock);
2398         write_lock(&hash_table_lock);
2399         spin_lock(&free_list[index].lock);
2400         tmp = bh;
2401         do {
2402                 struct buffer_head *p = tmp;
2403
2404                 tmp = tmp->b_this_page;
2405                 if (buffer_busy(p))
2406                         goto busy_buffer_page;
2407         } while (tmp != bh);
2408
2409         spin_lock(&unused_list_lock);
2410         tmp = bh;
2411         do {
2412                 struct buffer_head * p = tmp;
2413                 tmp = tmp->b_this_page;
2414
2415                 /* The buffer can be either on the regular
2416                  * queues or on the free list..
2417                  */
2418                 if (p->b_dev != B_FREE) {
2419                         remove_inode_queue(p);
2420                         __remove_from_queues(p);
2421                 }
2422                 else
2423                         __remove_from_free_list(p, index);
2424                 __put_unused_buffer_head(p);
2425         } while (tmp != bh);
2426         spin_unlock(&unused_list_lock);
2427
2428         /* Wake up anyone waiting for buffer heads */
2429         wake_up(&buffer_wait);
2430
2431         /* And free the page */
2432         page->buffers = NULL;
2433         page_cache_release(page);
2434         spin_unlock(&free_list[index].lock);
2435         write_unlock(&hash_table_lock);
2436         spin_unlock(&lru_list_lock);
2437         return 1;
2438
2439 busy_buffer_page:
2440         /* Uhhuh, start writeback so that we don't end up with all dirty pages */
2441         spin_unlock(&free_list[index].lock);
2442         write_unlock(&hash_table_lock);
2443         spin_unlock(&lru_list_lock);
2444         if (sync_page_buffers(bh, wait))
2445                 goto again;
2446         return 0;
2447 }
2448
2449 /* ================== Debugging =================== */
2450
2451 void show_buffers(void)
2452 {
2453 #ifdef CONFIG_SMP
2454         struct buffer_head * bh;
2455         int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
2456         int protected = 0;
2457         int nlist;
2458         static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", "PROTECTED", };
2459 #endif
2460
2461         printk("Buffer memory:   %6dkB\n",
2462                         atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
2463
2464 #ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
2465         if (!spin_trylock(&lru_list_lock))
2466                 return;
2467         for(nlist = 0; nlist < NR_LIST; nlist++) {
2468                 found = locked = dirty = used = lastused = protected = 0;
2469                 bh = lru_list[nlist];
2470                 if(!bh) continue;
2471
2472                 do {
2473                         found++;
2474                         if (buffer_locked(bh))
2475                                 locked++;
2476                         if (buffer_protected(bh))
2477                                 protected++;
2478                         if (buffer_dirty(bh))
2479                                 dirty++;
2480                         if (atomic_read(&bh->b_count))
2481                                 used++, lastused = found;
2482                         bh = bh->b_next_free;
2483                 } while (bh != lru_list[nlist]);
2484                 {
2485                         int tmp = nr_buffers_type[nlist];
2486                         if (found != tmp)
2487                                 printk("%9s: BUG -> found %d, reported %d\n",
2488                                        buf_types[nlist], found, tmp);
2489                 }
2490                 printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
2491                        "%d locked, %d protected, %d dirty\n",
2492                        buf_types[nlist], found, size_buffers_type[nlist]>>10,
2493                        used, lastused, locked, protected, dirty);
2494         }
2495         spin_unlock(&lru_list_lock);
2496 #endif
2497 }
2498
2499 /* ===================== Init ======================= */
2500
2501 /*
2502  * allocate the hash table and init the free list
2503  * Use gfp() for the hash table to decrease TLB misses, use
2504  * SLAB cache for buffer heads.
2505  */
2506 void __init buffer_init(unsigned long mempages)
2507 {
2508         int order, i;
2509         unsigned int nr_hash;
2510
2511         /* The buffer cache hash table is less important these days,
2512          * trim it a bit.
2513          */
2514         mempages >>= 14;
2515
2516         mempages *= sizeof(struct buffer_head *);
2517
2518         for (order = 0; (1 << order) < mempages; order++)
2519                 ;
2520
2521         /* try to allocate something until we get it or we're asking
2522            for something that is really too small */
2523
2524         do {
2525                 unsigned long tmp;
2526
2527                 nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
2528                 bh_hash_mask = (nr_hash - 1);
2529
2530                 tmp = nr_hash;
2531                 bh_hash_shift = 0;
2532                 while((tmp >>= 1UL) != 0UL)
2533                         bh_hash_shift++;
2534
2535                 hash_table = (struct buffer_head **)
2536                     __get_free_pages(GFP_ATOMIC, order);
2537         } while (hash_table == NULL && --order > 0);
2538         printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
2539                nr_hash, order, (PAGE_SIZE << order));
2540
2541         if (!hash_table)
2542                 panic("Failed to allocate buffer hash table\n");
2543
2544         /* Setup hash chains. */
2545         for(i = 0; i < nr_hash; i++)
2546                 hash_table[i] = NULL;
2547
2548         /* Setup free lists. */
2549         for(i = 0; i < NR_SIZES; i++) {
2550                 free_list[i].list = NULL;
2551                 free_list[i].lock = SPIN_LOCK_UNLOCKED;
2552         }
2553
2554         /* Setup lru lists. */
2555         for(i = 0; i < NR_LIST; i++)
2556                 lru_list[i] = NULL;
2557
2558         bh_cachep = kmem_cache_create("buffer_head",
2559                                       sizeof(struct buffer_head),
2560                                       0,
2561                                       SLAB_HWCACHE_ALIGN, NULL, NULL);
2562         if(!bh_cachep)
2563                 panic("Cannot create buffer head SLAB cache\n");
2564 }
2565
2566
2567 /* ====================== bdflush support =================== */
2568
2569 /* This is a simple kernel daemon, whose job it is to provide a dynamic
2570  * response to dirty buffers.  Once this process is activated, we write back
2571  * a limited number of buffers to the disks and then go back to sleep again.
2572  */
2573 static DECLARE_WAIT_QUEUE_HEAD(bdflush_done);
2574 struct task_struct *bdflush_tsk = 0;
2575
2576 void wakeup_bdflush(int block)
2577 {
2578         DECLARE_WAITQUEUE(wait, current);
2579
2580         if (current == bdflush_tsk)
2581                 return;
2582
2583         if (!block) {
2584                 wake_up_process(bdflush_tsk);
2585                 return;
2586         }
2587
2588         /* kflushd can wakeup us before we have a chance to
2589            go to sleep so we must be smart in handling
2590            this wakeup event from kflushd to avoid deadlocking in SMP
2591            (we are not holding any lock anymore in these two paths). */
2592         __set_current_state(TASK_UNINTERRUPTIBLE);
2593         add_wait_queue(&bdflush_done, &wait);
2594
2595         wake_up_process(bdflush_tsk);
2596         schedule();
2597
2598         remove_wait_queue(&bdflush_done, &wait);
2599         __set_current_state(TASK_RUNNING);
2600 }
2601
2602 /* This is the _only_ function that deals with flushing async writes
2603    to disk.
2604    NOTENOTENOTENOTE: we _only_ need to browse the DIRTY lru list
2605    as all dirty buffers lives _only_ in the DIRTY lru list.
2606    As we never browse the LOCKED and CLEAN lru lists they are infact
2607    completly useless. */
2608 static int flush_dirty_buffers(int check_flushtime)
2609 {
2610         struct buffer_head * bh, *next;
2611         int flushed = 0, i;
2612
2613  restart:
2614         spin_lock(&lru_list_lock);
2615         bh = lru_list[BUF_DIRTY];
2616         if (!bh)
2617                 goto out_unlock;
2618         for (i = nr_buffers_type[BUF_DIRTY]; i-- > 0; bh = next) {
2619                 next = bh->b_next_free;
2620
2621                 if (!buffer_dirty(bh)) {
2622                         __refile_buffer(bh);
2623                         continue;
2624                 }
2625                 if (buffer_locked(bh))
2626                         continue;
2627
2628                 if (check_flushtime) {
2629                         /* The dirty lru list is chronologically ordered so
2630                            if the current bh is not yet timed out,
2631                            then also all the following bhs
2632                            will be too young. */
2633                         if (time_before(jiffies, bh->b_flushtime))
2634                                 goto out_unlock;
2635                 } else {
2636                         if (++flushed > bdf_prm.b_un.ndirty)
2637                                 goto out_unlock;
2638                 }
2639
2640                 /* OK, now we are committed to write it out. */
2641                 atomic_inc(&bh->b_count);
2642                 spin_unlock(&lru_list_lock);
2643                 ll_rw_block(WRITE, 1, &bh);
2644                 atomic_dec(&bh->b_count);
2645
2646                 if (current->need_resched)
2647                         schedule();
2648                 goto restart;
2649         }
2650  out_unlock:
2651         spin_unlock(&lru_list_lock);
2652
2653         return flushed;
2654 }
2655
2656 /*
2657  * Here we attempt to write back old buffers.  We also try to flush inodes
2658  * and supers as well, since this function is essentially "update", and
2659  * otherwise there would be no way of ensuring that these quantities ever
2660  * get written back.  Ideally, we would have a timestamp on the inodes
2661  * and superblocks so that we could write back only the old ones as well
2662  */
2663
2664 static int sync_old_buffers(void)
2665 {
2666         lock_kernel();
2667         sync_supers(0);
2668         sync_inodes(0);
2669         unlock_kernel();
2670
2671         flush_dirty_buffers(1);
2672         /* must really sync all the active I/O request to disk here */
2673         run_task_queue(&tq_disk);
2674         return 0;
2675 }
2676
2677 int block_sync_page(struct page *page)
2678 {
2679         run_task_queue(&tq_disk);
2680         return 0;
2681 }
2682
2683 /* This is the interface to bdflush.  As we get more sophisticated, we can
2684  * pass tuning parameters to this "process", to adjust how it behaves.
2685  * We would want to verify each parameter, however, to make sure that it
2686  * is reasonable. */
2687
2688 asmlinkage long sys_bdflush(int func, long data)
2689 {
2690         if (!capable(CAP_SYS_ADMIN))
2691                 return -EPERM;
2692
2693         if (func == 1) {
2694                 /* do_exit directly and let kupdate to do its work alone. */
2695                 do_exit(0);
2696 #if 0 /* left here as it's the only example of lazy-mm-stuff used from
2697          a syscall that doesn't care about the current mm context. */
2698                 int error;
2699                 struct mm_struct *user_mm;
2700
2701                 /*
2702                  * bdflush will spend all of it's time in kernel-space,
2703                  * without touching user-space, so we can switch it into
2704                  * 'lazy TLB mode' to reduce the cost of context-switches
2705                  * to and from bdflush.
2706                  */
2707                 user_mm = start_lazy_tlb();
2708                 error = sync_old_buffers();
2709                 end_lazy_tlb(user_mm);
2710                 return error;
2711 #endif
2712         }
2713
2714         /* Basically func 1 means read param 1, 2 means write param 1, etc */
2715         if (func >= 2) {
2716                 int i = (func-2) >> 1;
2717                 if (i >= 0 && i < N_PARAM) {
2718                         if ((func & 1) == 0)
2719                                 return put_user(bdf_prm.data[i], (int*)data);
2720
2721                         if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
2722                                 bdf_prm.data[i] = data;
2723                                 return 0;
2724                         }
2725                 }
2726                 return -EINVAL;
2727         }
2728
2729         /* Having func 0 used to launch the actual bdflush and then never
2730          * return (unless explicitly killed). We return zero here to
2731          * remain semi-compatible with present update(8) programs.
2732          */
2733         return 0;
2734 }
2735
2736 /*
2737  * This is the actual bdflush daemon itself. It used to be started from
2738  * the syscall above, but now we launch it ourselves internally with
2739  * kernel_thread(...)  directly after the first thread in init/main.c
2740  */
2741 int bdflush(void *sem)
2742 {
2743         struct task_struct *tsk = current;
2744         int flushed;
2745         /*
2746          *      We have a bare-bones task_struct, and really should fill
2747          *      in a few more things so "top" and /proc/2/{exe,root,cwd}
2748          *      display semi-sane things. Not real crucial though...
2749          */
2750
2751         tsk->session = 1;
2752         tsk->pgrp = 1;
2753         strcpy(tsk->comm, "kflushd");
2754         bdflush_tsk = tsk;
2755
2756         /* avoid getting signals */
2757         spin_lock_irq(&tsk->sigmask_lock);
2758         flush_signals(tsk);
2759         sigfillset(&tsk->blocked);
2760         recalc_sigpending(tsk);
2761         spin_unlock_irq(&tsk->sigmask_lock);
2762
2763         up((struct semaphore *)sem);
2764
2765         for (;;) {
2766                 CHECK_EMERGENCY_SYNC
2767
2768                 flushed = flush_dirty_buffers(0);
2769
2770                 /* If wakeup_bdflush will wakeup us
2771                    after our bdflush_done wakeup, then
2772                    we must make sure to not sleep
2773                    in schedule_timeout otherwise
2774                    wakeup_bdflush may wait for our
2775                    bdflush_done wakeup that would never arrive
2776                    (as we would be sleeping) and so it would
2777                    deadlock in SMP. */
2778                 __set_current_state(TASK_INTERRUPTIBLE);
2779                 wake_up(&bdflush_done);
2780                 /*
2781                  * If there are still a lot of dirty buffers around,
2782                  * skip the sleep and flush some more. Otherwise, we
2783                  * go to sleep waiting a wakeup.
2784                  */
2785                 if (!flushed || balance_dirty_state(NODEV) < 0)
2786                         schedule();
2787                 /* Remember to mark us as running otherwise
2788                    the next schedule will block. */
2789                 __set_current_state(TASK_RUNNING);
2790         }
2791 }
2792
2793 /*
2794  * This is the kernel update daemon. It was used to live in userspace
2795  * but since it's need to run safely we want it unkillable by mistake.
2796  * You don't need to change your userspace configuration since
2797  * the userspace `update` will do_exit(0) at the first sys_bdflush().
2798  */
2799 int kupdate(void *sem)
2800 {
2801         struct task_struct * tsk = current;
2802         int interval;
2803
2804         tsk->session = 1;
2805         tsk->pgrp = 1;
2806         strcpy(tsk->comm, "kupdate");
2807
2808         /* sigstop and sigcont will stop and wakeup kupdate */
2809         spin_lock_irq(&tsk->sigmask_lock);
2810         sigfillset(&tsk->blocked);
2811         siginitsetinv(&current->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP));
2812         recalc_sigpending(tsk);
2813         spin_unlock_irq(&tsk->sigmask_lock);
2814
2815         up((struct semaphore *)sem);
2816
2817         for (;;) {
2818                 /* update interval */
2819                 interval = bdf_prm.b_un.interval;
2820                 if (interval) {
2821                         tsk->state = TASK_INTERRUPTIBLE;
2822                         schedule_timeout(interval);
2823                 } else {
2824                 stop_kupdate:
2825                         tsk->state = TASK_STOPPED;
2826                         schedule(); /* wait for SIGCONT */
2827                 }
2828                 /* check for sigstop */
2829                 if (signal_pending(tsk)) {
2830                         int stopped = 0;
2831                         spin_lock_irq(&tsk->sigmask_lock);
2832                         if (sigismember(&tsk->signal, SIGSTOP)) {
2833                                 sigdelset(&tsk->signal, SIGSTOP);
2834                                 stopped = 1;
2835                         }
2836                         recalc_sigpending(tsk);
2837                         spin_unlock_irq(&tsk->sigmask_lock);
2838                         if (stopped)
2839                                 goto stop_kupdate;
2840                 }
2841 #ifdef DEBUG
2842                 printk("kupdate() activated...\n");
2843 #endif
2844                 sync_old_buffers();
2845         }
2846 }
2847
2848 static int __init bdflush_init(void)
2849 {
2850         DECLARE_MUTEX_LOCKED(sem);
2851         kernel_thread(bdflush, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
2852         down(&sem);
2853         kernel_thread(kupdate, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
2854         down(&sem);
2855         return 0;
2856 }
2857
2858 module_init(bdflush_init)
2859