fs/buffer.c

   1 /*
   2  *  linux/fs/buffer.c
   3  *
   4  *  Copyright (C) 1991, 1992  Linus Torvalds
   5  */
   6
   7 /*
   8  *  'buffer.c' implements the buffer-cache functions. Race-conditions have
   9  * been avoided by NEVER letting an interrupt change a buffer (except for the
  10  * data, of course), but instead letting the caller do it.
  11  */
  12
  13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
  14
  15 /* Removed a lot of unnecessary code and simplified things now that
  16  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  17  */
  18
  19 /* Speed up hash, lru, and free list operations.  Use gfp() for allocating
  20  * hash table, use SLAB cache for buffer heads. -DaveM
  21  */
  22
  23 /* Added 32k buffer block sizes - these are required older ARM systems.
  24  * - RMK
  25  */
  26
  27 /* Thread it... -DaveM */
  28
  29 /* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
  30
  31 #include <linux/config.h>
  32 #include <linux/sched.h>
  33 #include <linux/fs.h>
  34 #include <linux/malloc.h>
  35 #include <linux/locks.h>
  36 #include <linux/errno.h>
  37 #include <linux/swap.h>
  38 #include <linux/smp_lock.h>
  39 #include <linux/vmalloc.h>
  40 #include <linux/blkdev.h>
  41 #include <linux/sysrq.h>
  42 #include <linux/file.h>
  43 #include <linux/init.h>
  44 #include <linux/quotaops.h>
  45 #include <linux/iobuf.h>
  46 #include <linux/highmem.h>
  47
  48 #include <asm/uaccess.h>
  49 #include <asm/io.h>
  50 #include <asm/bitops.h>
  51 #include <asm/mmu_context.h>
  52
  53 #define NR_SIZES 7
  54 static char buffersize_index[65] =
  55 {-1,  0,  1, -1,  2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
  56   4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  57   5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  58  -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  59   6};
  60
  61 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
  62 #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
  63 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
  64 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
  65                                              number of unused buffer heads */
  66
  67 /* Anti-deadlock ordering:
  68  *      lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
  69  */
  70
  71 /*
  72  * Hash table gook..
  73  */
  74 static unsigned int bh_hash_mask;
  75 static unsigned int bh_hash_shift;
  76 static struct buffer_head **hash_table;
  77 static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
  78
  79 static struct buffer_head *lru_list[NR_LIST];
  80 static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED;
  81 static int nr_buffers_type[NR_LIST];
  82 static unsigned long size_buffers_type[NR_LIST];
  83
  84 static struct buffer_head * unused_list;
  85 static int nr_unused_buffer_heads;
  86 static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
  87 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
  88
  89 struct bh_free_head {
  90         struct buffer_head *list;
  91         spinlock_t lock;
  92 };
  93 static struct bh_free_head free_list[NR_SIZES];
  94
  95 static int grow_buffers(int size);
  96 static void __refile_buffer(struct buffer_head *);
  97
  98 /* This is used by some architectures to estimate available memory. */
  99 atomic_t buffermem_pages = ATOMIC_INIT(0);
 100
 101 /* Here is the parameter block for the bdflush process. If you add or
 102  * remove any of the parameters, make sure to update kernel/sysctl.c.
 103  */
 104
 105 #define N_PARAM 9
 106
 107 /* The dummy values in this structure are left in there for compatibility
 108  * with old programs that play with the /proc entries.
 109  */
 110 union bdflush_param {
 111         struct {
 112                 int nfract;  /* Percentage of buffer cache dirty to
 113                                 activate bdflush */
 114                 int ndirty;  /* Maximum number of dirty blocks to write out per
 115                                 wake-cycle */
 116                 int nrefill; /* Number of clean buffers to try to obtain
 117                                 each time we call refill */
 118                 int nref_dirt; /* Dirty buffer threshold for activating bdflush
 119                                   when trying to refill buffers. */
 120                 int interval; /* jiffies delay between kupdate flushes */
 121                 int age_buffer;  /* Time for normal buffer to age before we flush it */
 122                 int dummy1;    /* unused, was age_super */
 123                 int dummy2;    /* unused */
 124                 int dummy3;    /* unused */
 125         } b_un;
 126         unsigned int data[N_PARAM];
 127 } bdf_prm = {{40, 500, 64, 256, 5*HZ, 30*HZ, 5*HZ, 1884, 2}};
 128
 129 /* These are the min and max parameter values that we will allow to be assigned */
 130 int bdflush_min[N_PARAM] = {  0,  10,    5,   25,  0,   1*HZ,   1*HZ, 1, 1};
 131 int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,600*HZ, 6000*HZ, 6000*HZ, 2047, 5};
 132
 133 /*
 134  * Rewrote the wait-routines to use the "new" wait-queue functionality,
 135  * and getting rid of the cli-sti pairs. The wait-queue routines still
 136  * need cli-sti, but now it's just a couple of 386 instructions or so.
 137  *
 138  * Note that the real wait_on_buffer() is an inline function that checks
 139  * if 'b_wait' is set before calling this, so that the queues aren't set
 140  * up unnecessarily.
 141  */
 142 void __wait_on_buffer(struct buffer_head * bh)
 143 {
 144         struct task_struct *tsk = current;
 145         DECLARE_WAITQUEUE(wait, tsk);
 146
 147         atomic_inc(&bh->b_count);
 148         add_wait_queue(&bh->b_wait, &wait);
 149         do {
 150                 run_task_queue(&tq_disk);
 151                 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 152                 if (!buffer_locked(bh))
 153                         break;
 154                 schedule();
 155         } while (buffer_locked(bh));
 156         tsk->state = TASK_RUNNING;
 157         remove_wait_queue(&bh->b_wait, &wait);
 158         atomic_dec(&bh->b_count);
 159 }
 160
 161 /* Call sync_buffers with wait!=0 to ensure that the call does not
 162  * return until all buffer writes have completed.  Sync() may return
 163  * before the writes have finished; fsync() may not.
 164  */
 165
 166 /* Godamity-damn.  Some buffers (bitmaps for filesystems)
 167  * spontaneously dirty themselves without ever brelse being called.
 168  * We will ultimately want to put these in a separate list, but for
 169  * now we search all of the lists for dirty buffers.
 170  */
 171 static int sync_buffers(kdev_t dev, int wait)
 172 {
 173         int i, retry, pass = 0, err = 0;
 174         struct buffer_head * bh, *next;
 175
 176         /* One pass for no-wait, three for wait:
 177          * 0) write out all dirty, unlocked buffers;
 178          * 1) write out all dirty buffers, waiting if locked;
 179          * 2) wait for completion by waiting for all buffers to unlock.
 180          */
 181         do {
 182                 retry = 0;
 183
 184                 /* We search all lists as a failsafe mechanism, not because we expect
 185                  * there to be dirty buffers on any of the other lists.
 186                  */
 187 repeat:
 188                 spin_lock(&lru_list_lock);
 189                 bh = lru_list[BUF_DIRTY];
 190                 if (!bh)
 191                         goto repeat2;
 192
 193                 for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
 194                         next = bh->b_next_free;
 195
 196                         if (!lru_list[BUF_DIRTY])
 197                                 break;
 198                         if (dev && bh->b_dev != dev)
 199                                 continue;
 200                         if (buffer_locked(bh)) {
 201                                 /* Buffer is locked; skip it unless wait is
 202                                  * requested AND pass > 0.
 203                                  */
 204                                 if (!wait || !pass) {
 205                                         retry = 1;
 206                                         continue;
 207                                 }
 208                                 atomic_inc(&bh->b_count);
 209                                 spin_unlock(&lru_list_lock);
 210                                 wait_on_buffer (bh);
 211                                 atomic_dec(&bh->b_count);
 212                                 goto repeat;
 213                         }
 214
 215                         /* If an unlocked buffer is not uptodate, there has
 216                          * been an IO error. Skip it.
 217                          */
 218                         if (wait && buffer_req(bh) && !buffer_locked(bh) &&
 219                             !buffer_dirty(bh) && !buffer_uptodate(bh)) {
 220                                 err = -EIO;
 221                                 continue;
 222                         }
 223
 224                         /* Don't write clean buffers.  Don't write ANY buffers
 225                          * on the third pass.
 226                          */
 227                         if (!buffer_dirty(bh) || pass >= 2)
 228                                 continue;
 229
 230                         atomic_inc(&bh->b_count);
 231                         spin_unlock(&lru_list_lock);
 232                         ll_rw_block(WRITE, 1, &bh);
 233                         atomic_dec(&bh->b_count);
 234                         retry = 1;
 235                         goto repeat;
 236                 }
 237
 238     repeat2:
 239                 bh = lru_list[BUF_LOCKED];
 240                 if (!bh) {
 241                         spin_unlock(&lru_list_lock);
 242                         break;
 243                 }
 244                 for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
 245                         next = bh->b_next_free;
 246
 247                         if (!lru_list[BUF_LOCKED])
 248                                 break;
 249                         if (dev && bh->b_dev != dev)
 250                                 continue;
 251                         if (buffer_locked(bh)) {
 252                                 /* Buffer is locked; skip it unless wait is
 253                                  * requested AND pass > 0.
 254                                  */
 255                                 if (!wait || !pass) {
 256                                         retry = 1;
 257                                         continue;
 258                                 }
 259                                 atomic_inc(&bh->b_count);
 260                                 spin_unlock(&lru_list_lock);
 261                                 wait_on_buffer (bh);
 262                                 spin_lock(&lru_list_lock);
 263                                 atomic_dec(&bh->b_count);
 264                                 goto repeat2;
 265                         }
 266                 }
 267                 spin_unlock(&lru_list_lock);
 268
 269                 /* If we are waiting for the sync to succeed, and if any dirty
 270                  * blocks were written, then repeat; on the second pass, only
 271                  * wait for buffers being written (do not pass to write any
 272                  * more buffers on the second pass).
 273                  */
 274         } while (wait && retry && ++pass<=2);
 275         return err;
 276 }
 277
 278 void sync_dev(kdev_t dev)
 279 {
 280         sync_supers(dev);
 281         sync_inodes(dev);
 282         DQUOT_SYNC(dev);
 283         /* sync all the dirty buffers out to disk only _after_ all the
 284            high level layers finished generated buffer dirty data
 285            (or we'll return with some buffer still dirty on the blockdevice
 286            so breaking the semantics of this call) */
 287         sync_buffers(dev, 0);
 288         /*
 289          * FIXME(eric) we need to sync the physical devices here.
 290          * This is because some (scsi) controllers have huge amounts of
 291          * cache onboard (hundreds of Mb), and we need to instruct
 292          * them to commit all of the dirty memory to disk, and we should
 293          * not return until this has happened.
 294          *
 295          * This would need to get implemented by going through the assorted
 296          * layers so that each block major number can be synced, and this
 297          * would call down into the upper and mid-layer scsi.
 298          */
 299 }
 300
 301 int fsync_dev(kdev_t dev)
 302 {
 303         sync_buffers(dev, 0);
 304
 305         lock_kernel();
 306         sync_supers(dev);
 307         sync_inodes(dev);
 308         DQUOT_SYNC(dev);
 309         unlock_kernel();
 310
 311         return sync_buffers(dev, 1);
 312 }
 313
 314 asmlinkage long sys_sync(void)
 315 {
 316         fsync_dev(0);
 317         return 0;
 318 }
 319
 320 /*
 321  *      filp may be NULL if called via the msync of a vma.
 322  */
 323
 324 int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
 325 {
 326         struct inode * inode = dentry->d_inode;
 327         struct super_block * sb;
 328         kdev_t dev;
 329         int ret;
 330
 331         lock_kernel();
 332         /* sync the inode to buffers */
 333         write_inode_now(inode, 0);
 334
 335         /* sync the superblock to buffers */
 336         sb = inode->i_sb;
 337         wait_on_super(sb);
 338         if (sb->s_op && sb->s_op->write_super)
 339                 sb->s_op->write_super(sb);
 340
 341         /* .. finally sync the buffers to disk */
 342         dev = inode->i_dev;
 343         ret = sync_buffers(dev, 1);
 344         unlock_kernel();
 345         return ret;
 346 }
 347
 348 asmlinkage long sys_fsync(unsigned int fd)
 349 {
 350         struct file * file;
 351         struct dentry * dentry;
 352         struct inode * inode;
 353         int err;
 354
 355         err = -EBADF;
 356         file = fget(fd);
 357         if (!file)
 358                 goto out;
 359
 360         dentry = file->f_dentry;
 361         inode = dentry->d_inode;
 362
 363         err = -EINVAL;
 364         if (!file->f_op || !file->f_op->fsync)
 365                 goto out_putf;
 366
 367         /* We need to protect against concurrent writers.. */
 368         down(&inode->i_sem);
 369         err = file->f_op->fsync(file, dentry, 0);
 370         up(&inode->i_sem);
 371
 372 out_putf:
 373         fput(file);
 374 out:
 375         return err;
 376 }
 377
 378 asmlinkage long sys_fdatasync(unsigned int fd)
 379 {
 380         struct file * file;
 381         struct dentry * dentry;
 382         struct inode * inode;
 383         int err;
 384
 385         err = -EBADF;
 386         file = fget(fd);
 387         if (!file)
 388                 goto out;
 389
 390         dentry = file->f_dentry;
 391         inode = dentry->d_inode;
 392
 393         err = -EINVAL;
 394         if (!file->f_op || !file->f_op->fsync)
 395                 goto out_putf;
 396
 397         down(&inode->i_sem);
 398         err = file->f_op->fsync(file, dentry, 1);
 399         up(&inode->i_sem);
 400
 401 out_putf:
 402         fput(file);
 403 out:
 404         return err;
 405 }
 406
 407 /* After several hours of tedious analysis, the following hash
 408  * function won.  Do not mess with it... -DaveM
 409  */
 410 #define _hashfn(dev,block)      \
 411         ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
 412          (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ ((block) << (bh_hash_shift - 12))))
 413 #define hash(dev,block) hash_table[(_hashfn(dev,block) & bh_hash_mask)]
 414
 415 static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head)
 416 {
 417         if ((bh->b_next = *head) != NULL)
 418                 bh->b_next->b_pprev = &bh->b_next;
 419         *head = bh;
 420         bh->b_pprev = head;
 421 }
 422
 423 static __inline__ void __hash_unlink(struct buffer_head *bh)
 424 {
 425         if (bh->b_pprev) {
 426                 if (bh->b_next)
 427                         bh->b_next->b_pprev = bh->b_pprev;
 428                 *(bh->b_pprev) = bh->b_next;
 429                 bh->b_pprev = NULL;
 430         }
 431 }
 432
 433 static void __insert_into_lru_list(struct buffer_head * bh, int blist)
 434 {
 435         struct buffer_head **bhp = &lru_list[blist];
 436
 437         if(!*bhp) {
 438                 *bhp = bh;
 439                 bh->b_prev_free = bh;
 440         }
 441         bh->b_next_free = *bhp;
 442         bh->b_prev_free = (*bhp)->b_prev_free;
 443         (*bhp)->b_prev_free->b_next_free = bh;
 444         (*bhp)->b_prev_free = bh;
 445         nr_buffers_type[blist]++;
 446         size_buffers_type[blist] += bh->b_size;
 447 }
 448
 449 static void __remove_from_lru_list(struct buffer_head * bh, int blist)
 450 {
 451         if (bh->b_prev_free || bh->b_next_free) {
 452                 bh->b_prev_free->b_next_free = bh->b_next_free;
 453                 bh->b_next_free->b_prev_free = bh->b_prev_free;
 454                 if (lru_list[blist] == bh)
 455                         lru_list[blist] = bh->b_next_free;
 456                 if (lru_list[blist] == bh)
 457                         lru_list[blist] = NULL;
 458                 bh->b_next_free = bh->b_prev_free = NULL;
 459                 nr_buffers_type[blist]--;
 460                 size_buffers_type[blist] -= bh->b_size;
 461         }
 462 }
 463
 464 static void __remove_from_free_list(struct buffer_head * bh, int index)
 465 {
 466         if(bh->b_next_free == bh)
 467                  free_list[index].list = NULL;
 468         else {
 469                 bh->b_prev_free->b_next_free = bh->b_next_free;
 470                 bh->b_next_free->b_prev_free = bh->b_prev_free;
 471                 if (free_list[index].list == bh)
 472                          free_list[index].list = bh->b_next_free;
 473         }
 474         bh->b_next_free = bh->b_prev_free = NULL;
 475 }
 476
 477 /* must be called with both the hash_table_lock and the lru_list_lock
 478    held */
 479 static void __remove_from_queues(struct buffer_head *bh)
 480 {
 481         __hash_unlink(bh);
 482         __remove_from_lru_list(bh, bh->b_list);
 483 }
 484
 485 static void __insert_into_queues(struct buffer_head *bh)
 486 {
 487         struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
 488
 489         __hash_link(bh, head);
 490         __insert_into_lru_list(bh, bh->b_list);
 491 }
 492
 493 /* This function must only run if there are no other
 494  * references _anywhere_ to this buffer head.
 495  */
 496 static void put_last_free(struct buffer_head * bh)
 497 {
 498         struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)];
 499         struct buffer_head **bhp = &head->list;
 500
 501         bh->b_state = 0;
 502
 503         spin_lock(&head->lock);
 504         bh->b_dev = B_FREE;
 505         if(!*bhp) {
 506                 *bhp = bh;
 507                 bh->b_prev_free = bh;
 508         }
 509         bh->b_next_free = *bhp;
 510         bh->b_prev_free = (*bhp)->b_prev_free;
 511         (*bhp)->b_prev_free->b_next_free = bh;
 512         (*bhp)->b_prev_free = bh;
 513         spin_unlock(&head->lock);
 514 }
 515
 516 /*
 517  * Why like this, I hear you say... The reason is race-conditions.
 518  * As we don't lock buffers (unless we are reading them, that is),
 519  * something might happen to it while we sleep (ie a read-error
 520  * will force it bad). This shouldn't really happen currently, but
 521  * the code is ready.
 522  */
 523 static inline struct buffer_head * __get_hash_table(kdev_t dev, int block, int size)
 524 {
 525         struct buffer_head *bh = hash(dev, block);
 526
 527         for (; bh; bh = bh->b_next)
 528                 if (bh->b_blocknr == block      &&
 529                     bh->b_size    == size       &&
 530                     bh->b_dev     == dev)
 531                         break;
 532         if (bh)
 533                 atomic_inc(&bh->b_count);
 534
 535         return bh;
 536 }
 537
 538 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
 539 {
 540         struct buffer_head *bh;
 541
 542         read_lock(&hash_table_lock);
 543         bh = __get_hash_table(dev, block, size);
 544         read_unlock(&hash_table_lock);
 545
 546         return bh;
 547 }
 548
 549 unsigned int get_hardblocksize(kdev_t dev)
 550 {
 551         /*
 552          * Get the hard sector size for the given device.  If we don't know
 553          * what it is, return 0.
 554          */
 555         if (hardsect_size[MAJOR(dev)] != NULL) {
 556                 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
 557                 if (blksize != 0)
 558                         return blksize;
 559         }
 560
 561         /*
 562          * We don't know what the hardware sector size for this device is.
 563          * Return 0 indicating that we don't know.
 564          */
 565         return 0;
 566 }
 567
 568 /* If invalidate_buffers() will trash dirty buffers, it means some kind
 569    of fs corruption is going on. Trashing dirty data always imply losing
 570    information that was supposed to be just stored on the physical layer
 571    by the user.
 572
 573    Thus invalidate_buffers in general usage is not allwowed to trash dirty
 574    buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved.
 575
 576    NOTE: In the case where the user removed a removable-media-disk even if
 577    there's still dirty data not synced on disk (due a bug in the device driver
 578    or due an error of the user), by not destroying the dirty buffers we could
 579    generate corruption also on the next media inserted, thus a parameter is
 580    necessary to handle this case in the most safe way possible (trying
 581    to not corrupt also the new disk inserted with the data belonging to
 582    the old now corrupted disk). Also for the ramdisk the natural thing
 583    to do in order to release the ramdisk memory is to destroy dirty buffers.
 584
 585    These are two special cases. Normal usage imply the device driver
 586    to issue a sync on the device (without waiting I/O completation) and
 587    then an invalidate_buffers call that doesn't trashes dirty buffers. */
 588 void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
 589 {
 590         int i, nlist, slept;
 591         struct buffer_head * bh, * bh_next;
 592
 593  retry:
 594         slept = 0;
 595         spin_lock(&lru_list_lock);
 596         for(nlist = 0; nlist < NR_LIST; nlist++) {
 597                 bh = lru_list[nlist];
 598                 if (!bh)
 599                         continue;
 600                 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
 601                         bh_next = bh->b_next_free;
 602                         if (bh->b_dev != dev)
 603                                 continue;
 604                         if (buffer_locked(bh)) {
 605                                 atomic_inc(&bh->b_count);
 606                                 spin_unlock(&lru_list_lock);
 607                                 wait_on_buffer(bh);
 608                                 slept = 1;
 609                                 spin_lock(&lru_list_lock);
 610                                 atomic_dec(&bh->b_count);
 611                         }
 612
 613                         write_lock(&hash_table_lock);
 614                         if (!atomic_read(&bh->b_count) &&
 615                             (destroy_dirty_buffers || !buffer_dirty(bh))) {
 616                                 __remove_from_queues(bh);
 617                                 put_last_free(bh);
 618                         }
 619                         write_unlock(&hash_table_lock);
 620                         if (slept)
 621                                 goto out;
 622                 }
 623         }
 624 out:
 625         spin_unlock(&lru_list_lock);
 626         if (slept)
 627                 goto retry;
 628 }
 629
 630 void set_blocksize(kdev_t dev, int size)
 631 {
 632         extern int *blksize_size[];
 633         int i, nlist, slept;
 634         struct buffer_head * bh, * bh_next;
 635
 636         if (!blksize_size[MAJOR(dev)])
 637                 return;
 638
 639         /* Size must be a power of two, and between 512 and PAGE_SIZE */
 640         if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
 641                 panic("Invalid blocksize passed to set_blocksize");
 642
 643         if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
 644                 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 645                 return;
 646         }
 647         if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
 648                 return;
 649         sync_buffers(dev, 2);
 650         blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 651
 652  retry:
 653         slept = 0;
 654         spin_lock(&lru_list_lock);
 655         for(nlist = 0; nlist < NR_LIST; nlist++) {
 656                 bh = lru_list[nlist];
 657                 if (!bh)
 658                         continue;
 659                 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
 660                         bh_next = bh->b_next_free;
 661                         if (bh->b_dev != dev || bh->b_size == size)
 662                                 continue;
 663                         if (buffer_locked(bh)) {
 664                                 atomic_inc(&bh->b_count);
 665                                 spin_unlock(&lru_list_lock);
 666                                 wait_on_buffer(bh);
 667                                 slept = 1;
 668                                 spin_lock(&lru_list_lock);
 669                                 atomic_dec(&bh->b_count);
 670                         }
 671
 672                         write_lock(&hash_table_lock);
 673                         if (!atomic_read(&bh->b_count)) {
 674                                 if (buffer_dirty(bh))
 675                                         printk(KERN_WARNING
 676                                                "set_blocksize: dev %s buffer_dirty %lu size %hu\n",
 677                                                kdevname(dev), bh->b_blocknr, bh->b_size);
 678                                 __remove_from_queues(bh);
 679                                 put_last_free(bh);
 680                         } else {
 681                                 if (atomic_set_buffer_clean(bh))
 682                                         __refile_buffer(bh);
 683                                 clear_bit(BH_Uptodate, &bh->b_state);
 684                                 printk(KERN_WARNING
 685                                        "set_blocksize: "
 686                                        "b_count %d, dev %s, block %lu, from %p\n",
 687                                        atomic_read(&bh->b_count), bdevname(bh->b_dev),
 688                                        bh->b_blocknr, __builtin_return_address(0));
 689                         }
 690                         write_unlock(&hash_table_lock);
 691                         if (slept)
 692                                 goto out;
 693                 }
 694         }
 695  out:
 696         spin_unlock(&lru_list_lock);
 697         if (slept)
 698                 goto retry;
 699 }
 700
 701 /*
 702  * We used to try various strange things. Let's not.
 703  */
 704 static void refill_freelist(int size)
 705 {
 706         if (!grow_buffers(size)) {
 707                 wakeup_bdflush(1);
 708                 current->policy |= SCHED_YIELD;
 709                 schedule();
 710         }
 711 }
 712
 713 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
 714 {
 715         bh->b_list = BUF_CLEAN;
 716         bh->b_end_io = handler;
 717         bh->b_private = private;
 718 }
 719
 720 static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 721 {
 722         mark_buffer_uptodate(bh, uptodate);
 723         unlock_buffer(bh);
 724 }
 725
 726 static void end_buffer_io_bad(struct buffer_head *bh, int uptodate)
 727 {
 728         mark_buffer_uptodate(bh, uptodate);
 729         unlock_buffer(bh);
 730         BUG();
 731 }
 732
 733 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
 734 {
 735         static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
 736         unsigned long flags;
 737         struct buffer_head *tmp;
 738         struct page *page;
 739
 740         mark_buffer_uptodate(bh, uptodate);
 741
 742         /* This is a temporary buffer used for page I/O. */
 743         page = bh->b_page;
 744
 745         if (!uptodate)
 746                 SetPageError(page);
 747
 748         /*
 749          * Be _very_ careful from here on. Bad things can happen if
 750          * two buffer heads end IO at almost the same time and both
 751          * decide that the page is now completely done.
 752          *
 753          * Async buffer_heads are here only as labels for IO, and get
 754          * thrown away once the IO for this page is complete.  IO is
 755          * deemed complete once all buffers have been visited
 756          * (b_count==0) and are now unlocked. We must make sure that
 757          * only the _last_ buffer that decrements its count is the one
 758          * that unlock the page..
 759          */
 760         spin_lock_irqsave(&page_uptodate_lock, flags);
 761         unlock_buffer(bh);
 762         atomic_dec(&bh->b_count);
 763         tmp = bh->b_this_page;
 764         while (tmp != bh) {
 765                 if (tmp->b_end_io == end_buffer_io_async && buffer_locked(tmp))
 766                         goto still_busy;
 767                 tmp = tmp->b_this_page;
 768         }
 769
 770         /* OK, the async IO on this page is complete. */
 771         spin_unlock_irqrestore(&page_uptodate_lock, flags);
 772
 773         /*
 774          * if none of the buffers had errors then we can set the
 775          * page uptodate:
 776          */
 777         if (!PageError(page))
 778                 SetPageUptodate(page);
 779
 780         /*
 781          * Run the hooks that have to be done when a page I/O has completed.
 782          */
 783         if (PageTestandClearDecrAfter(page))
 784                 atomic_dec(&nr_async_pages);
 785
 786         UnlockPage(page);
 787
 788         return;
 789
 790 still_busy:
 791         spin_unlock_irqrestore(&page_uptodate_lock, flags);
 792         return;
 793 }
 794
 795 /*
 796  * Ok, this is getblk, and it isn't very clear, again to hinder
 797  * race-conditions. Most of the code is seldom used, (ie repeating),
 798  * so it should be much more efficient than it looks.
 799  *
 800  * The algorithm is changed: hopefully better, and an elusive bug removed.
 801  *
 802  * 14.02.92: changed it to sync dirty buffers a bit: better performance
 803  * when the filesystem starts to get full of dirty blocks (I hope).
 804  */
 805 struct buffer_head * getblk(kdev_t dev, int block, int size)
 806 {
 807         struct buffer_head * bh;
 808         int isize;
 809
 810 repeat:
 811         spin_lock(&lru_list_lock);
 812         write_lock(&hash_table_lock);
 813         bh = __get_hash_table(dev, block, size);
 814         if (bh)
 815                 goto out;
 816
 817         isize = BUFSIZE_INDEX(size);
 818         spin_lock(&free_list[isize].lock);
 819         bh = free_list[isize].list;
 820         if (bh) {
 821                 __remove_from_free_list(bh, isize);
 822                 atomic_set(&bh->b_count, 1);
 823         }
 824         spin_unlock(&free_list[isize].lock);
 825
 826         /*
 827          * OK, FINALLY we know that this buffer is the only one of
 828          * its kind, we hold a reference (b_count>0), it is unlocked,
 829          * and it is clean.
 830          */
 831         if (bh) {
 832                 init_buffer(bh, end_buffer_io_sync, NULL);
 833                 bh->b_dev = dev;
 834                 bh->b_blocknr = block;
 835                 bh->b_state = 1 << BH_Mapped;
 836
 837                 /* Insert the buffer into the regular lists */
 838                 __insert_into_queues(bh);
 839         out:
 840                 write_unlock(&hash_table_lock);
 841                 spin_unlock(&lru_list_lock);
 842                 touch_buffer(bh);
 843                 return bh;
 844         }
 845
 846         /*
 847          * If we block while refilling the free list, somebody may
 848          * create the buffer first ... search the hashes again.
 849          */
 850         write_unlock(&hash_table_lock);
 851         spin_unlock(&lru_list_lock);
 852         refill_freelist(size);
 853         goto repeat;
 854 }
 855
 856 /* -1 -> no need to flush
 857     0 -> async flush
 858     1 -> sync flush (wait for I/O completation) */
 859 static int balance_dirty_state(kdev_t dev)
 860 {
 861         unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
 862
 863         dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
 864         tot = nr_free_buffer_pages();
 865         tot -= size_buffers_type[BUF_PROTECTED] >> PAGE_SHIFT;
 866
 867         dirty *= 200;
 868         soft_dirty_limit = tot * bdf_prm.b_un.nfract;
 869         hard_dirty_limit = soft_dirty_limit * 2;
 870
 871         if (dirty > soft_dirty_limit) {
 872                 if (dirty > hard_dirty_limit)
 873                         return 1;
 874                 return 0;
 875         }
 876         return -1;
 877 }
 878
 879 /*
 880  * if a new dirty buffer is created we need to balance bdflush.
 881  *
 882  * in the future we might want to make bdflush aware of different
 883  * pressures on different devices - thus the (currently unused)
 884  * 'dev' parameter.
 885  */
 886 void balance_dirty(kdev_t dev)
 887 {
 888         int state = balance_dirty_state(dev);
 889
 890         if (state < 0)
 891                 return;
 892         wakeup_bdflush(state);
 893 }
 894
 895 static __inline__ void __mark_dirty(struct buffer_head *bh, int flag)
 896 {
 897         bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
 898         refile_buffer(bh);
 899 }
 900
 901 /* atomic version, the user must call balance_dirty() by hand
 902    as soon as it become possible to block */
 903 void __mark_buffer_dirty(struct buffer_head *bh, int flag)
 904 {
 905         if (!atomic_set_buffer_dirty(bh))
 906                 __mark_dirty(bh, flag);
 907 }
 908
 909 void mark_buffer_dirty(struct buffer_head *bh, int flag)
 910 {
 911         __mark_buffer_dirty(bh, flag);
 912         balance_dirty(bh->b_dev);
 913 }
 914
 915 /*
 916  * A buffer may need to be moved from one buffer list to another
 917  * (e.g. in case it is not shared any more). Handle this.
 918  */
 919 static void __refile_buffer(struct buffer_head *bh)
 920 {
 921         int dispose = BUF_CLEAN;
 922         if (buffer_locked(bh))
 923                 dispose = BUF_LOCKED;
 924         if (buffer_dirty(bh))
 925                 dispose = BUF_DIRTY;
 926         if (buffer_protected(bh))
 927                 dispose = BUF_PROTECTED;
 928         if (dispose != bh->b_list) {
 929                 __remove_from_lru_list(bh, bh->b_list);
 930                 bh->b_list = dispose;
 931                 __insert_into_lru_list(bh, dispose);
 932         }
 933 }
 934
 935 void refile_buffer(struct buffer_head *bh)
 936 {
 937         spin_lock(&lru_list_lock);
 938         __refile_buffer(bh);
 939         spin_unlock(&lru_list_lock);
 940 }
 941
 942 /*
 943  * Release a buffer head
 944  */
 945 void __brelse(struct buffer_head * buf)
 946 {
 947         if (atomic_read(&buf->b_count)) {
 948                 atomic_dec(&buf->b_count);
 949                 return;
 950         }
 951         printk("VFS: brelse: Trying to free free buffer\n");
 952 }
 953
 954 /*
 955  * bforget() is like brelse(), except it puts the buffer on the
 956  * free list if it can.. We can NOT free the buffer if:
 957  *  - there are other users of it
 958  *  - it is locked and thus can have active IO
 959  */
 960 void __bforget(struct buffer_head * buf)
 961 {
 962         /* grab the lru lock here to block bdflush. */
 963         spin_lock(&lru_list_lock);
 964         write_lock(&hash_table_lock);
 965         if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf))
 966                 goto in_use;
 967         __hash_unlink(buf);
 968         write_unlock(&hash_table_lock);
 969         __remove_from_lru_list(buf, buf->b_list);
 970         spin_unlock(&lru_list_lock);
 971         put_last_free(buf);
 972         return;
 973
 974  in_use:
 975         write_unlock(&hash_table_lock);
 976         spin_unlock(&lru_list_lock);
 977 }
 978
 979 /*
 980  * bread() reads a specified block and returns the buffer that contains
 981  * it. It returns NULL if the block was unreadable.
 982  */
 983 struct buffer_head * bread(kdev_t dev, int block, int size)
 984 {
 985         struct buffer_head * bh;
 986
 987         bh = getblk(dev, block, size);
 988         if (buffer_uptodate(bh))
 989                 return bh;
 990         ll_rw_block(READ, 1, &bh);
 991         wait_on_buffer(bh);
 992         if (buffer_uptodate(bh))
 993                 return bh;
 994         brelse(bh);
 995         return NULL;
 996 }
 997
 998 /*
 999  * Ok, breada can be used as bread, but additionally to mark other
1000  * blocks for reading as well. End the argument list with a negative
1001  * number.
1002  */
1003
1004 #define NBUF 16
1005
1006 struct buffer_head * breada(kdev_t dev, int block, int bufsize,
1007         unsigned int pos, unsigned int filesize)
1008 {
1009         struct buffer_head * bhlist[NBUF];
1010         unsigned int blocks;
1011         struct buffer_head * bh;
1012         int index;
1013         int i, j;
1014
1015         if (pos >= filesize)
1016                 return NULL;
1017
1018         if (block < 0)
1019                 return NULL;
1020
1021         bh = getblk(dev, block, bufsize);
1022         index = BUFSIZE_INDEX(bh->b_size);
1023
1024         if (buffer_uptodate(bh))
1025                 return(bh);
1026         else ll_rw_block(READ, 1, &bh);
1027
1028         blocks = (filesize - pos) >> (9+index);
1029
1030         if (blocks < (read_ahead[MAJOR(dev)] >> index))
1031                 blocks = read_ahead[MAJOR(dev)] >> index;
1032         if (blocks > NBUF)
1033                 blocks = NBUF;
1034
1035 /*      if (blocks) printk("breada (new) %d blocks\n",blocks); */
1036
1037         bhlist[0] = bh;
1038         j = 1;
1039         for(i=1; i<blocks; i++) {
1040                 bh = getblk(dev,block+i,bufsize);
1041                 if (buffer_uptodate(bh)) {
1042                         brelse(bh);
1043                         break;
1044                 }
1045                 else bhlist[j++] = bh;
1046         }
1047
1048         /* Request the read for these buffers, and then release them. */
1049         if (j>1)
1050                 ll_rw_block(READA, (j-1), bhlist+1);
1051         for(i=1; i<j; i++)
1052                 brelse(bhlist[i]);
1053
1054         /* Wait for this buffer, and then continue on. */
1055         bh = bhlist[0];
1056         wait_on_buffer(bh);
1057         if (buffer_uptodate(bh))
1058                 return bh;
1059         brelse(bh);
1060         return NULL;
1061 }
1062
1063 /*
1064  * Note: the caller should wake up the buffer_wait list if needed.
1065  */
1066 static __inline__ void __put_unused_buffer_head(struct buffer_head * bh)
1067 {
1068         if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1069                 kmem_cache_free(bh_cachep, bh);
1070         } else {
1071                 bh->b_blocknr = -1;
1072                 init_waitqueue_head(&bh->b_wait);
1073                 nr_unused_buffer_heads++;
1074                 bh->b_next_free = unused_list;
1075                 bh->b_this_page = NULL;
1076                 unused_list = bh;
1077         }
1078 }
1079
1080 /*
1081  * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1082  * no-buffer-head deadlock.  Return NULL on failure; waiting for
1083  * buffer heads is now handled in create_buffers().
1084  */
1085 static struct buffer_head * get_unused_buffer_head(int async)
1086 {
1087         struct buffer_head * bh;
1088
1089         spin_lock(&unused_list_lock);
1090         if (nr_unused_buffer_heads > NR_RESERVED) {
1091                 bh = unused_list;
1092                 unused_list = bh->b_next_free;
1093                 nr_unused_buffer_heads--;
1094                 spin_unlock(&unused_list_lock);
1095                 return bh;
1096         }
1097         spin_unlock(&unused_list_lock);
1098
1099         /* This is critical.  We can't swap out pages to get
1100          * more buffer heads, because the swap-out may need
1101          * more buffer-heads itself.  Thus SLAB_BUFFER.
1102          */
1103         if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
1104                 memset(bh, 0, sizeof(*bh));
1105                 init_waitqueue_head(&bh->b_wait);
1106                 return bh;
1107         }
1108
1109         /*
1110          * If we need an async buffer, use the reserved buffer heads.
1111          */
1112         if (async) {
1113                 spin_lock(&unused_list_lock);
1114                 if (unused_list) {
1115                         bh = unused_list;
1116                         unused_list = bh->b_next_free;
1117                         nr_unused_buffer_heads--;
1118                         spin_unlock(&unused_list_lock);
1119                         return bh;
1120                 }
1121                 spin_unlock(&unused_list_lock);
1122         }
1123 #if 0
1124         /*
1125          * (Pending further analysis ...)
1126          * Ordinary (non-async) requests can use a different memory priority
1127          * to free up pages. Any swapping thus generated will use async
1128          * buffer heads.
1129          */
1130         if(!async &&
1131            (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1132                 memset(bh, 0, sizeof(*bh));
1133                 init_waitqueue_head(&bh->b_wait);
1134                 return bh;
1135         }
1136 #endif
1137
1138         return NULL;
1139 }
1140
1141 void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
1142 {
1143         bh->b_page = page;
1144         if (offset >= PAGE_SIZE)
1145                 BUG();
1146         if (PageHighMem(page))
1147                 /*
1148                  * This catches illegal uses and preserves the offset:
1149                  */
1150                 bh->b_data = (char *)(0 + offset);
1151         else
1152                 bh->b_data = page_address(page) + offset;
1153 }
1154
1155 /*
1156  * Create the appropriate buffers when given a page for data area and
1157  * the size of each buffer.. Use the bh->b_this_page linked list to
1158  * follow the buffers created.  Return NULL if unable to create more
1159  * buffers.
1160  * The async flag is used to differentiate async IO (paging, swapping)
1161  * from ordinary buffer allocations, and only async requests are allowed
1162  * to sleep waiting for buffer heads.
1163  */
1164 static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
1165 {
1166         struct buffer_head *bh, *head;
1167         long offset;
1168
1169 try_again:
1170         head = NULL;
1171         offset = PAGE_SIZE;
1172         while ((offset -= size) >= 0) {
1173                 bh = get_unused_buffer_head(async);
1174                 if (!bh)
1175                         goto no_grow;
1176
1177                 bh->b_dev = B_FREE;  /* Flag as unused */
1178                 bh->b_this_page = head;
1179                 head = bh;
1180
1181                 bh->b_state = 0;
1182                 bh->b_next_free = NULL;
1183                 bh->b_pprev = NULL;
1184                 atomic_set(&bh->b_count, 0);
1185                 bh->b_size = size;
1186
1187                 set_bh_page(bh, page, offset);
1188
1189                 bh->b_list = BUF_CLEAN;
1190                 bh->b_end_io = end_buffer_io_bad;
1191         }
1192         return head;
1193 /*
1194  * In case anything failed, we just free everything we got.
1195  */
1196 no_grow:
1197         if (head) {
1198                 spin_lock(&unused_list_lock);
1199                 do {
1200                         bh = head;
1201                         head = head->b_this_page;
1202                         __put_unused_buffer_head(bh);
1203                 } while (head);
1204                 spin_unlock(&unused_list_lock);
1205
1206                 /* Wake up any waiters ... */
1207                 wake_up(&buffer_wait);
1208         }
1209
1210         /*
1211          * Return failure for non-async IO requests.  Async IO requests
1212          * are not allowed to fail, so we have to wait until buffer heads
1213          * become available.  But we don't want tasks sleeping with
1214          * partially complete buffers, so all were released above.
1215          */
1216         if (!async)
1217                 return NULL;
1218
1219         /* We're _really_ low on memory. Now we just
1220          * wait for old buffer heads to become free due to
1221          * finishing IO.  Since this is an async request and
1222          * the reserve list is empty, we're sure there are
1223          * async buffer heads in use.
1224          */
1225         run_task_queue(&tq_disk);
1226
1227         /*
1228          * Set our state for sleeping, then check again for buffer heads.
1229          * This ensures we won't miss a wake_up from an interrupt.
1230          */
1231         wait_event(buffer_wait, nr_unused_buffer_heads >= MAX_BUF_PER_PAGE);
1232         goto try_again;
1233 }
1234
1235 static int create_page_buffers(int rw, struct page *page, kdev_t dev, int b[], int size)
1236 {
1237         struct buffer_head *head, *bh, *tail;
1238         int block;
1239
1240         if (!PageLocked(page))
1241                 BUG();
1242         /*
1243          * Allocate async buffer heads pointing to this page, just for I/O.
1244          * They don't show up in the buffer hash table, but they *are*
1245          * registered in page->buffers.
1246          */
1247         head = create_buffers(page, size, 1);
1248         if (page->buffers)
1249                 BUG();
1250         if (!head)
1251                 BUG();
1252         tail = head;
1253         for (bh = head; bh; bh = bh->b_this_page) {
1254                 block = *(b++);
1255
1256                 tail = bh;
1257                 init_buffer(bh, end_buffer_io_async, NULL);
1258                 bh->b_dev = dev;
1259                 bh->b_blocknr = block;
1260
1261                 set_bit(BH_Mapped, &bh->b_state);
1262         }
1263         tail->b_this_page = head;
1264         page_cache_get(page);
1265         page->buffers = head;
1266         return 0;
1267 }
1268
1269 static void unmap_buffer(struct buffer_head * bh)
1270 {
1271         if (buffer_mapped(bh)) {
1272                 mark_buffer_clean(bh);
1273                 wait_on_buffer(bh);
1274                 clear_bit(BH_Uptodate, &bh->b_state);
1275                 clear_bit(BH_Mapped, &bh->b_state);
1276                 clear_bit(BH_Req, &bh->b_state);
1277                 clear_bit(BH_New, &bh->b_state);
1278         }
1279 }
1280
1281 /*
1282  * We don't have to release all buffers here, but
1283  * we have to be sure that no dirty buffer is left
1284  * and no IO is going on (no buffer is locked), because
1285  * we have truncated the file and are going to free the
1286  * blocks on-disk..
1287  */
1288 int block_flushpage(struct page *page, unsigned long offset)
1289 {
1290         struct buffer_head *head, *bh, *next;
1291         unsigned int curr_off = 0;
1292
1293         if (!PageLocked(page))
1294                 BUG();
1295         if (!page->buffers)
1296                 return 1;
1297
1298         head = page->buffers;
1299         bh = head;
1300         do {
1301                 unsigned int next_off = curr_off + bh->b_size;
1302                 next = bh->b_this_page;
1303
1304                 /*
1305                  * is this block fully flushed?
1306                  */
1307                 if (offset <= curr_off)
1308                         unmap_buffer(bh);
1309                 curr_off = next_off;
1310                 bh = next;
1311         } while (bh != head);
1312
1313         /*
1314          * subtle. We release buffer-heads only if this is
1315          * the 'final' flushpage. We have invalidated the get_block
1316          * cached value unconditionally, so real IO is not
1317          * possible anymore.
1318          *
1319          * If the free doesn't work out, the buffers can be
1320          * left around - they just turn into anonymous buffers
1321          * instead.
1322          */
1323         if (!offset) {
1324                 if (!try_to_free_buffers(page, 0)) {
1325                         atomic_inc(&buffermem_pages);
1326                         return 0;
1327                 }
1328         }
1329
1330         return 1;
1331 }
1332
1333 static void create_empty_buffers(struct page *page, struct inode *inode, unsigned long blocksize)
1334 {
1335         struct buffer_head *bh, *head, *tail;
1336
1337         head = create_buffers(page, blocksize, 1);
1338         if (page->buffers)
1339                 BUG();
1340
1341         bh = head;
1342         do {
1343                 bh->b_dev = inode->i_dev;
1344                 bh->b_blocknr = 0;
1345                 bh->b_end_io = end_buffer_io_bad;
1346                 tail = bh;
1347                 bh = bh->b_this_page;
1348         } while (bh);
1349         tail->b_this_page = head;
1350         page->buffers = head;
1351         page_cache_get(page);
1352 }
1353
1354 /*
1355  * We are taking a block for data and we don't want any output from any
1356  * buffer-cache aliases starting from return from that function and
1357  * until the moment when something will explicitly mark the buffer
1358  * dirty (hopefully that will not happen until we will free that block ;-)
1359  * We don't even need to mark it not-uptodate - nobody can expect
1360  * anything from a newly allocated buffer anyway. We used to used
1361  * unmap_buffer() for such invalidation, but that was wrong. We definitely
1362  * don't want to mark the alias unmapped, for example - it would confuse
1363  * anyone who might pick it with bread() afterwards...
1364  */
1365
1366 static void unmap_underlying_metadata(struct buffer_head * bh)
1367 {
1368         struct buffer_head *old_bh;
1369
1370         old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
1371         if (old_bh) {
1372                 mark_buffer_clean(old_bh);
1373                 wait_on_buffer(old_bh);
1374                 clear_bit(BH_Req, &old_bh->b_state);
1375                 /* Here we could run brelse or bforget. We use
1376                    bforget because it will try to put the buffer
1377                    in the freelist. */
1378                 __bforget(old_bh);
1379         }
1380 }
1381
1382 /*
1383  * block_write_full_page() is SMP-safe - currently it's still
1384  * being called with the kernel lock held, but the code is ready.
1385  */
1386 static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
1387 {
1388         int err, i, need_balance_dirty = 0;
1389         unsigned long block;
1390         struct buffer_head *bh, *head;
1391
1392         if (!PageLocked(page))
1393                 BUG();
1394
1395         if (!page->buffers)
1396                 create_empty_buffers(page, inode, inode->i_sb->s_blocksize);
1397         head = page->buffers;
1398
1399         block = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1400
1401         bh = head;
1402         i = 0;
1403         do {
1404                 /*
1405                  * If the buffer isn't up-to-date, we can't be sure
1406                  * that the buffer has been initialized with the proper
1407                  * block number information etc..
1408                  *
1409                  * Leave it to the low-level FS to make all those
1410                  * decisions (block #0 may actually be a valid block)
1411                  */
1412                 bh->b_end_io = end_buffer_io_sync;
1413                 if (!buffer_mapped(bh)) {
1414                         err = get_block(inode, block, bh, 1);
1415                         if (err)
1416                                 goto out;
1417                         if (buffer_new(bh))
1418                                 unmap_underlying_metadata(bh);
1419                 }
1420                 set_bit(BH_Uptodate, &bh->b_state);
1421                 if (!atomic_set_buffer_dirty(bh)) {
1422                         __mark_dirty(bh, 0);
1423                         need_balance_dirty = 1;
1424                 }
1425
1426                 bh = bh->b_this_page;
1427                 block++;
1428         } while (bh != head);
1429
1430         if (need_balance_dirty)
1431                 balance_dirty(bh->b_dev);
1432
1433         SetPageUptodate(page);
1434         return 0;
1435 out:
1436         ClearPageUptodate(page);
1437         return err;
1438 }
1439
1440 static int __block_prepare_write(struct inode *inode, struct page *page,
1441                 unsigned from, unsigned to, get_block_t *get_block)
1442 {
1443         unsigned block_start, block_end;
1444         unsigned long block;
1445         int err = 0;
1446         unsigned blocksize, bbits;
1447         struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1448         char *kaddr = (char *)kmap(page);
1449
1450         blocksize = inode->i_sb->s_blocksize;
1451         if (!page->buffers)
1452                 create_empty_buffers(page, inode, blocksize);
1453         head = page->buffers;
1454
1455         bbits = inode->i_sb->s_blocksize_bits;
1456         block = page->index << (PAGE_CACHE_SHIFT - bbits);
1457
1458         for(bh = head, block_start = 0; bh != head || !block_start;
1459             block++, block_start=block_end, bh = bh->b_this_page) {
1460                 if (!bh)
1461                         BUG();
1462                 block_end = block_start+blocksize;
1463                 if (block_end <= from)
1464                         continue;
1465                 if (block_start >= to)
1466                         break;
1467                 bh->b_end_io = end_buffer_io_sync;
1468                 if (!buffer_mapped(bh)) {
1469                         err = get_block(inode, block, bh, 1);
1470                         if (err)
1471                                 goto out;
1472                         if (buffer_new(bh)) {
1473                                 unmap_underlying_metadata(bh);
1474                                 if (block_end > to)
1475                                         memset(kaddr+to, 0, block_end-to);
1476                                 if (block_start < from)
1477                                         memset(kaddr+block_start, 0, from-block_start);
1478                                 if (block_end > to || block_start < from)
1479                                         flush_dcache_page(page);
1480                                 continue;
1481                         }
1482                 }
1483                 if (!buffer_uptodate(bh) &&
1484                      (block_start < from || block_end > to)) {
1485                         ll_rw_block(READ, 1, &bh);
1486                         *wait_bh++=bh;
1487                 }
1488         }
1489         /*
1490          * If we issued read requests - let them complete.
1491          */
1492         while(wait_bh > wait) {
1493                 wait_on_buffer(*--wait_bh);
1494                 err = -EIO;
1495                 if (!buffer_uptodate(*wait_bh))
1496                         goto out;
1497         }
1498         return 0;
1499 out:
1500         return err;
1501 }
1502
1503 static int __block_commit_write(struct inode *inode, struct page *page,
1504                 unsigned from, unsigned to)
1505 {
1506         unsigned block_start, block_end;
1507         int partial = 0, need_balance_dirty = 0;
1508         unsigned blocksize;
1509         struct buffer_head *bh, *head;
1510
1511         blocksize = inode->i_sb->s_blocksize;
1512
1513         for(bh = head = page->buffers, block_start = 0;
1514             bh != head || !block_start;
1515             block_start=block_end, bh = bh->b_this_page) {
1516                 block_end = block_start + blocksize;
1517                 if (block_end <= from || block_start >= to) {
1518                         if (!buffer_uptodate(bh))
1519                                 partial = 1;
1520                 } else {
1521                         set_bit(BH_Uptodate, &bh->b_state);
1522                         if (!atomic_set_buffer_dirty(bh)) {
1523                                 __mark_dirty(bh, 0);
1524                                 need_balance_dirty = 1;
1525                         }
1526                 }
1527         }
1528
1529         if (need_balance_dirty)
1530                 balance_dirty(bh->b_dev);
1531         /*
1532          * is this a partial write that happened to make all buffers
1533          * uptodate then we can optimize away a bogus readpage() for
1534          * the next read(). Here we 'discover' wether the page went
1535          * uptodate as a result of this (potentially partial) write.
1536          */
1537         if (!partial)
1538                 SetPageUptodate(page);
1539         return 0;
1540 }
1541
1542 /*
1543  * Generic "read page" function for block devices that have the normal
1544  * get_block functionality. This is most of the block device filesystems.
1545  * Reads the page asynchronously --- the unlock_buffer() and
1546  * mark_buffer_uptodate() functions propagate buffer state into the
1547  * page struct once IO has completed.
1548  */
1549 int block_read_full_page(struct page *page, get_block_t *get_block)
1550 {
1551         struct inode *inode = (struct inode*)page->mapping->host;
1552         unsigned long iblock, lblock;
1553         struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1554         unsigned int blocksize, blocks;
1555         unsigned long kaddr = 0;
1556         int nr, i;
1557
1558         if (!PageLocked(page))
1559                 PAGE_BUG(page);
1560         blocksize = inode->i_sb->s_blocksize;
1561         if (!page->buffers)
1562                 create_empty_buffers(page, inode, blocksize);
1563         head = page->buffers;
1564
1565         blocks = PAGE_CACHE_SIZE >> inode->i_sb->s_blocksize_bits;
1566         iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1567         lblock = (inode->i_size+blocksize-1) >> inode->i_sb->s_blocksize_bits;
1568         bh = head;
1569         nr = 0;
1570         i = 0;
1571
1572         do {
1573                 if (buffer_uptodate(bh))
1574                         continue;
1575
1576                 if (!buffer_mapped(bh)) {
1577                         if (iblock < lblock)
1578                                 get_block(inode, iblock, bh, 0);
1579                         if (!buffer_mapped(bh)) {
1580                                 if (!kaddr)
1581                                         kaddr = kmap(page);
1582                                 memset((char *)(kaddr + i*blocksize), 0, blocksize);
1583                                 flush_dcache_page(page);
1584                                 set_bit(BH_Uptodate, &bh->b_state);
1585                                 continue;
1586                         }
1587                 }
1588
1589                 init_buffer(bh, end_buffer_io_async, NULL);
1590                 atomic_inc(&bh->b_count);
1591                 arr[nr] = bh;
1592                 nr++;
1593         } while (i++, iblock++, (bh = bh->b_this_page) != head);
1594
1595         if (nr) {
1596                 if (Page_Uptodate(page))
1597                         BUG();
1598                 ll_rw_block(READ, nr, arr);
1599         } else {
1600                 /*
1601                  * all buffers are uptodate - we can set the page
1602                  * uptodate as well.
1603                  */
1604                 SetPageUptodate(page);
1605                 UnlockPage(page);
1606         }
1607         if (kaddr)
1608                 kunmap(page);
1609         return 0;
1610 }
1611
1612 /*
1613  * For moronic filesystems that do not allow holes in file.
1614  * We may have to extend the file.
1615  */
1616
1617 int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes)
1618 {
1619         struct address_space *mapping = page->mapping;
1620         struct inode *inode = (struct inode*)mapping->host;
1621         struct page *new_page;
1622         unsigned long pgpos;
1623         long status;
1624         unsigned zerofrom;
1625         unsigned blocksize = inode->i_sb->s_blocksize;
1626         char *kaddr;
1627
1628         while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
1629                 status = -ENOMEM;
1630                 new_page = grab_cache_page(mapping, pgpos);
1631                 if (!new_page)
1632                         goto out;
1633                 /* we might sleep */
1634                 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
1635                         UnlockPage(new_page);
1636                         page_cache_release(new_page);
1637                         continue;
1638                 }
1639                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1640                 if (zerofrom & (blocksize-1)) {
1641                         *bytes |= (blocksize-1);
1642                         (*bytes)++;
1643                 }
1644                 status = __block_prepare_write(inode, new_page, zerofrom,
1645                                                 PAGE_CACHE_SIZE, get_block);
1646                 if (status)
1647                         goto out_unmap;
1648                 kaddr = page_address(new_page);
1649                 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
1650                 flush_dcache_page(new_page);
1651                 __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
1652                 kunmap(new_page);
1653                 UnlockPage(new_page);
1654                 page_cache_release(new_page);
1655         }
1656
1657         if (page->index < pgpos) {
1658                 /* completely inside the area */
1659                 zerofrom = offset;
1660         } else {
1661                 /* page covers the boundary, find the boundary offset */
1662                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1663
1664                 /* if we will expand the thing last block will be filled */
1665                 if (to > zerofrom && (zerofrom & (blocksize-1))) {
1666                         *bytes |= (blocksize-1);
1667                         (*bytes)++;
1668                 }
1669
1670                 /* starting below the boundary? Nothing to zero out */
1671                 if (offset <= zerofrom)
1672                         zerofrom = offset;
1673         }
1674         status = __block_prepare_write(inode, page, zerofrom, to, get_block);
1675         if (status)
1676                 goto out1;
1677         kaddr = page_address(page);
1678         if (zerofrom < offset) {
1679                 memset(kaddr+zerofrom, 0, offset-zerofrom);
1680                 flush_dcache_page(page);
1681                 __block_commit_write(inode, page, zerofrom, offset);
1682         }
1683         return 0;
1684 out1:
1685         ClearPageUptodate(page);
1686         kunmap(page);
1687         return status;
1688
1689 out_unmap:
1690         ClearPageUptodate(new_page);
1691         kunmap(new_page);
1692         UnlockPage(new_page);
1693         page_cache_release(new_page);
1694 out:
1695         return status;
1696 }
1697
1698 int block_prepare_write(struct page *page, unsigned from, unsigned to,
1699                         get_block_t *get_block)
1700 {
1701         struct inode *inode = (struct inode*)page->mapping->host;
1702         int err = __block_prepare_write(inode, page, from, to, get_block);
1703         if (err) {
1704                 ClearPageUptodate(page);
1705                 kunmap(page);
1706         }
1707         return err;
1708 }
1709
1710 int generic_commit_write(struct file *file, struct page *page,
1711                 unsigned from, unsigned to)
1712 {
1713         struct inode *inode = (struct inode*)page->mapping->host;
1714         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1715         __block_commit_write(inode,page,from,to);
1716         kunmap(page);
1717         if (pos > inode->i_size) {
1718                 inode->i_size = pos;
1719                 mark_inode_dirty(inode);
1720         }
1721         return 0;
1722 }
1723
1724 /*
1725  * If it would be '74 that would go into libc...
1726  */
1727 int mem_is_zero(char *p, unsigned len)
1728 {
1729         while (len--)
1730                 if (*p++)
1731                         return 0;
1732         return 1;
1733 }
1734
1735 int block_zero_page(struct address_space *mapping, loff_t from, unsigned length)
1736 {
1737         unsigned long index = from >> PAGE_CACHE_SHIFT;
1738         unsigned offset = from & (PAGE_CACHE_SIZE-1);
1739         struct inode *inode = (struct inode *)mapping->host;
1740         struct page *page;
1741         char *kaddr;
1742         int err;
1743
1744         if (!length)
1745                 return 0;
1746
1747         page = read_cache_page(mapping, index,
1748                                 (filler_t *)mapping->a_ops->readpage, NULL);
1749         err = PTR_ERR(page);
1750         if (ERR_PTR(page))
1751                 goto out;
1752         lock_page(page);
1753         err = -EIO;
1754         if (!Page_Uptodate(page))
1755                 goto unlock;
1756         kaddr = (char*)kmap(page);
1757         err = 0;
1758         if (mem_is_zero(kaddr+offset, length))
1759                 goto unmap;
1760         memset(kaddr+offset, 0, length);
1761         flush_dcache_page(page);
1762         __block_commit_write(inode, page, offset, offset+length);
1763 unmap:
1764         kunmap(page);
1765 unlock:
1766         UnlockPage(page);
1767         page_cache_release(page);
1768 out:
1769         return err;
1770 }
1771
1772 int block_write_full_page(struct page *page, get_block_t *get_block)
1773 {
1774         struct inode *inode = (struct inode*)page->mapping->host;
1775         unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1776         unsigned offset;
1777         int err;
1778
1779         /* easy case */
1780         if (page->index < end_index)
1781                 return __block_write_full_page(inode, page, get_block);
1782
1783         /* things got complicated... */
1784         offset = inode->i_size & (PAGE_CACHE_SIZE-1);
1785         /* OK, are we completely out? */
1786         if (page->index >= end_index+1 || !offset)
1787                 return -EIO;
1788         /* Sigh... will have to work, then... */
1789         err = __block_prepare_write(inode, page, 0, offset, get_block);
1790         if (!err) {
1791                 memset(page_address(page) + offset, 0, PAGE_CACHE_SIZE - offset);
1792                 flush_dcache_page(page);
1793                 __block_commit_write(inode,page,0,offset);
1794 done:
1795                 kunmap(page);
1796                 return err;
1797         }
1798         ClearPageUptodate(page);
1799         goto done;
1800 }
1801
1802 int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block)
1803 {
1804         struct buffer_head tmp;
1805         struct inode *inode = (struct inode*)mapping->host;
1806         tmp.b_state = 0;
1807         tmp.b_blocknr = 0;
1808         get_block(inode, block, &tmp, 0);
1809         return tmp.b_blocknr;
1810 }
1811
1812 /*
1813  * IO completion routine for a buffer_head being used for kiobuf IO: we
1814  * can't dispatch the kiobuf callback until io_count reaches 0.
1815  */
1816
1817 static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate)
1818 {
1819         struct kiobuf *kiobuf;
1820
1821         mark_buffer_uptodate(bh, uptodate);
1822
1823         kiobuf = bh->b_private;
1824         unlock_buffer(bh);
1825         end_kio_request(kiobuf, uptodate);
1826 }
1827
1828
1829 /*
1830  * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
1831  * for them to complete.  Clean up the buffer_heads afterwards.
1832  */
1833
1834 static int wait_kio(int rw, int nr, struct buffer_head *bh[], int size)
1835 {
1836         int iosize;
1837         int i;
1838         struct buffer_head *tmp;
1839
1840
1841         iosize = 0;
1842         spin_lock(&unused_list_lock);
1843
1844         for (i = nr; --i >= 0; ) {
1845                 iosize += size;
1846                 tmp = bh[i];
1847                 if (buffer_locked(tmp)) {
1848                         spin_unlock(&unused_list_lock);
1849                         wait_on_buffer(tmp);
1850                         spin_lock(&unused_list_lock);
1851                 }
1852
1853                 if (!buffer_uptodate(tmp)) {
1854                         /* We are traversing bh'es in reverse order so
1855                            clearing iosize on error calculates the
1856                            amount of IO before the first error. */
1857                         iosize = 0;
1858                 }
1859                 __put_unused_buffer_head(tmp);
1860         }
1861
1862         spin_unlock(&unused_list_lock);
1863
1864         return iosize;
1865 }
1866
1867 /*
1868  * Start I/O on a physical range of kernel memory, defined by a vector
1869  * of kiobuf structs (much like a user-space iovec list).
1870  *
1871  * The kiobuf must already be locked for IO.  IO is submitted
1872  * asynchronously: you need to check page->locked, page->uptodate, and
1873  * maybe wait on page->wait.
1874  *
1875  * It is up to the caller to make sure that there are enough blocks
1876  * passed in to completely map the iobufs to disk.
1877  */
1878
1879 int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
1880                kdev_t dev, unsigned long b[], int size)
1881 {
1882         int             err;
1883         int             length;
1884         int             transferred;
1885         int             i;
1886         int             bufind;
1887         int             pageind;
1888         int             bhind;
1889         int             offset;
1890         int             sectors = size>>9;
1891         unsigned long   blocknr;
1892         struct kiobuf * iobuf = NULL;
1893         struct page *   map;
1894         struct buffer_head *tmp, *bh[KIO_MAX_SECTORS];
1895
1896         if (!nr)
1897                 return 0;
1898
1899         /*
1900          * First, do some alignment and validity checks
1901          */
1902         for (i = 0; i < nr; i++) {
1903                 iobuf = iovec[i];
1904                 if ((iobuf->offset & (size-1)) ||
1905                     (iobuf->length & (size-1)))
1906                         return -EINVAL;
1907                 if (!iobuf->nr_pages)
1908                         panic("brw_kiovec: iobuf not initialised");
1909         }
1910
1911         /*
1912          * OK to walk down the iovec doing page IO on each page we find.
1913          */
1914         bufind = bhind = transferred = err = 0;
1915         for (i = 0; i < nr; i++) {
1916                 iobuf = iovec[i];
1917                 offset = iobuf->offset;
1918                 length = iobuf->length;
1919                 iobuf->errno = 0;
1920
1921                 for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
1922                         map  = iobuf->maplist[pageind];
1923                         if (!map) {
1924                                 err = -EFAULT;
1925                                 goto error;
1926                         }
1927
1928                         while (length > 0) {
1929                                 blocknr = b[bufind++];
1930                                 tmp = get_unused_buffer_head(0);
1931                                 if (!tmp) {
1932                                         err = -ENOMEM;
1933                                         goto error;
1934                                 }
1935
1936                                 tmp->b_dev = B_FREE;
1937                                 tmp->b_size = size;
1938                                 set_bh_page(tmp, map, offset);
1939                                 tmp->b_this_page = tmp;
1940
1941                                 init_buffer(tmp, end_buffer_io_kiobuf, iobuf);
1942                                 tmp->b_rdev = tmp->b_dev = dev;
1943                                 tmp->b_blocknr = blocknr;
1944                                 tmp->b_rsector = blocknr*sectors;
1945                                 tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req);
1946
1947                                 if (rw == WRITE) {
1948                                         set_bit(BH_Uptodate, &tmp->b_state);
1949                                         set_bit(BH_Dirty, &tmp->b_state);
1950                                 }
1951
1952                                 bh[bhind++] = tmp;
1953                                 length -= size;
1954                                 offset += size;
1955
1956                                 atomic_inc(&iobuf->io_count);
1957
1958                                 generic_make_request(rw, tmp);
1959                                 /*
1960                                  * Wait for IO if we have got too much
1961                                  */
1962                                 if (bhind >= KIO_MAX_SECTORS) {
1963                                         err = wait_kio(rw, bhind, bh, size);
1964                                         if (err >= 0)
1965                                                 transferred += err;
1966                                         else
1967                                                 goto finished;
1968                                         bhind = 0;
1969                                 }
1970
1971                                 if (offset >= PAGE_SIZE) {
1972                                         offset = 0;
1973                                         break;
1974                                 }
1975                         } /* End of block loop */
1976                 } /* End of page loop */
1977         } /* End of iovec loop */
1978
1979         /* Is there any IO still left to submit? */
1980         if (bhind) {
1981                 err = wait_kio(rw, bhind, bh, size);
1982                 if (err >= 0)
1983                         transferred += err;
1984                 else
1985                         goto finished;
1986         }
1987
1988  finished:
1989         if (transferred)
1990                 return transferred;
1991         return err;
1992
1993  error:
1994         /* We got an error allocating the bh'es.  Just free the current
1995            buffer_heads and exit. */
1996         spin_lock(&unused_list_lock);
1997         for (i = bhind; --i >= 0; ) {
1998                 __put_unused_buffer_head(bh[bhind]);
1999         }
2000         spin_unlock(&unused_list_lock);
2001         goto finished;
2002 }
2003
2004 /*
2005  * Start I/O on a page.
2006  * This function expects the page to be locked and may return
2007  * before I/O is complete. You then have to check page->locked,
2008  * page->uptodate, and maybe wait on page->wait.
2009  *
2010  * brw_page() is SMP-safe, although it's being called with the
2011  * kernel lock held - but the code is ready.
2012  *
2013  * FIXME: we need a swapper_inode->get_block function to remove
2014  *        some of the bmap kludges and interface ugliness here.
2015  */
2016 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size)
2017 {
2018         struct buffer_head *head, *bh, *arr[MAX_BUF_PER_PAGE];
2019         int nr, fresh /* temporary debugging flag */, block;
2020
2021         if (!PageLocked(page))
2022                 panic("brw_page: page not locked for I/O");
2023 //      ClearPageError(page);
2024         /*
2025          * We pretty much rely on the page lock for this, because
2026          * create_page_buffers() might sleep.
2027          */
2028         fresh = 0;
2029         if (!page->buffers) {
2030                 create_page_buffers(rw, page, dev, b, size);
2031                 fresh = 1;
2032         }
2033         if (!page->buffers)
2034                 BUG();
2035
2036         head = page->buffers;
2037         bh = head;
2038         nr = 0;
2039         do {
2040                 block = *(b++);
2041
2042                 if (fresh && (atomic_read(&bh->b_count) != 0))
2043                         BUG();
2044                 if (rw == READ) {
2045                         if (!fresh)
2046                                 BUG();
2047                         if (!buffer_uptodate(bh)) {
2048                                 arr[nr++] = bh;
2049                                 atomic_inc(&bh->b_count);
2050                         }
2051                 } else { /* WRITE */
2052                         if (!bh->b_blocknr) {
2053                                 if (!block)
2054                                         BUG();
2055                                 bh->b_blocknr = block;
2056                         } else {
2057                                 if (!block)
2058                                         BUG();
2059                         }
2060                         set_bit(BH_Uptodate, &bh->b_state);
2061                         set_bit(BH_Dirty, &bh->b_state);
2062                         arr[nr++] = bh;
2063                         atomic_inc(&bh->b_count);
2064                 }
2065                 bh = bh->b_this_page;
2066         } while (bh != head);
2067         if ((rw == READ) && nr) {
2068                 if (Page_Uptodate(page))
2069                         BUG();
2070                 ll_rw_block(rw, nr, arr);
2071         } else {
2072                 if (!nr && rw == READ) {
2073                         SetPageUptodate(page);
2074                         UnlockPage(page);
2075                 }
2076                 if (nr && (rw == WRITE))
2077                         ll_rw_block(rw, nr, arr);
2078         }
2079         return 0;
2080 }
2081
2082 int block_symlink(struct inode *inode, const char *symname, int len)
2083 {
2084         struct address_space *mapping = inode->i_mapping;
2085         struct page *page = grab_cache_page(mapping, 0);
2086         int err = -ENOMEM;
2087         char *kaddr;
2088
2089         if (!page)
2090                 goto fail;
2091         err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
2092         if (err)
2093                 goto fail_map;
2094         kaddr = page_address(page);
2095         memcpy(kaddr, symname, len-1);
2096         mapping->a_ops->commit_write(NULL, page, 0, len-1);
2097         /*
2098          * Notice that we are _not_ going to block here - end of page is
2099          * unmapped, so this will only try to map the rest of page, see
2100          * that it is unmapped (typically even will not look into inode -
2101          * ->i_size will be enough for everything) and zero it out.
2102          * OTOH it's obviously correct and should make the page up-to-date.
2103          */
2104         err = mapping->a_ops->readpage(NULL, page);
2105         wait_on_page(page);
2106         page_cache_release(page);
2107         if (err < 0)
2108                 goto fail;
2109         mark_inode_dirty(inode);
2110         return 0;
2111 fail_map:
2112         UnlockPage(page);
2113         page_cache_release(page);
2114 fail:
2115         return err;
2116 }
2117
2118 /*
2119  * Try to increase the number of buffers available: the size argument
2120  * is used to determine what kind of buffers we want.
2121  */
2122 static int grow_buffers(int size)
2123 {
2124         struct page * page;
2125         struct buffer_head *bh, *tmp;
2126         struct buffer_head * insert_point;
2127         int isize;
2128
2129         if ((size & 511) || (size > PAGE_SIZE)) {
2130                 printk("VFS: grow_buffers: size = %d\n",size);
2131                 return 0;
2132         }
2133
2134         page = alloc_page(GFP_BUFFER);
2135         if (!page)
2136                 goto out;
2137         bh = create_buffers(page, size, 0);
2138         if (!bh)
2139                 goto no_buffer_head;
2140
2141         isize = BUFSIZE_INDEX(size);
2142
2143         spin_lock(&free_list[isize].lock);
2144         insert_point = free_list[isize].list;
2145         tmp = bh;
2146         while (1) {
2147                 if (insert_point) {
2148                         tmp->b_next_free = insert_point->b_next_free;
2149                         tmp->b_prev_free = insert_point;
2150                         insert_point->b_next_free->b_prev_free = tmp;
2151                         insert_point->b_next_free = tmp;
2152                 } else {
2153                         tmp->b_prev_free = tmp;
2154                         tmp->b_next_free = tmp;
2155                 }
2156                 insert_point = tmp;
2157                 if (tmp->b_this_page)
2158                         tmp = tmp->b_this_page;
2159                 else
2160                         break;
2161         }
2162         tmp->b_this_page = bh;
2163         free_list[isize].list = bh;
2164         spin_unlock(&free_list[isize].lock);
2165
2166         page->buffers = bh;
2167         page->flags &= ~(1 << PG_referenced);
2168         lru_cache_add(page);
2169         atomic_inc(&buffermem_pages);
2170         return 1;
2171
2172 no_buffer_head:
2173         page_cache_release(page);
2174 out:
2175         return 0;
2176 }
2177
2178 /*
2179  * Sync all the buffers on one page..
2180  *
2181  * If we have old buffers that are locked, we'll
2182  * wait on them, but we won't wait on the new ones
2183  * we're writing out now.
2184  *
2185  * This all is required so that we can free up memory
2186  * later.
2187  *
2188  * Wait:
2189  *      0 - no wait (this does not get called - see try_to_free_buffers below)
2190  *      1 - start IO for dirty buffers
2191  *      2 - wait for completion of locked buffers
2192  */
2193 static void sync_page_buffers(struct buffer_head *bh, int wait)
2194 {
2195         struct buffer_head * tmp = bh;
2196
2197         do {
2198                 struct buffer_head *p = tmp;
2199                 tmp = tmp->b_this_page;
2200                 if (buffer_locked(p)) {
2201                         if (wait > 1)
2202                                 __wait_on_buffer(p);
2203                 } else if (buffer_dirty(p))
2204                         ll_rw_block(WRITE, 1, &p);
2205         } while (tmp != bh);
2206 }
2207
2208 /*
2209  * Can the buffer be thrown out?
2210  */
2211 #define BUFFER_BUSY_BITS        ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
2212 #define buffer_busy(bh)         (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
2213
2214 /*
2215  * try_to_free_buffers() checks if all the buffers on this particular page
2216  * are unused, and free's the page if so.
2217  *
2218  * Wake up bdflush() if this fails - if we're running low on memory due
2219  * to dirty buffers, we need to flush them out as quickly as possible.
2220  *
2221  * NOTE: There are quite a number of ways that threads of control can
2222  *       obtain a reference to a buffer head within a page.  So we must
2223  *       lock out all of these paths to cleanly toss the page.
2224  */
2225 int try_to_free_buffers(struct page * page, int wait)
2226 {
2227         struct buffer_head * tmp, * bh = page->buffers;
2228         int index = BUFSIZE_INDEX(bh->b_size);
2229
2230         spin_lock(&lru_list_lock);
2231         write_lock(&hash_table_lock);
2232         spin_lock(&free_list[index].lock);
2233         tmp = bh;
2234         do {
2235                 struct buffer_head *p = tmp;
2236
2237                 tmp = tmp->b_this_page;
2238                 if (buffer_busy(p))
2239                         goto busy_buffer_page;
2240         } while (tmp != bh);
2241
2242         spin_lock(&unused_list_lock);
2243         tmp = bh;
2244         do {
2245                 struct buffer_head * p = tmp;
2246                 tmp = tmp->b_this_page;
2247
2248                 /* The buffer can be either on the regular
2249                  * queues or on the free list..
2250                  */
2251                 if (p->b_dev != B_FREE)
2252                         __remove_from_queues(p);
2253                 else
2254                         __remove_from_free_list(p, index);
2255                 __put_unused_buffer_head(p);
2256         } while (tmp != bh);
2257         spin_unlock(&unused_list_lock);
2258
2259         /* Wake up anyone waiting for buffer heads */
2260         wake_up(&buffer_wait);
2261
2262         /* And free the page */
2263         page->buffers = NULL;
2264         page_cache_release(page);
2265         spin_unlock(&free_list[index].lock);
2266         write_unlock(&hash_table_lock);
2267         spin_unlock(&lru_list_lock);
2268         return 1;
2269
2270 busy_buffer_page:
2271         /* Uhhuh, start writeback so that we don't end up with all dirty pages */
2272         spin_unlock(&free_list[index].lock);
2273         write_unlock(&hash_table_lock);
2274         spin_unlock(&lru_list_lock);
2275         if (wait)
2276                 sync_page_buffers(bh, wait);
2277         return 0;
2278 }
2279
2280 /* ================== Debugging =================== */
2281
2282 void show_buffers(void)
2283 {
2284 #ifdef CONFIG_SMP
2285         struct buffer_head * bh;
2286         int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
2287         int protected = 0;
2288         int nlist;
2289         static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", "PROTECTED", };
2290 #endif
2291
2292         printk("Buffer memory:   %6dkB\n",
2293                         atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
2294
2295 #ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
2296         if (!spin_trylock(&lru_list_lock))
2297                 return;
2298         for(nlist = 0; nlist < NR_LIST; nlist++) {
2299                 found = locked = dirty = used = lastused = protected = 0;
2300                 bh = lru_list[nlist];
2301                 if(!bh) continue;
2302
2303                 do {
2304                         found++;
2305                         if (buffer_locked(bh))
2306                                 locked++;
2307                         if (buffer_protected(bh))
2308                                 protected++;
2309                         if (buffer_dirty(bh))
2310                                 dirty++;
2311                         if (atomic_read(&bh->b_count))
2312                                 used++, lastused = found;
2313                         bh = bh->b_next_free;
2314                 } while (bh != lru_list[nlist]);
2315                 {
2316                         int tmp = nr_buffers_type[nlist];
2317                         if (found != tmp)
2318                                 printk("%9s: BUG -> found %d, reported %d\n",
2319                                        buf_types[nlist], found, tmp);
2320                 }
2321                 printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
2322                        "%d locked, %d protected, %d dirty\n",
2323                        buf_types[nlist], found, size_buffers_type[nlist]>>10,
2324                        used, lastused, locked, protected, dirty);
2325         }
2326         spin_unlock(&lru_list_lock);
2327 #endif
2328 }
2329
2330 /* ===================== Init ======================= */
2331
2332 /*
2333  * allocate the hash table and init the free list
2334  * Use gfp() for the hash table to decrease TLB misses, use
2335  * SLAB cache for buffer heads.
2336  */
2337 void __init buffer_init(unsigned long mempages)
2338 {
2339         int order, i;
2340         unsigned int nr_hash;
2341
2342         /* The buffer cache hash table is less important these days,
2343          * trim it a bit.
2344          */
2345         mempages >>= 14;
2346
2347         mempages *= sizeof(struct buffer_head *);
2348
2349         for (order = 0; (1 << order) < mempages; order++)
2350                 ;
2351
2352         /* try to allocate something until we get it or we're asking
2353            for something that is really too small */
2354
2355         do {
2356                 unsigned long tmp;
2357
2358                 nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
2359                 bh_hash_mask = (nr_hash - 1);
2360
2361                 tmp = nr_hash;
2362                 bh_hash_shift = 0;
2363                 while((tmp >>= 1UL) != 0UL)
2364                         bh_hash_shift++;
2365
2366                 hash_table = (struct buffer_head **)
2367                     __get_free_pages(GFP_ATOMIC, order);
2368         } while (hash_table == NULL && --order > 0);
2369         printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
2370                nr_hash, order, (PAGE_SIZE << order));
2371
2372         if (!hash_table)
2373                 panic("Failed to allocate buffer hash table\n");
2374
2375         /* Setup hash chains. */
2376         for(i = 0; i < nr_hash; i++)
2377                 hash_table[i] = NULL;
2378
2379         /* Setup free lists. */
2380         for(i = 0; i < NR_SIZES; i++) {
2381                 free_list[i].list = NULL;
2382                 free_list[i].lock = SPIN_LOCK_UNLOCKED;
2383         }
2384
2385         /* Setup lru lists. */
2386         for(i = 0; i < NR_LIST; i++)
2387                 lru_list[i] = NULL;
2388
2389 }
2390
2391
2392 /* ====================== bdflush support =================== */
2393
2394 /* This is a simple kernel daemon, whose job it is to provide a dynamic
2395  * response to dirty buffers.  Once this process is activated, we write back
2396  * a limited number of buffers to the disks and then go back to sleep again.
2397  */
2398 static DECLARE_WAIT_QUEUE_HEAD(bdflush_done);
2399 struct task_struct *bdflush_tsk = 0;
2400
2401 void wakeup_bdflush(int block)
2402 {
2403         DECLARE_WAITQUEUE(wait, current);
2404
2405         if (current == bdflush_tsk)
2406                 return;
2407
2408         if (!block) {
2409                 wake_up_process(bdflush_tsk);
2410                 return;
2411         }
2412
2413         /* kflushd can wakeup us before we have a chance to
2414            go to sleep so we must be smart in handling
2415            this wakeup event from kflushd to avoid deadlocking in SMP
2416            (we are not holding any lock anymore in these two paths). */
2417         __set_current_state(TASK_UNINTERRUPTIBLE);
2418         add_wait_queue(&bdflush_done, &wait);
2419
2420         wake_up_process(bdflush_tsk);
2421         schedule();
2422
2423         remove_wait_queue(&bdflush_done, &wait);
2424         __set_current_state(TASK_RUNNING);
2425 }
2426
2427 /* This is the _only_ function that deals with flushing async writes
2428    to disk.
2429    NOTENOTENOTENOTE: we _only_ need to browse the DIRTY lru list
2430    as all dirty buffers lives _only_ in the DIRTY lru list.
2431    As we never browse the LOCKED and CLEAN lru lists they are infact
2432    completly useless. */
2433 static int flush_dirty_buffers(int check_flushtime)
2434 {
2435         struct buffer_head * bh, *next;
2436         int flushed = 0, i;
2437
2438  restart:
2439         spin_lock(&lru_list_lock);
2440         bh = lru_list[BUF_DIRTY];
2441         if (!bh)
2442                 goto out_unlock;
2443         for (i = nr_buffers_type[BUF_DIRTY]; i-- > 0; bh = next) {
2444                 next = bh->b_next_free;
2445
2446                 if (!buffer_dirty(bh)) {
2447                         __refile_buffer(bh);
2448                         continue;
2449                 }
2450                 if (buffer_locked(bh))
2451                         continue;
2452
2453                 if (check_flushtime) {
2454                         /* The dirty lru list is chronologically ordered so
2455                            if the current bh is not yet timed out,
2456                            then also all the following bhs
2457                            will be too young. */
2458                         if (time_before(jiffies, bh->b_flushtime))
2459                                 goto out_unlock;
2460                 } else {
2461                         if (++flushed > bdf_prm.b_un.ndirty)
2462                                 goto out_unlock;
2463                 }
2464
2465                 /* OK, now we are committed to write it out. */
2466                 atomic_inc(&bh->b_count);
2467                 spin_unlock(&lru_list_lock);
2468                 ll_rw_block(WRITE, 1, &bh);
2469                 atomic_dec(&bh->b_count);
2470
2471                 if (current->need_resched)
2472                         schedule();
2473                 goto restart;
2474         }
2475  out_unlock:
2476         spin_unlock(&lru_list_lock);
2477
2478         return flushed;
2479 }
2480
2481 /*
2482  * Here we attempt to write back old buffers.  We also try to flush inodes
2483  * and supers as well, since this function is essentially "update", and
2484  * otherwise there would be no way of ensuring that these quantities ever
2485  * get written back.  Ideally, we would have a timestamp on the inodes
2486  * and superblocks so that we could write back only the old ones as well
2487  */
2488
2489 static int sync_old_buffers(void)
2490 {
2491         lock_kernel();
2492         sync_supers(0);
2493         sync_inodes(0);
2494         unlock_kernel();
2495
2496         flush_dirty_buffers(1);
2497         /* must really sync all the active I/O request to disk here */
2498         run_task_queue(&tq_disk);
2499         return 0;
2500 }
2501
2502 int block_sync_page(struct page *page)
2503 {
2504         run_task_queue(&tq_disk);
2505         return 0;
2506 }
2507
2508 /* This is the interface to bdflush.  As we get more sophisticated, we can
2509  * pass tuning parameters to this "process", to adjust how it behaves.
2510  * We would want to verify each parameter, however, to make sure that it
2511  * is reasonable. */
2512
2513 asmlinkage long sys_bdflush(int func, long data)
2514 {
2515         if (!capable(CAP_SYS_ADMIN))
2516                 return -EPERM;
2517
2518         if (func == 1) {
2519                 /* do_exit directly and let kupdate to do its work alone. */
2520                 do_exit(0);
2521 #if 0 /* left here as it's the only example of lazy-mm-stuff used from
2522          a syscall that doesn't care about the current mm context. */
2523                 int error;
2524                 struct mm_struct *user_mm;
2525
2526                 /*
2527                  * bdflush will spend all of it's time in kernel-space,
2528                  * without touching user-space, so we can switch it into
2529                  * 'lazy TLB mode' to reduce the cost of context-switches
2530                  * to and from bdflush.
2531                  */
2532                 user_mm = start_lazy_tlb();
2533                 error = sync_old_buffers();
2534                 end_lazy_tlb(user_mm);
2535                 return error;
2536 #endif
2537         }
2538
2539         /* Basically func 1 means read param 1, 2 means write param 1, etc */
2540         if (func >= 2) {
2541                 int i = (func-2) >> 1;
2542                 if (i >= 0 && i < N_PARAM) {
2543                         if ((func & 1) == 0)
2544                                 return put_user(bdf_prm.data[i], (int*)data);
2545
2546                         if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
2547                                 bdf_prm.data[i] = data;
2548                                 return 0;
2549                         }
2550                 }
2551                 return -EINVAL;
2552         }
2553
2554         /* Having func 0 used to launch the actual bdflush and then never
2555          * return (unless explicitly killed). We return zero here to
2556          * remain semi-compatible with present update(8) programs.
2557          */
2558         return 0;
2559 }
2560
2561 /*
2562  * This is the actual bdflush daemon itself. It used to be started from
2563  * the syscall above, but now we launch it ourselves internally with
2564  * kernel_thread(...)  directly after the first thread in init/main.c
2565  */
2566 int bdflush(void *sem)
2567 {
2568         struct task_struct *tsk = current;
2569         int flushed;
2570         /*
2571          *      We have a bare-bones task_struct, and really should fill
2572          *      in a few more things so "top" and /proc/2/{exe,root,cwd}
2573          *      display semi-sane things. Not real crucial though...
2574          */
2575
2576         tsk->session = 1;
2577         tsk->pgrp = 1;
2578         strcpy(tsk->comm, "kflushd");
2579         bdflush_tsk = tsk;
2580
2581         /* avoid getting signals */
2582         spin_lock_irq(&tsk->sigmask_lock);
2583         flush_signals(tsk);
2584         sigfillset(&tsk->blocked);
2585         recalc_sigpending(tsk);
2586         spin_unlock_irq(&tsk->sigmask_lock);
2587
2588         up((struct semaphore *)sem);
2589
2590         for (;;) {
2591                 CHECK_EMERGENCY_SYNC
2592
2593                 flushed = flush_dirty_buffers(0);
2594
2595                 /* If wakeup_bdflush will wakeup us
2596                    after our bdflush_done wakeup, then
2597                    we must make sure to not sleep
2598                    in schedule_timeout otherwise
2599                    wakeup_bdflush may wait for our
2600                    bdflush_done wakeup that would never arrive
2601                    (as we would be sleeping) and so it would
2602                    deadlock in SMP. */
2603                 __set_current_state(TASK_INTERRUPTIBLE);
2604                 wake_up(&bdflush_done);
2605                 /*
2606                  * If there are still a lot of dirty buffers around,
2607                  * skip the sleep and flush some more. Otherwise, we
2608                  * go to sleep waiting a wakeup.
2609                  */
2610                 if (!flushed || balance_dirty_state(NODEV) < 0)
2611                         schedule();
2612                 /* Remember to mark us as running otherwise
2613                    the next schedule will block. */
2614                 __set_current_state(TASK_RUNNING);
2615         }
2616 }
2617
2618 /*
2619  * This is the kernel update daemon. It was used to live in userspace
2620  * but since it's need to run safely we want it unkillable by mistake.
2621  * You don't need to change your userspace configuration since
2622  * the userspace `update` will do_exit(0) at the first sys_bdflush().
2623  */
2624 int kupdate(void *sem)
2625 {
2626         struct task_struct * tsk = current;
2627         int interval;
2628
2629         tsk->session = 1;
2630         tsk->pgrp = 1;
2631         strcpy(tsk->comm, "kupdate");
2632
2633         /* sigstop and sigcont will stop and wakeup kupdate */
2634         spin_lock_irq(&tsk->sigmask_lock);
2635         sigfillset(&tsk->blocked);
2636         siginitsetinv(&current->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP));
2637         recalc_sigpending(tsk);
2638         spin_unlock_irq(&tsk->sigmask_lock);
2639
2640         up((struct semaphore *)sem);
2641
2642         for (;;) {
2643                 /* update interval */
2644                 interval = bdf_prm.b_un.interval;
2645                 if (interval) {
2646                         tsk->state = TASK_INTERRUPTIBLE;
2647                         schedule_timeout(interval);
2648                 } else {
2649                 stop_kupdate:
2650                         tsk->state = TASK_STOPPED;
2651                         schedule(); /* wait for SIGCONT */
2652                 }
2653                 /* check for sigstop */
2654                 if (signal_pending(tsk)) {
2655                         int stopped = 0;
2656                         spin_lock_irq(&tsk->sigmask_lock);
2657                         if (sigismember(&tsk->pending.signal, SIGSTOP)) {
2658                                 sigdelset(&tsk->pending.signal, SIGSTOP);
2659                                 stopped = 1;
2660                         }
2661                         recalc_sigpending(tsk);
2662                         spin_unlock_irq(&tsk->sigmask_lock);
2663                         if (stopped)
2664                                 goto stop_kupdate;
2665                 }
2666 #ifdef DEBUG
2667                 printk("kupdate() activated...\n");
2668 #endif
2669                 sync_old_buffers();
2670         }
2671 }
2672
2673 static int __init bdflush_init(void)
2674 {
2675         DECLARE_MUTEX_LOCKED(sem);
2676         kernel_thread(bdflush, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
2677         down(&sem);
2678         kernel_thread(kupdate, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
2679         down(&sem);
2680         return 0;
2681 }
2682
2683 module_init(bdflush_init)
2684