fs/buffer.c

   1 /*
   2  *  linux/fs/buffer.c
   3  *
   4  *  Copyright (C) 1991, 1992  Linus Torvalds
   5  */
   6
   7 /*
   8  *  'buffer.c' implements the buffer-cache functions. Race-conditions have
   9  * been avoided by NEVER letting an interrupt change a buffer (except for the
  10  * data, of course), but instead letting the caller do it.
  11  */
  12
  13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
  14
  15 /* Removed a lot of unnecessary code and simplified things now that
  16  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  17  */
  18
  19 /* Speed up hash, lru, and free list operations.  Use gfp() for allocating
  20  * hash table, use SLAB cache for buffer heads. -DaveM
  21  */
  22
  23 /* Added 32k buffer block sizes - these are required older ARM systems.
  24  * - RMK
  25  */
  26
  27 /* Thread it... -DaveM */
  28
  29 /* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
  30
  31 #include <linux/config.h>
  32 #include <linux/sched.h>
  33 #include <linux/fs.h>
  34 #include <linux/malloc.h>
  35 #include <linux/locks.h>
  36 #include <linux/errno.h>
  37 #include <linux/swap.h>
  38 #include <linux/smp_lock.h>
  39 #include <linux/vmalloc.h>
  40 #include <linux/blkdev.h>
  41 #include <linux/sysrq.h>
  42 #include <linux/file.h>
  43 #include <linux/init.h>
  44 #include <linux/quotaops.h>
  45 #include <linux/iobuf.h>
  46 #include <linux/highmem.h>
  47
  48 #include <asm/uaccess.h>
  49 #include <asm/io.h>
  50 #include <asm/bitops.h>
  51 #include <asm/mmu_context.h>
  52
  53 #define NR_SIZES 7
  54 static char buffersize_index[65] =
  55 {-1,  0,  1, -1,  2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
  56   4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  57   5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  58  -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  59   6};
  60
  61 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
  62 #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
  63 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
  64 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
  65                                              number of unused buffer heads */
  66
  67 /* Anti-deadlock ordering:
  68  *      lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
  69  */
  70
  71 /*
  72  * Hash table gook..
  73  */
  74 static unsigned int bh_hash_mask;
  75 static unsigned int bh_hash_shift;
  76 static struct buffer_head **hash_table;
  77 static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
  78
  79 static struct buffer_head *lru_list[NR_LIST];
  80 static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED;
  81 static int nr_buffers_type[NR_LIST];
  82 static unsigned long size_buffers_type[NR_LIST];
  83
  84 static struct buffer_head * unused_list;
  85 static int nr_unused_buffer_heads;
  86 static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
  87 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
  88
  89 struct bh_free_head {
  90         struct buffer_head *list;
  91         spinlock_t lock;
  92 };
  93 static struct bh_free_head free_list[NR_SIZES];
  94
  95 static int grow_buffers(int size);
  96 static void __refile_buffer(struct buffer_head *);
  97
  98 /* This is used by some architectures to estimate available memory. */
  99 atomic_t buffermem_pages = ATOMIC_INIT(0);
 100
 101 /* Here is the parameter block for the bdflush process. If you add or
 102  * remove any of the parameters, make sure to update kernel/sysctl.c.
 103  */
 104
 105 #define N_PARAM 9
 106
 107 /* The dummy values in this structure are left in there for compatibility
 108  * with old programs that play with the /proc entries.
 109  */
 110 union bdflush_param {
 111         struct {
 112                 int nfract;  /* Percentage of buffer cache dirty to
 113                                 activate bdflush */
 114                 int ndirty;  /* Maximum number of dirty blocks to write out per
 115                                 wake-cycle */
 116                 int nrefill; /* Number of clean buffers to try to obtain
 117                                 each time we call refill */
 118                 int nref_dirt; /* Dirty buffer threshold for activating bdflush
 119                                   when trying to refill buffers. */
 120                 int interval; /* jiffies delay between kupdate flushes */
 121                 int age_buffer;  /* Time for normal buffer to age before we flush it */
 122                 int dummy1;    /* unused, was age_super */
 123                 int dummy2;    /* unused */
 124                 int dummy3;    /* unused */
 125         } b_un;
 126         unsigned int data[N_PARAM];
 127 } bdf_prm = {{40, 500, 64, 256, 5*HZ, 30*HZ, 5*HZ, 1884, 2}};
 128
 129 /* These are the min and max parameter values that we will allow to be assigned */
 130 int bdflush_min[N_PARAM] = {  0,  10,    5,   25,  0,   1*HZ,   1*HZ, 1, 1};
 131 int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,600*HZ, 6000*HZ, 6000*HZ, 2047, 5};
 132
 133 /*
 134  * Rewrote the wait-routines to use the "new" wait-queue functionality,
 135  * and getting rid of the cli-sti pairs. The wait-queue routines still
 136  * need cli-sti, but now it's just a couple of 386 instructions or so.
 137  *
 138  * Note that the real wait_on_buffer() is an inline function that checks
 139  * if 'b_wait' is set before calling this, so that the queues aren't set
 140  * up unnecessarily.
 141  */
 142 void __wait_on_buffer(struct buffer_head * bh)
 143 {
 144         struct task_struct *tsk = current;
 145         DECLARE_WAITQUEUE(wait, tsk);
 146
 147         atomic_inc(&bh->b_count);
 148         add_wait_queue(&bh->b_wait, &wait);
 149         do {
 150                 run_task_queue(&tq_disk);
 151                 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 152                 if (!buffer_locked(bh))
 153                         break;
 154                 schedule();
 155         } while (buffer_locked(bh));
 156         tsk->state = TASK_RUNNING;
 157         remove_wait_queue(&bh->b_wait, &wait);
 158         atomic_dec(&bh->b_count);
 159 }
 160
 161 /* Call sync_buffers with wait!=0 to ensure that the call does not
 162  * return until all buffer writes have completed.  Sync() may return
 163  * before the writes have finished; fsync() may not.
 164  */
 165
 166 /* Godamity-damn.  Some buffers (bitmaps for filesystems)
 167  * spontaneously dirty themselves without ever brelse being called.
 168  * We will ultimately want to put these in a separate list, but for
 169  * now we search all of the lists for dirty buffers.
 170  */
 171 static int sync_buffers(kdev_t dev, int wait)
 172 {
 173         int i, retry, pass = 0, err = 0;
 174         struct buffer_head * bh, *next;
 175
 176         /* One pass for no-wait, three for wait:
 177          * 0) write out all dirty, unlocked buffers;
 178          * 1) write out all dirty buffers, waiting if locked;
 179          * 2) wait for completion by waiting for all buffers to unlock.
 180          */
 181         do {
 182                 retry = 0;
 183
 184                 /* We search all lists as a failsafe mechanism, not because we expect
 185                  * there to be dirty buffers on any of the other lists.
 186                  */
 187 repeat:
 188                 spin_lock(&lru_list_lock);
 189                 bh = lru_list[BUF_DIRTY];
 190                 if (!bh)
 191                         goto repeat2;
 192
 193                 for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
 194                         next = bh->b_next_free;
 195
 196                         if (!lru_list[BUF_DIRTY])
 197                                 break;
 198                         if (dev && bh->b_dev != dev)
 199                                 continue;
 200                         if (buffer_locked(bh)) {
 201                                 /* Buffer is locked; skip it unless wait is
 202                                  * requested AND pass > 0.
 203                                  */
 204                                 if (!wait || !pass) {
 205                                         retry = 1;
 206                                         continue;
 207                                 }
 208                                 atomic_inc(&bh->b_count);
 209                                 spin_unlock(&lru_list_lock);
 210                                 wait_on_buffer (bh);
 211                                 atomic_dec(&bh->b_count);
 212                                 goto repeat;
 213                         }
 214
 215                         /* If an unlocked buffer is not uptodate, there has
 216                          * been an IO error. Skip it.
 217                          */
 218                         if (wait && buffer_req(bh) && !buffer_locked(bh) &&
 219                             !buffer_dirty(bh) && !buffer_uptodate(bh)) {
 220                                 err = -EIO;
 221                                 continue;
 222                         }
 223
 224                         /* Don't write clean buffers.  Don't write ANY buffers
 225                          * on the third pass.
 226                          */
 227                         if (!buffer_dirty(bh) || pass >= 2)
 228                                 continue;
 229
 230                         atomic_inc(&bh->b_count);
 231                         spin_unlock(&lru_list_lock);
 232                         ll_rw_block(WRITE, 1, &bh);
 233                         atomic_dec(&bh->b_count);
 234                         retry = 1;
 235                         goto repeat;
 236                 }
 237
 238     repeat2:
 239                 bh = lru_list[BUF_LOCKED];
 240                 if (!bh) {
 241                         spin_unlock(&lru_list_lock);
 242                         break;
 243                 }
 244                 for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
 245                         next = bh->b_next_free;
 246
 247                         if (!lru_list[BUF_LOCKED])
 248                                 break;
 249                         if (dev && bh->b_dev != dev)
 250                                 continue;
 251                         if (buffer_locked(bh)) {
 252                                 /* Buffer is locked; skip it unless wait is
 253                                  * requested AND pass > 0.
 254                                  */
 255                                 if (!wait || !pass) {
 256                                         retry = 1;
 257                                         continue;
 258                                 }
 259                                 atomic_inc(&bh->b_count);
 260                                 spin_unlock(&lru_list_lock);
 261                                 wait_on_buffer (bh);
 262                                 spin_lock(&lru_list_lock);
 263                                 atomic_dec(&bh->b_count);
 264                                 goto repeat2;
 265                         }
 266                 }
 267                 spin_unlock(&lru_list_lock);
 268
 269                 /* If we are waiting for the sync to succeed, and if any dirty
 270                  * blocks were written, then repeat; on the second pass, only
 271                  * wait for buffers being written (do not pass to write any
 272                  * more buffers on the second pass).
 273                  */
 274         } while (wait && retry && ++pass<=2);
 275         return err;
 276 }
 277
 278 void sync_dev(kdev_t dev)
 279 {
 280         sync_supers(dev);
 281         sync_inodes(dev);
 282         DQUOT_SYNC(dev);
 283         /* sync all the dirty buffers out to disk only _after_ all the
 284            high level layers finished generated buffer dirty data
 285            (or we'll return with some buffer still dirty on the blockdevice
 286            so breaking the semantics of this call) */
 287         sync_buffers(dev, 0);
 288         /*
 289          * FIXME(eric) we need to sync the physical devices here.
 290          * This is because some (scsi) controllers have huge amounts of
 291          * cache onboard (hundreds of Mb), and we need to instruct
 292          * them to commit all of the dirty memory to disk, and we should
 293          * not return until this has happened.
 294          *
 295          * This would need to get implemented by going through the assorted
 296          * layers so that each block major number can be synced, and this
 297          * would call down into the upper and mid-layer scsi.
 298          */
 299 }
 300
 301 int fsync_dev(kdev_t dev)
 302 {
 303         sync_buffers(dev, 0);
 304
 305         lock_kernel();
 306         sync_supers(dev);
 307         sync_inodes(dev);
 308         DQUOT_SYNC(dev);
 309         unlock_kernel();
 310
 311         return sync_buffers(dev, 1);
 312 }
 313
 314 asmlinkage long sys_sync(void)
 315 {
 316         fsync_dev(0);
 317         return 0;
 318 }
 319
 320 /*
 321  *      filp may be NULL if called via the msync of a vma.
 322  */
 323
 324 int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
 325 {
 326         struct inode * inode = dentry->d_inode;
 327         struct super_block * sb;
 328         kdev_t dev;
 329         int ret;
 330
 331         lock_kernel();
 332         /* sync the inode to buffers */
 333         write_inode_now(inode, 0);
 334
 335         /* sync the superblock to buffers */
 336         sb = inode->i_sb;
 337         wait_on_super(sb);
 338         if (sb->s_op && sb->s_op->write_super)
 339                 sb->s_op->write_super(sb);
 340
 341         /* .. finally sync the buffers to disk */
 342         dev = inode->i_dev;
 343         ret = sync_buffers(dev, 1);
 344         unlock_kernel();
 345         return ret;
 346 }
 347
 348 asmlinkage long sys_fsync(unsigned int fd)
 349 {
 350         struct file * file;
 351         struct dentry * dentry;
 352         struct inode * inode;
 353         int err;
 354
 355         err = -EBADF;
 356         file = fget(fd);
 357         if (!file)
 358                 goto out;
 359
 360         dentry = file->f_dentry;
 361         inode = dentry->d_inode;
 362
 363         err = -EINVAL;
 364         if (!file->f_op || !file->f_op->fsync)
 365                 goto out_putf;
 366
 367         /* We need to protect against concurrent writers.. */
 368         down(&inode->i_sem);
 369         err = file->f_op->fsync(file, dentry, 0);
 370         up(&inode->i_sem);
 371
 372 out_putf:
 373         fput(file);
 374 out:
 375         return err;
 376 }
 377
 378 asmlinkage long sys_fdatasync(unsigned int fd)
 379 {
 380         struct file * file;
 381         struct dentry * dentry;
 382         struct inode * inode;
 383         int err;
 384
 385         err = -EBADF;
 386         file = fget(fd);
 387         if (!file)
 388                 goto out;
 389
 390         dentry = file->f_dentry;
 391         inode = dentry->d_inode;
 392
 393         err = -EINVAL;
 394         if (!file->f_op || !file->f_op->fsync)
 395                 goto out_putf;
 396
 397         down(&inode->i_sem);
 398         err = file->f_op->fsync(file, dentry, 1);
 399         up(&inode->i_sem);
 400
 401 out_putf:
 402         fput(file);
 403 out:
 404         return err;
 405 }
 406
 407 /* After several hours of tedious analysis, the following hash
 408  * function won.  Do not mess with it... -DaveM
 409  */
 410 #define _hashfn(dev,block)      \
 411         ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
 412          (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ ((block) << (bh_hash_shift - 12))))
 413 #define hash(dev,block) hash_table[(_hashfn(dev,block) & bh_hash_mask)]
 414
 415 static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head)
 416 {
 417         if ((bh->b_next = *head) != NULL)
 418                 bh->b_next->b_pprev = &bh->b_next;
 419         *head = bh;
 420         bh->b_pprev = head;
 421 }
 422
 423 static __inline__ void __hash_unlink(struct buffer_head *bh)
 424 {
 425         if (bh->b_pprev) {
 426                 if (bh->b_next)
 427                         bh->b_next->b_pprev = bh->b_pprev;
 428                 *(bh->b_pprev) = bh->b_next;
 429                 bh->b_pprev = NULL;
 430         }
 431 }
 432
 433 static void __insert_into_lru_list(struct buffer_head * bh, int blist)
 434 {
 435         struct buffer_head **bhp = &lru_list[blist];
 436
 437         if(!*bhp) {
 438                 *bhp = bh;
 439                 bh->b_prev_free = bh;
 440         }
 441         bh->b_next_free = *bhp;
 442         bh->b_prev_free = (*bhp)->b_prev_free;
 443         (*bhp)->b_prev_free->b_next_free = bh;
 444         (*bhp)->b_prev_free = bh;
 445         nr_buffers_type[blist]++;
 446         size_buffers_type[blist] += bh->b_size;
 447 }
 448
 449 static void __remove_from_lru_list(struct buffer_head * bh, int blist)
 450 {
 451         if (bh->b_prev_free || bh->b_next_free) {
 452                 bh->b_prev_free->b_next_free = bh->b_next_free;
 453                 bh->b_next_free->b_prev_free = bh->b_prev_free;
 454                 if (lru_list[blist] == bh)
 455                         lru_list[blist] = bh->b_next_free;
 456                 if (lru_list[blist] == bh)
 457                         lru_list[blist] = NULL;
 458                 bh->b_next_free = bh->b_prev_free = NULL;
 459                 nr_buffers_type[blist]--;
 460                 size_buffers_type[blist] -= bh->b_size;
 461         }
 462 }
 463
 464 static void __remove_from_free_list(struct buffer_head * bh, int index)
 465 {
 466         if(bh->b_next_free == bh)
 467                  free_list[index].list = NULL;
 468         else {
 469                 bh->b_prev_free->b_next_free = bh->b_next_free;
 470                 bh->b_next_free->b_prev_free = bh->b_prev_free;
 471                 if (free_list[index].list == bh)
 472                          free_list[index].list = bh->b_next_free;
 473         }
 474         bh->b_next_free = bh->b_prev_free = NULL;
 475 }
 476
 477 /* must be called with both the hash_table_lock and the lru_list_lock
 478    held */
 479 static void __remove_from_queues(struct buffer_head *bh)
 480 {
 481         __hash_unlink(bh);
 482         __remove_from_lru_list(bh, bh->b_list);
 483 }
 484
 485 static void __insert_into_queues(struct buffer_head *bh)
 486 {
 487         struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
 488
 489         __hash_link(bh, head);
 490         __insert_into_lru_list(bh, bh->b_list);
 491 }
 492
 493 /* This function must only run if there are no other
 494  * references _anywhere_ to this buffer head.
 495  */
 496 static void put_last_free(struct buffer_head * bh)
 497 {
 498         struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)];
 499         struct buffer_head **bhp = &head->list;
 500
 501         bh->b_state = 0;
 502
 503         spin_lock(&head->lock);
 504         bh->b_dev = B_FREE;
 505         if(!*bhp) {
 506                 *bhp = bh;
 507                 bh->b_prev_free = bh;
 508         }
 509         bh->b_next_free = *bhp;
 510         bh->b_prev_free = (*bhp)->b_prev_free;
 511         (*bhp)->b_prev_free->b_next_free = bh;
 512         (*bhp)->b_prev_free = bh;
 513         spin_unlock(&head->lock);
 514 }
 515
 516 /*
 517  * Why like this, I hear you say... The reason is race-conditions.
 518  * As we don't lock buffers (unless we are reading them, that is),
 519  * something might happen to it while we sleep (ie a read-error
 520  * will force it bad). This shouldn't really happen currently, but
 521  * the code is ready.
 522  */
 523 static inline struct buffer_head * __get_hash_table(kdev_t dev, int block, int size)
 524 {
 525         struct buffer_head *bh = hash(dev, block);
 526
 527         for (; bh; bh = bh->b_next)
 528                 if (bh->b_blocknr == block      &&
 529                     bh->b_size    == size       &&
 530                     bh->b_dev     == dev)
 531                         break;
 532         if (bh)
 533                 atomic_inc(&bh->b_count);
 534
 535         return bh;
 536 }
 537
 538 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
 539 {
 540         struct buffer_head *bh;
 541
 542         read_lock(&hash_table_lock);
 543         bh = __get_hash_table(dev, block, size);
 544         read_unlock(&hash_table_lock);
 545
 546         return bh;
 547 }
 548
 549 unsigned int get_hardblocksize(kdev_t dev)
 550 {
 551         /*
 552          * Get the hard sector size for the given device.  If we don't know
 553          * what it is, return 0.
 554          */
 555         if (hardsect_size[MAJOR(dev)] != NULL) {
 556                 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
 557                 if (blksize != 0)
 558                         return blksize;
 559         }
 560
 561         /*
 562          * We don't know what the hardware sector size for this device is.
 563          * Return 0 indicating that we don't know.
 564          */
 565         return 0;
 566 }
 567
 568 /* If invalidate_buffers() will trash dirty buffers, it means some kind
 569    of fs corruption is going on. Trashing dirty data always imply losing
 570    information that was supposed to be just stored on the physical layer
 571    by the user.
 572
 573    Thus invalidate_buffers in general usage is not allwowed to trash dirty
 574    buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved.
 575
 576    NOTE: In the case where the user removed a removable-media-disk even if
 577    there's still dirty data not synced on disk (due a bug in the device driver
 578    or due an error of the user), by not destroying the dirty buffers we could
 579    generate corruption also on the next media inserted, thus a parameter is
 580    necessary to handle this case in the most safe way possible (trying
 581    to not corrupt also the new disk inserted with the data belonging to
 582    the old now corrupted disk). Also for the ramdisk the natural thing
 583    to do in order to release the ramdisk memory is to destroy dirty buffers.
 584
 585    These are two special cases. Normal usage imply the device driver
 586    to issue a sync on the device (without waiting I/O completation) and
 587    then an invalidate_buffers call that doesn't trashes dirty buffers. */
 588 void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
 589 {
 590         int i, nlist, slept;
 591         struct buffer_head * bh, * bh_next;
 592
 593  retry:
 594         slept = 0;
 595         spin_lock(&lru_list_lock);
 596         for(nlist = 0; nlist < NR_LIST; nlist++) {
 597                 bh = lru_list[nlist];
 598                 if (!bh)
 599                         continue;
 600                 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
 601                         bh_next = bh->b_next_free;
 602                         if (bh->b_dev != dev)
 603                                 continue;
 604                         if (buffer_locked(bh)) {
 605                                 atomic_inc(&bh->b_count);
 606                                 spin_unlock(&lru_list_lock);
 607                                 wait_on_buffer(bh);
 608                                 slept = 1;
 609                                 spin_lock(&lru_list_lock);
 610                                 atomic_dec(&bh->b_count);
 611                         }
 612
 613                         write_lock(&hash_table_lock);
 614                         if (!atomic_read(&bh->b_count) &&
 615                             (destroy_dirty_buffers || !buffer_dirty(bh))) {
 616                                 __remove_from_queues(bh);
 617                                 put_last_free(bh);
 618                         }
 619                         write_unlock(&hash_table_lock);
 620                         if (slept)
 621                                 goto out;
 622                 }
 623         }
 624 out:
 625         spin_unlock(&lru_list_lock);
 626         if (slept)
 627                 goto retry;
 628 }
 629
 630 void set_blocksize(kdev_t dev, int size)
 631 {
 632         extern int *blksize_size[];
 633         int i, nlist, slept;
 634         struct buffer_head * bh, * bh_next;
 635
 636         if (!blksize_size[MAJOR(dev)])
 637                 return;
 638
 639         /* Size must be a power of two, and between 512 and PAGE_SIZE */
 640         if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
 641                 panic("Invalid blocksize passed to set_blocksize");
 642
 643         if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
 644                 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 645                 return;
 646         }
 647         if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
 648                 return;
 649         sync_buffers(dev, 2);
 650         blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 651
 652  retry:
 653         slept = 0;
 654         spin_lock(&lru_list_lock);
 655         for(nlist = 0; nlist < NR_LIST; nlist++) {
 656                 bh = lru_list[nlist];
 657                 if (!bh)
 658                         continue;
 659                 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
 660                         bh_next = bh->b_next_free;
 661                         if (bh->b_dev != dev || bh->b_size == size)
 662                                 continue;
 663                         if (buffer_locked(bh)) {
 664                                 atomic_inc(&bh->b_count);
 665                                 spin_unlock(&lru_list_lock);
 666                                 wait_on_buffer(bh);
 667                                 slept = 1;
 668                                 spin_lock(&lru_list_lock);
 669                                 atomic_dec(&bh->b_count);
 670                         }
 671
 672                         write_lock(&hash_table_lock);
 673                         if (!atomic_read(&bh->b_count)) {
 674                                 if (buffer_dirty(bh))
 675                                         printk(KERN_WARNING
 676                                                "set_blocksize: dev %s buffer_dirty %lu size %hu\n",
 677                                                kdevname(dev), bh->b_blocknr, bh->b_size);
 678                                 __remove_from_queues(bh);
 679                                 put_last_free(bh);
 680                         } else {
 681                                 if (atomic_set_buffer_clean(bh))
 682                                         __refile_buffer(bh);
 683                                 clear_bit(BH_Uptodate, &bh->b_state);
 684                                 printk(KERN_WARNING
 685                                        "set_blocksize: "
 686                                        "b_count %d, dev %s, block %lu, from %p\n",
 687                                        atomic_read(&bh->b_count), bdevname(bh->b_dev),
 688                                        bh->b_blocknr, __builtin_return_address(0));
 689                         }
 690                         write_unlock(&hash_table_lock);
 691                         if (slept)
 692                                 goto out;
 693                 }
 694         }
 695  out:
 696         spin_unlock(&lru_list_lock);
 697         if (slept)
 698                 goto retry;
 699 }
 700
 701 /*
 702  * We used to try various strange things. Let's not.
 703  */
 704 static void refill_freelist(int size)
 705 {
 706         if (!grow_buffers(size)) {
 707                 wakeup_bdflush(1);
 708                 current->policy |= SCHED_YIELD;
 709                 schedule();
 710         }
 711 }
 712
 713 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
 714 {
 715         bh->b_list = BUF_CLEAN;
 716         bh->b_end_io = handler;
 717         bh->b_private = private;
 718 }
 719
 720 static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 721 {
 722         mark_buffer_uptodate(bh, uptodate);
 723         unlock_buffer(bh);
 724 }
 725
 726 static void end_buffer_io_bad(struct buffer_head *bh, int uptodate)
 727 {
 728         mark_buffer_uptodate(bh, uptodate);
 729         unlock_buffer(bh);
 730         BUG();
 731 }
 732
 733 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
 734 {
 735         static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
 736         unsigned long flags;
 737         struct buffer_head *tmp;
 738         struct page *page;
 739
 740         mark_buffer_uptodate(bh, uptodate);
 741
 742         /* This is a temporary buffer used for page I/O. */
 743         page = bh->b_page;
 744
 745         if (!uptodate)
 746                 SetPageError(page);
 747
 748         /*
 749          * Be _very_ careful from here on. Bad things can happen if
 750          * two buffer heads end IO at almost the same time and both
 751          * decide that the page is now completely done.
 752          *
 753          * Async buffer_heads are here only as labels for IO, and get
 754          * thrown away once the IO for this page is complete.  IO is
 755          * deemed complete once all buffers have been visited
 756          * (b_count==0) and are now unlocked. We must make sure that
 757          * only the _last_ buffer that decrements its count is the one
 758          * that unlock the page..
 759          */
 760         spin_lock_irqsave(&page_uptodate_lock, flags);
 761         unlock_buffer(bh);
 762         atomic_dec(&bh->b_count);
 763         tmp = bh->b_this_page;
 764         while (tmp != bh) {
 765                 if (tmp->b_end_io == end_buffer_io_async && buffer_locked(tmp))
 766                         goto still_busy;
 767                 tmp = tmp->b_this_page;
 768         }
 769
 770         /* OK, the async IO on this page is complete. */
 771         spin_unlock_irqrestore(&page_uptodate_lock, flags);
 772
 773         /*
 774          * if none of the buffers had errors then we can set the
 775          * page uptodate:
 776          */
 777         if (!PageError(page))
 778                 SetPageUptodate(page);
 779
 780         /*
 781          * Run the hooks that have to be done when a page I/O has completed.
 782          */
 783         if (PageTestandClearDecrAfter(page))
 784                 atomic_dec(&nr_async_pages);
 785
 786         UnlockPage(page);
 787
 788         return;
 789
 790 still_busy:
 791         spin_unlock_irqrestore(&page_uptodate_lock, flags);
 792         return;
 793 }
 794
 795 /*
 796  * Ok, this is getblk, and it isn't very clear, again to hinder
 797  * race-conditions. Most of the code is seldom used, (ie repeating),
 798  * so it should be much more efficient than it looks.
 799  *
 800  * The algorithm is changed: hopefully better, and an elusive bug removed.
 801  *
 802  * 14.02.92: changed it to sync dirty buffers a bit: better performance
 803  * when the filesystem starts to get full of dirty blocks (I hope).
 804  */
 805 struct buffer_head * getblk(kdev_t dev, int block, int size)
 806 {
 807         struct buffer_head * bh;
 808         int isize;
 809
 810 repeat:
 811         spin_lock(&lru_list_lock);
 812         write_lock(&hash_table_lock);
 813         bh = __get_hash_table(dev, block, size);
 814         if (bh)
 815                 goto out;
 816
 817         isize = BUFSIZE_INDEX(size);
 818         spin_lock(&free_list[isize].lock);
 819         bh = free_list[isize].list;
 820         if (bh) {
 821                 __remove_from_free_list(bh, isize);
 822                 atomic_set(&bh->b_count, 1);
 823         }
 824         spin_unlock(&free_list[isize].lock);
 825
 826         /*
 827          * OK, FINALLY we know that this buffer is the only one of
 828          * its kind, we hold a reference (b_count>0), it is unlocked,
 829          * and it is clean.
 830          */
 831         if (bh) {
 832                 init_buffer(bh, end_buffer_io_sync, NULL);
 833                 bh->b_dev = dev;
 834                 bh->b_blocknr = block;
 835                 bh->b_state = 1 << BH_Mapped;
 836
 837                 /* Insert the buffer into the regular lists */
 838                 __insert_into_queues(bh);
 839         out:
 840                 write_unlock(&hash_table_lock);
 841                 spin_unlock(&lru_list_lock);
 842                 touch_buffer(bh);
 843                 return bh;
 844         }
 845
 846         /*
 847          * If we block while refilling the free list, somebody may
 848          * create the buffer first ... search the hashes again.
 849          */
 850         write_unlock(&hash_table_lock);
 851         spin_unlock(&lru_list_lock);
 852         refill_freelist(size);
 853         goto repeat;
 854 }
 855
 856 /* -1 -> no need to flush
 857     0 -> async flush
 858     1 -> sync flush (wait for I/O completation) */
 859 static int balance_dirty_state(kdev_t dev)
 860 {
 861         unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
 862
 863         dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
 864         tot = nr_free_buffer_pages();
 865         tot -= size_buffers_type[BUF_PROTECTED] >> PAGE_SHIFT;
 866
 867         dirty *= 200;
 868         soft_dirty_limit = tot * bdf_prm.b_un.nfract;
 869         hard_dirty_limit = soft_dirty_limit * 2;
 870
 871         if (dirty > soft_dirty_limit) {
 872                 if (dirty > hard_dirty_limit)
 873                         return 1;
 874                 return 0;
 875         }
 876         return -1;
 877 }
 878
 879 /*
 880  * if a new dirty buffer is created we need to balance bdflush.
 881  *
 882  * in the future we might want to make bdflush aware of different
 883  * pressures on different devices - thus the (currently unused)
 884  * 'dev' parameter.
 885  */
 886 void balance_dirty(kdev_t dev)
 887 {
 888         int state = balance_dirty_state(dev);
 889
 890         if (state < 0)
 891                 return;
 892         wakeup_bdflush(state);
 893 }
 894
 895 static __inline__ void __mark_dirty(struct buffer_head *bh, int flag)
 896 {
 897         bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
 898         refile_buffer(bh);
 899 }
 900
 901 /* atomic version, the user must call balance_dirty() by hand
 902    as soon as it become possible to block */
 903 void __mark_buffer_dirty(struct buffer_head *bh, int flag)
 904 {
 905         if (!atomic_set_buffer_dirty(bh))
 906                 __mark_dirty(bh, flag);
 907 }
 908
 909 void mark_buffer_dirty(struct buffer_head *bh, int flag)
 910 {
 911         __mark_buffer_dirty(bh, flag);
 912         balance_dirty(bh->b_dev);
 913 }
 914
 915 /*
 916  * A buffer may need to be moved from one buffer list to another
 917  * (e.g. in case it is not shared any more). Handle this.
 918  */
 919 static void __refile_buffer(struct buffer_head *bh)
 920 {
 921         int dispose = BUF_CLEAN;
 922         if (buffer_locked(bh))
 923                 dispose = BUF_LOCKED;
 924         if (buffer_dirty(bh))
 925                 dispose = BUF_DIRTY;
 926         if (buffer_protected(bh))
 927                 dispose = BUF_PROTECTED;
 928         if (dispose != bh->b_list) {
 929                 __remove_from_lru_list(bh, bh->b_list);
 930                 bh->b_list = dispose;
 931                 __insert_into_lru_list(bh, dispose);
 932         }
 933 }
 934
 935 void refile_buffer(struct buffer_head *bh)
 936 {
 937         spin_lock(&lru_list_lock);
 938         __refile_buffer(bh);
 939         spin_unlock(&lru_list_lock);
 940 }
 941
 942 /*
 943  * Release a buffer head
 944  */
 945 void __brelse(struct buffer_head * buf)
 946 {
 947         if (atomic_read(&buf->b_count)) {
 948                 atomic_dec(&buf->b_count);
 949                 return;
 950         }
 951         printk("VFS: brelse: Trying to free free buffer\n");
 952 }
 953
 954 /*
 955  * bforget() is like brelse(), except it puts the buffer on the
 956  * free list if it can.. We can NOT free the buffer if:
 957  *  - there are other users of it
 958  *  - it is locked and thus can have active IO
 959  */
 960 void __bforget(struct buffer_head * buf)
 961 {
 962         /* grab the lru lock here to block bdflush. */
 963         spin_lock(&lru_list_lock);
 964         write_lock(&hash_table_lock);
 965         if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf))
 966                 goto in_use;
 967         __hash_unlink(buf);
 968         write_unlock(&hash_table_lock);
 969         __remove_from_lru_list(buf, buf->b_list);
 970         spin_unlock(&lru_list_lock);
 971         put_last_free(buf);
 972         return;
 973
 974  in_use:
 975         write_unlock(&hash_table_lock);
 976         spin_unlock(&lru_list_lock);
 977 }
 978
 979 /*
 980  * bread() reads a specified block and returns the buffer that contains
 981  * it. It returns NULL if the block was unreadable.
 982  */
 983 struct buffer_head * bread(kdev_t dev, int block, int size)
 984 {
 985         struct buffer_head * bh;
 986
 987         bh = getblk(dev, block, size);
 988         if (buffer_uptodate(bh))
 989                 return bh;
 990         ll_rw_block(READ, 1, &bh);
 991         wait_on_buffer(bh);
 992         if (buffer_uptodate(bh))
 993                 return bh;
 994         brelse(bh);
 995         return NULL;
 996 }
 997
 998 /*
 999  * Ok, breada can be used as bread, but additionally to mark other
1000  * blocks for reading as well. End the argument list with a negative
1001  * number.
1002  */
1003
1004 #define NBUF 16
1005
1006 struct buffer_head * breada(kdev_t dev, int block, int bufsize,
1007         unsigned int pos, unsigned int filesize)
1008 {
1009         struct buffer_head * bhlist[NBUF];
1010         unsigned int blocks;
1011         struct buffer_head * bh;
1012         int index;
1013         int i, j;
1014
1015         if (pos >= filesize)
1016                 return NULL;
1017
1018         if (block < 0)
1019                 return NULL;
1020
1021         bh = getblk(dev, block, bufsize);
1022         index = BUFSIZE_INDEX(bh->b_size);
1023
1024         if (buffer_uptodate(bh))
1025                 return(bh);
1026         else ll_rw_block(READ, 1, &bh);
1027
1028         blocks = (filesize - pos) >> (9+index);
1029
1030         if (blocks < (read_ahead[MAJOR(dev)] >> index))
1031                 blocks = read_ahead[MAJOR(dev)] >> index;
1032         if (blocks > NBUF)
1033                 blocks = NBUF;
1034
1035 /*      if (blocks) printk("breada (new) %d blocks\n",blocks); */
1036
1037         bhlist[0] = bh;
1038         j = 1;
1039         for(i=1; i<blocks; i++) {
1040                 bh = getblk(dev,block+i,bufsize);
1041                 if (buffer_uptodate(bh)) {
1042                         brelse(bh);
1043                         break;
1044                 }
1045                 else bhlist[j++] = bh;
1046         }
1047
1048         /* Request the read for these buffers, and then release them. */
1049         if (j>1)
1050                 ll_rw_block(READA, (j-1), bhlist+1);
1051         for(i=1; i<j; i++)
1052                 brelse(bhlist[i]);
1053
1054         /* Wait for this buffer, and then continue on. */
1055         bh = bhlist[0];
1056         wait_on_buffer(bh);
1057         if (buffer_uptodate(bh))
1058                 return bh;
1059         brelse(bh);
1060         return NULL;
1061 }
1062
1063 /*
1064  * Note: the caller should wake up the buffer_wait list if needed.
1065  */
1066 static __inline__ void __put_unused_buffer_head(struct buffer_head * bh)
1067 {
1068         if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1069                 kmem_cache_free(bh_cachep, bh);
1070         } else {
1071                 bh->b_blocknr = -1;
1072                 init_waitqueue_head(&bh->b_wait);
1073                 nr_unused_buffer_heads++;
1074                 bh->b_next_free = unused_list;
1075                 bh->b_this_page = NULL;
1076                 unused_list = bh;
1077         }
1078 }
1079
1080 /*
1081  * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1082  * no-buffer-head deadlock.  Return NULL on failure; waiting for
1083  * buffer heads is now handled in create_buffers().
1084  */
1085 static struct buffer_head * get_unused_buffer_head(int async)
1086 {
1087         struct buffer_head * bh;
1088
1089         spin_lock(&unused_list_lock);
1090         if (nr_unused_buffer_heads > NR_RESERVED) {
1091                 bh = unused_list;
1092                 unused_list = bh->b_next_free;
1093                 nr_unused_buffer_heads--;
1094                 spin_unlock(&unused_list_lock);
1095                 return bh;
1096         }
1097         spin_unlock(&unused_list_lock);
1098
1099         /* This is critical.  We can't swap out pages to get
1100          * more buffer heads, because the swap-out may need
1101          * more buffer-heads itself.  Thus SLAB_BUFFER.
1102          */
1103         if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
1104                 memset(bh, 0, sizeof(*bh));
1105                 init_waitqueue_head(&bh->b_wait);
1106                 return bh;
1107         }
1108
1109         /*
1110          * If we need an async buffer, use the reserved buffer heads.
1111          */
1112         if (async) {
1113                 spin_lock(&unused_list_lock);
1114                 if (unused_list) {
1115                         bh = unused_list;
1116                         unused_list = bh->b_next_free;
1117                         nr_unused_buffer_heads--;
1118                         spin_unlock(&unused_list_lock);
1119                         return bh;
1120                 }
1121                 spin_unlock(&unused_list_lock);
1122         }
1123 #if 0
1124         /*
1125          * (Pending further analysis ...)
1126          * Ordinary (non-async) requests can use a different memory priority
1127          * to free up pages. Any swapping thus generated will use async
1128          * buffer heads.
1129          */
1130         if(!async &&
1131            (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1132                 memset(bh, 0, sizeof(*bh));
1133                 init_waitqueue_head(&bh->b_wait);
1134                 return bh;
1135         }
1136 #endif
1137
1138         return NULL;
1139 }
1140
1141 void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
1142 {
1143         bh->b_page = page;
1144         if (offset >= PAGE_SIZE)
1145                 BUG();
1146         if (PageHighMem(page))
1147                 /*
1148                  * This catches illegal uses and preserves the offset:
1149                  */
1150                 bh->b_data = (char *)(0 + offset);
1151         else
1152                 bh->b_data = page_address(page) + offset;
1153 }
1154
1155 /*
1156  * Create the appropriate buffers when given a page for data area and
1157  * the size of each buffer.. Use the bh->b_this_page linked list to
1158  * follow the buffers created.  Return NULL if unable to create more
1159  * buffers.
1160  * The async flag is used to differentiate async IO (paging, swapping)
1161  * from ordinary buffer allocations, and only async requests are allowed
1162  * to sleep waiting for buffer heads.
1163  */
1164 static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
1165 {
1166         struct buffer_head *bh, *head;
1167         long offset;
1168
1169 try_again:
1170         head = NULL;
1171         offset = PAGE_SIZE;
1172         while ((offset -= size) >= 0) {
1173                 bh = get_unused_buffer_head(async);
1174                 if (!bh)
1175                         goto no_grow;
1176
1177                 bh->b_dev = B_FREE;  /* Flag as unused */
1178                 bh->b_this_page = head;
1179                 head = bh;
1180
1181                 bh->b_state = 0;
1182                 bh->b_next_free = NULL;
1183                 bh->b_pprev = NULL;
1184                 atomic_set(&bh->b_count, 0);
1185                 bh->b_size = size;
1186
1187                 set_bh_page(bh, page, offset);
1188
1189                 bh->b_list = BUF_CLEAN;
1190                 bh->b_end_io = end_buffer_io_bad;
1191         }
1192         return head;
1193 /*
1194  * In case anything failed, we just free everything we got.
1195  */
1196 no_grow:
1197         if (head) {
1198                 spin_lock(&unused_list_lock);
1199                 do {
1200                         bh = head;
1201                         head = head->b_this_page;
1202                         __put_unused_buffer_head(bh);
1203                 } while (head);
1204                 spin_unlock(&unused_list_lock);
1205
1206                 /* Wake up any waiters ... */
1207                 wake_up(&buffer_wait);
1208         }
1209
1210         /*
1211          * Return failure for non-async IO requests.  Async IO requests
1212          * are not allowed to fail, so we have to wait until buffer heads
1213          * become available.  But we don't want tasks sleeping with
1214          * partially complete buffers, so all were released above.
1215          */
1216         if (!async)
1217                 return NULL;
1218
1219         /* We're _really_ low on memory. Now we just
1220          * wait for old buffer heads to become free due to
1221          * finishing IO.  Since this is an async request and
1222          * the reserve list is empty, we're sure there are
1223          * async buffer heads in use.
1224          */
1225         run_task_queue(&tq_disk);
1226
1227         /*
1228          * Set our state for sleeping, then check again for buffer heads.
1229          * This ensures we won't miss a wake_up from an interrupt.
1230          */
1231         wait_event(buffer_wait, nr_unused_buffer_heads >= MAX_BUF_PER_PAGE);
1232         goto try_again;
1233 }
1234
1235 static int create_page_buffers(int rw, struct page *page, kdev_t dev, int b[], int size)
1236 {
1237         struct buffer_head *head, *bh, *tail;
1238         int block;
1239
1240         if (!PageLocked(page))
1241                 BUG();
1242         /*
1243          * Allocate async buffer heads pointing to this page, just for I/O.
1244          * They don't show up in the buffer hash table, but they *are*
1245          * registered in page->buffers.
1246          */
1247         head = create_buffers(page, size, 1);
1248         if (page->buffers)
1249                 BUG();
1250         if (!head)
1251                 BUG();
1252         tail = head;
1253         for (bh = head; bh; bh = bh->b_this_page) {
1254                 block = *(b++);
1255
1256                 tail = bh;
1257                 init_buffer(bh, end_buffer_io_async, NULL);
1258                 bh->b_dev = dev;
1259                 bh->b_blocknr = block;
1260
1261                 set_bit(BH_Mapped, &bh->b_state);
1262         }
1263         tail->b_this_page = head;
1264         page_cache_get(page);
1265         page->buffers = head;
1266         return 0;
1267 }
1268
1269 static void unmap_buffer(struct buffer_head * bh)
1270 {
1271         if (buffer_mapped(bh)) {
1272                 mark_buffer_clean(bh);
1273                 wait_on_buffer(bh);
1274                 clear_bit(BH_Uptodate, &bh->b_state);
1275                 clear_bit(BH_Mapped, &bh->b_state);
1276                 clear_bit(BH_Req, &bh->b_state);
1277                 clear_bit(BH_New, &bh->b_state);
1278         }
1279 }
1280
1281 /*
1282  * We don't have to release all buffers here, but
1283  * we have to be sure that no dirty buffer is left
1284  * and no IO is going on (no buffer is locked), because
1285  * we have truncated the file and are going to free the
1286  * blocks on-disk..
1287  */
1288 int block_flushpage(struct page *page, unsigned long offset)
1289 {
1290         struct buffer_head *head, *bh, *next;
1291         unsigned int curr_off = 0;
1292
1293         if (!PageLocked(page))
1294                 BUG();
1295         if (!page->buffers)
1296                 return 1;
1297
1298         head = page->buffers;
1299         bh = head;
1300         do {
1301                 unsigned int next_off = curr_off + bh->b_size;
1302                 next = bh->b_this_page;
1303
1304                 /*
1305                  * is this block fully flushed?
1306                  */
1307                 if (offset <= curr_off)
1308                         unmap_buffer(bh);
1309                 curr_off = next_off;
1310                 bh = next;
1311         } while (bh != head);
1312
1313         /*
1314          * subtle. We release buffer-heads only if this is
1315          * the 'final' flushpage. We have invalidated the get_block
1316          * cached value unconditionally, so real IO is not
1317          * possible anymore.
1318          *
1319          * If the free doesn't work out, the buffers can be
1320          * left around - they just turn into anonymous buffers
1321          * instead.
1322          */
1323         if (!offset) {
1324                 if (!try_to_free_buffers(page, 0)) {
1325                         atomic_inc(&buffermem_pages);
1326                         return 0;
1327                 }
1328         }
1329
1330         return 1;
1331 }
1332
1333 static void create_empty_buffers(struct page *page, struct inode *inode, unsigned long blocksize)
1334 {
1335         struct buffer_head *bh, *head, *tail;
1336
1337         head = create_buffers(page, blocksize, 1);
1338         if (page->buffers)
1339                 BUG();
1340
1341         bh = head;
1342         do {
1343                 bh->b_dev = inode->i_dev;
1344                 bh->b_blocknr = 0;
1345                 bh->b_end_io = end_buffer_io_bad;
1346                 tail = bh;
1347                 bh = bh->b_this_page;
1348         } while (bh);
1349         tail->b_this_page = head;
1350         page->buffers = head;
1351         page_cache_get(page);
1352 }
1353
1354 /*
1355  * We are taking a block for data and we don't want any output from any
1356  * buffer-cache aliases starting from return from that function and
1357  * until the moment when something will explicitly mark the buffer
1358  * dirty (hopefully that will not happen until we will free that block ;-)
1359  * We don't even need to mark it not-uptodate - nobody can expect
1360  * anything from a newly allocated buffer anyway. We used to used
1361  * unmap_buffer() for such invalidation, but that was wrong. We definitely
1362  * don't want to mark the alias unmapped, for example - it would confuse
1363  * anyone who might pick it with bread() afterwards...
1364  */
1365
1366 static void unmap_underlying_metadata(struct buffer_head * bh)
1367 {
1368         struct buffer_head *old_bh;
1369
1370         old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
1371         if (old_bh) {
1372                 mark_buffer_clean(old_bh);
1373                 wait_on_buffer(old_bh);
1374                 clear_bit(BH_Req, &old_bh->b_state);
1375                 /* Here we could run brelse or bforget. We use
1376                    bforget because it will try to put the buffer
1377                    in the freelist. */
1378                 __bforget(old_bh);
1379         }
1380 }
1381
1382 /*
1383  * block_write_full_page() is SMP-safe - currently it's still
1384  * being called with the kernel lock held, but the code is ready.
1385  */
1386 static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
1387 {
1388         int err, i, need_balance_dirty = 0;
1389         unsigned long block;
1390         struct buffer_head *bh, *head;
1391
1392         if (!PageLocked(page))
1393                 BUG();
1394
1395         if (!page->buffers)
1396                 create_empty_buffers(page, inode, inode->i_sb->s_blocksize);
1397         head = page->buffers;
1398
1399         block = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1400
1401         bh = head;
1402         i = 0;
1403         do {
1404                 /*
1405                  * If the buffer isn't up-to-date, we can't be sure
1406                  * that the buffer has been initialized with the proper
1407                  * block number information etc..
1408                  *
1409                  * Leave it to the low-level FS to make all those
1410                  * decisions (block #0 may actually be a valid block)
1411                  */
1412                 bh->b_end_io = end_buffer_io_sync;
1413                 if (!buffer_mapped(bh)) {
1414                         err = get_block(inode, block, bh, 1);
1415                         if (err)
1416                                 goto out;
1417                         if (buffer_new(bh))
1418                                 unmap_underlying_metadata(bh);
1419                 }
1420                 set_bit(BH_Uptodate, &bh->b_state);
1421                 if (!atomic_set_buffer_dirty(bh)) {
1422                         __mark_dirty(bh, 0);
1423                         need_balance_dirty = 1;
1424                 }
1425
1426                 bh = bh->b_this_page;
1427                 block++;
1428         } while (bh != head);
1429
1430         if (need_balance_dirty)
1431                 balance_dirty(bh->b_dev);
1432
1433         SetPageUptodate(page);
1434         return 0;
1435 out:
1436         ClearPageUptodate(page);
1437         return err;
1438 }
1439
1440 static int __block_prepare_write(struct inode *inode, struct page *page,
1441                 unsigned from, unsigned to, get_block_t *get_block)
1442 {
1443         unsigned block_start, block_end;
1444         unsigned long block;
1445         int err = 0;
1446         unsigned blocksize, bbits;
1447         struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1448         char *kaddr = (char *)kmap(page);
1449
1450         blocksize = inode->i_sb->s_blocksize;
1451         if (!page->buffers)
1452                 create_empty_buffers(page, inode, blocksize);
1453         head = page->buffers;
1454
1455         bbits = inode->i_sb->s_blocksize_bits;
1456         block = page->index << (PAGE_CACHE_SHIFT - bbits);
1457
1458         for(bh = head, block_start = 0; bh != head || !block_start;
1459             block++, block_start=block_end, bh = bh->b_this_page) {
1460                 if (!bh)
1461                         BUG();
1462                 block_end = block_start+blocksize;
1463                 if (block_end <= from)
1464                         continue;
1465                 if (block_start >= to)
1466                         break;
1467                 bh->b_end_io = end_buffer_io_sync;
1468                 if (!buffer_mapped(bh)) {
1469                         err = get_block(inode, block, bh, 1);
1470                         if (err)
1471                                 goto out;
1472                         if (buffer_new(bh)) {
1473                                 unmap_underlying_metadata(bh);
1474                                 if (block_end > to)
1475                                         memset(kaddr+to, 0, block_end-to);
1476                                 if (block_start < from)
1477                                         memset(kaddr+block_start, 0, from-block_start);
1478                                 if (block_end > to || block_start < from)
1479                                         flush_dcache_page(page);
1480                                 continue;
1481                         }
1482                 }
1483                 if (!buffer_uptodate(bh) &&
1484                      (block_start < from || block_end > to)) {
1485                         ll_rw_block(READ, 1, &bh);
1486                         *wait_bh++=bh;
1487                 }
1488         }
1489         /*
1490          * If we issued read requests - let them complete.
1491          */
1492         while(wait_bh > wait) {
1493                 wait_on_buffer(*--wait_bh);
1494                 err = -EIO;
1495                 if (!buffer_uptodate(*wait_bh))
1496                         goto out;
1497         }
1498         return 0;
1499 out:
1500         return err;
1501 }
1502
1503 static int __block_commit_write(struct inode *inode, struct page *page,
1504                 unsigned from, unsigned to)
1505 {
1506         unsigned block_start, block_end;
1507         int partial = 0, need_balance_dirty = 0;
1508         unsigned blocksize;
1509         struct buffer_head *bh, *head;
1510
1511         blocksize = inode->i_sb->s_blocksize;
1512
1513         for(bh = head = page->buffers, block_start = 0;
1514             bh != head || !block_start;
1515             block_start=block_end, bh = bh->b_this_page) {
1516                 block_end = block_start + blocksize;
1517                 /* This can happen for the truncate case */
1518                 if (!buffer_mapped(bh))
1519                         continue;
1520                 if (block_end <= from || block_start >= to) {
1521                         if (!buffer_uptodate(bh))
1522                                 partial = 1;
1523                 } else {
1524                         set_bit(BH_Uptodate, &bh->b_state);
1525                         if (!atomic_set_buffer_dirty(bh)) {
1526                                 __mark_dirty(bh, 0);
1527                                 need_balance_dirty = 1;
1528                         }
1529                 }
1530         }
1531
1532         if (need_balance_dirty)
1533                 balance_dirty(bh->b_dev);
1534         /*
1535          * is this a partial write that happened to make all buffers
1536          * uptodate then we can optimize away a bogus readpage() for
1537          * the next read(). Here we 'discover' wether the page went
1538          * uptodate as a result of this (potentially partial) write.
1539          */
1540         if (!partial)
1541                 SetPageUptodate(page);
1542         return 0;
1543 }
1544
1545 /*
1546  * Generic "read page" function for block devices that have the normal
1547  * get_block functionality. This is most of the block device filesystems.
1548  * Reads the page asynchronously --- the unlock_buffer() and
1549  * mark_buffer_uptodate() functions propagate buffer state into the
1550  * page struct once IO has completed.
1551  */
1552 int block_read_full_page(struct page *page, get_block_t *get_block)
1553 {
1554         struct inode *inode = (struct inode*)page->mapping->host;
1555         unsigned long iblock, lblock;
1556         struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1557         unsigned int blocksize, blocks;
1558         unsigned long kaddr = 0;
1559         int nr, i;
1560
1561         if (!PageLocked(page))
1562                 PAGE_BUG(page);
1563         blocksize = inode->i_sb->s_blocksize;
1564         if (!page->buffers)
1565                 create_empty_buffers(page, inode, blocksize);
1566         head = page->buffers;
1567
1568         blocks = PAGE_CACHE_SIZE >> inode->i_sb->s_blocksize_bits;
1569         iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1570         lblock = (inode->i_size+blocksize-1) >> inode->i_sb->s_blocksize_bits;
1571         bh = head;
1572         nr = 0;
1573         i = 0;
1574
1575         do {
1576                 if (buffer_uptodate(bh))
1577                         continue;
1578
1579                 if (!buffer_mapped(bh)) {
1580                         if (iblock < lblock)
1581                                 get_block(inode, iblock, bh, 0);
1582                         if (!buffer_mapped(bh)) {
1583                                 if (!kaddr)
1584                                         kaddr = kmap(page);
1585                                 memset((char *)(kaddr + i*blocksize), 0, blocksize);
1586                                 flush_dcache_page(page);
1587                                 set_bit(BH_Uptodate, &bh->b_state);
1588                                 continue;
1589                         }
1590                 }
1591
1592                 init_buffer(bh, end_buffer_io_async, NULL);
1593                 atomic_inc(&bh->b_count);
1594                 arr[nr] = bh;
1595                 nr++;
1596         } while (i++, iblock++, (bh = bh->b_this_page) != head);
1597
1598         if (nr) {
1599                 if (Page_Uptodate(page))
1600                         BUG();
1601                 ll_rw_block(READ, nr, arr);
1602         } else {
1603                 /*
1604                  * all buffers are uptodate - we can set the page
1605                  * uptodate as well.
1606                  */
1607                 SetPageUptodate(page);
1608                 UnlockPage(page);
1609         }
1610         if (kaddr)
1611                 kunmap(page);
1612         return 0;
1613 }
1614
1615 /*
1616  * For moronic filesystems that do not allow holes in file.
1617  * We may have to extend the file.
1618  */
1619
1620 int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes)
1621 {
1622         struct address_space *mapping = page->mapping;
1623         struct inode *inode = (struct inode*)mapping->host;
1624         struct page *new_page;
1625         unsigned long pgpos;
1626         long status;
1627         unsigned zerofrom;
1628         unsigned blocksize = inode->i_sb->s_blocksize;
1629         char *kaddr;
1630
1631         while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
1632                 status = -ENOMEM;
1633                 new_page = grab_cache_page(mapping, pgpos);
1634                 if (!new_page)
1635                         goto out;
1636                 /* we might sleep */
1637                 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
1638                         UnlockPage(new_page);
1639                         page_cache_release(new_page);
1640                         continue;
1641                 }
1642                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1643                 if (zerofrom & (blocksize-1)) {
1644                         *bytes |= (blocksize-1);
1645                         (*bytes)++;
1646                 }
1647                 status = __block_prepare_write(inode, new_page, zerofrom,
1648                                                 PAGE_CACHE_SIZE, get_block);
1649                 if (status)
1650                         goto out_unmap;
1651                 kaddr = page_address(new_page);
1652                 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
1653                 flush_dcache_page(new_page);
1654                 __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
1655                 kunmap(new_page);
1656                 UnlockPage(new_page);
1657                 page_cache_release(new_page);
1658         }
1659
1660         if (page->index < pgpos) {
1661                 /* completely inside the area */
1662                 zerofrom = offset;
1663         } else {
1664                 /* page covers the boundary, find the boundary offset */
1665                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1666
1667                 /* if we will expand the thing last block will be filled */
1668                 if (to > zerofrom && (zerofrom & (blocksize-1))) {
1669                         *bytes |= (blocksize-1);
1670                         (*bytes)++;
1671                 }
1672
1673                 /* starting below the boundary? Nothing to zero out */
1674                 if (offset <= zerofrom)
1675                         zerofrom = offset;
1676         }
1677         status = __block_prepare_write(inode, page, zerofrom, to, get_block);
1678         if (status)
1679                 goto out1;
1680         kaddr = page_address(page);
1681         if (zerofrom < offset) {
1682                 memset(kaddr+zerofrom, 0, offset-zerofrom);
1683                 flush_dcache_page(page);
1684                 __block_commit_write(inode, page, zerofrom, offset);
1685         }
1686         return 0;
1687 out1:
1688         ClearPageUptodate(page);
1689         kunmap(page);
1690         return status;
1691
1692 out_unmap:
1693         ClearPageUptodate(new_page);
1694         kunmap(new_page);
1695         UnlockPage(new_page);
1696         page_cache_release(new_page);
1697 out:
1698         return status;
1699 }
1700
1701 int block_prepare_write(struct page *page, unsigned from, unsigned to,
1702                         get_block_t *get_block)
1703 {
1704         struct inode *inode = (struct inode*)page->mapping->host;
1705         int err = __block_prepare_write(inode, page, from, to, get_block);
1706         if (err) {
1707                 ClearPageUptodate(page);
1708                 kunmap(page);
1709         }
1710         return err;
1711 }
1712
1713 int generic_commit_write(struct file *file, struct page *page,
1714                 unsigned from, unsigned to)
1715 {
1716         struct inode *inode = (struct inode*)page->mapping->host;
1717         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1718         __block_commit_write(inode,page,from,to);
1719         kunmap(page);
1720         if (pos > inode->i_size) {
1721                 inode->i_size = pos;
1722                 mark_inode_dirty(inode);
1723         }
1724         return 0;
1725 }
1726
1727 int block_zero_page(struct address_space *mapping, loff_t from, unsigned length)
1728 {
1729         unsigned long index = from >> PAGE_CACHE_SHIFT;
1730         unsigned offset = from & (PAGE_CACHE_SIZE-1);
1731         struct inode *inode = (struct inode *)mapping->host;
1732         struct page *page;
1733         int err;
1734
1735         if (!length)
1736                 return 0;
1737
1738         page = read_cache_page(mapping, index,
1739                                 (filler_t *)mapping->a_ops->readpage, NULL);
1740         err = PTR_ERR(page);
1741         if (IS_ERR(page))
1742                 goto out;
1743         lock_page(page);
1744         err = -EIO;
1745         if (!Page_Uptodate(page))
1746                 goto unlock;
1747
1748         memset((char *) kmap(page) + offset, 0, length);
1749         flush_dcache_page(page);
1750         __block_commit_write(inode, page, offset, offset+length);
1751         kunmap(page);
1752         err = 0;
1753
1754 unlock:
1755         UnlockPage(page);
1756         page_cache_release(page);
1757 out:
1758         return err;
1759 }
1760
1761 int block_write_full_page(struct page *page, get_block_t *get_block)
1762 {
1763         struct inode *inode = (struct inode*)page->mapping->host;
1764         unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1765         unsigned offset;
1766         int err;
1767
1768         /* easy case */
1769         if (page->index < end_index)
1770                 return __block_write_full_page(inode, page, get_block);
1771
1772         /* things got complicated... */
1773         offset = inode->i_size & (PAGE_CACHE_SIZE-1);
1774         /* OK, are we completely out? */
1775         if (page->index >= end_index+1 || !offset)
1776                 return -EIO;
1777         /* Sigh... will have to work, then... */
1778         err = __block_prepare_write(inode, page, 0, offset, get_block);
1779         if (!err) {
1780                 memset(page_address(page) + offset, 0, PAGE_CACHE_SIZE - offset);
1781                 flush_dcache_page(page);
1782                 __block_commit_write(inode,page,0,offset);
1783 done:
1784                 kunmap(page);
1785                 return err;
1786         }
1787         ClearPageUptodate(page);
1788         goto done;
1789 }
1790
1791 int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block)
1792 {
1793         struct buffer_head tmp;
1794         struct inode *inode = (struct inode*)mapping->host;
1795         tmp.b_state = 0;
1796         tmp.b_blocknr = 0;
1797         get_block(inode, block, &tmp, 0);
1798         return tmp.b_blocknr;
1799 }
1800
1801 /*
1802  * IO completion routine for a buffer_head being used for kiobuf IO: we
1803  * can't dispatch the kiobuf callback until io_count reaches 0.
1804  */
1805
1806 static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate)
1807 {
1808         struct kiobuf *kiobuf;
1809
1810         mark_buffer_uptodate(bh, uptodate);
1811
1812         kiobuf = bh->b_private;
1813         unlock_buffer(bh);
1814         end_kio_request(kiobuf, uptodate);
1815 }
1816
1817
1818 /*
1819  * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
1820  * for them to complete.  Clean up the buffer_heads afterwards.
1821  */
1822
1823 static int wait_kio(int rw, int nr, struct buffer_head *bh[], int size)
1824 {
1825         int iosize;
1826         int i;
1827         struct buffer_head *tmp;
1828
1829
1830         iosize = 0;
1831         spin_lock(&unused_list_lock);
1832
1833         for (i = nr; --i >= 0; ) {
1834                 iosize += size;
1835                 tmp = bh[i];
1836                 if (buffer_locked(tmp)) {
1837                         spin_unlock(&unused_list_lock);
1838                         wait_on_buffer(tmp);
1839                         spin_lock(&unused_list_lock);
1840                 }
1841
1842                 if (!buffer_uptodate(tmp)) {
1843                         /* We are traversing bh'es in reverse order so
1844                            clearing iosize on error calculates the
1845                            amount of IO before the first error. */
1846                         iosize = 0;
1847                 }
1848                 __put_unused_buffer_head(tmp);
1849         }
1850
1851         spin_unlock(&unused_list_lock);
1852
1853         return iosize;
1854 }
1855
1856 /*
1857  * Start I/O on a physical range of kernel memory, defined by a vector
1858  * of kiobuf structs (much like a user-space iovec list).
1859  *
1860  * The kiobuf must already be locked for IO.  IO is submitted
1861  * asynchronously: you need to check page->locked, page->uptodate, and
1862  * maybe wait on page->wait.
1863  *
1864  * It is up to the caller to make sure that there are enough blocks
1865  * passed in to completely map the iobufs to disk.
1866  */
1867
1868 int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
1869                kdev_t dev, unsigned long b[], int size)
1870 {
1871         int             err;
1872         int             length;
1873         int             transferred;
1874         int             i;
1875         int             bufind;
1876         int             pageind;
1877         int             bhind;
1878         int             offset;
1879         int             sectors = size>>9;
1880         unsigned long   blocknr;
1881         struct kiobuf * iobuf = NULL;
1882         struct page *   map;
1883         struct buffer_head *tmp, *bh[KIO_MAX_SECTORS];
1884
1885         if (!nr)
1886                 return 0;
1887
1888         /*
1889          * First, do some alignment and validity checks
1890          */
1891         for (i = 0; i < nr; i++) {
1892                 iobuf = iovec[i];
1893                 if ((iobuf->offset & (size-1)) ||
1894                     (iobuf->length & (size-1)))
1895                         return -EINVAL;
1896                 if (!iobuf->nr_pages)
1897                         panic("brw_kiovec: iobuf not initialised");
1898         }
1899
1900         /*
1901          * OK to walk down the iovec doing page IO on each page we find.
1902          */
1903         bufind = bhind = transferred = err = 0;
1904         for (i = 0; i < nr; i++) {
1905                 iobuf = iovec[i];
1906                 offset = iobuf->offset;
1907                 length = iobuf->length;
1908                 iobuf->errno = 0;
1909
1910                 for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
1911                         map  = iobuf->maplist[pageind];
1912                         if (!map) {
1913                                 err = -EFAULT;
1914                                 goto error;
1915                         }
1916
1917                         while (length > 0) {
1918                                 blocknr = b[bufind++];
1919                                 tmp = get_unused_buffer_head(0);
1920                                 if (!tmp) {
1921                                         err = -ENOMEM;
1922                                         goto error;
1923                                 }
1924
1925                                 tmp->b_dev = B_FREE;
1926                                 tmp->b_size = size;
1927                                 set_bh_page(tmp, map, offset);
1928                                 tmp->b_this_page = tmp;
1929
1930                                 init_buffer(tmp, end_buffer_io_kiobuf, iobuf);
1931                                 tmp->b_rdev = tmp->b_dev = dev;
1932                                 tmp->b_blocknr = blocknr;
1933                                 tmp->b_rsector = blocknr*sectors;
1934                                 tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req);
1935
1936                                 if (rw == WRITE) {
1937                                         set_bit(BH_Uptodate, &tmp->b_state);
1938                                         set_bit(BH_Dirty, &tmp->b_state);
1939                                 }
1940
1941                                 bh[bhind++] = tmp;
1942                                 length -= size;
1943                                 offset += size;
1944
1945                                 atomic_inc(&iobuf->io_count);
1946
1947                                 generic_make_request(rw, tmp);
1948                                 /*
1949                                  * Wait for IO if we have got too much
1950                                  */
1951                                 if (bhind >= KIO_MAX_SECTORS) {
1952                                         err = wait_kio(rw, bhind, bh, size);
1953                                         if (err >= 0)
1954                                                 transferred += err;
1955                                         else
1956                                                 goto finished;
1957                                         bhind = 0;
1958                                 }
1959
1960                                 if (offset >= PAGE_SIZE) {
1961                                         offset = 0;
1962                                         break;
1963                                 }
1964                         } /* End of block loop */
1965                 } /* End of page loop */
1966         } /* End of iovec loop */
1967
1968         /* Is there any IO still left to submit? */
1969         if (bhind) {
1970                 err = wait_kio(rw, bhind, bh, size);
1971                 if (err >= 0)
1972                         transferred += err;
1973                 else
1974                         goto finished;
1975         }
1976
1977  finished:
1978         if (transferred)
1979                 return transferred;
1980         return err;
1981
1982  error:
1983         /* We got an error allocating the bh'es.  Just free the current
1984            buffer_heads and exit. */
1985         spin_lock(&unused_list_lock);
1986         for (i = bhind; --i >= 0; ) {
1987                 __put_unused_buffer_head(bh[bhind]);
1988         }
1989         spin_unlock(&unused_list_lock);
1990         goto finished;
1991 }
1992
1993 /*
1994  * Start I/O on a page.
1995  * This function expects the page to be locked and may return
1996  * before I/O is complete. You then have to check page->locked,
1997  * page->uptodate, and maybe wait on page->wait.
1998  *
1999  * brw_page() is SMP-safe, although it's being called with the
2000  * kernel lock held - but the code is ready.
2001  *
2002  * FIXME: we need a swapper_inode->get_block function to remove
2003  *        some of the bmap kludges and interface ugliness here.
2004  */
2005 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size)
2006 {
2007         struct buffer_head *head, *bh, *arr[MAX_BUF_PER_PAGE];
2008         int nr, fresh /* temporary debugging flag */, block;
2009
2010         if (!PageLocked(page))
2011                 panic("brw_page: page not locked for I/O");
2012 //      ClearPageError(page);
2013         /*
2014          * We pretty much rely on the page lock for this, because
2015          * create_page_buffers() might sleep.
2016          */
2017         fresh = 0;
2018         if (!page->buffers) {
2019                 create_page_buffers(rw, page, dev, b, size);
2020                 fresh = 1;
2021         }
2022         if (!page->buffers)
2023                 BUG();
2024
2025         head = page->buffers;
2026         bh = head;
2027         nr = 0;
2028         do {
2029                 block = *(b++);
2030
2031                 if (fresh && (atomic_read(&bh->b_count) != 0))
2032                         BUG();
2033                 if (rw == READ) {
2034                         if (!fresh)
2035                                 BUG();
2036                         if (!buffer_uptodate(bh)) {
2037                                 arr[nr++] = bh;
2038                                 atomic_inc(&bh->b_count);
2039                         }
2040                 } else { /* WRITE */
2041                         if (!bh->b_blocknr) {
2042                                 if (!block)
2043                                         BUG();
2044                                 bh->b_blocknr = block;
2045                         } else {
2046                                 if (!block)
2047                                         BUG();
2048                         }
2049                         set_bit(BH_Uptodate, &bh->b_state);
2050                         set_bit(BH_Dirty, &bh->b_state);
2051                         arr[nr++] = bh;
2052                         atomic_inc(&bh->b_count);
2053                 }
2054                 bh = bh->b_this_page;
2055         } while (bh != head);
2056         if ((rw == READ) && nr) {
2057                 if (Page_Uptodate(page))
2058                         BUG();
2059                 ll_rw_block(rw, nr, arr);
2060         } else {
2061                 if (!nr && rw == READ) {
2062                         SetPageUptodate(page);
2063                         UnlockPage(page);
2064                 }
2065                 if (nr && (rw == WRITE))
2066                         ll_rw_block(rw, nr, arr);
2067         }
2068         return 0;
2069 }
2070
2071 int block_symlink(struct inode *inode, const char *symname, int len)
2072 {
2073         struct address_space *mapping = inode->i_mapping;
2074         struct page *page = grab_cache_page(mapping, 0);
2075         int err = -ENOMEM;
2076         char *kaddr;
2077
2078         if (!page)
2079                 goto fail;
2080         err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
2081         if (err)
2082                 goto fail_map;
2083         kaddr = page_address(page);
2084         memcpy(kaddr, symname, len-1);
2085         mapping->a_ops->commit_write(NULL, page, 0, len-1);
2086         /*
2087          * Notice that we are _not_ going to block here - end of page is
2088          * unmapped, so this will only try to map the rest of page, see
2089          * that it is unmapped (typically even will not look into inode -
2090          * ->i_size will be enough for everything) and zero it out.
2091          * OTOH it's obviously correct and should make the page up-to-date.
2092          */
2093         err = mapping->a_ops->readpage(NULL, page);
2094         wait_on_page(page);
2095         page_cache_release(page);
2096         if (err < 0)
2097                 goto fail;
2098         mark_inode_dirty(inode);
2099         return 0;
2100 fail_map:
2101         UnlockPage(page);
2102         page_cache_release(page);
2103 fail:
2104         return err;
2105 }
2106
2107 /*
2108  * Try to increase the number of buffers available: the size argument
2109  * is used to determine what kind of buffers we want.
2110  */
2111 static int grow_buffers(int size)
2112 {
2113         struct page * page;
2114         struct buffer_head *bh, *tmp;
2115         struct buffer_head * insert_point;
2116         int isize;
2117
2118         if ((size & 511) || (size > PAGE_SIZE)) {
2119                 printk("VFS: grow_buffers: size = %d\n",size);
2120                 return 0;
2121         }
2122
2123         page = alloc_page(GFP_BUFFER);
2124         if (!page)
2125                 goto out;
2126         bh = create_buffers(page, size, 0);
2127         if (!bh)
2128                 goto no_buffer_head;
2129
2130         isize = BUFSIZE_INDEX(size);
2131
2132         spin_lock(&free_list[isize].lock);
2133         insert_point = free_list[isize].list;
2134         tmp = bh;
2135         while (1) {
2136                 if (insert_point) {
2137                         tmp->b_next_free = insert_point->b_next_free;
2138                         tmp->b_prev_free = insert_point;
2139                         insert_point->b_next_free->b_prev_free = tmp;
2140                         insert_point->b_next_free = tmp;
2141                 } else {
2142                         tmp->b_prev_free = tmp;
2143                         tmp->b_next_free = tmp;
2144                 }
2145                 insert_point = tmp;
2146                 if (tmp->b_this_page)
2147                         tmp = tmp->b_this_page;
2148                 else
2149                         break;
2150         }
2151         tmp->b_this_page = bh;
2152         free_list[isize].list = bh;
2153         spin_unlock(&free_list[isize].lock);
2154
2155         page->buffers = bh;
2156         page->flags &= ~(1 << PG_referenced);
2157         lru_cache_add(page);
2158         atomic_inc(&buffermem_pages);
2159         return 1;
2160
2161 no_buffer_head:
2162         page_cache_release(page);
2163 out:
2164         return 0;
2165 }
2166
2167 /*
2168  * Sync all the buffers on one page..
2169  *
2170  * If we have old buffers that are locked, we'll
2171  * wait on them, but we won't wait on the new ones
2172  * we're writing out now.
2173  *
2174  * This all is required so that we can free up memory
2175  * later.
2176  *
2177  * Wait:
2178  *      0 - no wait (this does not get called - see try_to_free_buffers below)
2179  *      1 - start IO for dirty buffers
2180  *      2 - wait for completion of locked buffers
2181  */
2182 static void sync_page_buffers(struct buffer_head *bh, int wait)
2183 {
2184         struct buffer_head * tmp = bh;
2185
2186         do {
2187                 struct buffer_head *p = tmp;
2188                 tmp = tmp->b_this_page;
2189                 if (buffer_locked(p)) {
2190                         if (wait > 1)
2191                                 __wait_on_buffer(p);
2192                 } else if (buffer_dirty(p))
2193                         ll_rw_block(WRITE, 1, &p);
2194         } while (tmp != bh);
2195 }
2196
2197 /*
2198  * Can the buffer be thrown out?
2199  */
2200 #define BUFFER_BUSY_BITS        ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
2201 #define buffer_busy(bh)         (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
2202
2203 /*
2204  * try_to_free_buffers() checks if all the buffers on this particular page
2205  * are unused, and free's the page if so.
2206  *
2207  * Wake up bdflush() if this fails - if we're running low on memory due
2208  * to dirty buffers, we need to flush them out as quickly as possible.
2209  *
2210  * NOTE: There are quite a number of ways that threads of control can
2211  *       obtain a reference to a buffer head within a page.  So we must
2212  *       lock out all of these paths to cleanly toss the page.
2213  */
2214 int try_to_free_buffers(struct page * page, int wait)
2215 {
2216         struct buffer_head * tmp, * bh = page->buffers;
2217         int index = BUFSIZE_INDEX(bh->b_size);
2218
2219         spin_lock(&lru_list_lock);
2220         write_lock(&hash_table_lock);
2221         spin_lock(&free_list[index].lock);
2222         tmp = bh;
2223         do {
2224                 struct buffer_head *p = tmp;
2225
2226                 tmp = tmp->b_this_page;
2227                 if (buffer_busy(p))
2228                         goto busy_buffer_page;
2229         } while (tmp != bh);
2230
2231         spin_lock(&unused_list_lock);
2232         tmp = bh;
2233         do {
2234                 struct buffer_head * p = tmp;
2235                 tmp = tmp->b_this_page;
2236
2237                 /* The buffer can be either on the regular
2238                  * queues or on the free list..
2239                  */
2240                 if (p->b_dev != B_FREE)
2241                         __remove_from_queues(p);
2242                 else
2243                         __remove_from_free_list(p, index);
2244                 __put_unused_buffer_head(p);
2245         } while (tmp != bh);
2246         spin_unlock(&unused_list_lock);
2247
2248         /* Wake up anyone waiting for buffer heads */
2249         wake_up(&buffer_wait);
2250
2251         /* And free the page */
2252         page->buffers = NULL;
2253         page_cache_release(page);
2254         spin_unlock(&free_list[index].lock);
2255         write_unlock(&hash_table_lock);
2256         spin_unlock(&lru_list_lock);
2257         return 1;
2258
2259 busy_buffer_page:
2260         /* Uhhuh, start writeback so that we don't end up with all dirty pages */
2261         spin_unlock(&free_list[index].lock);
2262         write_unlock(&hash_table_lock);
2263         spin_unlock(&lru_list_lock);
2264         if (wait)
2265                 sync_page_buffers(bh, wait);
2266         return 0;
2267 }
2268
2269 /* ================== Debugging =================== */
2270
2271 void show_buffers(void)
2272 {
2273 #ifdef CONFIG_SMP
2274         struct buffer_head * bh;
2275         int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
2276         int protected = 0;
2277         int nlist;
2278         static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", "PROTECTED", };
2279 #endif
2280
2281         printk("Buffer memory:   %6dkB\n",
2282                         atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
2283
2284 #ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
2285         if (!spin_trylock(&lru_list_lock))
2286                 return;
2287         for(nlist = 0; nlist < NR_LIST; nlist++) {
2288                 found = locked = dirty = used = lastused = protected = 0;
2289                 bh = lru_list[nlist];
2290                 if(!bh) continue;
2291
2292                 do {
2293                         found++;
2294                         if (buffer_locked(bh))
2295                                 locked++;
2296                         if (buffer_protected(bh))
2297                                 protected++;
2298                         if (buffer_dirty(bh))
2299                                 dirty++;
2300                         if (atomic_read(&bh->b_count))
2301                                 used++, lastused = found;
2302                         bh = bh->b_next_free;
2303                 } while (bh != lru_list[nlist]);
2304                 {
2305                         int tmp = nr_buffers_type[nlist];
2306                         if (found != tmp)
2307                                 printk("%9s: BUG -> found %d, reported %d\n",
2308                                        buf_types[nlist], found, tmp);
2309                 }
2310                 printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
2311                        "%d locked, %d protected, %d dirty\n",
2312                        buf_types[nlist], found, size_buffers_type[nlist]>>10,
2313                        used, lastused, locked, protected, dirty);
2314         }
2315         spin_unlock(&lru_list_lock);
2316 #endif
2317 }
2318
2319 /* ===================== Init ======================= */
2320
2321 /*
2322  * allocate the hash table and init the free list
2323  * Use gfp() for the hash table to decrease TLB misses, use
2324  * SLAB cache for buffer heads.
2325  */
2326 void __init buffer_init(unsigned long mempages)
2327 {
2328         int order, i;
2329         unsigned int nr_hash;
2330
2331         /* The buffer cache hash table is less important these days,
2332          * trim it a bit.
2333          */
2334         mempages >>= 14;
2335
2336         mempages *= sizeof(struct buffer_head *);
2337
2338         for (order = 0; (1 << order) < mempages; order++)
2339                 ;
2340
2341         /* try to allocate something until we get it or we're asking
2342            for something that is really too small */
2343
2344         do {
2345                 unsigned long tmp;
2346
2347                 nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
2348                 bh_hash_mask = (nr_hash - 1);
2349
2350                 tmp = nr_hash;
2351                 bh_hash_shift = 0;
2352                 while((tmp >>= 1UL) != 0UL)
2353                         bh_hash_shift++;
2354
2355                 hash_table = (struct buffer_head **)
2356                     __get_free_pages(GFP_ATOMIC, order);
2357         } while (hash_table == NULL && --order > 0);
2358         printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
2359                nr_hash, order, (PAGE_SIZE << order));
2360
2361         if (!hash_table)
2362                 panic("Failed to allocate buffer hash table\n");
2363
2364         /* Setup hash chains. */
2365         for(i = 0; i < nr_hash; i++)
2366                 hash_table[i] = NULL;
2367
2368         /* Setup free lists. */
2369         for(i = 0; i < NR_SIZES; i++) {
2370                 free_list[i].list = NULL;
2371                 free_list[i].lock = SPIN_LOCK_UNLOCKED;
2372         }
2373
2374         /* Setup lru lists. */
2375         for(i = 0; i < NR_LIST; i++)
2376                 lru_list[i] = NULL;
2377
2378 }
2379
2380
2381 /* ====================== bdflush support =================== */
2382
2383 /* This is a simple kernel daemon, whose job it is to provide a dynamic
2384  * response to dirty buffers.  Once this process is activated, we write back
2385  * a limited number of buffers to the disks and then go back to sleep again.
2386  */
2387 static DECLARE_WAIT_QUEUE_HEAD(bdflush_done);
2388 struct task_struct *bdflush_tsk = 0;
2389
2390 void wakeup_bdflush(int block)
2391 {
2392         DECLARE_WAITQUEUE(wait, current);
2393
2394         if (current == bdflush_tsk)
2395                 return;
2396
2397         if (!block) {
2398                 wake_up_process(bdflush_tsk);
2399                 return;
2400         }
2401
2402         /* kflushd can wakeup us before we have a chance to
2403            go to sleep so we must be smart in handling
2404            this wakeup event from kflushd to avoid deadlocking in SMP
2405            (we are not holding any lock anymore in these two paths). */
2406         __set_current_state(TASK_UNINTERRUPTIBLE);
2407         add_wait_queue(&bdflush_done, &wait);
2408
2409         wake_up_process(bdflush_tsk);
2410         schedule();
2411
2412         remove_wait_queue(&bdflush_done, &wait);
2413         __set_current_state(TASK_RUNNING);
2414 }
2415
2416 /* This is the _only_ function that deals with flushing async writes
2417    to disk.
2418    NOTENOTENOTENOTE: we _only_ need to browse the DIRTY lru list
2419    as all dirty buffers lives _only_ in the DIRTY lru list.
2420    As we never browse the LOCKED and CLEAN lru lists they are infact
2421    completly useless. */
2422 static int flush_dirty_buffers(int check_flushtime)
2423 {
2424         struct buffer_head * bh, *next;
2425         int flushed = 0, i;
2426
2427  restart:
2428         spin_lock(&lru_list_lock);
2429         bh = lru_list[BUF_DIRTY];
2430         if (!bh)
2431                 goto out_unlock;
2432         for (i = nr_buffers_type[BUF_DIRTY]; i-- > 0; bh = next) {
2433                 next = bh->b_next_free;
2434
2435                 if (!buffer_dirty(bh)) {
2436                         __refile_buffer(bh);
2437                         continue;
2438                 }
2439                 if (buffer_locked(bh))
2440                         continue;
2441
2442                 if (check_flushtime) {
2443                         /* The dirty lru list is chronologically ordered so
2444                            if the current bh is not yet timed out,
2445                            then also all the following bhs
2446                            will be too young. */
2447                         if (time_before(jiffies, bh->b_flushtime))
2448                                 goto out_unlock;
2449                 } else {
2450                         if (++flushed > bdf_prm.b_un.ndirty)
2451                                 goto out_unlock;
2452                 }
2453
2454                 /* OK, now we are committed to write it out. */
2455                 atomic_inc(&bh->b_count);
2456                 spin_unlock(&lru_list_lock);
2457                 ll_rw_block(WRITE, 1, &bh);
2458                 atomic_dec(&bh->b_count);
2459
2460                 if (current->need_resched)
2461                         schedule();
2462                 goto restart;
2463         }
2464  out_unlock:
2465         spin_unlock(&lru_list_lock);
2466
2467         return flushed;
2468 }
2469
2470 /*
2471  * Here we attempt to write back old buffers.  We also try to flush inodes
2472  * and supers as well, since this function is essentially "update", and
2473  * otherwise there would be no way of ensuring that these quantities ever
2474  * get written back.  Ideally, we would have a timestamp on the inodes
2475  * and superblocks so that we could write back only the old ones as well
2476  */
2477
2478 static int sync_old_buffers(void)
2479 {
2480         lock_kernel();
2481         sync_supers(0);
2482         sync_inodes(0);
2483         unlock_kernel();
2484
2485         flush_dirty_buffers(1);
2486         /* must really sync all the active I/O request to disk here */
2487         run_task_queue(&tq_disk);
2488         return 0;
2489 }
2490
2491 int block_sync_page(struct page *page)
2492 {
2493         run_task_queue(&tq_disk);
2494         return 0;
2495 }
2496
2497 /* This is the interface to bdflush.  As we get more sophisticated, we can
2498  * pass tuning parameters to this "process", to adjust how it behaves.
2499  * We would want to verify each parameter, however, to make sure that it
2500  * is reasonable. */
2501
2502 asmlinkage long sys_bdflush(int func, long data)
2503 {
2504         if (!capable(CAP_SYS_ADMIN))
2505                 return -EPERM;
2506
2507         if (func == 1) {
2508                 /* do_exit directly and let kupdate to do its work alone. */
2509                 do_exit(0);
2510 #if 0 /* left here as it's the only example of lazy-mm-stuff used from
2511          a syscall that doesn't care about the current mm context. */
2512                 int error;
2513                 struct mm_struct *user_mm;
2514
2515                 /*
2516                  * bdflush will spend all of it's time in kernel-space,
2517                  * without touching user-space, so we can switch it into
2518                  * 'lazy TLB mode' to reduce the cost of context-switches
2519                  * to and from bdflush.
2520                  */
2521                 user_mm = start_lazy_tlb();
2522                 error = sync_old_buffers();
2523                 end_lazy_tlb(user_mm);
2524                 return error;
2525 #endif
2526         }
2527
2528         /* Basically func 1 means read param 1, 2 means write param 1, etc */
2529         if (func >= 2) {
2530                 int i = (func-2) >> 1;
2531                 if (i >= 0 && i < N_PARAM) {
2532                         if ((func & 1) == 0)
2533                                 return put_user(bdf_prm.data[i], (int*)data);
2534
2535                         if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
2536                                 bdf_prm.data[i] = data;
2537                                 return 0;
2538                         }
2539                 }
2540                 return -EINVAL;
2541         }
2542
2543         /* Having func 0 used to launch the actual bdflush and then never
2544          * return (unless explicitly killed). We return zero here to
2545          * remain semi-compatible with present update(8) programs.
2546          */
2547         return 0;
2548 }
2549
2550 /*
2551  * This is the actual bdflush daemon itself. It used to be started from
2552  * the syscall above, but now we launch it ourselves internally with
2553  * kernel_thread(...)  directly after the first thread in init/main.c
2554  */
2555 int bdflush(void *sem)
2556 {
2557         struct task_struct *tsk = current;
2558         int flushed;
2559         /*
2560          *      We have a bare-bones task_struct, and really should fill
2561          *      in a few more things so "top" and /proc/2/{exe,root,cwd}
2562          *      display semi-sane things. Not real crucial though...
2563          */
2564
2565         tsk->session = 1;
2566         tsk->pgrp = 1;
2567         strcpy(tsk->comm, "kflushd");
2568         bdflush_tsk = tsk;
2569
2570         /* avoid getting signals */
2571         spin_lock_irq(&tsk->sigmask_lock);
2572         flush_signals(tsk);
2573         sigfillset(&tsk->blocked);
2574         recalc_sigpending(tsk);
2575         spin_unlock_irq(&tsk->sigmask_lock);
2576
2577         up((struct semaphore *)sem);
2578
2579         for (;;) {
2580                 CHECK_EMERGENCY_SYNC
2581
2582                 flushed = flush_dirty_buffers(0);
2583
2584                 /* If wakeup_bdflush will wakeup us
2585                    after our bdflush_done wakeup, then
2586                    we must make sure to not sleep
2587                    in schedule_timeout otherwise
2588                    wakeup_bdflush may wait for our
2589                    bdflush_done wakeup that would never arrive
2590                    (as we would be sleeping) and so it would
2591                    deadlock in SMP. */
2592                 __set_current_state(TASK_INTERRUPTIBLE);
2593                 wake_up(&bdflush_done);
2594                 /*
2595                  * If there are still a lot of dirty buffers around,
2596                  * skip the sleep and flush some more. Otherwise, we
2597                  * go to sleep waiting a wakeup.
2598                  */
2599                 if (!flushed || balance_dirty_state(NODEV) < 0)
2600                         schedule();
2601                 /* Remember to mark us as running otherwise
2602                    the next schedule will block. */
2603                 __set_current_state(TASK_RUNNING);
2604         }
2605 }
2606
2607 /*
2608  * This is the kernel update daemon. It was used to live in userspace
2609  * but since it's need to run safely we want it unkillable by mistake.
2610  * You don't need to change your userspace configuration since
2611  * the userspace `update` will do_exit(0) at the first sys_bdflush().
2612  */
2613 int kupdate(void *sem)
2614 {
2615         struct task_struct * tsk = current;
2616         int interval;
2617
2618         tsk->session = 1;
2619         tsk->pgrp = 1;
2620         strcpy(tsk->comm, "kupdate");
2621
2622         /* sigstop and sigcont will stop and wakeup kupdate */
2623         spin_lock_irq(&tsk->sigmask_lock);
2624         sigfillset(&tsk->blocked);
2625         siginitsetinv(&current->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP));
2626         recalc_sigpending(tsk);
2627         spin_unlock_irq(&tsk->sigmask_lock);
2628
2629         up((struct semaphore *)sem);
2630
2631         for (;;) {
2632                 /* update interval */
2633                 interval = bdf_prm.b_un.interval;
2634                 if (interval) {
2635                         tsk->state = TASK_INTERRUPTIBLE;
2636                         schedule_timeout(interval);
2637                 } else {
2638                 stop_kupdate:
2639                         tsk->state = TASK_STOPPED;
2640                         schedule(); /* wait for SIGCONT */
2641                 }
2642                 /* check for sigstop */
2643                 if (signal_pending(tsk)) {
2644                         int stopped = 0;
2645                         spin_lock_irq(&tsk->sigmask_lock);
2646                         if (sigismember(&tsk->pending.signal, SIGSTOP)) {
2647                                 sigdelset(&tsk->pending.signal, SIGSTOP);
2648                                 stopped = 1;
2649                         }
2650                         recalc_sigpending(tsk);
2651                         spin_unlock_irq(&tsk->sigmask_lock);
2652                         if (stopped)
2653                                 goto stop_kupdate;
2654                 }
2655 #ifdef DEBUG
2656                 printk("kupdate() activated...\n");
2657 #endif
2658                 sync_old_buffers();
2659         }
2660 }
2661
2662 static int __init bdflush_init(void)
2663 {
2664         DECLARE_MUTEX_LOCKED(sem);
2665         kernel_thread(bdflush, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
2666         down(&sem);
2667         kernel_thread(kupdate, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
2668         down(&sem);
2669         return 0;
2670 }
2671
2672 module_init(bdflush_init)
2673