mm/filemap.c

   1 /*
   2  *      linux/mm/filemap.c
   3  *
   4  * Copyright (C) 1994-1999  Linus Torvalds
   5  */
   6
   7 /*
   8  * This file handles the generic file mmap semantics used by
   9  * most "normal" filesystems (but you don't /have/ to use this:
  10  * the NFS filesystem used to do this differently, for example)
  11  */
  12 #include <linux/malloc.h>
  13 #include <linux/shm.h>
  14 #include <linux/mman.h>
  15 #include <linux/locks.h>
  16 #include <linux/pagemap.h>
  17 #include <linux/swap.h>
  18 #include <linux/smp_lock.h>
  19 #include <linux/blkdev.h>
  20 #include <linux/file.h>
  21 #include <linux/swapctl.h>
  22 #include <linux/slab.h>
  23 #include <linux/init.h>
  24 #include <linux/highmem.h>
  25
  26 #include <asm/pgtable.h>
  27 #include <asm/uaccess.h>
  28
  29 /*
  30  * Shared mappings implemented 30.11.1994. It's not fully working yet,
  31  * though.
  32  *
  33  * Shared mappings now work. 15.8.1995  Bruno.
  34  *
  35  * finished 'unifying' the page and buffer cache and SMP-threaded the
  36  * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
  37  *
  38  * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
  39  */
  40
  41 atomic_t page_cache_size = ATOMIC_INIT(0);
  42 unsigned int page_hash_bits;
  43 struct page **page_hash_table;
  44
  45 spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
  46 /*
  47  * NOTE: to avoid deadlocking you must never acquire the pagecache_lock with
  48  *       the pagemap_lru_lock held.
  49  */
  50 spinlock_t pagemap_lru_lock = SPIN_LOCK_UNLOCKED;
  51
  52 #define CLUSTER_PAGES           (1 << page_cluster)
  53 #define CLUSTER_OFFSET(x)       (((x) >> page_cluster) << page_cluster)
  54
  55 void __add_page_to_hash_queue(struct page * page, struct page **p)
  56 {
  57         atomic_inc(&page_cache_size);
  58         if((page->next_hash = *p) != NULL)
  59                 (*p)->pprev_hash = &page->next_hash;
  60         *p = page;
  61         page->pprev_hash = p;
  62         if (page->buffers)
  63                 PAGE_BUG(page);
  64 }
  65
  66 static void remove_page_from_hash_queue(struct page * page)
  67 {
  68         if(page->pprev_hash) {
  69                 if(page->next_hash)
  70                         page->next_hash->pprev_hash = page->pprev_hash;
  71                 *page->pprev_hash = page->next_hash;
  72                 page->pprev_hash = NULL;
  73         }
  74         atomic_dec(&page_cache_size);
  75 }
  76
  77 /*
  78  * Remove a page from the page cache and free it. Caller has to make
  79  * sure the page is locked and that nobody else uses it - or that usage
  80  * is safe.
  81  */
  82 void remove_inode_page(struct page *page)
  83 {
  84         if (!PageLocked(page))
  85                 PAGE_BUG(page);
  86
  87         spin_lock(&pagecache_lock);
  88         remove_page_from_inode_queue(page);
  89         remove_page_from_hash_queue(page);
  90         page->mapping = NULL;
  91         spin_unlock(&pagecache_lock);
  92 }
  93
  94 void invalidate_inode_pages(struct inode * inode)
  95 {
  96         struct list_head *head, *curr;
  97         struct page * page;
  98
  99         head = &inode->i_data.pages;
 100         spin_lock(&pagecache_lock);
 101         curr = head->next;
 102
 103         while (curr != head) {
 104                 page = list_entry(curr, struct page, list);
 105                 curr = curr->next;
 106
 107                 /* We cannot invalidate a locked page */
 108                 if (PageLocked(page))
 109                         continue;
 110
 111                 lru_cache_del(page);
 112
 113                 remove_page_from_inode_queue(page);
 114                 remove_page_from_hash_queue(page);
 115                 page->mapping = NULL;
 116                 page_cache_release(page);
 117         }
 118         spin_unlock(&pagecache_lock);
 119 }
 120
 121 /*
 122  * Truncate the page cache at a set offset, removing the pages
 123  * that are beyond that offset (and zeroing out partial pages).
 124  */
 125 void truncate_inode_pages(struct inode * inode, unsigned long start)
 126 {
 127         struct list_head *head, *curr;
 128         struct page * page;
 129         unsigned partial = start & (PAGE_CACHE_SIZE - 1);
 130
 131         start = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 132
 133 repeat:
 134         head = &inode->i_data.pages;
 135         spin_lock(&pagecache_lock);
 136         curr = head->next;
 137         while (curr != head) {
 138                 unsigned long offset;
 139
 140                 page = list_entry(curr, struct page, list);
 141                 curr = curr->next;
 142
 143                 offset = page->pg_offset;
 144
 145                 /* page wholly truncated - free it */
 146                 if (offset >= start) {
 147                         get_page(page);
 148                         spin_unlock(&pagecache_lock);
 149
 150                         lock_page(page);
 151
 152                         if (!inode->i_op->flushpage ||
 153                             inode->i_op->flushpage(inode, page, 0))
 154                                 lru_cache_del(page);
 155
 156                         /*
 157                          * We remove the page from the page cache
 158                          * _after_ we have destroyed all buffer-cache
 159                          * references to it. Otherwise some other process
 160                          * might think this inode page is not in the
 161                          * page cache and creates a buffer-cache alias
 162                          * to it causing all sorts of fun problems ...
 163                          */
 164                         remove_inode_page(page);
 165
 166                         UnlockPage(page);
 167                         page_cache_release(page);
 168                         page_cache_release(page);
 169
 170                         /*
 171                          * We have done things without the pagecache lock,
 172                          * so we'll have to repeat the scan.
 173                          * It's not possible to deadlock here because
 174                          * we are guaranteed to make progress. (ie. we have
 175                          * just removed a page)
 176                          */
 177                         goto repeat;
 178                 }
 179                 /*
 180                  * there is only one partial page possible.
 181                  */
 182                 if (!partial)
 183                         continue;
 184
 185                 /* and it's the one preceeding the first wholly truncated page */
 186                 if ((offset + 1) != start)
 187                         continue;
 188
 189                 /* partial truncate, clear end of page */
 190                 get_page(page);
 191                 spin_unlock(&pagecache_lock);
 192
 193                 lock_page(page);
 194
 195                 memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
 196                 if (inode->i_op->flushpage)
 197                         inode->i_op->flushpage(inode, page, partial);
 198
 199                 partial = 0;
 200
 201                 /*
 202                  * we have dropped the spinlock so we have to
 203                  * restart.
 204                  */
 205                 UnlockPage(page);
 206                 page_cache_release(page);
 207                 goto repeat;
 208         }
 209         spin_unlock(&pagecache_lock);
 210 }
 211
 212 int shrink_mmap(int priority, int gfp_mask)
 213 {
 214         int ret = 0, count;
 215         LIST_HEAD(young);
 216         LIST_HEAD(old);
 217         LIST_HEAD(forget);
 218         struct list_head * page_lru, * dispose;
 219         struct page * page;
 220
 221         count = nr_lru_pages / (priority+1);
 222
 223         spin_lock(&pagemap_lru_lock);
 224
 225         while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache) {
 226                 page = list_entry(page_lru, struct page, lru);
 227                 list_del(page_lru);
 228
 229                 dispose = &lru_cache;
 230                 if (test_and_clear_bit(PG_referenced, &page->flags))
 231                         /* Roll the page at the top of the lru list,
 232                          * we could also be more aggressive putting
 233                          * the page in the young-dispose-list, so
 234                          * avoiding to free young pages in each pass.
 235                          */
 236                         goto dispose_continue;
 237
 238                 dispose = &old;
 239                 /* don't account passes over not DMA pages */
 240                 if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
 241                         goto dispose_continue;
 242                 if (!(gfp_mask & __GFP_HIGHMEM) && PageHighMem(page))
 243                         goto dispose_continue;
 244
 245                 count--;
 246
 247                 dispose = &young;
 248                 if (TryLockPage(page))
 249                         goto dispose_continue;
 250
 251                 /* Release the pagemap_lru lock even if the page is not yet
 252                    queued in any lru queue since we have just locked down
 253                    the page so nobody else may SMP race with us running
 254                    a lru_cache_del() (lru_cache_del() always run with the
 255                    page locked down ;). */
 256                 spin_unlock(&pagemap_lru_lock);
 257
 258                 /* avoid unscalable SMP locking */
 259                 if (!page->buffers && page_count(page) > 1)
 260                         goto unlock_noput_continue;
 261
 262                 /* Take the pagecache_lock spinlock held to avoid
 263                    other tasks to notice the page while we are looking at its
 264                    page count. If it's a pagecache-page we'll free it
 265                    in one atomic transaction after checking its page count. */
 266                 spin_lock(&pagecache_lock);
 267
 268                 /* avoid freeing the page while it's locked */
 269                 get_page(page);
 270
 271                 /* Is it a buffer page? */
 272                 if (page->buffers) {
 273                         spin_unlock(&pagecache_lock);
 274                         if (!try_to_free_buffers(page))
 275                                 goto unlock_continue;
 276                         /* page was locked, inode can't go away under us */
 277                         if (!page->mapping) {
 278                                 atomic_dec(&buffermem_pages);
 279                                 goto made_buffer_progress;
 280                         }
 281                         spin_lock(&pagecache_lock);
 282                 }
 283
 284                 /*
 285                  * We can't free pages unless there's just one user
 286                  * (count == 2 because we added one ourselves above).
 287                  */
 288                 if (page_count(page) != 2)
 289                         goto cache_unlock_continue;
 290
 291                 /*
 292                  * Is it a page swap page? If so, we want to
 293                  * drop it if it is no longer used, even if it
 294                  * were to be marked referenced..
 295                  */
 296                 if (PageSwapCache(page)) {
 297                         spin_unlock(&pagecache_lock);
 298                         __delete_from_swap_cache(page);
 299                         goto made_inode_progress;
 300                 }
 301
 302                 /* is it a page-cache page? */
 303                 if (page->mapping) {
 304                         dispose = &old;
 305                         if (!pgcache_under_min())
 306                         {
 307                                 remove_page_from_inode_queue(page);
 308                                 remove_page_from_hash_queue(page);
 309                                 page->mapping = NULL;
 310                                 spin_unlock(&pagecache_lock);
 311                                 goto made_inode_progress;
 312                         }
 313                         goto cache_unlock_continue;
 314                 }
 315
 316                 dispose = &forget;
 317                 printk(KERN_ERR "shrink_mmap: unknown LRU page!\n");
 318
 319 cache_unlock_continue:
 320                 spin_unlock(&pagecache_lock);
 321 unlock_continue:
 322                 UnlockPage(page);
 323                 put_page(page);
 324 dispose_relock_continue:
 325                 /* even if the dispose list is local, a truncate_inode_page()
 326                    may remove a page from its queue so always
 327                    synchronize with the lru lock while accesing the
 328                    page->lru field */
 329                 spin_lock(&pagemap_lru_lock);
 330                 list_add(page_lru, dispose);
 331                 continue;
 332
 333 unlock_noput_continue:
 334                 UnlockPage(page);
 335                 goto dispose_relock_continue;
 336
 337 dispose_continue:
 338                 list_add(page_lru, dispose);
 339         }
 340         goto out;
 341
 342 made_inode_progress:
 343         page_cache_release(page);
 344 made_buffer_progress:
 345         UnlockPage(page);
 346         put_page(page);
 347         ret = 1;
 348         spin_lock(&pagemap_lru_lock);
 349         /* nr_lru_pages needs the spinlock */
 350         nr_lru_pages--;
 351
 352 out:
 353         list_splice(&young, &lru_cache);
 354         list_splice(&old, lru_cache.prev);
 355
 356         spin_unlock(&pagemap_lru_lock);
 357
 358         return ret;
 359 }
 360
 361 static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
 362 {
 363         goto inside;
 364
 365         for (;;) {
 366                 page = page->next_hash;
 367 inside:
 368                 if (!page)
 369                         goto not_found;
 370                 if (page->mapping != mapping)
 371                         continue;
 372                 if (page->pg_offset == offset)
 373                         break;
 374         }
 375         set_bit(PG_referenced, &page->flags);
 376 not_found:
 377         return page;
 378 }
 379
 380 /*
 381  * By the time this is called, the page is locked and
 382  * we don't have to worry about any races any more.
 383  *
 384  * Start the IO..
 385  */
 386 static int writeout_one_page(struct page *page)
 387 {
 388         struct buffer_head *bh, *head = page->buffers;
 389
 390         bh = head;
 391         do {
 392                 if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
 393                         continue;
 394
 395                 bh->b_flushtime = 0;
 396                 ll_rw_block(WRITE, 1, &bh);
 397         } while ((bh = bh->b_this_page) != head);
 398         return 0;
 399 }
 400
 401 static int waitfor_one_page(struct page *page)
 402 {
 403         int error = 0;
 404         struct buffer_head *bh, *head = page->buffers;
 405
 406         bh = head;
 407         do {
 408                 wait_on_buffer(bh);
 409                 if (buffer_req(bh) && !buffer_uptodate(bh))
 410                         error = -EIO;
 411         } while ((bh = bh->b_this_page) != head);
 412         return error;
 413 }
 414
 415 static int do_buffer_fdatasync(struct inode *inode, unsigned long start, unsigned long end, int (*fn)(struct page *))
 416 {
 417         struct list_head *head, *curr;
 418         struct page *page;
 419         int retval = 0;
 420
 421         head = &inode->i_data.pages;
 422
 423         spin_lock(&pagecache_lock);
 424         curr = head->next;
 425         while (curr != head) {
 426                 page = list_entry(curr, struct page, list);
 427                 curr = curr->next;
 428                 if (!page->buffers)
 429                         continue;
 430                 if (page->pg_offset >= end)
 431                         continue;
 432                 if (page->pg_offset < start)
 433                         continue;
 434
 435                 get_page(page);
 436                 spin_unlock(&pagecache_lock);
 437                 lock_page(page);
 438
 439                 /* The buffers could have been free'd while we waited for the page lock */
 440                 if (page->buffers)
 441                         retval |= fn(page);
 442
 443                 UnlockPage(page);
 444                 spin_lock(&pagecache_lock);
 445                 curr = page->list.next;
 446                 page_cache_release(page);
 447         }
 448         spin_unlock(&pagecache_lock);
 449
 450         return retval;
 451 }
 452
 453 /*
 454  * Two-stage data sync: first start the IO, then go back and
 455  * collect the information..
 456  */
 457 int generic_buffer_fdatasync(struct inode *inode, unsigned long start, unsigned long end)
 458 {
 459         unsigned long start_idx = start >> PAGE_CACHE_SHIFT;
 460         unsigned long end_idx = (end + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 461         int retval;
 462
 463         retval = do_buffer_fdatasync(inode, start_idx, end_idx, writeout_one_page);
 464         retval |= do_buffer_fdatasync(inode, start_idx, end_idx, waitfor_one_page);
 465         return retval;
 466 }
 467
 468 /*
 469  * This adds a page to the page cache, starting out as locked,
 470  * owned by us, referenced, but not uptodate and with no errors.
 471  */
 472 static inline void __add_to_page_cache(struct page * page,
 473         struct address_space *mapping, unsigned long offset,
 474         struct page **hash)
 475 {
 476         struct page *alias;
 477         unsigned long flags;
 478
 479         flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced));
 480         page->flags = flags | (1 << PG_locked);
 481         get_page(page);
 482         page->pg_offset = offset;
 483         add_page_to_inode_queue(mapping, page);
 484         __add_page_to_hash_queue(page, hash);
 485         lru_cache_add(page);
 486         alias = __find_page_nolock(mapping, offset, *hash);
 487         if (alias != page)
 488                 BUG();
 489 }
 490
 491 void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset)
 492 {
 493         spin_lock(&pagecache_lock);
 494         __add_to_page_cache(page, mapping, offset, page_hash(mapping, offset));
 495         spin_unlock(&pagecache_lock);
 496 }
 497
 498 int add_to_page_cache_unique(struct page * page,
 499         struct address_space *mapping, unsigned long offset,
 500         struct page **hash)
 501 {
 502         int err;
 503         struct page *alias;
 504
 505         spin_lock(&pagecache_lock);
 506         alias = __find_page_nolock(mapping, offset, *hash);
 507
 508         err = 1;
 509         if (!alias) {
 510                 __add_to_page_cache(page,mapping,offset,hash);
 511                 err = 0;
 512         }
 513
 514         spin_unlock(&pagecache_lock);
 515         return err;
 516 }
 517
 518 /*
 519  * This adds the requested page to the page cache if it isn't already there,
 520  * and schedules an I/O to read in its contents from disk.
 521  */
 522 static inline void page_cache_read(struct file * file, unsigned long offset)
 523 {
 524         struct inode *inode = file->f_dentry->d_inode;
 525         struct page **hash = page_hash(&inode->i_data, offset);
 526         struct page *page;
 527
 528         spin_lock(&pagecache_lock);
 529         page = __find_page_nolock(&inode->i_data, offset, *hash);
 530         spin_unlock(&pagecache_lock);
 531         if (page)
 532                 return;
 533
 534         page = page_cache_alloc();
 535         if (!page)
 536                 return;
 537
 538         if (!add_to_page_cache_unique(page, &inode->i_data, offset, hash)) {
 539                 inode->i_op->readpage(file, page);
 540                 page_cache_release(page);
 541                 return;
 542         }
 543         /*
 544          * We arrive here in the unlikely event that someone
 545          * raced with us and added our page to the cache first.
 546          */
 547         page_cache_free(page);
 548         return;
 549 }
 550
 551 /*
 552  * Read in an entire cluster at once.  A cluster is usually a 64k-
 553  * aligned block that includes the address requested in "offset."
 554  */
 555 static void read_cluster_nonblocking(struct file * file, unsigned long offset)
 556 {
 557         unsigned long filesize = (file->f_dentry->d_inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 558         unsigned long pages = CLUSTER_PAGES;
 559
 560         offset = CLUSTER_OFFSET(offset);
 561         while ((pages-- > 0) && (offset < filesize)) {
 562                 page_cache_read(file, offset);
 563                 offset ++;
 564         }
 565
 566         return;
 567 }
 568
 569 /*
 570  * Wait for a page to get unlocked.
 571  *
 572  * This must be called with the caller "holding" the page,
 573  * ie with increased "page->count" so that the page won't
 574  * go away during the wait..
 575  */
 576 void ___wait_on_page(struct page *page)
 577 {
 578         struct task_struct *tsk = current;
 579         DECLARE_WAITQUEUE(wait, tsk);
 580
 581         add_wait_queue(&page->wait, &wait);
 582         do {
 583                 run_task_queue(&tq_disk);
 584                 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 585                 if (!PageLocked(page))
 586                         break;
 587                 schedule();
 588         } while (PageLocked(page));
 589         tsk->state = TASK_RUNNING;
 590         remove_wait_queue(&page->wait, &wait);
 591 }
 592
 593 /*
 594  * Get an exclusive lock on the page..
 595  */
 596 void lock_page(struct page *page)
 597 {
 598         while (TryLockPage(page))
 599                 ___wait_on_page(page);
 600 }
 601
 602
 603 /*
 604  * a rather lightweight function, finding and getting a reference to a
 605  * hashed page atomically, waiting for it if it's locked.
 606  */
 607 struct page * __find_get_page (struct address_space *mapping,
 608                                 unsigned long offset, struct page **hash)
 609 {
 610         struct page *page;
 611
 612         /*
 613          * We scan the hash list read-only. Addition to and removal from
 614          * the hash-list needs a held write-lock.
 615          */
 616 repeat:
 617         spin_lock(&pagecache_lock);
 618         page = __find_page_nolock(mapping, offset, *hash);
 619         if (page)
 620                 get_page(page);
 621         spin_unlock(&pagecache_lock);
 622
 623         /* Found the page, sleep if locked. */
 624         if (page && PageLocked(page)) {
 625                 struct task_struct *tsk = current;
 626                 DECLARE_WAITQUEUE(wait, tsk);
 627
 628                 run_task_queue(&tq_disk);
 629
 630                 __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 631                 add_wait_queue(&page->wait, &wait);
 632
 633                 if (PageLocked(page))
 634                         schedule();
 635                 __set_task_state(tsk, TASK_RUNNING);
 636                 remove_wait_queue(&page->wait, &wait);
 637
 638                 /*
 639                  * The page might have been unhashed meanwhile. It's
 640                  * not freed though because we hold a reference to it.
 641                  * If this is the case then it will be freed _here_,
 642                  * and we recheck the hash anyway.
 643                  */
 644                 page_cache_release(page);
 645                 goto repeat;
 646         }
 647         /*
 648          * It's not locked so we can return the page and we hold
 649          * a reference to it.
 650          */
 651         return page;
 652 }
 653
 654 /*
 655  * Get the lock to a page atomically.
 656  */
 657 struct page * __find_lock_page (struct address_space *mapping,
 658                                 unsigned long offset, struct page **hash)
 659 {
 660         struct page *page;
 661
 662         /*
 663          * We scan the hash list read-only. Addition to and removal from
 664          * the hash-list needs a held write-lock.
 665          */
 666 repeat:
 667         spin_lock(&pagecache_lock);
 668         page = __find_page_nolock(mapping, offset, *hash);
 669         if (page)
 670                 get_page(page);
 671         spin_unlock(&pagecache_lock);
 672
 673         /* Found the page, sleep if locked. */
 674         if (page && TryLockPage(page)) {
 675                 struct task_struct *tsk = current;
 676                 DECLARE_WAITQUEUE(wait, tsk);
 677
 678                 run_task_queue(&tq_disk);
 679
 680                 __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 681                 add_wait_queue(&page->wait, &wait);
 682
 683                 if (PageLocked(page))
 684                         schedule();
 685                 __set_task_state(tsk, TASK_RUNNING);
 686                 remove_wait_queue(&page->wait, &wait);
 687
 688                 /*
 689                  * The page might have been unhashed meanwhile. It's
 690                  * not freed though because we hold a reference to it.
 691                  * If this is the case then it will be freed _here_,
 692                  * and we recheck the hash anyway.
 693                  */
 694                 page_cache_release(page);
 695                 goto repeat;
 696         }
 697         /*
 698          * It's not locked so we can return the page and we hold
 699          * a reference to it.
 700          */
 701         return page;
 702 }
 703
 704 #if 0
 705 #define PROFILE_READAHEAD
 706 #define DEBUG_READAHEAD
 707 #endif
 708
 709 /*
 710  * Read-ahead profiling information
 711  * --------------------------------
 712  * Every PROFILE_MAXREADCOUNT, the following information is written
 713  * to the syslog:
 714  *   Percentage of asynchronous read-ahead.
 715  *   Average of read-ahead fields context value.
 716  * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
 717  * to the syslog.
 718  */
 719
 720 #ifdef PROFILE_READAHEAD
 721
 722 #define PROFILE_MAXREADCOUNT 1000
 723
 724 static unsigned long total_reada;
 725 static unsigned long total_async;
 726 static unsigned long total_ramax;
 727 static unsigned long total_ralen;
 728 static unsigned long total_rawin;
 729
 730 static void profile_readahead(int async, struct file *filp)
 731 {
 732         unsigned long flags;
 733
 734         ++total_reada;
 735         if (async)
 736                 ++total_async;
 737
 738         total_ramax     += filp->f_ramax;
 739         total_ralen     += filp->f_ralen;
 740         total_rawin     += filp->f_rawin;
 741
 742         if (total_reada > PROFILE_MAXREADCOUNT) {
 743                 save_flags(flags);
 744                 cli();
 745                 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
 746                         restore_flags(flags);
 747                         return;
 748                 }
 749
 750                 printk("Readahead average:  max=%ld, len=%ld, win=%ld, async=%ld%%\n",
 751                         total_ramax/total_reada,
 752                         total_ralen/total_reada,
 753                         total_rawin/total_reada,
 754                         (total_async*100)/total_reada);
 755 #ifdef DEBUG_READAHEAD
 756                 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n",
 757                         filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
 758 #endif
 759
 760                 total_reada     = 0;
 761                 total_async     = 0;
 762                 total_ramax     = 0;
 763                 total_ralen     = 0;
 764                 total_rawin     = 0;
 765
 766                 restore_flags(flags);
 767         }
 768 }
 769 #endif  /* defined PROFILE_READAHEAD */
 770
 771 /*
 772  * Read-ahead context:
 773  * -------------------
 774  * The read ahead context fields of the "struct file" are the following:
 775  * - f_raend : position of the first byte after the last page we tried to
 776  *             read ahead.
 777  * - f_ramax : current read-ahead maximum size.
 778  * - f_ralen : length of the current IO read block we tried to read-ahead.
 779  * - f_rawin : length of the current read-ahead window.
 780  *              if last read-ahead was synchronous then
 781  *                      f_rawin = f_ralen
 782  *              otherwise (was asynchronous)
 783  *                      f_rawin = previous value of f_ralen + f_ralen
 784  *
 785  * Read-ahead limits:
 786  * ------------------
 787  * MIN_READAHEAD   : minimum read-ahead size when read-ahead.
 788  * MAX_READAHEAD   : maximum read-ahead size when read-ahead.
 789  *
 790  * Synchronous read-ahead benefits:
 791  * --------------------------------
 792  * Using reasonable IO xfer length from peripheral devices increase system
 793  * performances.
 794  * Reasonable means, in this context, not too large but not too small.
 795  * The actual maximum value is:
 796  *      MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
 797  *      and 32K if defined (4K page size assumed).
 798  *
 799  * Asynchronous read-ahead benefits:
 800  * ---------------------------------
 801  * Overlapping next read request and user process execution increase system
 802  * performance.
 803  *
 804  * Read-ahead risks:
 805  * -----------------
 806  * We have to guess which further data are needed by the user process.
 807  * If these data are often not really needed, it's bad for system
 808  * performances.
 809  * However, we know that files are often accessed sequentially by
 810  * application programs and it seems that it is possible to have some good
 811  * strategy in that guessing.
 812  * We only try to read-ahead files that seems to be read sequentially.
 813  *
 814  * Asynchronous read-ahead risks:
 815  * ------------------------------
 816  * In order to maximize overlapping, we must start some asynchronous read
 817  * request from the device, as soon as possible.
 818  * We must be very careful about:
 819  * - The number of effective pending IO read requests.
 820  *   ONE seems to be the only reasonable value.
 821  * - The total memory pool usage for the file access stream.
 822  *   This maximum memory usage is implicitly 2 IO read chunks:
 823  *   2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
 824  *   64k if defined (4K page size assumed).
 825  */
 826
 827 static inline int get_max_readahead(struct inode * inode)
 828 {
 829         if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
 830                 return MAX_READAHEAD;
 831         return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
 832 }
 833
 834 static void generic_file_readahead(int reada_ok,
 835         struct file * filp, struct inode * inode,
 836         unsigned long ppos, struct page * page)
 837 {
 838         unsigned long max_ahead, ahead;
 839         unsigned long raend;
 840         int max_readahead = get_max_readahead(inode);
 841
 842         raend = filp->f_raend & PAGE_CACHE_MASK;
 843         max_ahead = 0;
 844
 845 /*
 846  * The current page is locked.
 847  * If the current position is inside the previous read IO request, do not
 848  * try to reread previously read ahead pages.
 849  * Otherwise decide or not to read ahead some pages synchronously.
 850  * If we are not going to read ahead, set the read ahead context for this
 851  * page only.
 852  */
 853         if (PageLocked(page)) {
 854                 if (!filp->f_ralen || ppos >= raend || ppos + filp->f_ralen < raend) {
 855                         raend = ppos;
 856                         if (raend < inode->i_size)
 857                                 max_ahead = filp->f_ramax;
 858                         filp->f_rawin = 0;
 859                         filp->f_ralen = PAGE_CACHE_SIZE;
 860                         if (!max_ahead) {
 861                                 filp->f_raend  = ppos + filp->f_ralen;
 862                                 filp->f_rawin += filp->f_ralen;
 863                         }
 864                 }
 865         }
 866 /*
 867  * The current page is not locked.
 868  * If we were reading ahead and,
 869  * if the current max read ahead size is not zero and,
 870  * if the current position is inside the last read-ahead IO request,
 871  *   it is the moment to try to read ahead asynchronously.
 872  * We will later force unplug device in order to force asynchronous read IO.
 873  */
 874         else if (reada_ok && filp->f_ramax && raend >= PAGE_CACHE_SIZE &&
 875                  ppos <= raend && ppos + filp->f_ralen >= raend) {
 876 /*
 877  * Add ONE page to max_ahead in order to try to have about the same IO max size
 878  * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
 879  * Compute the position of the last page we have tried to read in order to
 880  * begin to read ahead just at the next page.
 881  */
 882                 raend -= PAGE_CACHE_SIZE;
 883                 if (raend < inode->i_size)
 884                         max_ahead = filp->f_ramax + PAGE_CACHE_SIZE;
 885
 886                 if (max_ahead) {
 887                         filp->f_rawin = filp->f_ralen;
 888                         filp->f_ralen = 0;
 889                         reada_ok      = 2;
 890                 }
 891         }
 892 /*
 893  * Try to read ahead pages.
 894  * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
 895  * scheduler, will work enough for us to avoid too bad actuals IO requests.
 896  */
 897         ahead = 0;
 898         while (ahead < max_ahead) {
 899                 ahead += PAGE_CACHE_SIZE;
 900                 if ((raend + ahead) >= inode->i_size)
 901                         break;
 902                 page_cache_read(filp, (raend + ahead) >> PAGE_CACHE_SHIFT);
 903         }
 904 /*
 905  * If we tried to read ahead some pages,
 906  * If we tried to read ahead asynchronously,
 907  *   Try to force unplug of the device in order to start an asynchronous
 908  *   read IO request.
 909  * Update the read-ahead context.
 910  * Store the length of the current read-ahead window.
 911  * Double the current max read ahead size.
 912  *   That heuristic avoid to do some large IO for files that are not really
 913  *   accessed sequentially.
 914  */
 915         if (ahead) {
 916                 if (reada_ok == 2) {
 917                         run_task_queue(&tq_disk);
 918                 }
 919
 920                 filp->f_ralen += ahead;
 921                 filp->f_rawin += filp->f_ralen;
 922                 filp->f_raend = raend + ahead + PAGE_CACHE_SIZE;
 923
 924                 filp->f_ramax += filp->f_ramax;
 925
 926                 if (filp->f_ramax > max_readahead)
 927                         filp->f_ramax = max_readahead;
 928
 929 #ifdef PROFILE_READAHEAD
 930                 profile_readahead((reada_ok == 2), filp);
 931 #endif
 932         }
 933
 934         return;
 935 }
 936
 937
 938 /*
 939  * This is a generic file read routine, and uses the
 940  * inode->i_op->readpage() function for the actual low-level
 941  * stuff.
 942  *
 943  * This is really ugly. But the goto's actually try to clarify some
 944  * of the logic when it comes to error handling etc.
 945  */
 946 void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
 947 {
 948         struct dentry *dentry = filp->f_dentry;
 949         struct inode *inode = dentry->d_inode;
 950         unsigned long pos, pgpos;
 951         struct page *cached_page;
 952         int reada_ok;
 953         int error;
 954         int max_readahead = get_max_readahead(inode);
 955         unsigned long pgoff;
 956
 957         cached_page = NULL;
 958         pos = *ppos;
 959         pgpos = pos & PAGE_CACHE_MASK;
 960         pgoff = pos >> PAGE_CACHE_SHIFT;
 961 /*
 962  * If the current position is outside the previous read-ahead window,
 963  * we reset the current read-ahead context and set read ahead max to zero
 964  * (will be set to just needed value later),
 965  * otherwise, we assume that the file accesses are sequential enough to
 966  * continue read-ahead.
 967  */
 968         if (pgpos > filp->f_raend || pgpos + filp->f_rawin < filp->f_raend) {
 969                 reada_ok = 0;
 970                 filp->f_raend = 0;
 971                 filp->f_ralen = 0;
 972                 filp->f_ramax = 0;
 973                 filp->f_rawin = 0;
 974         } else {
 975                 reada_ok = 1;
 976         }
 977 /*
 978  * Adjust the current value of read-ahead max.
 979  * If the read operation stay in the first half page, force no readahead.
 980  * Otherwise try to increase read ahead max just enough to do the read request.
 981  * Then, at least MIN_READAHEAD if read ahead is ok,
 982  * and at most MAX_READAHEAD in all cases.
 983  */
 984         if (pos + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
 985                 filp->f_ramax = 0;
 986         } else {
 987                 unsigned long needed;
 988
 989                 needed = ((pos + desc->count) & PAGE_CACHE_MASK) - pgpos;
 990
 991                 if (filp->f_ramax < needed)
 992                         filp->f_ramax = needed;
 993
 994                 if (reada_ok && filp->f_ramax < MIN_READAHEAD)
 995                                 filp->f_ramax = MIN_READAHEAD;
 996                 if (filp->f_ramax > max_readahead)
 997                         filp->f_ramax = max_readahead;
 998         }
 999
1000         for (;;) {
1001                 struct page *page, **hash;
1002
1003                 if (pos >= inode->i_size)
1004                         break;
1005
1006                 /*
1007                  * Try to find the data in the page cache..
1008                  */
1009                 hash = page_hash(&inode->i_data, pgoff);
1010
1011                 spin_lock(&pagecache_lock);
1012                 page = __find_page_nolock(&inode->i_data, pgoff, *hash);
1013                 if (!page)
1014                         goto no_cached_page;
1015 found_page:
1016                 get_page(page);
1017                 spin_unlock(&pagecache_lock);
1018
1019                 if (!Page_Uptodate(page))
1020                         goto page_not_up_to_date;
1021 page_ok:
1022         /*
1023          * Ok, we have the page, and it's up-to-date, so
1024          * now we can copy it to user space...
1025          */
1026         {
1027                 unsigned long offset, nr;
1028
1029                 offset = pos & ~PAGE_CACHE_MASK;
1030                 nr = PAGE_CACHE_SIZE - offset;
1031                 if (nr > inode->i_size - pos)
1032                         nr = inode->i_size - pos;
1033
1034                 /*
1035                  * The actor routine returns how many bytes were actually used..
1036                  * NOTE! This may not be the same as how much of a user buffer
1037                  * we filled up (we may be padding etc), so we can only update
1038                  * "pos" here (the actor routine has to update the user buffer
1039                  * pointers and the remaining count).
1040                  */
1041                 nr = actor(desc, page, offset, nr);
1042                 pos += nr;
1043                 pgoff = pos >> PAGE_CACHE_SHIFT;
1044                 page_cache_release(page);
1045                 if (nr && desc->count)
1046                         continue;
1047                 break;
1048         }
1049
1050 /*
1051  * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
1052  */
1053 page_not_up_to_date:
1054                 generic_file_readahead(reada_ok, filp, inode,
1055                                         pos & PAGE_CACHE_MASK, page);
1056
1057                 if (Page_Uptodate(page))
1058                         goto page_ok;
1059
1060                 /* Get exclusive access to the page ... */
1061                 lock_page(page);
1062                 if (Page_Uptodate(page)) {
1063                         UnlockPage(page);
1064                         goto page_ok;
1065                 }
1066
1067 readpage:
1068                 /* ... and start the actual read. The read will unlock the page. */
1069                 error = inode->i_op->readpage(filp, page);
1070
1071                 if (!error) {
1072                         if (Page_Uptodate(page))
1073                                 goto page_ok;
1074
1075                         /* Again, try some read-ahead while waiting for the page to finish.. */
1076                         generic_file_readahead(reada_ok, filp, inode,
1077                                                 pos & PAGE_CACHE_MASK, page);
1078                         wait_on_page(page);
1079                         if (Page_Uptodate(page))
1080                                 goto page_ok;
1081                         error = -EIO;
1082                 }
1083
1084                 /* UHHUH! A synchronous read error occurred. Report it */
1085                 desc->error = error;
1086                 page_cache_release(page);
1087                 break;
1088
1089 no_cached_page:
1090                 /*
1091                  * Ok, it wasn't cached, so we need to create a new
1092                  * page..
1093                  *
1094                  * We get here with the page cache lock held.
1095                  */
1096                 if (!cached_page) {
1097                         spin_unlock(&pagecache_lock);
1098                         cached_page = page_cache_alloc();
1099                         if (!cached_page) {
1100                                 desc->error = -ENOMEM;
1101                                 break;
1102                         }
1103
1104                         /*
1105                          * Somebody may have added the page while we
1106                          * dropped the page cache lock. Check for that.
1107                          */
1108                         spin_lock(&pagecache_lock);
1109                         page = __find_page_nolock(&inode->i_data, pgoff, *hash);
1110                         if (page)
1111                                 goto found_page;
1112                 }
1113
1114                 /*
1115                  * Ok, add the new page to the hash-queues...
1116                  */
1117                 page = cached_page;
1118                 __add_to_page_cache(page, &inode->i_data, pgoff, hash);
1119                 spin_unlock(&pagecache_lock);
1120                 cached_page = NULL;
1121
1122                 goto readpage;
1123         }
1124
1125         *ppos = pos;
1126         filp->f_reada = 1;
1127         if (cached_page)
1128                 page_cache_free(cached_page);
1129         UPDATE_ATIME(inode);
1130 }
1131
1132 static int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
1133 {
1134         unsigned long kaddr;
1135         unsigned long left, count = desc->count;
1136
1137         if (size > count)
1138                 size = count;
1139         /*
1140          * FIXME: We cannot yet sleep with kmaps held.
1141          */
1142         kaddr = kmap(page, KM_READ);
1143         left = __copy_to_user(desc->buf, (void *)(kaddr+offset), size);
1144         kunmap(kaddr, KM_READ);
1145
1146         if (left) {
1147                 size -= left;
1148                 desc->error = -EFAULT;
1149         }
1150         desc->count = count - size;
1151         desc->written += size;
1152         desc->buf += size;
1153         return size;
1154 }
1155
1156 /*
1157  * This is the "read()" routine for all filesystems
1158  * that can use the page cache directly.
1159  */
1160 ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
1161 {
1162         ssize_t retval;
1163
1164         retval = -EFAULT;
1165         if (access_ok(VERIFY_WRITE, buf, count)) {
1166                 retval = 0;
1167
1168                 if (count) {
1169                         read_descriptor_t desc;
1170
1171                         desc.written = 0;
1172                         desc.count = count;
1173                         desc.buf = buf;
1174                         desc.error = 0;
1175                         do_generic_file_read(filp, ppos, &desc, file_read_actor);
1176
1177                         retval = desc.written;
1178                         if (!retval)
1179                                 retval = desc.error;
1180                 }
1181         }
1182         return retval;
1183 }
1184
1185 static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size)
1186 {
1187         unsigned long kaddr;
1188         ssize_t written;
1189         unsigned long count = desc->count;
1190         struct file *file = (struct file *) desc->buf;
1191         mm_segment_t old_fs;
1192
1193         if (size > count)
1194                 size = count;
1195         old_fs = get_fs();
1196         set_fs(KERNEL_DS);
1197         kaddr = kmap(page, KM_READ);
1198         written = file->f_op->write(file, (char *)kaddr + offset, size, &file->f_pos);
1199         kunmap(kaddr, KM_READ);
1200         set_fs(old_fs);
1201         if (written < 0) {
1202                 desc->error = written;
1203                 written = 0;
1204         }
1205         desc->count = count - written;
1206         desc->written += written;
1207         return written;
1208 }
1209
1210 asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
1211 {
1212         ssize_t retval;
1213         struct file * in_file, * out_file;
1214         struct inode * in_inode, * out_inode;
1215
1216         /*
1217          * Get input file, and verify that it is ok..
1218          */
1219         retval = -EBADF;
1220         in_file = fget(in_fd);
1221         if (!in_file)
1222                 goto out;
1223         if (!(in_file->f_mode & FMODE_READ))
1224                 goto fput_in;
1225         retval = -EINVAL;
1226         in_inode = in_file->f_dentry->d_inode;
1227         if (!in_inode)
1228                 goto fput_in;
1229         if (!in_inode->i_op || !in_inode->i_op->readpage)
1230                 goto fput_in;
1231         retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
1232         if (retval)
1233                 goto fput_in;
1234
1235         /*
1236          * Get output file, and verify that it is ok..
1237          */
1238         retval = -EBADF;
1239         out_file = fget(out_fd);
1240         if (!out_file)
1241                 goto fput_in;
1242         if (!(out_file->f_mode & FMODE_WRITE))
1243                 goto fput_out;
1244         retval = -EINVAL;
1245         if (!out_file->f_op || !out_file->f_op->write)
1246                 goto fput_out;
1247         out_inode = out_file->f_dentry->d_inode;
1248         if (!out_inode)
1249                 goto fput_out;
1250         retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
1251         if (retval)
1252                 goto fput_out;
1253
1254         retval = 0;
1255         if (count) {
1256                 read_descriptor_t desc;
1257                 loff_t pos = 0, *ppos;
1258
1259                 retval = -EFAULT;
1260                 ppos = &in_file->f_pos;
1261                 if (offset) {
1262                         if (get_user(pos, offset))
1263                                 goto fput_out;
1264                         ppos = &pos;
1265                 }
1266
1267                 desc.written = 0;
1268                 desc.count = count;
1269                 desc.buf = (char *) out_file;
1270                 desc.error = 0;
1271                 do_generic_file_read(in_file, ppos, &desc, file_send_actor);
1272
1273                 retval = desc.written;
1274                 if (!retval)
1275                         retval = desc.error;
1276                 if (offset)
1277                         put_user(pos, offset);
1278         }
1279
1280 fput_out:
1281         fput(out_file);
1282 fput_in:
1283         fput(in_file);
1284 out:
1285         return retval;
1286 }
1287
1288 /*
1289  * filemap_nopage() is invoked via the vma operations vector for a
1290  * mapped memory region to read in file data during a page fault.
1291  *
1292  * The goto's are kind of ugly, but this streamlines the normal case of having
1293  * it in the page cache, and handles the special cases reasonably without
1294  * having a lot of duplicated code.
1295  *
1296  * XXX - at some point, this should return unique values to indicate to
1297  *       the caller whether this is EIO, OOM, or SIGBUS.
1298  */
1299 static struct page * filemap_nopage(struct vm_area_struct * area,
1300         unsigned long address, int no_share)
1301 {
1302         struct file *file = area->vm_file;
1303         struct dentry *dentry = file->f_dentry;
1304         struct inode *inode = dentry->d_inode;
1305         struct page *page, **hash, *old_page;
1306         unsigned long size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1307
1308         unsigned long pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
1309
1310         /*
1311          * Semantics for shared and private memory areas are different
1312          * past the end of the file. A shared mapping past the last page
1313          * of the file is an error and results in a SIGBUS, while a
1314          * private mapping just maps in a zero page.
1315          */
1316         if ((pgoff >= size) &&
1317                 (area->vm_flags & VM_SHARED) && (area->vm_mm == current->mm))
1318                 return NULL;
1319
1320         /*
1321          * Do we have something in the page cache already?
1322          */
1323         hash = page_hash(&inode->i_data, pgoff);
1324 retry_find:
1325         page = __find_get_page(&inode->i_data, pgoff, hash);
1326         if (!page)
1327                 goto no_cached_page;
1328
1329         /*
1330          * Ok, found a page in the page cache, now we need to check
1331          * that it's up-to-date.
1332          */
1333         if (!Page_Uptodate(page))
1334                 goto page_not_uptodate;
1335
1336 success:
1337         /*
1338          * Found the page and have a reference on it, need to check sharing
1339          * and possibly copy it over to another page..
1340          */
1341         old_page = page;
1342         if (no_share) {
1343                 struct page *new_page = page_cache_alloc();
1344
1345                 if (new_page) {
1346                         if (PageHighMem(new_page) || PageHighMem(old_page))
1347                                 BUG();
1348                         copy_highpage(new_page, old_page);
1349                         flush_page_to_ram(new_page);
1350                 }
1351                 page_cache_release(page);
1352                 return new_page;
1353         }
1354
1355         flush_page_to_ram(old_page);
1356         return old_page;
1357
1358 no_cached_page:
1359         /*
1360          * If the requested offset is within our file, try to read a whole
1361          * cluster of pages at once.
1362          *
1363          * Otherwise, we're off the end of a privately mapped file,
1364          * so we need to map a zero page.
1365          */
1366         if (pgoff < size)
1367                 read_cluster_nonblocking(file, pgoff);
1368         else
1369                 page_cache_read(file, pgoff);
1370
1371         /*
1372          * The page we want has now been added to the page cache.
1373          * In the unlikely event that someone removed it in the
1374          * meantime, we'll just come back here and read it again.
1375          */
1376         goto retry_find;
1377
1378 page_not_uptodate:
1379         lock_page(page);
1380         if (Page_Uptodate(page)) {
1381                 UnlockPage(page);
1382                 goto success;
1383         }
1384
1385         if (!inode->i_op->readpage(file, page)) {
1386                 wait_on_page(page);
1387                 if (Page_Uptodate(page))
1388                         goto success;
1389         }
1390
1391         /*
1392          * Umm, take care of errors if the page isn't up-to-date.
1393          * Try to re-read it _once_. We do this synchronously,
1394          * because there really aren't any performance issues here
1395          * and we need to check for errors.
1396          */
1397         lock_page(page);
1398         if (Page_Uptodate(page)) {
1399                 UnlockPage(page);
1400                 goto success;
1401         }
1402         ClearPageError(page);
1403         if (!inode->i_op->readpage(file, page)) {
1404                 wait_on_page(page);
1405                 if (Page_Uptodate(page))
1406                         goto success;
1407         }
1408
1409         /*
1410          * Things didn't work out. Return zero to tell the
1411          * mm layer so, possibly freeing the page cache page first.
1412          */
1413         page_cache_release(page);
1414         return NULL;
1415 }
1416
1417 /*
1418  * Tries to write a shared mapped page to its backing store. May return -EIO
1419  * if the disk is full.
1420  */
1421 static inline int do_write_page(struct inode * inode, struct file * file,
1422         struct page * page, unsigned long offset)
1423 {
1424         int retval;
1425         unsigned long size;
1426         int (*writepage) (struct file *, struct page *);
1427
1428         size = (offset << PAGE_CACHE_SHIFT) + PAGE_CACHE_SIZE;
1429         /* refuse to extend file size.. */
1430         if (S_ISREG(inode->i_mode)) {
1431                 if (size > inode->i_size)
1432                         size = inode->i_size;
1433                 /* Ho humm.. We should have tested for this earlier */
1434                 if (size < offset)
1435                         return -EIO;
1436         }
1437         retval = -EIO;
1438         writepage = inode->i_op->writepage;
1439         lock_page(page);
1440
1441         retval = writepage(file, page);
1442
1443         UnlockPage(page);
1444         return retval;
1445 }
1446
1447 static int filemap_write_page(struct file *file,
1448                               unsigned long offset,
1449                               struct page * page,
1450                               int wait)
1451 {
1452         int result;
1453         struct dentry * dentry;
1454         struct inode * inode;
1455
1456         dentry = file->f_dentry;
1457         inode = dentry->d_inode;
1458
1459         /*
1460          * If a task terminates while we're swapping the page, the vma and
1461          * and file could be released: try_to_swap_out has done a get_file.
1462          * vma/file is guaranteed to exist in the unmap/sync cases because
1463          * mmap_sem is held.
1464          */
1465         result = do_write_page(inode, file, page, offset);
1466         return result;
1467 }
1468
1469
1470 /*
1471  * The page cache takes care of races between somebody
1472  * trying to swap something out and swap something in
1473  * at the same time..
1474  */
1475 extern void wakeup_bdflush(int);
1476 int filemap_swapout(struct page * page, struct file * file)
1477 {
1478         int retval = filemap_write_page(file, page->pg_offset, page, 0);
1479         wakeup_bdflush(0);
1480         return retval;
1481 }
1482
1483 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
1484         unsigned long address, unsigned int flags)
1485 {
1486         unsigned long pgoff;
1487         pte_t pte = *ptep;
1488         struct page *page;
1489         int error;
1490
1491         if (!(flags & MS_INVALIDATE)) {
1492                 if (!pte_present(pte))
1493                         return 0;
1494                 if (!pte_dirty(pte))
1495                         return 0;
1496                 flush_page_to_ram(pte_page(pte));
1497                 flush_cache_page(vma, address);
1498                 set_pte(ptep, pte_mkclean(pte));
1499                 flush_tlb_page(vma, address);
1500                 page = pte_page(pte);
1501                 get_page(page);
1502         } else {
1503                 if (pte_none(pte))
1504                         return 0;
1505                 flush_cache_page(vma, address);
1506                 pte_clear(ptep);
1507                 flush_tlb_page(vma, address);
1508                 if (!pte_present(pte)) {
1509                         swap_free(pte_to_swp_entry(pte));
1510                         return 0;
1511                 }
1512                 page = pte_page(pte);
1513                 if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
1514                         page_cache_free(page);
1515                         return 0;
1516                 }
1517         }
1518         if (PageHighMem(page))
1519                 BUG();
1520         pgoff = (address - vma->vm_start) >> PAGE_CACHE_SHIFT;
1521         pgoff += vma->vm_pgoff;
1522         if (page->pg_offset != pgoff) {
1523                 printk("weirdness: pgoff=%lu pg_offset=%lu address=%lu vm_start=%lu vm_pgoff=%lu\n",
1524                         pgoff, page->pg_offset, address, vma->vm_start, vma->vm_pgoff);
1525         }
1526         error = filemap_write_page(vma->vm_file, pgoff, page, 1);
1527         page_cache_free(page);
1528         return error;
1529 }
1530
1531 static inline int filemap_sync_pte_range(pmd_t * pmd,
1532         unsigned long address, unsigned long size,
1533         struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
1534 {
1535         pte_t * pte;
1536         unsigned long end;
1537         int error;
1538
1539         if (pmd_none(*pmd))
1540                 return 0;
1541         if (pmd_bad(*pmd)) {
1542                 pmd_ERROR(*pmd);
1543                 pmd_clear(pmd);
1544                 return 0;
1545         }
1546         pte = pte_offset(pmd, address);
1547         offset += address & PMD_MASK;
1548         address &= ~PMD_MASK;
1549         end = address + size;
1550         if (end > PMD_SIZE)
1551                 end = PMD_SIZE;
1552         error = 0;
1553         do {
1554                 error |= filemap_sync_pte(pte, vma, address + offset, flags);
1555                 address += PAGE_SIZE;
1556                 pte++;
1557         } while (address && (address < end));
1558         return error;
1559 }
1560
1561 static inline int filemap_sync_pmd_range(pgd_t * pgd,
1562         unsigned long address, unsigned long size,
1563         struct vm_area_struct *vma, unsigned int flags)
1564 {
1565         pmd_t * pmd;
1566         unsigned long offset, end;
1567         int error;
1568
1569         if (pgd_none(*pgd))
1570                 return 0;
1571         if (pgd_bad(*pgd)) {
1572                 pgd_ERROR(*pgd);
1573                 pgd_clear(pgd);
1574                 return 0;
1575         }
1576         pmd = pmd_offset(pgd, address);
1577         offset = address & PGDIR_MASK;
1578         address &= ~PGDIR_MASK;
1579         end = address + size;
1580         if (end > PGDIR_SIZE)
1581                 end = PGDIR_SIZE;
1582         error = 0;
1583         do {
1584                 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
1585                 address = (address + PMD_SIZE) & PMD_MASK;
1586                 pmd++;
1587         } while (address && (address < end));
1588         return error;
1589 }
1590
1591 static int filemap_sync(struct vm_area_struct * vma, unsigned long address,
1592         size_t size, unsigned int flags)
1593 {
1594         pgd_t * dir;
1595         unsigned long end = address + size;
1596         int error = 0;
1597
1598         dir = pgd_offset(vma->vm_mm, address);
1599         flush_cache_range(vma->vm_mm, end - size, end);
1600         if (address >= end)
1601                 BUG();
1602         do {
1603                 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
1604                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
1605                 dir++;
1606         } while (address && (address < end));
1607         flush_tlb_range(vma->vm_mm, end - size, end);
1608         return error;
1609 }
1610
1611 /*
1612  * This handles (potentially partial) area unmaps..
1613  */
1614 static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len)
1615 {
1616         filemap_sync(vma, start, len, MS_ASYNC);
1617 }
1618
1619 /*
1620  * Shared mappings need to be able to do the right thing at
1621  * close/unmap/sync. They will also use the private file as
1622  * backing-store for swapping..
1623  */
1624 static struct vm_operations_struct file_shared_mmap = {
1625         NULL,                   /* no special open */
1626         NULL,                   /* no special close */
1627         filemap_unmap,          /* unmap - we need to sync the pages */
1628         NULL,                   /* no special protect */
1629         filemap_sync,           /* sync */
1630         NULL,                   /* advise */
1631         filemap_nopage,         /* nopage */
1632         NULL,                   /* wppage */
1633         filemap_swapout         /* swapout */
1634 };
1635
1636 /*
1637  * Private mappings just need to be able to load in the map.
1638  *
1639  * (This is actually used for shared mappings as well, if we
1640  * know they can't ever get write permissions..)
1641  */
1642 static struct vm_operations_struct file_private_mmap = {
1643         NULL,                   /* open */
1644         NULL,                   /* close */
1645         NULL,                   /* unmap */
1646         NULL,                   /* protect */
1647         NULL,                   /* sync */
1648         NULL,                   /* advise */
1649         filemap_nopage,         /* nopage */
1650         NULL,                   /* wppage */
1651         NULL                    /* swapout */
1652 };
1653
1654 /* This is used for a general mmap of a disk file */
1655
1656 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1657 {
1658         struct vm_operations_struct * ops;
1659         struct inode *inode = file->f_dentry->d_inode;
1660
1661         ops = &file_private_mmap;
1662         if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1663                 if (!inode->i_op || !inode->i_op->writepage)
1664                         return -EINVAL;
1665                 ops = &file_shared_mmap;
1666         }
1667         if (!inode->i_sb || !S_ISREG(inode->i_mode))
1668                 return -EACCES;
1669         if (!inode->i_op || !inode->i_op->readpage)
1670                 return -ENOEXEC;
1671         UPDATE_ATIME(inode);
1672         vma->vm_ops = ops;
1673         return 0;
1674 }
1675
1676
1677 /*
1678  * The msync() system call.
1679  */
1680
1681 static int msync_interval(struct vm_area_struct * vma,
1682         unsigned long start, unsigned long end, int flags)
1683 {
1684         if (vma->vm_file && vma->vm_ops && vma->vm_ops->sync) {
1685                 int error;
1686                 error = vma->vm_ops->sync(vma, start, end-start, flags);
1687                 if (!error && (flags & MS_SYNC)) {
1688                         struct file * file = vma->vm_file;
1689                         if (file) {
1690                                 struct dentry * dentry = file->f_dentry;
1691                                 error = file_fsync(file, dentry);
1692                         }
1693                 }
1694                 return error;
1695         }
1696         return 0;
1697 }
1698
1699 asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
1700 {
1701         unsigned long end;
1702         struct vm_area_struct * vma;
1703         int unmapped_error, error = -EINVAL;
1704
1705         down(&current->mm->mmap_sem);
1706         lock_kernel();
1707         if (start & ~PAGE_MASK)
1708                 goto out;
1709         len = (len + ~PAGE_MASK) & PAGE_MASK;
1710         end = start + len;
1711         if (end < start)
1712                 goto out;
1713         if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1714                 goto out;
1715         error = 0;
1716         if (end == start)
1717                 goto out;
1718         /*
1719          * If the interval [start,end) covers some unmapped address ranges,
1720          * just ignore them, but return -EFAULT at the end.
1721          */
1722         vma = find_vma(current->mm, start);
1723         unmapped_error = 0;
1724         for (;;) {
1725                 /* Still start < end. */
1726                 error = -EFAULT;
1727                 if (!vma)
1728                         goto out;
1729                 /* Here start < vma->vm_end. */
1730                 if (start < vma->vm_start) {
1731                         unmapped_error = -EFAULT;
1732                         start = vma->vm_start;
1733                 }
1734                 /* Here vma->vm_start <= start < vma->vm_end. */
1735                 if (end <= vma->vm_end) {
1736                         if (start < end) {
1737                                 error = msync_interval(vma, start, end, flags);
1738                                 if (error)
1739                                         goto out;
1740                         }
1741                         error = unmapped_error;
1742                         goto out;
1743                 }
1744                 /* Here vma->vm_start <= start < vma->vm_end < end. */
1745                 error = msync_interval(vma, start, vma->vm_end, flags);
1746                 if (error)
1747                         goto out;
1748                 start = vma->vm_end;
1749                 vma = vma->vm_next;
1750         }
1751 out:
1752         unlock_kernel();
1753         up(&current->mm->mmap_sem);
1754         return error;
1755 }
1756
1757 /*
1758  * Write to a file through the page cache. This is mainly for the
1759  * benefit of NFS and possibly other network-based file systems.
1760  *
1761  * We currently put everything into the page cache prior to writing it.
1762  * This is not a problem when writing full pages. With partial pages,
1763  * however, we first have to read the data into the cache, then
1764  * dirty the page, and finally schedule it for writing. Alternatively, we
1765  * could write-through just the portion of data that would go into that
1766  * page, but that would kill performance for applications that write data
1767  * line by line, and it's prone to race conditions.
1768  *
1769  * Note that this routine doesn't try to keep track of dirty pages. Each
1770  * file system has to do this all by itself, unfortunately.
1771  *                                                      okir@monad.swb.de
1772  */
1773 ssize_t
1774 generic_file_write(struct file *file, const char *buf,
1775                    size_t count, loff_t *ppos,
1776                    writepage_t write_one_page)
1777 {
1778         struct dentry   *dentry = file->f_dentry;
1779         struct inode    *inode = dentry->d_inode;
1780         unsigned long   pos = *ppos;
1781         unsigned long   limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
1782         struct page     *page, **hash, *cached_page;
1783         unsigned long   written;
1784         long            status;
1785         int             err;
1786
1787         cached_page = NULL;
1788
1789         down(&inode->i_sem);
1790         err = file->f_error;
1791         if (err) {
1792                 file->f_error = 0;
1793                 goto out;
1794         }
1795
1796         written = 0;
1797
1798         if (file->f_flags & O_APPEND)
1799                 pos = inode->i_size;
1800
1801         /*
1802          * Check whether we've reached the file size limit.
1803          */
1804         err = -EFBIG;
1805         if (pos >= limit) {
1806                 send_sig(SIGXFSZ, current, 0);
1807                 goto out;
1808         }
1809
1810         status  = 0;
1811         /*
1812          * Check whether to truncate the write,
1813          * and send the signal if we do.
1814          */
1815         if (count > limit - pos) {
1816                 send_sig(SIGXFSZ, current, 0);
1817                 count = limit - pos;
1818         }
1819
1820         while (count) {
1821                 unsigned long bytes, pgoff, offset;
1822
1823                 /*
1824                  * Try to find the page in the cache. If it isn't there,
1825                  * allocate a free page.
1826                  */
1827                 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
1828                 pgoff = pos >> PAGE_CACHE_SHIFT;
1829                 bytes = PAGE_CACHE_SIZE - offset;
1830                 if (bytes > count)
1831                         bytes = count;
1832
1833                 hash = page_hash(&inode->i_data, pgoff);
1834 repeat_find:
1835                 page = __find_lock_page(&inode->i_data, pgoff, hash);
1836                 if (!page) {
1837                         if (!cached_page) {
1838                                 cached_page = page_cache_alloc();
1839                                 if (cached_page)
1840                                         goto repeat_find;
1841                                 status = -ENOMEM;
1842                                 break;
1843                         }
1844                         page = cached_page;
1845                         if (add_to_page_cache_unique(page,&inode->i_data,pgoff,hash))
1846                                 goto repeat_find;
1847
1848                         cached_page = NULL;
1849                 }
1850
1851                 /* We have exclusive IO access to the page.. */
1852                 if (!PageLocked(page)) {
1853                         PAGE_BUG(page);
1854                 }
1855
1856                 status = write_one_page(file, page, offset, bytes, buf);
1857
1858                 if (status >= 0) {
1859                         written += status;
1860                         count -= status;
1861                         pos += status;
1862                         buf += status;
1863                         if (pos > inode->i_size)
1864                                 inode->i_size = pos;
1865                 }
1866                 /* Mark it unlocked again and drop the page.. */
1867                 UnlockPage(page);
1868                 page_cache_release(page);
1869
1870                 if (status < 0)
1871                         break;
1872         }
1873         *ppos = pos;
1874
1875         if (cached_page)
1876                 page_cache_free(cached_page);
1877
1878         err = written ? written : status;
1879 out:
1880         up(&inode->i_sem);
1881         return err;
1882 }
1883
1884 /*
1885  * Support routines for directory caching using the page cache.
1886  */
1887
1888 /*
1889  * Unlock and free a page.
1890  */
1891 void put_cached_page(unsigned long addr)
1892 {
1893         struct page * page = page_cache_entry(addr);
1894
1895         UnlockPage(page);
1896         if (page_count(page) != 2)
1897                 panic("put_cached_page: page count=%d\n",
1898                         page_count(page));
1899         page_cache_release(page);
1900 }
1901
1902 void __init page_cache_init(unsigned long mempages)
1903 {
1904         unsigned long htable_size, order;
1905
1906         htable_size = mempages;
1907         htable_size *= sizeof(struct page *);
1908         for(order = 0; (PAGE_SIZE << order) < htable_size; order++)
1909                 ;
1910
1911         do {
1912                 unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *);
1913
1914                 page_hash_bits = 0;
1915                 while((tmp >>= 1UL) != 0UL)
1916                         page_hash_bits++;
1917
1918                 page_hash_table = (struct page **)
1919                         __get_free_pages(GFP_ATOMIC, order);
1920         } while(page_hash_table == NULL && --order > 0);
1921
1922         printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
1923                (1 << page_hash_bits), order, (PAGE_SIZE << order));
1924         if (!page_hash_table)
1925                 panic("Failed to allocate page hash table\n");
1926         memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));
1927 }