mm/filemap.c

   1 /*
   2  *      linux/mm/filemap.c
   3  *
   4  * Copyright (C) 1994-1999  Linus Torvalds
   5  */
   6
   7 /*
   8  * This file handles the generic file mmap semantics used by
   9  * most "normal" filesystems (but you don't /have/ to use this:
  10  * the NFS filesystem used to do this differently, for example)
  11  */
  12 #include <linux/malloc.h>
  13 #include <linux/shm.h>
  14 #include <linux/mman.h>
  15 #include <linux/locks.h>
  16 #include <linux/pagemap.h>
  17 #include <linux/swap.h>
  18 #include <linux/smp_lock.h>
  19 #include <linux/blkdev.h>
  20 #include <linux/file.h>
  21 #include <linux/swapctl.h>
  22 #include <linux/slab.h>
  23 #include <linux/init.h>
  24 #include <linux/mm.h>
  25
  26 #include <asm/pgalloc.h>
  27 #include <asm/uaccess.h>
  28
  29 #include <linux/highmem.h>
  30
  31 /*
  32  * Shared mappings implemented 30.11.1994. It's not fully working yet,
  33  * though.
  34  *
  35  * Shared mappings now work. 15.8.1995  Bruno.
  36  *
  37  * finished 'unifying' the page and buffer cache and SMP-threaded the
  38  * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
  39  *
  40  * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
  41  */
  42
  43 atomic_t page_cache_size = ATOMIC_INIT(0);
  44 unsigned int page_hash_bits;
  45 struct page **page_hash_table;
  46
  47 spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
  48 /*
  49  * NOTE: to avoid deadlocking you must never acquire the pagecache_lock with
  50  *       the pagemap_lru_lock held.
  51  */
  52 spinlock_t pagemap_lru_lock = SPIN_LOCK_UNLOCKED;
  53
  54 #define CLUSTER_PAGES           (1 << page_cluster)
  55 #define CLUSTER_OFFSET(x)       (((x) >> page_cluster) << page_cluster)
  56
  57 void __add_page_to_hash_queue(struct page * page, struct page **p)
  58 {
  59         atomic_inc(&page_cache_size);
  60         if((page->next_hash = *p) != NULL)
  61                 (*p)->pprev_hash = &page->next_hash;
  62         *p = page;
  63         page->pprev_hash = p;
  64         if (page->buffers)
  65                 PAGE_BUG(page);
  66 }
  67
  68 static void remove_page_from_hash_queue(struct page * page)
  69 {
  70         if(page->pprev_hash) {
  71                 if(page->next_hash)
  72                         page->next_hash->pprev_hash = page->pprev_hash;
  73                 *page->pprev_hash = page->next_hash;
  74                 page->pprev_hash = NULL;
  75         }
  76         atomic_dec(&page_cache_size);
  77 }
  78
  79 /*
  80  * Remove a page from the page cache and free it. Caller has to make
  81  * sure the page is locked and that nobody else uses it - or that usage
  82  * is safe.
  83  */
  84 void remove_inode_page(struct page *page)
  85 {
  86         if (!PageLocked(page))
  87                 PAGE_BUG(page);
  88
  89         spin_lock(&pagecache_lock);
  90         remove_page_from_inode_queue(page);
  91         remove_page_from_hash_queue(page);
  92         page->mapping = NULL;
  93         spin_unlock(&pagecache_lock);
  94 }
  95
  96 void invalidate_inode_pages(struct inode * inode)
  97 {
  98         struct list_head *head, *curr;
  99         struct page * page;
 100
 101         head = &inode->i_data.pages;
 102         spin_lock(&pagecache_lock);
 103         curr = head->next;
 104
 105         while (curr != head) {
 106                 page = list_entry(curr, struct page, list);
 107                 curr = curr->next;
 108
 109                 /* We cannot invalidate a locked page */
 110                 if (PageLocked(page))
 111                         continue;
 112
 113                 lru_cache_del(page);
 114
 115                 remove_page_from_inode_queue(page);
 116                 remove_page_from_hash_queue(page);
 117                 page->mapping = NULL;
 118                 page_cache_release(page);
 119         }
 120         spin_unlock(&pagecache_lock);
 121 }
 122
 123 /*
 124  * Truncate the page cache at a set offset, removing the pages
 125  * that are beyond that offset (and zeroing out partial pages).
 126  */
 127 void truncate_inode_pages(struct inode * inode, loff_t lstart)
 128 {
 129         struct list_head *head, *curr;
 130         struct page * page;
 131         unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
 132         unsigned long start;
 133
 134         start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 135
 136 repeat:
 137         head = &inode->i_data.pages;
 138         spin_lock(&pagecache_lock);
 139         curr = head->next;
 140         while (curr != head) {
 141                 unsigned long offset;
 142
 143                 page = list_entry(curr, struct page, list);
 144                 curr = curr->next;
 145
 146                 offset = page->index;
 147
 148                 /* page wholly truncated - free it */
 149                 if (offset >= start) {
 150                         get_page(page);
 151                         spin_unlock(&pagecache_lock);
 152
 153                         lock_page(page);
 154
 155                         if (!page->buffers || block_flushpage(page, 0))
 156                                 lru_cache_del(page);
 157
 158                         /*
 159                          * We remove the page from the page cache
 160                          * _after_ we have destroyed all buffer-cache
 161                          * references to it. Otherwise some other process
 162                          * might think this inode page is not in the
 163                          * page cache and creates a buffer-cache alias
 164                          * to it causing all sorts of fun problems ...
 165                          */
 166                         remove_inode_page(page);
 167
 168                         UnlockPage(page);
 169                         page_cache_release(page);
 170                         page_cache_release(page);
 171
 172                         /*
 173                          * We have done things without the pagecache lock,
 174                          * so we'll have to repeat the scan.
 175                          * It's not possible to deadlock here because
 176                          * we are guaranteed to make progress. (ie. we have
 177                          * just removed a page)
 178                          */
 179                         goto repeat;
 180                 }
 181                 /*
 182                  * there is only one partial page possible.
 183                  */
 184                 if (!partial)
 185                         continue;
 186
 187                 /* and it's the one preceeding the first wholly truncated page */
 188                 if ((offset + 1) != start)
 189                         continue;
 190
 191                 /* partial truncate, clear end of page */
 192                 get_page(page);
 193                 spin_unlock(&pagecache_lock);
 194
 195                 lock_page(page);
 196
 197                 memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
 198                 if (page->buffers)
 199                         block_flushpage(page, partial);
 200
 201                 partial = 0;
 202
 203                 /*
 204                  * we have dropped the spinlock so we have to
 205                  * restart.
 206                  */
 207                 UnlockPage(page);
 208                 page_cache_release(page);
 209                 goto repeat;
 210         }
 211         spin_unlock(&pagecache_lock);
 212 }
 213
 214 int shrink_mmap(int priority, int gfp_mask, zone_t *zone)
 215 {
 216         int ret = 0, count;
 217         LIST_HEAD(young);
 218         LIST_HEAD(old);
 219         LIST_HEAD(forget);
 220         struct list_head * page_lru, * dispose;
 221         struct page * page;
 222
 223         count = nr_lru_pages / (priority+1);
 224
 225         spin_lock(&pagemap_lru_lock);
 226
 227         while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache) {
 228                 page = list_entry(page_lru, struct page, lru);
 229                 list_del(page_lru);
 230
 231                 dispose = &lru_cache;
 232                 if (test_and_clear_bit(PG_referenced, &page->flags))
 233                         /* Roll the page at the top of the lru list,
 234                          * we could also be more aggressive putting
 235                          * the page in the young-dispose-list, so
 236                          * avoiding to free young pages in each pass.
 237                          */
 238                         goto dispose_continue;
 239
 240                 dispose = &old;
 241                 /* don't account passes over not DMA pages */
 242                 if (zone && (!memclass(page->zone, zone)))
 243                         goto dispose_continue;
 244
 245                 count--;
 246
 247                 dispose = &young;
 248                 if (TryLockPage(page))
 249                         goto dispose_continue;
 250
 251                 /* Release the pagemap_lru lock even if the page is not yet
 252                    queued in any lru queue since we have just locked down
 253                    the page so nobody else may SMP race with us running
 254                    a lru_cache_del() (lru_cache_del() always run with the
 255                    page locked down ;). */
 256                 spin_unlock(&pagemap_lru_lock);
 257
 258                 /* avoid unscalable SMP locking */
 259                 if (!page->buffers && page_count(page) > 1)
 260                         goto unlock_noput_continue;
 261
 262                 /* Take the pagecache_lock spinlock held to avoid
 263                    other tasks to notice the page while we are looking at its
 264                    page count. If it's a pagecache-page we'll free it
 265                    in one atomic transaction after checking its page count. */
 266                 spin_lock(&pagecache_lock);
 267
 268                 /* avoid freeing the page while it's locked */
 269                 get_page(page);
 270
 271                 /* Is it a buffer page? */
 272                 if (page->buffers) {
 273                         spin_unlock(&pagecache_lock);
 274                         if (!try_to_free_buffers(page))
 275                                 goto unlock_continue;
 276                         /* page was locked, inode can't go away under us */
 277                         if (!page->mapping) {
 278                                 atomic_dec(&buffermem_pages);
 279                                 goto made_buffer_progress;
 280                         }
 281                         spin_lock(&pagecache_lock);
 282                 }
 283
 284                 /*
 285                  * We can't free pages unless there's just one user
 286                  * (count == 2 because we added one ourselves above).
 287                  */
 288                 if (page_count(page) != 2)
 289                         goto cache_unlock_continue;
 290
 291                 /*
 292                  * Is it a page swap page? If so, we want to
 293                  * drop it if it is no longer used, even if it
 294                  * were to be marked referenced..
 295                  */
 296                 if (PageSwapCache(page)) {
 297                         spin_unlock(&pagecache_lock);
 298                         __delete_from_swap_cache(page);
 299                         goto made_inode_progress;
 300                 }
 301
 302                 /* is it a page-cache page? */
 303                 if (page->mapping) {
 304                         if (!pgcache_under_min())
 305                         {
 306                                 remove_page_from_inode_queue(page);
 307                                 remove_page_from_hash_queue(page);
 308                                 page->mapping = NULL;
 309                                 spin_unlock(&pagecache_lock);
 310                                 goto made_inode_progress;
 311                         }
 312                         goto cache_unlock_continue;
 313                 }
 314
 315                 dispose = &forget;
 316                 printk(KERN_ERR "shrink_mmap: unknown LRU page!\n");
 317
 318 cache_unlock_continue:
 319                 spin_unlock(&pagecache_lock);
 320 unlock_continue:
 321                 UnlockPage(page);
 322                 put_page(page);
 323 dispose_relock_continue:
 324                 /* even if the dispose list is local, a truncate_inode_page()
 325                    may remove a page from its queue so always
 326                    synchronize with the lru lock while accesing the
 327                    page->lru field */
 328                 spin_lock(&pagemap_lru_lock);
 329                 list_add(page_lru, dispose);
 330                 continue;
 331
 332 unlock_noput_continue:
 333                 UnlockPage(page);
 334                 goto dispose_relock_continue;
 335
 336 dispose_continue:
 337                 list_add(page_lru, dispose);
 338         }
 339         goto out;
 340
 341 made_inode_progress:
 342         page_cache_release(page);
 343 made_buffer_progress:
 344         UnlockPage(page);
 345         put_page(page);
 346         ret = 1;
 347         spin_lock(&pagemap_lru_lock);
 348         /* nr_lru_pages needs the spinlock */
 349         nr_lru_pages--;
 350
 351 out:
 352         list_splice(&young, &lru_cache);
 353         list_splice(&old, lru_cache.prev);
 354
 355         spin_unlock(&pagemap_lru_lock);
 356
 357         return ret;
 358 }
 359
 360 static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
 361 {
 362         goto inside;
 363
 364         for (;;) {
 365                 page = page->next_hash;
 366 inside:
 367                 if (!page)
 368                         goto not_found;
 369                 if (page->mapping != mapping)
 370                         continue;
 371                 if (page->index == offset)
 372                         break;
 373         }
 374         set_bit(PG_referenced, &page->flags);
 375 not_found:
 376         return page;
 377 }
 378
 379 /*
 380  * By the time this is called, the page is locked and
 381  * we don't have to worry about any races any more.
 382  *
 383  * Start the IO..
 384  */
 385 static int writeout_one_page(struct page *page)
 386 {
 387         struct buffer_head *bh, *head = page->buffers;
 388
 389         bh = head;
 390         do {
 391                 if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
 392                         continue;
 393
 394                 bh->b_flushtime = 0;
 395                 ll_rw_block(WRITE, 1, &bh);
 396         } while ((bh = bh->b_this_page) != head);
 397         return 0;
 398 }
 399
 400 static int waitfor_one_page(struct page *page)
 401 {
 402         int error = 0;
 403         struct buffer_head *bh, *head = page->buffers;
 404
 405         bh = head;
 406         do {
 407                 wait_on_buffer(bh);
 408                 if (buffer_req(bh) && !buffer_uptodate(bh))
 409                         error = -EIO;
 410         } while ((bh = bh->b_this_page) != head);
 411         return error;
 412 }
 413
 414 static int do_buffer_fdatasync(struct inode *inode, unsigned long start, unsigned long end, int (*fn)(struct page *))
 415 {
 416         struct list_head *head, *curr;
 417         struct page *page;
 418         int retval = 0;
 419
 420         head = &inode->i_data.pages;
 421
 422         spin_lock(&pagecache_lock);
 423         curr = head->next;
 424         while (curr != head) {
 425                 page = list_entry(curr, struct page, list);
 426                 curr = curr->next;
 427                 if (!page->buffers)
 428                         continue;
 429                 if (page->index >= end)
 430                         continue;
 431                 if (page->index < start)
 432                         continue;
 433
 434                 get_page(page);
 435                 spin_unlock(&pagecache_lock);
 436                 lock_page(page);
 437
 438                 /* The buffers could have been free'd while we waited for the page lock */
 439                 if (page->buffers)
 440                         retval |= fn(page);
 441
 442                 UnlockPage(page);
 443                 spin_lock(&pagecache_lock);
 444                 curr = page->list.next;
 445                 page_cache_release(page);
 446         }
 447         spin_unlock(&pagecache_lock);
 448
 449         return retval;
 450 }
 451
 452 /*
 453  * Two-stage data sync: first start the IO, then go back and
 454  * collect the information..
 455  */
 456 int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx)
 457 {
 458         int retval;
 459
 460         retval = do_buffer_fdatasync(inode, start_idx, end_idx, writeout_one_page);
 461         retval |= do_buffer_fdatasync(inode, start_idx, end_idx, waitfor_one_page);
 462         return retval;
 463 }
 464
 465 /*
 466  * This adds a page to the page cache, starting out as locked,
 467  * owned by us, referenced, but not uptodate and with no errors.
 468  */
 469 static inline void __add_to_page_cache(struct page * page,
 470         struct address_space *mapping, unsigned long offset,
 471         struct page **hash)
 472 {
 473         struct page *alias;
 474         unsigned long flags;
 475
 476         flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced));
 477         page->flags = flags | (1 << PG_locked);
 478         get_page(page);
 479         page->index = offset;
 480         add_page_to_inode_queue(mapping, page);
 481         __add_page_to_hash_queue(page, hash);
 482         lru_cache_add(page);
 483         alias = __find_page_nolock(mapping, offset, *hash);
 484         if (alias != page)
 485                 BUG();
 486 }
 487
 488 void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset)
 489 {
 490         spin_lock(&pagecache_lock);
 491         __add_to_page_cache(page, mapping, offset, page_hash(mapping, offset));
 492         spin_unlock(&pagecache_lock);
 493 }
 494
 495 static int add_to_page_cache_unique(struct page * page,
 496         struct address_space *mapping, unsigned long offset,
 497         struct page **hash)
 498 {
 499         int err;
 500         struct page *alias;
 501
 502         spin_lock(&pagecache_lock);
 503         alias = __find_page_nolock(mapping, offset, *hash);
 504
 505         err = 1;
 506         if (!alias) {
 507                 __add_to_page_cache(page,mapping,offset,hash);
 508                 err = 0;
 509         }
 510
 511         spin_unlock(&pagecache_lock);
 512         return err;
 513 }
 514
 515 /*
 516  * This adds the requested page to the page cache if it isn't already there,
 517  * and schedules an I/O to read in its contents from disk.
 518  */
 519 static inline int page_cache_read(struct file * file, unsigned long offset)
 520 {
 521         struct inode *inode = file->f_dentry->d_inode;
 522         struct page **hash = page_hash(&inode->i_data, offset);
 523         struct page *page;
 524
 525         spin_lock(&pagecache_lock);
 526         page = __find_page_nolock(&inode->i_data, offset, *hash);
 527         spin_unlock(&pagecache_lock);
 528         if (page)
 529                 return 0;
 530
 531         page = page_cache_alloc();
 532         if (!page)
 533                 return -ENOMEM;
 534
 535         if (!add_to_page_cache_unique(page, &inode->i_data, offset, hash)) {
 536                 int error = inode->i_op->readpage(file->f_dentry, page);
 537                 page_cache_release(page);
 538                 return error;
 539         }
 540         /*
 541          * We arrive here in the unlikely event that someone
 542          * raced with us and added our page to the cache first.
 543          */
 544         page_cache_free(page);
 545         return 0;
 546 }
 547
 548 /*
 549  * Read in an entire cluster at once.  A cluster is usually a 64k-
 550  * aligned block that includes the page requested in "offset."
 551  */
 552 static int read_cluster_nonblocking(struct file * file, unsigned long offset,
 553         unsigned long filesize)
 554 {
 555         unsigned long pages = CLUSTER_PAGES;
 556
 557         offset = CLUSTER_OFFSET(offset);
 558         while ((pages-- > 0) && (offset < filesize)) {
 559                 int error = page_cache_read(file, offset);
 560                 if (error < 0)
 561                         return error;
 562                 offset ++;
 563         }
 564
 565         return 0;
 566 }
 567
 568 /*
 569  * Wait for a page to get unlocked.
 570  *
 571  * This must be called with the caller "holding" the page,
 572  * ie with increased "page->count" so that the page won't
 573  * go away during the wait..
 574  */
 575 void ___wait_on_page(struct page *page)
 576 {
 577         struct task_struct *tsk = current;
 578         DECLARE_WAITQUEUE(wait, tsk);
 579
 580         add_wait_queue(&page->wait, &wait);
 581         do {
 582                 run_task_queue(&tq_disk);
 583                 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 584                 if (!PageLocked(page))
 585                         break;
 586                 schedule();
 587         } while (PageLocked(page));
 588         tsk->state = TASK_RUNNING;
 589         remove_wait_queue(&page->wait, &wait);
 590 }
 591
 592 /*
 593  * Get an exclusive lock on the page..
 594  */
 595 void lock_page(struct page *page)
 596 {
 597         while (TryLockPage(page))
 598                 ___wait_on_page(page);
 599 }
 600
 601
 602 /*
 603  * a rather lightweight function, finding and getting a reference to a
 604  * hashed page atomically, waiting for it if it's locked.
 605  */
 606 struct page * __find_get_page (struct address_space *mapping,
 607                                 unsigned long offset, struct page **hash)
 608 {
 609         struct page *page;
 610
 611         /*
 612          * We scan the hash list read-only. Addition to and removal from
 613          * the hash-list needs a held write-lock.
 614          */
 615 repeat:
 616         spin_lock(&pagecache_lock);
 617         page = __find_page_nolock(mapping, offset, *hash);
 618         if (page)
 619                 get_page(page);
 620         spin_unlock(&pagecache_lock);
 621
 622         /* Found the page, sleep if locked. */
 623         if (page && PageLocked(page)) {
 624                 struct task_struct *tsk = current;
 625                 DECLARE_WAITQUEUE(wait, tsk);
 626
 627                 run_task_queue(&tq_disk);
 628
 629                 __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 630                 add_wait_queue(&page->wait, &wait);
 631
 632                 if (PageLocked(page))
 633                         schedule();
 634                 __set_task_state(tsk, TASK_RUNNING);
 635                 remove_wait_queue(&page->wait, &wait);
 636
 637                 /*
 638                  * The page might have been unhashed meanwhile. It's
 639                  * not freed though because we hold a reference to it.
 640                  * If this is the case then it will be freed _here_,
 641                  * and we recheck the hash anyway.
 642                  */
 643                 page_cache_release(page);
 644                 goto repeat;
 645         }
 646         /*
 647          * It's not locked so we can return the page and we hold
 648          * a reference to it.
 649          */
 650         return page;
 651 }
 652
 653 /*
 654  * Get the lock to a page atomically.
 655  */
 656 struct page * __find_lock_page (struct address_space *mapping,
 657                                 unsigned long offset, struct page **hash)
 658 {
 659         struct page *page;
 660
 661         /*
 662          * We scan the hash list read-only. Addition to and removal from
 663          * the hash-list needs a held write-lock.
 664          */
 665 repeat:
 666         spin_lock(&pagecache_lock);
 667         page = __find_page_nolock(mapping, offset, *hash);
 668         if (page)
 669                 get_page(page);
 670         spin_unlock(&pagecache_lock);
 671
 672         /* Found the page, sleep if locked. */
 673         if (page && TryLockPage(page)) {
 674                 struct task_struct *tsk = current;
 675                 DECLARE_WAITQUEUE(wait, tsk);
 676
 677                 run_task_queue(&tq_disk);
 678
 679                 __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 680                 add_wait_queue(&page->wait, &wait);
 681
 682                 if (PageLocked(page))
 683                         schedule();
 684                 __set_task_state(tsk, TASK_RUNNING);
 685                 remove_wait_queue(&page->wait, &wait);
 686
 687                 /*
 688                  * The page might have been unhashed meanwhile. It's
 689                  * not freed though because we hold a reference to it.
 690                  * If this is the case then it will be freed _here_,
 691                  * and we recheck the hash anyway.
 692                  */
 693                 page_cache_release(page);
 694                 goto repeat;
 695         }
 696         /*
 697          * It's not locked so we can return the page and we hold
 698          * a reference to it.
 699          */
 700         return page;
 701 }
 702
 703 #if 0
 704 #define PROFILE_READAHEAD
 705 #define DEBUG_READAHEAD
 706 #endif
 707
 708 /*
 709  * Read-ahead profiling information
 710  * --------------------------------
 711  * Every PROFILE_MAXREADCOUNT, the following information is written
 712  * to the syslog:
 713  *   Percentage of asynchronous read-ahead.
 714  *   Average of read-ahead fields context value.
 715  * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
 716  * to the syslog.
 717  */
 718
 719 #ifdef PROFILE_READAHEAD
 720
 721 #define PROFILE_MAXREADCOUNT 1000
 722
 723 static unsigned long total_reada;
 724 static unsigned long total_async;
 725 static unsigned long total_ramax;
 726 static unsigned long total_ralen;
 727 static unsigned long total_rawin;
 728
 729 static void profile_readahead(int async, struct file *filp)
 730 {
 731         unsigned long flags;
 732
 733         ++total_reada;
 734         if (async)
 735                 ++total_async;
 736
 737         total_ramax     += filp->f_ramax;
 738         total_ralen     += filp->f_ralen;
 739         total_rawin     += filp->f_rawin;
 740
 741         if (total_reada > PROFILE_MAXREADCOUNT) {
 742                 save_flags(flags);
 743                 cli();
 744                 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
 745                         restore_flags(flags);
 746                         return;
 747                 }
 748
 749                 printk("Readahead average:  max=%ld, len=%ld, win=%ld, async=%ld%%\n",
 750                         total_ramax/total_reada,
 751                         total_ralen/total_reada,
 752                         total_rawin/total_reada,
 753                         (total_async*100)/total_reada);
 754 #ifdef DEBUG_READAHEAD
 755                 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n",
 756                         filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
 757 #endif
 758
 759                 total_reada     = 0;
 760                 total_async     = 0;
 761                 total_ramax     = 0;
 762                 total_ralen     = 0;
 763                 total_rawin     = 0;
 764
 765                 restore_flags(flags);
 766         }
 767 }
 768 #endif  /* defined PROFILE_READAHEAD */
 769
 770 /*
 771  * Read-ahead context:
 772  * -------------------
 773  * The read ahead context fields of the "struct file" are the following:
 774  * - f_raend : position of the first byte after the last page we tried to
 775  *             read ahead.
 776  * - f_ramax : current read-ahead maximum size.
 777  * - f_ralen : length of the current IO read block we tried to read-ahead.
 778  * - f_rawin : length of the current read-ahead window.
 779  *              if last read-ahead was synchronous then
 780  *                      f_rawin = f_ralen
 781  *              otherwise (was asynchronous)
 782  *                      f_rawin = previous value of f_ralen + f_ralen
 783  *
 784  * Read-ahead limits:
 785  * ------------------
 786  * MIN_READAHEAD   : minimum read-ahead size when read-ahead.
 787  * MAX_READAHEAD   : maximum read-ahead size when read-ahead.
 788  *
 789  * Synchronous read-ahead benefits:
 790  * --------------------------------
 791  * Using reasonable IO xfer length from peripheral devices increase system
 792  * performances.
 793  * Reasonable means, in this context, not too large but not too small.
 794  * The actual maximum value is:
 795  *      MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
 796  *      and 32K if defined (4K page size assumed).
 797  *
 798  * Asynchronous read-ahead benefits:
 799  * ---------------------------------
 800  * Overlapping next read request and user process execution increase system
 801  * performance.
 802  *
 803  * Read-ahead risks:
 804  * -----------------
 805  * We have to guess which further data are needed by the user process.
 806  * If these data are often not really needed, it's bad for system
 807  * performances.
 808  * However, we know that files are often accessed sequentially by
 809  * application programs and it seems that it is possible to have some good
 810  * strategy in that guessing.
 811  * We only try to read-ahead files that seems to be read sequentially.
 812  *
 813  * Asynchronous read-ahead risks:
 814  * ------------------------------
 815  * In order to maximize overlapping, we must start some asynchronous read
 816  * request from the device, as soon as possible.
 817  * We must be very careful about:
 818  * - The number of effective pending IO read requests.
 819  *   ONE seems to be the only reasonable value.
 820  * - The total memory pool usage for the file access stream.
 821  *   This maximum memory usage is implicitly 2 IO read chunks:
 822  *   2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
 823  *   64k if defined (4K page size assumed).
 824  */
 825
 826 static inline int get_max_readahead(struct inode * inode)
 827 {
 828         if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
 829                 return MAX_READAHEAD;
 830         return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
 831 }
 832
 833 static void generic_file_readahead(int reada_ok,
 834         struct file * filp, struct inode * inode,
 835         struct page * page)
 836 {
 837         unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
 838         unsigned long index = page->index;
 839         unsigned long max_ahead, ahead;
 840         unsigned long raend;
 841         int max_readahead = get_max_readahead(inode);
 842
 843         raend = filp->f_raend;
 844         max_ahead = 0;
 845
 846 /*
 847  * The current page is locked.
 848  * If the current position is inside the previous read IO request, do not
 849  * try to reread previously read ahead pages.
 850  * Otherwise decide or not to read ahead some pages synchronously.
 851  * If we are not going to read ahead, set the read ahead context for this
 852  * page only.
 853  */
 854         if (PageLocked(page)) {
 855                 if (!filp->f_ralen || index >= raend || index + filp->f_ralen < raend) {
 856                         raend = index;
 857                         if (raend < end_index)
 858                                 max_ahead = filp->f_ramax;
 859                         filp->f_rawin = 0;
 860                         filp->f_ralen = 1;
 861                         if (!max_ahead) {
 862                                 filp->f_raend  = index + filp->f_ralen;
 863                                 filp->f_rawin += filp->f_ralen;
 864                         }
 865                 }
 866         }
 867 /*
 868  * The current page is not locked.
 869  * If we were reading ahead and,
 870  * if the current max read ahead size is not zero and,
 871  * if the current position is inside the last read-ahead IO request,
 872  *   it is the moment to try to read ahead asynchronously.
 873  * We will later force unplug device in order to force asynchronous read IO.
 874  */
 875         else if (reada_ok && filp->f_ramax && raend >= 1 &&
 876                  index <= raend && index + filp->f_ralen >= raend) {
 877 /*
 878  * Add ONE page to max_ahead in order to try to have about the same IO max size
 879  * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
 880  * Compute the position of the last page we have tried to read in order to
 881  * begin to read ahead just at the next page.
 882  */
 883                 raend -= 1;
 884                 if (raend < end_index)
 885                         max_ahead = filp->f_ramax + 1;
 886
 887                 if (max_ahead) {
 888                         filp->f_rawin = filp->f_ralen;
 889                         filp->f_ralen = 0;
 890                         reada_ok      = 2;
 891                 }
 892         }
 893 /*
 894  * Try to read ahead pages.
 895  * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
 896  * scheduler, will work enough for us to avoid too bad actuals IO requests.
 897  */
 898         ahead = 0;
 899         while (ahead < max_ahead) {
 900                 ahead ++;
 901                 if ((raend + ahead) >= end_index)
 902                         break;
 903                 if (page_cache_read(filp, raend + ahead) < 0)
 904                         break;
 905         }
 906 /*
 907  * If we tried to read ahead some pages,
 908  * If we tried to read ahead asynchronously,
 909  *   Try to force unplug of the device in order to start an asynchronous
 910  *   read IO request.
 911  * Update the read-ahead context.
 912  * Store the length of the current read-ahead window.
 913  * Double the current max read ahead size.
 914  *   That heuristic avoid to do some large IO for files that are not really
 915  *   accessed sequentially.
 916  */
 917         if (ahead) {
 918                 if (reada_ok == 2) {
 919                         run_task_queue(&tq_disk);
 920                 }
 921
 922                 filp->f_ralen += ahead;
 923                 filp->f_rawin += filp->f_ralen;
 924                 filp->f_raend = raend + ahead + 1;
 925
 926                 filp->f_ramax += filp->f_ramax;
 927
 928                 if (filp->f_ramax > max_readahead)
 929                         filp->f_ramax = max_readahead;
 930
 931 #ifdef PROFILE_READAHEAD
 932                 profile_readahead((reada_ok == 2), filp);
 933 #endif
 934         }
 935
 936         return;
 937 }
 938
 939
 940 /*
 941  * This is a generic file read routine, and uses the
 942  * inode->i_op->readpage() function for the actual low-level
 943  * stuff.
 944  *
 945  * This is really ugly. But the goto's actually try to clarify some
 946  * of the logic when it comes to error handling etc.
 947  */
 948 void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
 949 {
 950         struct dentry *dentry = filp->f_dentry;
 951         struct inode *inode = dentry->d_inode;
 952         unsigned long index, offset;
 953         struct page *cached_page;
 954         int reada_ok;
 955         int error;
 956         int max_readahead = get_max_readahead(inode);
 957
 958         cached_page = NULL;
 959         index = *ppos >> PAGE_CACHE_SHIFT;
 960         offset = *ppos & ~PAGE_CACHE_MASK;
 961
 962 /*
 963  * If the current position is outside the previous read-ahead window,
 964  * we reset the current read-ahead context and set read ahead max to zero
 965  * (will be set to just needed value later),
 966  * otherwise, we assume that the file accesses are sequential enough to
 967  * continue read-ahead.
 968  */
 969         if (index > filp->f_raend || index + filp->f_rawin < filp->f_raend) {
 970                 reada_ok = 0;
 971                 filp->f_raend = 0;
 972                 filp->f_ralen = 0;
 973                 filp->f_ramax = 0;
 974                 filp->f_rawin = 0;
 975         } else {
 976                 reada_ok = 1;
 977         }
 978 /*
 979  * Adjust the current value of read-ahead max.
 980  * If the read operation stay in the first half page, force no readahead.
 981  * Otherwise try to increase read ahead max just enough to do the read request.
 982  * Then, at least MIN_READAHEAD if read ahead is ok,
 983  * and at most MAX_READAHEAD in all cases.
 984  */
 985         if (!index && offset + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
 986                 filp->f_ramax = 0;
 987         } else {
 988                 unsigned long needed;
 989
 990                 needed = ((offset + desc->count) >> PAGE_CACHE_SHIFT) + 1;
 991
 992                 if (filp->f_ramax < needed)
 993                         filp->f_ramax = needed;
 994
 995                 if (reada_ok && filp->f_ramax < MIN_READAHEAD)
 996                                 filp->f_ramax = MIN_READAHEAD;
 997                 if (filp->f_ramax > max_readahead)
 998                         filp->f_ramax = max_readahead;
 999         }
1000
1001         for (;;) {
1002                 struct page *page, **hash;
1003                 unsigned long end_index, nr;
1004
1005                 end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1006                 if (index > end_index)
1007                         break;
1008                 nr = PAGE_CACHE_SIZE;
1009                 if (index == end_index) {
1010                         nr = inode->i_size & ~PAGE_CACHE_MASK;
1011                         if (nr <= offset)
1012                                 break;
1013                 }
1014
1015                 nr = nr - offset;
1016
1017                 /*
1018                  * Try to find the data in the page cache..
1019                  */
1020                 hash = page_hash(&inode->i_data, index);
1021
1022                 spin_lock(&pagecache_lock);
1023                 page = __find_page_nolock(&inode->i_data, index, *hash);
1024                 if (!page)
1025                         goto no_cached_page;
1026 found_page:
1027                 get_page(page);
1028                 spin_unlock(&pagecache_lock);
1029
1030                 if (!Page_Uptodate(page))
1031                         goto page_not_up_to_date;
1032 page_ok:
1033                 /*
1034                  * Ok, we have the page, and it's up-to-date, so
1035                  * now we can copy it to user space...
1036                  *
1037                  * The actor routine returns how many bytes were actually used..
1038                  * NOTE! This may not be the same as how much of a user buffer
1039                  * we filled up (we may be padding etc), so we can only update
1040                  * "pos" here (the actor routine has to update the user buffer
1041                  * pointers and the remaining count).
1042                  */
1043                 nr = actor(desc, page, offset, nr);
1044                 offset += nr;
1045                 index += offset >> PAGE_CACHE_SHIFT;
1046                 offset &= ~PAGE_CACHE_MASK;
1047
1048                 page_cache_release(page);
1049                 if (nr && desc->count)
1050                         continue;
1051                 break;
1052
1053 /*
1054  * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
1055  */
1056 page_not_up_to_date:
1057                 generic_file_readahead(reada_ok, filp, inode, page);
1058
1059                 if (Page_Uptodate(page))
1060                         goto page_ok;
1061
1062                 /* Get exclusive access to the page ... */
1063                 lock_page(page);
1064                 if (Page_Uptodate(page)) {
1065                         UnlockPage(page);
1066                         goto page_ok;
1067                 }
1068
1069 readpage:
1070                 /* ... and start the actual read. The read will unlock the page. */
1071                 error = inode->i_op->readpage(filp->f_dentry, page);
1072
1073                 if (!error) {
1074                         if (Page_Uptodate(page))
1075                                 goto page_ok;
1076
1077                         /* Again, try some read-ahead while waiting for the page to finish.. */
1078                         generic_file_readahead(reada_ok, filp, inode, page);
1079                         wait_on_page(page);
1080                         if (Page_Uptodate(page))
1081                                 goto page_ok;
1082                         error = -EIO;
1083                 }
1084
1085                 /* UHHUH! A synchronous read error occurred. Report it */
1086                 desc->error = error;
1087                 page_cache_release(page);
1088                 break;
1089
1090 no_cached_page:
1091                 /*
1092                  * Ok, it wasn't cached, so we need to create a new
1093                  * page..
1094                  *
1095                  * We get here with the page cache lock held.
1096                  */
1097                 if (!cached_page) {
1098                         spin_unlock(&pagecache_lock);
1099                         cached_page = page_cache_alloc();
1100                         if (!cached_page) {
1101                                 desc->error = -ENOMEM;
1102                                 break;
1103                         }
1104
1105                         /*
1106                          * Somebody may have added the page while we
1107                          * dropped the page cache lock. Check for that.
1108                          */
1109                         spin_lock(&pagecache_lock);
1110                         page = __find_page_nolock(&inode->i_data, index, *hash);
1111                         if (page)
1112                                 goto found_page;
1113                 }
1114
1115                 /*
1116                  * Ok, add the new page to the hash-queues...
1117                  */
1118                 page = cached_page;
1119                 __add_to_page_cache(page, &inode->i_data, index, hash);
1120                 spin_unlock(&pagecache_lock);
1121                 cached_page = NULL;
1122
1123                 goto readpage;
1124         }
1125
1126         *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1127         filp->f_reada = 1;
1128         if (cached_page)
1129                 page_cache_free(cached_page);
1130         UPDATE_ATIME(inode);
1131 }
1132
1133 static int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
1134 {
1135         unsigned long kaddr;
1136         unsigned long left, count = desc->count;
1137
1138         if (size > count)
1139                 size = count;
1140
1141         kaddr = kmap(page);
1142         left = __copy_to_user(desc->buf, (void *)(kaddr + offset), size);
1143         kunmap(page);
1144
1145         if (left) {
1146                 size -= left;
1147                 desc->error = -EFAULT;
1148         }
1149         desc->count = count - size;
1150         desc->written += size;
1151         desc->buf += size;
1152         return size;
1153 }
1154
1155 /*
1156  * This is the "read()" routine for all filesystems
1157  * that can use the page cache directly.
1158  */
1159 ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
1160 {
1161         ssize_t retval;
1162
1163         retval = -EFAULT;
1164         if (access_ok(VERIFY_WRITE, buf, count)) {
1165                 retval = 0;
1166
1167                 if (count) {
1168                         read_descriptor_t desc;
1169
1170                         desc.written = 0;
1171                         desc.count = count;
1172                         desc.buf = buf;
1173                         desc.error = 0;
1174                         do_generic_file_read(filp, ppos, &desc, file_read_actor);
1175
1176                         retval = desc.written;
1177                         if (!retval)
1178                                 retval = desc.error;
1179                 }
1180         }
1181         return retval;
1182 }
1183
1184 static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size)
1185 {
1186         unsigned long kaddr;
1187         ssize_t written;
1188         unsigned long count = desc->count;
1189         struct file *file = (struct file *) desc->buf;
1190         mm_segment_t old_fs;
1191
1192         if (size > count)
1193                 size = count;
1194         old_fs = get_fs();
1195         set_fs(KERNEL_DS);
1196
1197         kaddr = kmap(page);
1198         written = file->f_op->write(file, (char *)kaddr + offset,
1199                                                  size, &file->f_pos);
1200         kunmap(page);
1201         set_fs(old_fs);
1202         if (written < 0) {
1203                 desc->error = written;
1204                 written = 0;
1205         }
1206         desc->count = count - written;
1207         desc->written += written;
1208         return written;
1209 }
1210
1211 asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
1212 {
1213         ssize_t retval;
1214         struct file * in_file, * out_file;
1215         struct inode * in_inode, * out_inode;
1216
1217         /*
1218          * Get input file, and verify that it is ok..
1219          */
1220         retval = -EBADF;
1221         in_file = fget(in_fd);
1222         if (!in_file)
1223                 goto out;
1224         if (!(in_file->f_mode & FMODE_READ))
1225                 goto fput_in;
1226         retval = -EINVAL;
1227         in_inode = in_file->f_dentry->d_inode;
1228         if (!in_inode)
1229                 goto fput_in;
1230         if (!in_inode->i_op || !in_inode->i_op->readpage)
1231                 goto fput_in;
1232         retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
1233         if (retval)
1234                 goto fput_in;
1235
1236         /*
1237          * Get output file, and verify that it is ok..
1238          */
1239         retval = -EBADF;
1240         out_file = fget(out_fd);
1241         if (!out_file)
1242                 goto fput_in;
1243         if (!(out_file->f_mode & FMODE_WRITE))
1244                 goto fput_out;
1245         retval = -EINVAL;
1246         if (!out_file->f_op || !out_file->f_op->write)
1247                 goto fput_out;
1248         out_inode = out_file->f_dentry->d_inode;
1249         if (!out_inode)
1250                 goto fput_out;
1251         retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
1252         if (retval)
1253                 goto fput_out;
1254
1255         retval = 0;
1256         if (count) {
1257                 read_descriptor_t desc;
1258                 loff_t pos = 0, *ppos;
1259
1260                 retval = -EFAULT;
1261                 ppos = &in_file->f_pos;
1262                 if (offset) {
1263                         if (get_user(pos, offset))
1264                                 goto fput_out;
1265                         ppos = &pos;
1266                 }
1267
1268                 desc.written = 0;
1269                 desc.count = count;
1270                 desc.buf = (char *) out_file;
1271                 desc.error = 0;
1272                 do_generic_file_read(in_file, ppos, &desc, file_send_actor);
1273
1274                 retval = desc.written;
1275                 if (!retval)
1276                         retval = desc.error;
1277                 if (offset)
1278                         put_user(pos, offset);
1279         }
1280
1281 fput_out:
1282         fput(out_file);
1283 fput_in:
1284         fput(in_file);
1285 out:
1286         return retval;
1287 }
1288
1289 /*
1290  * filemap_nopage() is invoked via the vma operations vector for a
1291  * mapped memory region to read in file data during a page fault.
1292  *
1293  * The goto's are kind of ugly, but this streamlines the normal case of having
1294  * it in the page cache, and handles the special cases reasonably without
1295  * having a lot of duplicated code.
1296  */
1297 struct page * filemap_nopage(struct vm_area_struct * area,
1298         unsigned long address, int no_share)
1299 {
1300         int error;
1301         struct file *file = area->vm_file;
1302         struct dentry *dentry = file->f_dentry;
1303         struct inode *inode = dentry->d_inode;
1304         struct page *page, **hash, *old_page;
1305         unsigned long size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1306
1307         unsigned long pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
1308
1309         /*
1310          * Semantics for shared and private memory areas are different
1311          * past the end of the file. A shared mapping past the last page
1312          * of the file is an error and results in a SIGBUS, while a
1313          * private mapping just maps in a zero page.
1314          */
1315         if ((pgoff >= size) &&
1316                 (area->vm_flags & VM_SHARED) && (area->vm_mm == current->mm))
1317                 return NULL;
1318
1319         /*
1320          * Do we have something in the page cache already?
1321          */
1322         hash = page_hash(&inode->i_data, pgoff);
1323 retry_find:
1324         page = __find_get_page(&inode->i_data, pgoff, hash);
1325         if (!page)
1326                 goto no_cached_page;
1327
1328         /*
1329          * Ok, found a page in the page cache, now we need to check
1330          * that it's up-to-date.
1331          */
1332         if (!Page_Uptodate(page))
1333                 goto page_not_uptodate;
1334
1335 success:
1336         /*
1337          * Found the page and have a reference on it, need to check sharing
1338          * and possibly copy it over to another page..
1339          */
1340         old_page = page;
1341         if (no_share) {
1342                 struct page *new_page = page_cache_alloc();
1343
1344                 if (new_page) {
1345                         copy_highpage(new_page, old_page);
1346                         flush_page_to_ram(new_page);
1347                 } else
1348                         new_page = NOPAGE_OOM;
1349                 page_cache_release(page);
1350                 return new_page;
1351         }
1352
1353         flush_page_to_ram(old_page);
1354         return old_page;
1355
1356 no_cached_page:
1357         /*
1358          * If the requested offset is within our file, try to read a whole
1359          * cluster of pages at once.
1360          *
1361          * Otherwise, we're off the end of a privately mapped file,
1362          * so we need to map a zero page.
1363          */
1364         if (pgoff < size)
1365                 error = read_cluster_nonblocking(file, pgoff, size);
1366         else
1367                 error = page_cache_read(file, pgoff);
1368
1369         /*
1370          * The page we want has now been added to the page cache.
1371          * In the unlikely event that someone removed it in the
1372          * meantime, we'll just come back here and read it again.
1373          */
1374         if (error >= 0)
1375                 goto retry_find;
1376
1377         /*
1378          * An error return from page_cache_read can result if the
1379          * system is low on memory, or a problem occurs while trying
1380          * to schedule I/O.
1381          */
1382         if (error == -ENOMEM)
1383                 return NOPAGE_OOM;
1384         return NULL;
1385
1386 page_not_uptodate:
1387         lock_page(page);
1388         if (Page_Uptodate(page)) {
1389                 UnlockPage(page);
1390                 goto success;
1391         }
1392
1393         if (!inode->i_op->readpage(file->f_dentry, page)) {
1394                 wait_on_page(page);
1395                 if (Page_Uptodate(page))
1396                         goto success;
1397         }
1398
1399         /*
1400          * Umm, take care of errors if the page isn't up-to-date.
1401          * Try to re-read it _once_. We do this synchronously,
1402          * because there really aren't any performance issues here
1403          * and we need to check for errors.
1404          */
1405         lock_page(page);
1406         if (Page_Uptodate(page)) {
1407                 UnlockPage(page);
1408                 goto success;
1409         }
1410         ClearPageError(page);
1411         if (!inode->i_op->readpage(file->f_dentry, page)) {
1412                 wait_on_page(page);
1413                 if (Page_Uptodate(page))
1414                         goto success;
1415         }
1416
1417         /*
1418          * Things didn't work out. Return zero to tell the
1419          * mm layer so, possibly freeing the page cache page first.
1420          */
1421         page_cache_release(page);
1422         return NULL;
1423 }
1424
1425 /*
1426  * Tries to write a shared mapped page to its backing store. May return -EIO
1427  * if the disk is full.
1428  */
1429 static inline int do_write_page(struct inode * inode, struct file * file,
1430         struct page * page, unsigned long index)
1431 {
1432         int retval;
1433         int (*writepage) (struct dentry *, struct page *);
1434
1435         /* refuse to extend file size.. */
1436         if (S_ISREG(inode->i_mode)) {
1437                 unsigned long size_idx = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1438
1439                 /* Ho humm.. We should have tested for this earlier */
1440                 if (size_idx <= index)
1441                         return -EIO;
1442         }
1443         writepage = inode->i_op->writepage;
1444         lock_page(page);
1445
1446         retval = writepage(file->f_dentry, page);
1447
1448         UnlockPage(page);
1449         return retval;
1450 }
1451
1452 static int filemap_write_page(struct file *file,
1453                               unsigned long index,
1454                               struct page * page,
1455                               int wait)
1456 {
1457         int result;
1458         struct dentry * dentry;
1459         struct inode * inode;
1460
1461         dentry = file->f_dentry;
1462         inode = dentry->d_inode;
1463
1464         /*
1465          * If a task terminates while we're swapping the page, the vma and
1466          * and file could be released: try_to_swap_out has done a get_file.
1467          * vma/file is guaranteed to exist in the unmap/sync cases because
1468          * mmap_sem is held.
1469          */
1470         result = do_write_page(inode, file, page, index);
1471         return result;
1472 }
1473
1474
1475 /*
1476  * The page cache takes care of races between somebody
1477  * trying to swap something out and swap something in
1478  * at the same time..
1479  */
1480 extern void wakeup_bdflush(int);
1481 int filemap_swapout(struct page * page, struct file * file)
1482 {
1483         int retval = filemap_write_page(file, page->index, page, 0);
1484         wakeup_bdflush(0);
1485         return retval;
1486 }
1487
1488 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
1489         unsigned long address, unsigned int flags)
1490 {
1491         unsigned long pgoff;
1492         pte_t pte = *ptep;
1493         struct page *page;
1494         int error;
1495
1496         if (!(flags & MS_INVALIDATE)) {
1497                 if (!pte_present(pte))
1498                         return 0;
1499                 if (!pte_dirty(pte))
1500                         return 0;
1501                 flush_page_to_ram(pte_page(pte));
1502                 flush_cache_page(vma, address);
1503                 set_pte(ptep, pte_mkclean(pte));
1504                 flush_tlb_page(vma, address);
1505                 page = pte_page(pte);
1506                 get_page(page);
1507         } else {
1508                 if (pte_none(pte))
1509                         return 0;
1510                 flush_cache_page(vma, address);
1511                 pte_clear(ptep);
1512                 flush_tlb_page(vma, address);
1513                 if (!pte_present(pte)) {
1514                         swap_free(pte_to_swp_entry(pte));
1515                         return 0;
1516                 }
1517                 page = pte_page(pte);
1518                 if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
1519                         page_cache_free(page);
1520                         return 0;
1521                 }
1522         }
1523         pgoff = (address - vma->vm_start) >> PAGE_CACHE_SHIFT;
1524         pgoff += vma->vm_pgoff;
1525         if (page->index != pgoff) {
1526                 printk("weirdness: pgoff=%lu index=%lu address=%lu vm_start=%lu vm_pgoff=%lu\n",
1527                         pgoff, page->index, address, vma->vm_start, vma->vm_pgoff);
1528         }
1529         error = filemap_write_page(vma->vm_file, pgoff, page, 1);
1530         page_cache_free(page);
1531         return error;
1532 }
1533
1534 static inline int filemap_sync_pte_range(pmd_t * pmd,
1535         unsigned long address, unsigned long size,
1536         struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
1537 {
1538         pte_t * pte;
1539         unsigned long end;
1540         int error;
1541
1542         if (pmd_none(*pmd))
1543                 return 0;
1544         if (pmd_bad(*pmd)) {
1545                 pmd_ERROR(*pmd);
1546                 pmd_clear(pmd);
1547                 return 0;
1548         }
1549         pte = pte_offset(pmd, address);
1550         offset += address & PMD_MASK;
1551         address &= ~PMD_MASK;
1552         end = address + size;
1553         if (end > PMD_SIZE)
1554                 end = PMD_SIZE;
1555         error = 0;
1556         do {
1557                 error |= filemap_sync_pte(pte, vma, address + offset, flags);
1558                 address += PAGE_SIZE;
1559                 pte++;
1560         } while (address && (address < end));
1561         return error;
1562 }
1563
1564 static inline int filemap_sync_pmd_range(pgd_t * pgd,
1565         unsigned long address, unsigned long size,
1566         struct vm_area_struct *vma, unsigned int flags)
1567 {
1568         pmd_t * pmd;
1569         unsigned long offset, end;
1570         int error;
1571
1572         if (pgd_none(*pgd))
1573                 return 0;
1574         if (pgd_bad(*pgd)) {
1575                 pgd_ERROR(*pgd);
1576                 pgd_clear(pgd);
1577                 return 0;
1578         }
1579         pmd = pmd_offset(pgd, address);
1580         offset = address & PGDIR_MASK;
1581         address &= ~PGDIR_MASK;
1582         end = address + size;
1583         if (end > PGDIR_SIZE)
1584                 end = PGDIR_SIZE;
1585         error = 0;
1586         do {
1587                 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
1588                 address = (address + PMD_SIZE) & PMD_MASK;
1589                 pmd++;
1590         } while (address && (address < end));
1591         return error;
1592 }
1593
1594 int filemap_sync(struct vm_area_struct * vma, unsigned long address,
1595         size_t size, unsigned int flags)
1596 {
1597         pgd_t * dir;
1598         unsigned long end = address + size;
1599         int error = 0;
1600
1601         dir = pgd_offset(vma->vm_mm, address);
1602         flush_cache_range(vma->vm_mm, end - size, end);
1603         if (address >= end)
1604                 BUG();
1605         do {
1606                 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
1607                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
1608                 dir++;
1609         } while (address && (address < end));
1610         flush_tlb_range(vma->vm_mm, end - size, end);
1611         return error;
1612 }
1613
1614 /*
1615  * This handles (potentially partial) area unmaps..
1616  */
1617 static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len)
1618 {
1619         lock_kernel();
1620         filemap_sync(vma, start, len, MS_ASYNC);
1621         unlock_kernel();
1622 }
1623
1624 /*
1625  * Shared mappings need to be able to do the right thing at
1626  * close/unmap/sync. They will also use the private file as
1627  * backing-store for swapping..
1628  */
1629 static struct vm_operations_struct file_shared_mmap = {
1630         NULL,                   /* no special open */
1631         NULL,                   /* no special close */
1632         filemap_unmap,          /* unmap - we need to sync the pages */
1633         NULL,                   /* no special protect */
1634         filemap_sync,           /* sync */
1635         NULL,                   /* advise */
1636         filemap_nopage,         /* nopage */
1637         NULL,                   /* wppage */
1638         filemap_swapout         /* swapout */
1639 };
1640
1641 /*
1642  * Private mappings just need to be able to load in the map.
1643  *
1644  * (This is actually used for shared mappings as well, if we
1645  * know they can't ever get write permissions..)
1646  */
1647 static struct vm_operations_struct file_private_mmap = {
1648         NULL,                   /* open */
1649         NULL,                   /* close */
1650         NULL,                   /* unmap */
1651         NULL,                   /* protect */
1652         NULL,                   /* sync */
1653         NULL,                   /* advise */
1654         filemap_nopage,         /* nopage */
1655         NULL,                   /* wppage */
1656         NULL                    /* swapout */
1657 };
1658
1659 /* This is used for a general mmap of a disk file */
1660
1661 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1662 {
1663         struct vm_operations_struct * ops;
1664         struct inode *inode = file->f_dentry->d_inode;
1665
1666         ops = &file_private_mmap;
1667         if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1668                 if (!inode->i_op || !inode->i_op->writepage)
1669                         return -EINVAL;
1670                 ops = &file_shared_mmap;
1671         }
1672         if (!inode->i_sb || !S_ISREG(inode->i_mode))
1673                 return -EACCES;
1674         if (!inode->i_op || !inode->i_op->readpage)
1675                 return -ENOEXEC;
1676         UPDATE_ATIME(inode);
1677         vma->vm_ops = ops;
1678         return 0;
1679 }
1680
1681
1682 /*
1683  * The msync() system call.
1684  */
1685
1686 static int msync_interval(struct vm_area_struct * vma,
1687         unsigned long start, unsigned long end, int flags)
1688 {
1689         if (vma->vm_file && vma->vm_ops && vma->vm_ops->sync) {
1690                 int error;
1691                 error = vma->vm_ops->sync(vma, start, end-start, flags);
1692                 if (!error && (flags & MS_SYNC)) {
1693                         struct file * file = vma->vm_file;
1694                         if (file) {
1695                                 struct dentry * dentry = file->f_dentry;
1696                                 error = file_fsync(file, dentry);
1697                         }
1698                 }
1699                 return error;
1700         }
1701         return 0;
1702 }
1703
1704 asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
1705 {
1706         unsigned long end;
1707         struct vm_area_struct * vma;
1708         int unmapped_error, error = -EINVAL;
1709
1710         down(&current->mm->mmap_sem);
1711         lock_kernel();
1712         if (start & ~PAGE_MASK)
1713                 goto out;
1714         len = (len + ~PAGE_MASK) & PAGE_MASK;
1715         end = start + len;
1716         if (end < start)
1717                 goto out;
1718         if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1719                 goto out;
1720         error = 0;
1721         if (end == start)
1722                 goto out;
1723         /*
1724          * If the interval [start,end) covers some unmapped address ranges,
1725          * just ignore them, but return -EFAULT at the end.
1726          */
1727         vma = find_vma(current->mm, start);
1728         unmapped_error = 0;
1729         for (;;) {
1730                 /* Still start < end. */
1731                 error = -EFAULT;
1732                 if (!vma)
1733                         goto out;
1734                 /* Here start < vma->vm_end. */
1735                 if (start < vma->vm_start) {
1736                         unmapped_error = -EFAULT;
1737                         start = vma->vm_start;
1738                 }
1739                 /* Here vma->vm_start <= start < vma->vm_end. */
1740                 if (end <= vma->vm_end) {
1741                         if (start < end) {
1742                                 error = msync_interval(vma, start, end, flags);
1743                                 if (error)
1744                                         goto out;
1745                         }
1746                         error = unmapped_error;
1747                         goto out;
1748                 }
1749                 /* Here vma->vm_start <= start < vma->vm_end < end. */
1750                 error = msync_interval(vma, start, vma->vm_end, flags);
1751                 if (error)
1752                         goto out;
1753                 start = vma->vm_end;
1754                 vma = vma->vm_next;
1755         }
1756 out:
1757         unlock_kernel();
1758         up(&current->mm->mmap_sem);
1759         return error;
1760 }
1761
1762 struct page *read_cache_page(struct address_space *mapping,
1763                                 unsigned long index,
1764                                 int (*filler)(void *,struct page*),
1765                                 void *data)
1766 {
1767         struct page **hash = page_hash(mapping, index);
1768         struct page *page, *cached_page = NULL;
1769         int err;
1770 repeat:
1771         page = __find_get_page(mapping, index, hash);
1772         if (!page) {
1773                 if (!cached_page) {
1774                         cached_page = page_cache_alloc();
1775                         if (!cached_page)
1776                                 return ERR_PTR(-ENOMEM);
1777                 }
1778                 page = cached_page;
1779                 if (add_to_page_cache_unique(page, mapping, index, hash))
1780                         goto repeat;
1781                 cached_page = NULL;
1782                 err = filler(data, page);
1783                 if (err < 0) {
1784                         page_cache_release(page);
1785                         page = ERR_PTR(err);
1786                 }
1787         }
1788         if (cached_page)
1789                 page_cache_free(cached_page);
1790         return page;
1791 }
1792
1793 static inline struct page * __grab_cache_page(struct address_space *mapping,
1794                                 unsigned long index, struct page **cached_page)
1795 {
1796         struct page *page, **hash = page_hash(mapping, index);
1797 repeat:
1798         page = __find_lock_page(mapping, index, hash);
1799         if (!page) {
1800                 if (!*cached_page) {
1801                         *cached_page = page_cache_alloc();
1802                         if (!*cached_page)
1803                                 return NULL;
1804                 }
1805                 page = *cached_page;
1806                 if (add_to_page_cache_unique(page, mapping, index, hash))
1807                         goto repeat;
1808                 *cached_page = NULL;
1809         }
1810         return page;
1811 }
1812
1813 /*
1814  * Returns locked page at given index in given cache, creating it if needed.
1815  */
1816
1817 struct page *grab_cache_page(struct address_space *mapping, unsigned long index)
1818 {
1819         struct page *cached_page = NULL;
1820         struct page *page = __grab_cache_page(mapping,index,&cached_page);
1821         if (cached_page)
1822                 page_cache_free(cached_page);
1823         return page;
1824 }
1825
1826 /*
1827  * Write to a file through the page cache. This is mainly for the
1828  * benefit of NFS and possibly other network-based file systems.
1829  *
1830  * We currently put everything into the page cache prior to writing it.
1831  * This is not a problem when writing full pages. With partial pages,
1832  * however, we first have to read the data into the cache, then
1833  * dirty the page, and finally schedule it for writing. Alternatively, we
1834  * could write-through just the portion of data that would go into that
1835  * page, but that would kill performance for applications that write data
1836  * line by line, and it's prone to race conditions.
1837  *
1838  * Note that this routine doesn't try to keep track of dirty pages. Each
1839  * file system has to do this all by itself, unfortunately.
1840  *                                                      okir@monad.swb.de
1841  */
1842 ssize_t
1843 generic_file_write(struct file *file, const char *buf,
1844                    size_t count, loff_t *ppos,
1845                    writepage_t write_one_page)
1846 {
1847         struct dentry   *dentry = file->f_dentry;
1848         struct inode    *inode = dentry->d_inode;
1849         unsigned long   limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
1850         loff_t          pos;
1851         struct page     *page, *cached_page;
1852         unsigned long   written;
1853         long            status;
1854         int             err;
1855
1856         cached_page = NULL;
1857
1858         down(&inode->i_sem);
1859
1860         pos = *ppos;
1861         err = -EINVAL;
1862         if (pos < 0)
1863                 goto out;
1864
1865         err = file->f_error;
1866         if (err) {
1867                 file->f_error = 0;
1868                 goto out;
1869         }
1870
1871         written = 0;
1872
1873         if (file->f_flags & O_APPEND)
1874                 pos = inode->i_size;
1875
1876         /*
1877          * Check whether we've reached the file size limit.
1878          */
1879         err = -EFBIG;
1880         if (limit != RLIM_INFINITY) {
1881                 if (pos >= limit) {
1882                         send_sig(SIGXFSZ, current, 0);
1883                         goto out;
1884                 }
1885                 if (count > limit - pos) {
1886                         send_sig(SIGXFSZ, current, 0);
1887                         count = limit - pos;
1888                 }
1889         }
1890
1891         status  = 0;
1892
1893         while (count) {
1894                 unsigned long bytes, index, offset;
1895
1896                 /*
1897                  * Try to find the page in the cache. If it isn't there,
1898                  * allocate a free page.
1899                  */
1900                 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
1901                 index = pos >> PAGE_CACHE_SHIFT;
1902                 bytes = PAGE_CACHE_SIZE - offset;
1903                 if (bytes > count)
1904                         bytes = count;
1905
1906                 status = -ENOMEM;       /* we'll assign it later anyway */
1907                 page = __grab_cache_page(&inode->i_data, index, &cached_page);
1908                 if (!page)
1909                         break;
1910
1911                 /* We have exclusive IO access to the page.. */
1912                 if (!PageLocked(page)) {
1913                         PAGE_BUG(page);
1914                 }
1915
1916                 status = write_one_page(file, page, offset, bytes, buf);
1917
1918                 if (status >= 0) {
1919                         written += status;
1920                         count -= status;
1921                         pos += status;
1922                         buf += status;
1923                         if (pos > inode->i_size)
1924                                 inode->i_size = pos;
1925                 }
1926                 /* Mark it unlocked again and drop the page.. */
1927                 UnlockPage(page);
1928                 page_cache_release(page);
1929
1930                 if (status < 0)
1931                         break;
1932         }
1933         *ppos = pos;
1934
1935         if (cached_page)
1936                 page_cache_free(cached_page);
1937
1938         err = written ? written : status;
1939 out:
1940         up(&inode->i_sem);
1941         return err;
1942 }
1943
1944 void __init page_cache_init(unsigned long mempages)
1945 {
1946         unsigned long htable_size, order;
1947
1948         htable_size = mempages;
1949         htable_size *= sizeof(struct page *);
1950         for(order = 0; (PAGE_SIZE << order) < htable_size; order++)
1951                 ;
1952
1953         do {
1954                 unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *);
1955
1956                 page_hash_bits = 0;
1957                 while((tmp >>= 1UL) != 0UL)
1958                         page_hash_bits++;
1959
1960                 page_hash_table = (struct page **)
1961                         __get_free_pages(GFP_ATOMIC, order);
1962         } while(page_hash_table == NULL && --order > 0);
1963
1964         printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
1965                (1 << page_hash_bits), order, (PAGE_SIZE << order));
1966         if (!page_hash_table)
1967                 panic("Failed to allocate page hash table\n");
1968         memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));
1969 }