mm/filemap.c

   1 /*
   2  *      linux/mm/filemap.c
   3  *
   4  * Copyright (C) 1994-1999  Linus Torvalds
   5  */
   6
   7 /*
   8  * This file handles the generic file mmap semantics used by
   9  * most "normal" filesystems (but you don't /have/ to use this:
  10  * the NFS filesystem used to do this differently, for example)
  11  */
  12 #include <linux/malloc.h>
  13 #include <linux/shm.h>
  14 #include <linux/mman.h>
  15 #include <linux/locks.h>
  16 #include <linux/pagemap.h>
  17 #include <linux/swap.h>
  18 #include <linux/smp_lock.h>
  19 #include <linux/blkdev.h>
  20 #include <linux/file.h>
  21 #include <linux/swapctl.h>
  22 #include <linux/slab.h>
  23 #include <linux/init.h>
  24 #include <linux/mm.h>
  25
  26 #include <asm/pgalloc.h>
  27 #include <asm/uaccess.h>
  28 #include <asm/mman.h>
  29
  30 #include <linux/highmem.h>
  31
  32 /*
  33  * Shared mappings implemented 30.11.1994. It's not fully working yet,
  34  * though.
  35  *
  36  * Shared mappings now work. 15.8.1995  Bruno.
  37  *
  38  * finished 'unifying' the page and buffer cache and SMP-threaded the
  39  * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
  40  *
  41  * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
  42  */
  43
  44 atomic_t page_cache_size = ATOMIC_INIT(0);
  45 unsigned int page_hash_bits;
  46 struct page **page_hash_table;
  47 struct list_head lru_cache;
  48
  49 static spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
  50 /*
  51  * NOTE: to avoid deadlocking you must never acquire the pagecache_lock with
  52  *       the pagemap_lru_lock held.
  53  */
  54 spinlock_t pagemap_lru_lock = SPIN_LOCK_UNLOCKED;
  55
  56 #define CLUSTER_PAGES           (1 << page_cluster)
  57 #define CLUSTER_OFFSET(x)       (((x) >> page_cluster) << page_cluster)
  58
  59 void __add_page_to_hash_queue(struct page * page, struct page **p)
  60 {
  61         atomic_inc(&page_cache_size);
  62         if((page->next_hash = *p) != NULL)
  63                 (*p)->pprev_hash = &page->next_hash;
  64         *p = page;
  65         page->pprev_hash = p;
  66         if (page->buffers)
  67                 PAGE_BUG(page);
  68 }
  69
  70 static inline void remove_page_from_hash_queue(struct page * page)
  71 {
  72         if(page->pprev_hash) {
  73                 if(page->next_hash)
  74                         page->next_hash->pprev_hash = page->pprev_hash;
  75                 *page->pprev_hash = page->next_hash;
  76                 page->pprev_hash = NULL;
  77         }
  78         atomic_dec(&page_cache_size);
  79 }
  80
  81 static inline int sync_page(struct page *page)
  82 {
  83         struct address_space *mapping = page->mapping;
  84
  85         if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
  86                 return mapping->a_ops->sync_page(page);
  87         return 0;
  88 }
  89
  90 /*
  91  * Remove a page from the page cache and free it. Caller has to make
  92  * sure the page is locked and that nobody else uses it - or that usage
  93  * is safe.
  94  */
  95 static inline void __remove_inode_page(struct page *page)
  96 {
  97         remove_page_from_inode_queue(page);
  98         remove_page_from_hash_queue(page);
  99         page->mapping = NULL;
 100 }
 101
 102 void remove_inode_page(struct page *page)
 103 {
 104         if (!PageLocked(page))
 105                 PAGE_BUG(page);
 106
 107         spin_lock(&pagecache_lock);
 108         __remove_inode_page(page);
 109         spin_unlock(&pagecache_lock);
 110 }
 111
 112 /**
 113  * invalidate_inode_pages - Invalidate all the unlocked pages of one inode
 114  * @inode: the inode which pages we want to invalidate
 115  *
 116  * This function only removes the unlocked pages, if you want to
 117  * remove all the pages of one inode, you must call truncate_inode_pages.
 118  */
 119
 120 void invalidate_inode_pages(struct inode * inode)
 121 {
 122         struct list_head *head, *curr;
 123         struct page * page;
 124
 125         head = &inode->i_mapping->pages;
 126
 127         spin_lock(&pagecache_lock);
 128         spin_lock(&pagemap_lru_lock);
 129         curr = head->next;
 130
 131         while (curr != head) {
 132                 page = list_entry(curr, struct page, list);
 133                 curr = curr->next;
 134
 135                 /* We cannot invalidate a locked page */
 136                 if (TryLockPage(page))
 137                         continue;
 138
 139                 __lru_cache_del(page);
 140                 __remove_inode_page(page);
 141                 UnlockPage(page);
 142                 page_cache_release(page);
 143         }
 144
 145         spin_unlock(&pagemap_lru_lock);
 146         spin_unlock(&pagecache_lock);
 147 }
 148
 149 /*
 150  * Truncate the page cache at a set offset, removing the pages
 151  * that are beyond that offset (and zeroing out partial pages).
 152  */
 153 void truncate_inode_pages(struct address_space * mapping, loff_t lstart)
 154 {
 155         struct list_head *head, *curr;
 156         struct page * page;
 157         unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
 158         unsigned long start;
 159
 160         start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 161
 162 repeat:
 163         head = &mapping->pages;
 164         spin_lock(&pagecache_lock);
 165         curr = head->next;
 166         while (curr != head) {
 167                 unsigned long offset;
 168
 169                 page = list_entry(curr, struct page, list);
 170                 curr = curr->next;
 171
 172                 offset = page->index;
 173
 174                 /* page wholly truncated - free it */
 175                 if (offset >= start) {
 176                         if (TryLockPage(page)) {
 177                                 page_cache_get(page);
 178                                 spin_unlock(&pagecache_lock);
 179                                 wait_on_page(page);
 180                                 page_cache_release(page);
 181                                 goto repeat;
 182                         }
 183                         page_cache_get(page);
 184                         spin_unlock(&pagecache_lock);
 185
 186                         if (!page->buffers || block_flushpage(page, 0))
 187                                 lru_cache_del(page);
 188
 189                         /*
 190                          * We remove the page from the page cache
 191                          * _after_ we have destroyed all buffer-cache
 192                          * references to it. Otherwise some other process
 193                          * might think this inode page is not in the
 194                          * page cache and creates a buffer-cache alias
 195                          * to it causing all sorts of fun problems ...
 196                          */
 197                         remove_inode_page(page);
 198                         ClearPageDirty(page);
 199
 200                         UnlockPage(page);
 201                         page_cache_release(page);
 202                         page_cache_release(page);
 203
 204                         /*
 205                          * We have done things without the pagecache lock,
 206                          * so we'll have to repeat the scan.
 207                          * It's not possible to deadlock here because
 208                          * we are guaranteed to make progress. (ie. we have
 209                          * just removed a page)
 210                          */
 211                         goto repeat;
 212                 }
 213                 /*
 214                  * there is only one partial page possible.
 215                  */
 216                 if (!partial)
 217                         continue;
 218
 219                 /* and it's the one preceeding the first wholly truncated page */
 220                 if ((offset + 1) != start)
 221                         continue;
 222
 223                 /* partial truncate, clear end of page */
 224                 if (TryLockPage(page)) {
 225                         spin_unlock(&pagecache_lock);
 226                         goto repeat;
 227                 }
 228                 page_cache_get(page);
 229                 spin_unlock(&pagecache_lock);
 230
 231                 memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
 232                 if (page->buffers)
 233                         block_flushpage(page, partial);
 234
 235                 partial = 0;
 236
 237                 /*
 238                  * we have dropped the spinlock so we have to
 239                  * restart.
 240                  */
 241                 UnlockPage(page);
 242                 page_cache_release(page);
 243                 goto repeat;
 244         }
 245         spin_unlock(&pagecache_lock);
 246 }
 247
 248 /*
 249  * nr_dirty represents the number of dirty pages that we will write async
 250  * before doing sync writes.  We can only do sync writes if we can
 251  * wait for IO (__GFP_IO set).
 252  */
 253 int shrink_mmap(int priority, int gfp_mask)
 254 {
 255         int ret = 0, count, nr_dirty;
 256         struct list_head * page_lru;
 257         struct page * page = NULL;
 258
 259         count = nr_lru_pages / (priority + 1);
 260         nr_dirty = priority;
 261
 262         /* we need pagemap_lru_lock for list_del() ... subtle code below */
 263         spin_lock(&pagemap_lru_lock);
 264         while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache) {
 265                 page = list_entry(page_lru, struct page, lru);
 266                 list_del(page_lru);
 267
 268                 if (PageTestandClearReferenced(page))
 269                         goto dispose_continue;
 270
 271                 count--;
 272                 /*
 273                  * Avoid unscalable SMP locking for pages we can
 274                  * immediate tell are untouchable..
 275                  */
 276                 if (!page->buffers && page_count(page) > 1)
 277                         goto dispose_continue;
 278
 279                 if (TryLockPage(page))
 280                         goto dispose_continue;
 281
 282                 /* Release the pagemap_lru lock even if the page is not yet
 283                    queued in any lru queue since we have just locked down
 284                    the page so nobody else may SMP race with us running
 285                    a lru_cache_del() (lru_cache_del() always run with the
 286                    page locked down ;). */
 287                 spin_unlock(&pagemap_lru_lock);
 288
 289                 /* avoid freeing the page while it's locked */
 290                 page_cache_get(page);
 291
 292                 /*
 293                  * Is it a buffer page? Try to clean it up regardless
 294                  * of zone - it's old.
 295                  */
 296                 if (page->buffers) {
 297                         int wait;
 298                         /*
 299                          * 0 - free it if can do so without IO
 300                          * 1 - start write-out of dirty buffers
 301                          * 2 - wait for locked buffers
 302                          */
 303                         wait = (gfp_mask & __GFP_IO) ? (nr_dirty-- < 0) ? 2 : 1 : 0;
 304                         if (!try_to_free_buffers(page, wait))
 305                                 goto unlock_continue;
 306                         /* page was locked, inode can't go away under us */
 307                         if (!page->mapping) {
 308                                 atomic_dec(&buffermem_pages);
 309                                 goto made_buffer_progress;
 310                         }
 311                 }
 312
 313                 /* Take the pagecache_lock spinlock held to avoid
 314                    other tasks to notice the page while we are looking at its
 315                    page count. If it's a pagecache-page we'll free it
 316                    in one atomic transaction after checking its page count. */
 317                 spin_lock(&pagecache_lock);
 318
 319                 /*
 320                  * We can't free pages unless there's just one user
 321                  * (count == 2 because we added one ourselves above).
 322                  */
 323                 if (page_count(page) != 2)
 324                         goto cache_unlock_continue;
 325
 326                 /*
 327                  * Is it a page swap page? If so, we want to
 328                  * drop it if it is no longer used, even if it
 329                  * were to be marked referenced..
 330                  */
 331                 if (PageSwapCache(page)) {
 332                         spin_unlock(&pagecache_lock);
 333                         __delete_from_swap_cache(page);
 334                         goto made_inode_progress;
 335                 }
 336
 337                 /*
 338                  * Page is from a zone we don't care about.
 339                  * Don't drop page cache entries in vain.
 340                  */
 341                 if (page->zone->free_pages > page->zone->pages_high)
 342                         goto cache_unlock_continue;
 343
 344                 /* is it a page-cache page? */
 345                 if (page->mapping) {
 346                         if (!PageDirty(page) && !pgcache_under_min()) {
 347                                 __remove_inode_page(page);
 348                                 spin_unlock(&pagecache_lock);
 349                                 goto made_inode_progress;
 350                         }
 351                         goto cache_unlock_continue;
 352                 }
 353
 354                 printk(KERN_ERR "shrink_mmap: unknown LRU page!\n");
 355
 356 cache_unlock_continue:
 357                 spin_unlock(&pagecache_lock);
 358 unlock_continue:
 359                 spin_lock(&pagemap_lru_lock);
 360                 UnlockPage(page);
 361                 page_cache_release(page);
 362 dispose_continue:
 363                 list_add(page_lru, &lru_cache);
 364         }
 365         goto out;
 366
 367 made_inode_progress:
 368         page_cache_release(page);
 369 made_buffer_progress:
 370         UnlockPage(page);
 371         page_cache_release(page);
 372         ret = 1;
 373         spin_lock(&pagemap_lru_lock);
 374         /* nr_lru_pages needs the spinlock */
 375         nr_lru_pages--;
 376
 377 out:
 378         spin_unlock(&pagemap_lru_lock);
 379
 380         return ret;
 381 }
 382
 383 static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
 384 {
 385         goto inside;
 386
 387         for (;;) {
 388                 page = page->next_hash;
 389 inside:
 390                 if (!page)
 391                         goto not_found;
 392                 if (page->mapping != mapping)
 393                         continue;
 394                 if (page->index == offset)
 395                         break;
 396         }
 397         SetPageReferenced(page);
 398 not_found:
 399         return page;
 400 }
 401
 402 /*
 403  * By the time this is called, the page is locked and
 404  * we don't have to worry about any races any more.
 405  *
 406  * Start the IO..
 407  */
 408 static int writeout_one_page(struct page *page)
 409 {
 410         struct buffer_head *bh, *head = page->buffers;
 411
 412         bh = head;
 413         do {
 414                 if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
 415                         continue;
 416
 417                 bh->b_flushtime = 0;
 418                 ll_rw_block(WRITE, 1, &bh);
 419         } while ((bh = bh->b_this_page) != head);
 420         return 0;
 421 }
 422
 423 static int waitfor_one_page(struct page *page)
 424 {
 425         int error = 0;
 426         struct buffer_head *bh, *head = page->buffers;
 427
 428         bh = head;
 429         do {
 430                 wait_on_buffer(bh);
 431                 if (buffer_req(bh) && !buffer_uptodate(bh))
 432                         error = -EIO;
 433         } while ((bh = bh->b_this_page) != head);
 434         return error;
 435 }
 436
 437 static int do_buffer_fdatasync(struct inode *inode, unsigned long start, unsigned long end, int (*fn)(struct page *))
 438 {
 439         struct list_head *head, *curr;
 440         struct page *page;
 441         int retval = 0;
 442
 443         head = &inode->i_mapping->pages;
 444
 445         spin_lock(&pagecache_lock);
 446         curr = head->next;
 447         while (curr != head) {
 448                 page = list_entry(curr, struct page, list);
 449                 curr = curr->next;
 450                 if (!page->buffers)
 451                         continue;
 452                 if (page->index >= end)
 453                         continue;
 454                 if (page->index < start)
 455                         continue;
 456
 457                 page_cache_get(page);
 458                 spin_unlock(&pagecache_lock);
 459                 lock_page(page);
 460
 461                 /* The buffers could have been free'd while we waited for the page lock */
 462                 if (page->buffers)
 463                         retval |= fn(page);
 464
 465                 UnlockPage(page);
 466                 spin_lock(&pagecache_lock);
 467                 curr = page->list.next;
 468                 page_cache_release(page);
 469         }
 470         spin_unlock(&pagecache_lock);
 471
 472         return retval;
 473 }
 474
 475 /*
 476  * Two-stage data sync: first start the IO, then go back and
 477  * collect the information..
 478  */
 479 int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx)
 480 {
 481         int retval;
 482
 483         retval = do_buffer_fdatasync(inode, start_idx, end_idx, writeout_one_page);
 484         retval |= do_buffer_fdatasync(inode, start_idx, end_idx, waitfor_one_page);
 485         return retval;
 486 }
 487
 488 /*
 489  * Add a page to the inode page cache.
 490  *
 491  * The caller must have locked the page and
 492  * set all the page flags correctly..
 493  */
 494 void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index)
 495 {
 496         if (!PageLocked(page))
 497                 BUG();
 498
 499         page_cache_get(page);
 500         spin_lock(&pagecache_lock);
 501         page->index = index;
 502         add_page_to_inode_queue(mapping, page);
 503         __add_page_to_hash_queue(page, page_hash(mapping, index));
 504         lru_cache_add(page);
 505         spin_unlock(&pagecache_lock);
 506 }
 507
 508 /*
 509  * This adds a page to the page cache, starting out as locked,
 510  * owned by us, but unreferenced, not uptodate and with no errors.
 511  */
 512 static inline void __add_to_page_cache(struct page * page,
 513         struct address_space *mapping, unsigned long offset,
 514         struct page **hash)
 515 {
 516         struct page *alias;
 517         unsigned long flags;
 518
 519         if (PageLocked(page))
 520                 BUG();
 521
 522         flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced));
 523         page->flags = flags | (1 << PG_locked);
 524         page_cache_get(page);
 525         page->index = offset;
 526         add_page_to_inode_queue(mapping, page);
 527         __add_page_to_hash_queue(page, hash);
 528         lru_cache_add(page);
 529         alias = __find_page_nolock(mapping, offset, *hash);
 530         if (alias != page)
 531                 BUG();
 532 }
 533
 534 void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset)
 535 {
 536         spin_lock(&pagecache_lock);
 537         __add_to_page_cache(page, mapping, offset, page_hash(mapping, offset));
 538         spin_unlock(&pagecache_lock);
 539 }
 540
 541 static int add_to_page_cache_unique(struct page * page,
 542         struct address_space *mapping, unsigned long offset,
 543         struct page **hash)
 544 {
 545         int err;
 546         struct page *alias;
 547
 548         spin_lock(&pagecache_lock);
 549         alias = __find_page_nolock(mapping, offset, *hash);
 550
 551         err = 1;
 552         if (!alias) {
 553                 __add_to_page_cache(page,mapping,offset,hash);
 554                 err = 0;
 555         }
 556
 557         spin_unlock(&pagecache_lock);
 558         return err;
 559 }
 560
 561 /*
 562  * This adds the requested page to the page cache if it isn't already there,
 563  * and schedules an I/O to read in its contents from disk.
 564  */
 565 static inline int page_cache_read(struct file * file, unsigned long offset)
 566 {
 567         struct inode *inode = file->f_dentry->d_inode;
 568         struct address_space *mapping = inode->i_mapping;
 569         struct page **hash = page_hash(mapping, offset);
 570         struct page *page;
 571
 572         spin_lock(&pagecache_lock);
 573         page = __find_page_nolock(mapping, offset, *hash);
 574         spin_unlock(&pagecache_lock);
 575         if (page)
 576                 return 0;
 577
 578         page = page_cache_alloc();
 579         if (!page)
 580                 return -ENOMEM;
 581
 582         if (!add_to_page_cache_unique(page, mapping, offset, hash)) {
 583                 int error = mapping->a_ops->readpage(file, page);
 584                 page_cache_release(page);
 585                 return error;
 586         }
 587         /*
 588          * We arrive here in the unlikely event that someone
 589          * raced with us and added our page to the cache first.
 590          */
 591         page_cache_free(page);
 592         return 0;
 593 }
 594
 595 /*
 596  * Read in an entire cluster at once.  A cluster is usually a 64k-
 597  * aligned block that includes the page requested in "offset."
 598  */
 599 static int read_cluster_nonblocking(struct file * file, unsigned long offset,
 600         unsigned long filesize)
 601 {
 602         unsigned long pages = CLUSTER_PAGES;
 603
 604         offset = CLUSTER_OFFSET(offset);
 605         while ((pages-- > 0) && (offset < filesize)) {
 606                 int error = page_cache_read(file, offset);
 607                 if (error < 0)
 608                         return error;
 609                 offset ++;
 610         }
 611
 612         return 0;
 613 }
 614
 615 /*
 616  * Wait for a page to get unlocked.
 617  *
 618  * This must be called with the caller "holding" the page,
 619  * ie with increased "page->count" so that the page won't
 620  * go away during the wait..
 621  */
 622 void ___wait_on_page(struct page *page)
 623 {
 624         struct task_struct *tsk = current;
 625         DECLARE_WAITQUEUE(wait, tsk);
 626
 627         add_wait_queue(&page->wait, &wait);
 628         do {
 629                 sync_page(page);
 630                 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 631                 if (!PageLocked(page))
 632                         break;
 633                 schedule();
 634         } while (PageLocked(page));
 635         tsk->state = TASK_RUNNING;
 636         remove_wait_queue(&page->wait, &wait);
 637 }
 638
 639 /*
 640  * Get an exclusive lock on the page..
 641  */
 642 void lock_page(struct page *page)
 643 {
 644         while (TryLockPage(page))
 645                 ___wait_on_page(page);
 646 }
 647
 648
 649 /*
 650  * a rather lightweight function, finding and getting a reference to a
 651  * hashed page atomically, waiting for it if it's locked.
 652  */
 653 struct page * __find_get_page (struct address_space *mapping,
 654                                 unsigned long offset, struct page **hash)
 655 {
 656         struct page *page;
 657
 658         /*
 659          * We scan the hash list read-only. Addition to and removal from
 660          * the hash-list needs a held write-lock.
 661          */
 662 repeat:
 663         spin_lock(&pagecache_lock);
 664         page = __find_page_nolock(mapping, offset, *hash);
 665         if (page)
 666                 page_cache_get(page);
 667         spin_unlock(&pagecache_lock);
 668
 669         /* Found the page, sleep if locked. */
 670         if (page && PageLocked(page)) {
 671                 struct task_struct *tsk = current;
 672                 DECLARE_WAITQUEUE(wait, tsk);
 673
 674                 sync_page(page);
 675
 676                 __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 677                 add_wait_queue(&page->wait, &wait);
 678
 679                 if (PageLocked(page))
 680                         schedule();
 681                 __set_task_state(tsk, TASK_RUNNING);
 682                 remove_wait_queue(&page->wait, &wait);
 683
 684                 /*
 685                  * The page might have been unhashed meanwhile. It's
 686                  * not freed though because we hold a reference to it.
 687                  * If this is the case then it will be freed _here_,
 688                  * and we recheck the hash anyway.
 689                  */
 690                 page_cache_release(page);
 691                 goto repeat;
 692         }
 693         /*
 694          * It's not locked so we can return the page and we hold
 695          * a reference to it.
 696          */
 697         return page;
 698 }
 699
 700 /*
 701  * Get the lock to a page atomically.
 702  */
 703 struct page * __find_lock_page (struct address_space *mapping,
 704                                 unsigned long offset, struct page **hash)
 705 {
 706         struct page *page;
 707
 708         /*
 709          * We scan the hash list read-only. Addition to and removal from
 710          * the hash-list needs a held write-lock.
 711          */
 712 repeat:
 713         spin_lock(&pagecache_lock);
 714         page = __find_page_nolock(mapping, offset, *hash);
 715         if (page)
 716                 page_cache_get(page);
 717         spin_unlock(&pagecache_lock);
 718
 719         /* Found the page, sleep if locked. */
 720         if (page && TryLockPage(page)) {
 721                 struct task_struct *tsk = current;
 722                 DECLARE_WAITQUEUE(wait, tsk);
 723
 724                 sync_page(page);
 725
 726                 __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 727                 add_wait_queue(&page->wait, &wait);
 728
 729                 if (PageLocked(page))
 730                         schedule();
 731                 __set_task_state(tsk, TASK_RUNNING);
 732                 remove_wait_queue(&page->wait, &wait);
 733
 734                 /*
 735                  * The page might have been unhashed meanwhile. It's
 736                  * not freed though because we hold a reference to it.
 737                  * If this is the case then it will be freed _here_,
 738                  * and we recheck the hash anyway.
 739                  */
 740                 page_cache_release(page);
 741                 goto repeat;
 742         }
 743         /*
 744          * It's not locked so we can return the page and we hold
 745          * a reference to it.
 746          */
 747         return page;
 748 }
 749
 750 #if 0
 751 #define PROFILE_READAHEAD
 752 #define DEBUG_READAHEAD
 753 #endif
 754
 755 /*
 756  * Read-ahead profiling information
 757  * --------------------------------
 758  * Every PROFILE_MAXREADCOUNT, the following information is written
 759  * to the syslog:
 760  *   Percentage of asynchronous read-ahead.
 761  *   Average of read-ahead fields context value.
 762  * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
 763  * to the syslog.
 764  */
 765
 766 #ifdef PROFILE_READAHEAD
 767
 768 #define PROFILE_MAXREADCOUNT 1000
 769
 770 static unsigned long total_reada;
 771 static unsigned long total_async;
 772 static unsigned long total_ramax;
 773 static unsigned long total_ralen;
 774 static unsigned long total_rawin;
 775
 776 static void profile_readahead(int async, struct file *filp)
 777 {
 778         unsigned long flags;
 779
 780         ++total_reada;
 781         if (async)
 782                 ++total_async;
 783
 784         total_ramax     += filp->f_ramax;
 785         total_ralen     += filp->f_ralen;
 786         total_rawin     += filp->f_rawin;
 787
 788         if (total_reada > PROFILE_MAXREADCOUNT) {
 789                 save_flags(flags);
 790                 cli();
 791                 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
 792                         restore_flags(flags);
 793                         return;
 794                 }
 795
 796                 printk("Readahead average:  max=%ld, len=%ld, win=%ld, async=%ld%%\n",
 797                         total_ramax/total_reada,
 798                         total_ralen/total_reada,
 799                         total_rawin/total_reada,
 800                         (total_async*100)/total_reada);
 801 #ifdef DEBUG_READAHEAD
 802                 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n",
 803                         filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
 804 #endif
 805
 806                 total_reada     = 0;
 807                 total_async     = 0;
 808                 total_ramax     = 0;
 809                 total_ralen     = 0;
 810                 total_rawin     = 0;
 811
 812                 restore_flags(flags);
 813         }
 814 }
 815 #endif  /* defined PROFILE_READAHEAD */
 816
 817 /*
 818  * Read-ahead context:
 819  * -------------------
 820  * The read ahead context fields of the "struct file" are the following:
 821  * - f_raend : position of the first byte after the last page we tried to
 822  *             read ahead.
 823  * - f_ramax : current read-ahead maximum size.
 824  * - f_ralen : length of the current IO read block we tried to read-ahead.
 825  * - f_rawin : length of the current read-ahead window.
 826  *              if last read-ahead was synchronous then
 827  *                      f_rawin = f_ralen
 828  *              otherwise (was asynchronous)
 829  *                      f_rawin = previous value of f_ralen + f_ralen
 830  *
 831  * Read-ahead limits:
 832  * ------------------
 833  * MIN_READAHEAD   : minimum read-ahead size when read-ahead.
 834  * MAX_READAHEAD   : maximum read-ahead size when read-ahead.
 835  *
 836  * Synchronous read-ahead benefits:
 837  * --------------------------------
 838  * Using reasonable IO xfer length from peripheral devices increase system
 839  * performances.
 840  * Reasonable means, in this context, not too large but not too small.
 841  * The actual maximum value is:
 842  *      MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
 843  *      and 32K if defined (4K page size assumed).
 844  *
 845  * Asynchronous read-ahead benefits:
 846  * ---------------------------------
 847  * Overlapping next read request and user process execution increase system
 848  * performance.
 849  *
 850  * Read-ahead risks:
 851  * -----------------
 852  * We have to guess which further data are needed by the user process.
 853  * If these data are often not really needed, it's bad for system
 854  * performances.
 855  * However, we know that files are often accessed sequentially by
 856  * application programs and it seems that it is possible to have some good
 857  * strategy in that guessing.
 858  * We only try to read-ahead files that seems to be read sequentially.
 859  *
 860  * Asynchronous read-ahead risks:
 861  * ------------------------------
 862  * In order to maximize overlapping, we must start some asynchronous read
 863  * request from the device, as soon as possible.
 864  * We must be very careful about:
 865  * - The number of effective pending IO read requests.
 866  *   ONE seems to be the only reasonable value.
 867  * - The total memory pool usage for the file access stream.
 868  *   This maximum memory usage is implicitly 2 IO read chunks:
 869  *   2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
 870  *   64k if defined (4K page size assumed).
 871  */
 872
 873 static inline int get_max_readahead(struct inode * inode)
 874 {
 875         if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
 876                 return MAX_READAHEAD;
 877         return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
 878 }
 879
 880 static void generic_file_readahead(int reada_ok,
 881         struct file * filp, struct inode * inode,
 882         struct page * page)
 883 {
 884         unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
 885         unsigned long index = page->index;
 886         unsigned long max_ahead, ahead;
 887         unsigned long raend;
 888         int max_readahead = get_max_readahead(inode);
 889
 890         raend = filp->f_raend;
 891         max_ahead = 0;
 892
 893 /*
 894  * The current page is locked.
 895  * If the current position is inside the previous read IO request, do not
 896  * try to reread previously read ahead pages.
 897  * Otherwise decide or not to read ahead some pages synchronously.
 898  * If we are not going to read ahead, set the read ahead context for this
 899  * page only.
 900  */
 901         if (PageLocked(page)) {
 902                 if (!filp->f_ralen || index >= raend || index + filp->f_ralen < raend) {
 903                         raend = index;
 904                         if (raend < end_index)
 905                                 max_ahead = filp->f_ramax;
 906                         filp->f_rawin = 0;
 907                         filp->f_ralen = 1;
 908                         if (!max_ahead) {
 909                                 filp->f_raend  = index + filp->f_ralen;
 910                                 filp->f_rawin += filp->f_ralen;
 911                         }
 912                 }
 913         }
 914 /*
 915  * The current page is not locked.
 916  * If we were reading ahead and,
 917  * if the current max read ahead size is not zero and,
 918  * if the current position is inside the last read-ahead IO request,
 919  *   it is the moment to try to read ahead asynchronously.
 920  * We will later force unplug device in order to force asynchronous read IO.
 921  */
 922         else if (reada_ok && filp->f_ramax && raend >= 1 &&
 923                  index <= raend && index + filp->f_ralen >= raend) {
 924 /*
 925  * Add ONE page to max_ahead in order to try to have about the same IO max size
 926  * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
 927  * Compute the position of the last page we have tried to read in order to
 928  * begin to read ahead just at the next page.
 929  */
 930                 raend -= 1;
 931                 if (raend < end_index)
 932                         max_ahead = filp->f_ramax + 1;
 933
 934                 if (max_ahead) {
 935                         filp->f_rawin = filp->f_ralen;
 936                         filp->f_ralen = 0;
 937                         reada_ok      = 2;
 938                 }
 939         }
 940 /*
 941  * Try to read ahead pages.
 942  * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
 943  * scheduler, will work enough for us to avoid too bad actuals IO requests.
 944  */
 945         ahead = 0;
 946         while (ahead < max_ahead) {
 947                 ahead ++;
 948                 if ((raend + ahead) >= end_index)
 949                         break;
 950                 if (page_cache_read(filp, raend + ahead) < 0)
 951                         break;
 952         }
 953 /*
 954  * If we tried to read ahead some pages,
 955  * If we tried to read ahead asynchronously,
 956  *   Try to force unplug of the device in order to start an asynchronous
 957  *   read IO request.
 958  * Update the read-ahead context.
 959  * Store the length of the current read-ahead window.
 960  * Double the current max read ahead size.
 961  *   That heuristic avoid to do some large IO for files that are not really
 962  *   accessed sequentially.
 963  */
 964         if (ahead) {
 965                 if (reada_ok == 2) {
 966                         run_task_queue(&tq_disk);
 967                 }
 968
 969                 filp->f_ralen += ahead;
 970                 filp->f_rawin += filp->f_ralen;
 971                 filp->f_raend = raend + ahead + 1;
 972
 973                 filp->f_ramax += filp->f_ramax;
 974
 975                 if (filp->f_ramax > max_readahead)
 976                         filp->f_ramax = max_readahead;
 977
 978 #ifdef PROFILE_READAHEAD
 979                 profile_readahead((reada_ok == 2), filp);
 980 #endif
 981         }
 982
 983         return;
 984 }
 985
 986
 987 /*
 988  * This is a generic file read routine, and uses the
 989  * inode->i_op->readpage() function for the actual low-level
 990  * stuff.
 991  *
 992  * This is really ugly. But the goto's actually try to clarify some
 993  * of the logic when it comes to error handling etc.
 994  */
 995 void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
 996 {
 997         struct inode *inode = filp->f_dentry->d_inode;
 998         struct address_space *mapping = inode->i_mapping;
 999         unsigned long index, offset;
1000         struct page *cached_page;
1001         int reada_ok;
1002         int error;
1003         int max_readahead = get_max_readahead(inode);
1004
1005         cached_page = NULL;
1006         index = *ppos >> PAGE_CACHE_SHIFT;
1007         offset = *ppos & ~PAGE_CACHE_MASK;
1008
1009 /*
1010  * If the current position is outside the previous read-ahead window,
1011  * we reset the current read-ahead context and set read ahead max to zero
1012  * (will be set to just needed value later),
1013  * otherwise, we assume that the file accesses are sequential enough to
1014  * continue read-ahead.
1015  */
1016         if (index > filp->f_raend || index + filp->f_rawin < filp->f_raend) {
1017                 reada_ok = 0;
1018                 filp->f_raend = 0;
1019                 filp->f_ralen = 0;
1020                 filp->f_ramax = 0;
1021                 filp->f_rawin = 0;
1022         } else {
1023                 reada_ok = 1;
1024         }
1025 /*
1026  * Adjust the current value of read-ahead max.
1027  * If the read operation stay in the first half page, force no readahead.
1028  * Otherwise try to increase read ahead max just enough to do the read request.
1029  * Then, at least MIN_READAHEAD if read ahead is ok,
1030  * and at most MAX_READAHEAD in all cases.
1031  */
1032         if (!index && offset + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
1033                 filp->f_ramax = 0;
1034         } else {
1035                 unsigned long needed;
1036
1037                 needed = ((offset + desc->count) >> PAGE_CACHE_SHIFT) + 1;
1038
1039                 if (filp->f_ramax < needed)
1040                         filp->f_ramax = needed;
1041
1042                 if (reada_ok && filp->f_ramax < MIN_READAHEAD)
1043                                 filp->f_ramax = MIN_READAHEAD;
1044                 if (filp->f_ramax > max_readahead)
1045                         filp->f_ramax = max_readahead;
1046         }
1047
1048         for (;;) {
1049                 struct page *page, **hash;
1050                 unsigned long end_index, nr;
1051
1052                 end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1053                 if (index > end_index)
1054                         break;
1055                 nr = PAGE_CACHE_SIZE;
1056                 if (index == end_index) {
1057                         nr = inode->i_size & ~PAGE_CACHE_MASK;
1058                         if (nr <= offset)
1059                                 break;
1060                 }
1061
1062                 nr = nr - offset;
1063
1064                 /*
1065                  * Try to find the data in the page cache..
1066                  */
1067                 hash = page_hash(mapping, index);
1068
1069                 spin_lock(&pagecache_lock);
1070                 page = __find_page_nolock(mapping, index, *hash);
1071                 if (!page)
1072                         goto no_cached_page;
1073 found_page:
1074                 page_cache_get(page);
1075                 spin_unlock(&pagecache_lock);
1076
1077                 if (!Page_Uptodate(page))
1078                         goto page_not_up_to_date;
1079 page_ok:
1080                 /*
1081                  * Ok, we have the page, and it's up-to-date, so
1082                  * now we can copy it to user space...
1083                  *
1084                  * The actor routine returns how many bytes were actually used..
1085                  * NOTE! This may not be the same as how much of a user buffer
1086                  * we filled up (we may be padding etc), so we can only update
1087                  * "pos" here (the actor routine has to update the user buffer
1088                  * pointers and the remaining count).
1089                  */
1090                 nr = actor(desc, page, offset, nr);
1091                 offset += nr;
1092                 index += offset >> PAGE_CACHE_SHIFT;
1093                 offset &= ~PAGE_CACHE_MASK;
1094
1095                 page_cache_release(page);
1096                 if (nr && desc->count)
1097                         continue;
1098                 break;
1099
1100 /*
1101  * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
1102  */
1103 page_not_up_to_date:
1104                 generic_file_readahead(reada_ok, filp, inode, page);
1105
1106                 if (Page_Uptodate(page))
1107                         goto page_ok;
1108
1109                 /* Get exclusive access to the page ... */
1110                 lock_page(page);
1111                 if (Page_Uptodate(page)) {
1112                         UnlockPage(page);
1113                         goto page_ok;
1114                 }
1115
1116 readpage:
1117                 /* ... and start the actual read. The read will unlock the page. */
1118                 error = mapping->a_ops->readpage(filp, page);
1119
1120                 if (!error) {
1121                         if (Page_Uptodate(page))
1122                                 goto page_ok;
1123
1124                         /* Again, try some read-ahead while waiting for the page to finish.. */
1125                         generic_file_readahead(reada_ok, filp, inode, page);
1126                         wait_on_page(page);
1127                         if (Page_Uptodate(page))
1128                                 goto page_ok;
1129                         error = -EIO;
1130                 }
1131
1132                 /* UHHUH! A synchronous read error occurred. Report it */
1133                 desc->error = error;
1134                 page_cache_release(page);
1135                 break;
1136
1137 no_cached_page:
1138                 /*
1139                  * Ok, it wasn't cached, so we need to create a new
1140                  * page..
1141                  *
1142                  * We get here with the page cache lock held.
1143                  */
1144                 if (!cached_page) {
1145                         spin_unlock(&pagecache_lock);
1146                         cached_page = page_cache_alloc();
1147                         if (!cached_page) {
1148                                 desc->error = -ENOMEM;
1149                                 break;
1150                         }
1151
1152                         /*
1153                          * Somebody may have added the page while we
1154                          * dropped the page cache lock. Check for that.
1155                          */
1156                         spin_lock(&pagecache_lock);
1157                         page = __find_page_nolock(mapping, index, *hash);
1158                         if (page)
1159                                 goto found_page;
1160                 }
1161
1162                 /*
1163                  * Ok, add the new page to the hash-queues...
1164                  */
1165                 page = cached_page;
1166                 __add_to_page_cache(page, mapping, index, hash);
1167                 spin_unlock(&pagecache_lock);
1168                 cached_page = NULL;
1169
1170                 goto readpage;
1171         }
1172
1173         *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1174         filp->f_reada = 1;
1175         if (cached_page)
1176                 page_cache_free(cached_page);
1177         UPDATE_ATIME(inode);
1178 }
1179
1180 static int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
1181 {
1182         unsigned long kaddr;
1183         unsigned long left, count = desc->count;
1184
1185         if (size > count)
1186                 size = count;
1187
1188         kaddr = kmap(page);
1189         left = __copy_to_user(desc->buf, (void *)(kaddr + offset), size);
1190         kunmap(page);
1191
1192         if (left) {
1193                 size -= left;
1194                 desc->error = -EFAULT;
1195         }
1196         desc->count = count - size;
1197         desc->written += size;
1198         desc->buf += size;
1199         return size;
1200 }
1201
1202 /*
1203  * This is the "read()" routine for all filesystems
1204  * that can use the page cache directly.
1205  */
1206 ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
1207 {
1208         ssize_t retval;
1209
1210         retval = -EFAULT;
1211         if (access_ok(VERIFY_WRITE, buf, count)) {
1212                 retval = 0;
1213
1214                 if (count) {
1215                         read_descriptor_t desc;
1216
1217                         desc.written = 0;
1218                         desc.count = count;
1219                         desc.buf = buf;
1220                         desc.error = 0;
1221                         do_generic_file_read(filp, ppos, &desc, file_read_actor);
1222
1223                         retval = desc.written;
1224                         if (!retval)
1225                                 retval = desc.error;
1226                 }
1227         }
1228         return retval;
1229 }
1230
1231 static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size)
1232 {
1233         unsigned long kaddr;
1234         ssize_t written;
1235         unsigned long count = desc->count;
1236         struct file *file = (struct file *) desc->buf;
1237         mm_segment_t old_fs;
1238
1239         if (size > count)
1240                 size = count;
1241         old_fs = get_fs();
1242         set_fs(KERNEL_DS);
1243
1244         kaddr = kmap(page);
1245         written = file->f_op->write(file, (char *)kaddr + offset,
1246                                                  size, &file->f_pos);
1247         kunmap(page);
1248         set_fs(old_fs);
1249         if (written < 0) {
1250                 desc->error = written;
1251                 written = 0;
1252         }
1253         desc->count = count - written;
1254         desc->written += written;
1255         return written;
1256 }
1257
1258 asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
1259 {
1260         ssize_t retval;
1261         struct file * in_file, * out_file;
1262         struct inode * in_inode, * out_inode;
1263
1264         /*
1265          * Get input file, and verify that it is ok..
1266          */
1267         retval = -EBADF;
1268         in_file = fget(in_fd);
1269         if (!in_file)
1270                 goto out;
1271         if (!(in_file->f_mode & FMODE_READ))
1272                 goto fput_in;
1273         retval = -EINVAL;
1274         in_inode = in_file->f_dentry->d_inode;
1275         if (!in_inode)
1276                 goto fput_in;
1277         if (!in_inode->i_mapping->a_ops->readpage)
1278                 goto fput_in;
1279         retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
1280         if (retval)
1281                 goto fput_in;
1282
1283         /*
1284          * Get output file, and verify that it is ok..
1285          */
1286         retval = -EBADF;
1287         out_file = fget(out_fd);
1288         if (!out_file)
1289                 goto fput_in;
1290         if (!(out_file->f_mode & FMODE_WRITE))
1291                 goto fput_out;
1292         retval = -EINVAL;
1293         if (!out_file->f_op || !out_file->f_op->write)
1294                 goto fput_out;
1295         out_inode = out_file->f_dentry->d_inode;
1296         if (!out_inode)
1297                 goto fput_out;
1298         retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
1299         if (retval)
1300                 goto fput_out;
1301
1302         retval = 0;
1303         if (count) {
1304                 read_descriptor_t desc;
1305                 loff_t pos = 0, *ppos;
1306
1307                 retval = -EFAULT;
1308                 ppos = &in_file->f_pos;
1309                 if (offset) {
1310                         if (get_user(pos, offset))
1311                                 goto fput_out;
1312                         ppos = &pos;
1313                 }
1314
1315                 desc.written = 0;
1316                 desc.count = count;
1317                 desc.buf = (char *) out_file;
1318                 desc.error = 0;
1319                 do_generic_file_read(in_file, ppos, &desc, file_send_actor);
1320
1321                 retval = desc.written;
1322                 if (!retval)
1323                         retval = desc.error;
1324                 if (offset)
1325                         put_user(pos, offset);
1326         }
1327
1328 fput_out:
1329         fput(out_file);
1330 fput_in:
1331         fput(in_file);
1332 out:
1333         return retval;
1334 }
1335
1336 /*
1337  * Read-ahead and flush behind for MADV_SEQUENTIAL areas.  Since we are
1338  * sure this is sequential access, we don't need a flexible read-ahead
1339  * window size -- we can always use a large fixed size window.
1340  */
1341 static void nopage_sequential_readahead(struct vm_area_struct * vma,
1342         unsigned long pgoff, unsigned long filesize)
1343 {
1344         unsigned long ra_window;
1345
1346         ra_window = get_max_readahead(vma->vm_file->f_dentry->d_inode);
1347         ra_window = CLUSTER_OFFSET(ra_window + CLUSTER_PAGES - 1);
1348
1349         /* vm_raend is zero if we haven't read ahead in this area yet.  */
1350         if (vma->vm_raend == 0)
1351                 vma->vm_raend = vma->vm_pgoff + ra_window;
1352
1353         /*
1354          * If we've just faulted the page half-way through our window,
1355          * then schedule reads for the next window, and release the
1356          * pages in the previous window.
1357          */
1358         if ((pgoff + (ra_window >> 1)) == vma->vm_raend) {
1359                 unsigned long start = vma->vm_pgoff + vma->vm_raend;
1360                 unsigned long end = start + ra_window;
1361
1362                 if (end > ((vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff))
1363                         end = (vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff;
1364                 if (start > end)
1365                         return;
1366
1367                 while ((start < end) && (start < filesize)) {
1368                         if (read_cluster_nonblocking(vma->vm_file,
1369                                                         start, filesize) < 0)
1370                                 break;
1371                         start += CLUSTER_PAGES;
1372                 }
1373                 run_task_queue(&tq_disk);
1374
1375                 /* if we're far enough past the beginning of this area,
1376                    recycle pages that are in the previous window. */
1377                 if (vma->vm_raend > (vma->vm_pgoff + ra_window + ra_window)) {
1378                         unsigned long window = ra_window << PAGE_SHIFT;
1379
1380                         end = vma->vm_start + (vma->vm_raend << PAGE_SHIFT);
1381                         end -= window + window;
1382                         filemap_sync(vma, end - window, window, MS_INVALIDATE);
1383                 }
1384
1385                 vma->vm_raend += ra_window;
1386         }
1387
1388         return;
1389 }
1390
1391 /*
1392  * filemap_nopage() is invoked via the vma operations vector for a
1393  * mapped memory region to read in file data during a page fault.
1394  *
1395  * The goto's are kind of ugly, but this streamlines the normal case of having
1396  * it in the page cache, and handles the special cases reasonably without
1397  * having a lot of duplicated code.
1398  */
1399 struct page * filemap_nopage(struct vm_area_struct * area,
1400         unsigned long address, int no_share)
1401 {
1402         int error;
1403         struct file *file = area->vm_file;
1404         struct inode *inode = file->f_dentry->d_inode;
1405         struct address_space *mapping = inode->i_mapping;
1406         struct page *page, **hash, *old_page;
1407         unsigned long size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1408
1409         unsigned long pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
1410
1411         /*
1412          * Semantics for shared and private memory areas are different
1413          * past the end of the file. A shared mapping past the last page
1414          * of the file is an error and results in a SIGBUS, while a
1415          * private mapping just maps in a zero page.
1416          */
1417         if ((pgoff >= size) && (area->vm_mm == current->mm))
1418                 return NULL;
1419
1420         /*
1421          * Do we have something in the page cache already?
1422          */
1423         hash = page_hash(mapping, pgoff);
1424 retry_find:
1425         page = __find_get_page(mapping, pgoff, hash);
1426         if (!page)
1427                 goto no_cached_page;
1428
1429         /*
1430          * Ok, found a page in the page cache, now we need to check
1431          * that it's up-to-date.
1432          */
1433         if (!Page_Uptodate(page))
1434                 goto page_not_uptodate;
1435
1436 success:
1437         /*
1438          * Try read-ahead for sequential areas.
1439          */
1440         if (VM_SequentialReadHint(area))
1441                 nopage_sequential_readahead(area, pgoff, size);
1442
1443         /*
1444          * Found the page and have a reference on it, need to check sharing
1445          * and possibly copy it over to another page..
1446          */
1447         old_page = page;
1448         if (no_share) {
1449                 struct page *new_page = page_cache_alloc();
1450
1451                 if (new_page) {
1452                         copy_user_highpage(new_page, old_page, address);
1453                         flush_page_to_ram(new_page);
1454                 } else
1455                         new_page = NOPAGE_OOM;
1456                 page_cache_release(page);
1457                 return new_page;
1458         }
1459
1460         flush_page_to_ram(old_page);
1461         return old_page;
1462
1463 no_cached_page:
1464         /*
1465          * If the requested offset is within our file, try to read a whole
1466          * cluster of pages at once.
1467          *
1468          * Otherwise, we're off the end of a privately mapped file,
1469          * so we need to map a zero page.
1470          */
1471         if ((pgoff < size) && !VM_RandomReadHint(area))
1472                 error = read_cluster_nonblocking(file, pgoff, size);
1473         else
1474                 error = page_cache_read(file, pgoff);
1475
1476         /*
1477          * The page we want has now been added to the page cache.
1478          * In the unlikely event that someone removed it in the
1479          * meantime, we'll just come back here and read it again.
1480          */
1481         if (error >= 0)
1482                 goto retry_find;
1483
1484         /*
1485          * An error return from page_cache_read can result if the
1486          * system is low on memory, or a problem occurs while trying
1487          * to schedule I/O.
1488          */
1489         if (error == -ENOMEM)
1490                 return NOPAGE_OOM;
1491         return NULL;
1492
1493 page_not_uptodate:
1494         lock_page(page);
1495         if (Page_Uptodate(page)) {
1496                 UnlockPage(page);
1497                 goto success;
1498         }
1499
1500         if (!mapping->a_ops->readpage(file, page)) {
1501                 wait_on_page(page);
1502                 if (Page_Uptodate(page))
1503                         goto success;
1504         }
1505
1506         /*
1507          * Umm, take care of errors if the page isn't up-to-date.
1508          * Try to re-read it _once_. We do this synchronously,
1509          * because there really aren't any performance issues here
1510          * and we need to check for errors.
1511          */
1512         lock_page(page);
1513         if (Page_Uptodate(page)) {
1514                 UnlockPage(page);
1515                 goto success;
1516         }
1517         ClearPageError(page);
1518         if (!mapping->a_ops->readpage(file, page)) {
1519                 wait_on_page(page);
1520                 if (Page_Uptodate(page))
1521                         goto success;
1522         }
1523
1524         /*
1525          * Things didn't work out. Return zero to tell the
1526          * mm layer so, possibly freeing the page cache page first.
1527          */
1528         page_cache_release(page);
1529         return NULL;
1530 }
1531
1532 static int filemap_write_page(struct file *file,
1533                               struct page * page,
1534                               int wait)
1535 {
1536         /*
1537          * If a task terminates while we're swapping the page, the vma and
1538          * and file could be released: try_to_swap_out has done a get_file.
1539          * vma/file is guaranteed to exist in the unmap/sync cases because
1540          * mmap_sem is held.
1541          */
1542         return page->mapping->a_ops->writepage(file, page);
1543 }
1544
1545
1546 /*
1547  * The page cache takes care of races between somebody
1548  * trying to swap something out and swap something in
1549  * at the same time..
1550  */
1551 extern void wakeup_bdflush(int);
1552 int filemap_swapout(struct page * page, struct file * file)
1553 {
1554         int retval = filemap_write_page(file, page, 0);
1555         wakeup_bdflush(0);
1556         return retval;
1557 }
1558
1559 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
1560         unsigned long address, unsigned int flags)
1561 {
1562         unsigned long pgoff;
1563         pte_t pte = *ptep;
1564         struct page *page;
1565         int error;
1566
1567         if (!(flags & MS_INVALIDATE)) {
1568                 if (!pte_present(pte))
1569                         return 0;
1570                 if (!pte_dirty(pte))
1571                         return 0;
1572                 flush_page_to_ram(pte_page(pte));
1573                 flush_cache_page(vma, address);
1574                 set_pte(ptep, pte_mkclean(pte));
1575                 flush_tlb_page(vma, address);
1576                 page = pte_page(pte);
1577                 page_cache_get(page);
1578         } else {
1579                 if (pte_none(pte))
1580                         return 0;
1581                 flush_cache_page(vma, address);
1582                 pte_clear(ptep);
1583                 flush_tlb_page(vma, address);
1584                 if (!pte_present(pte)) {
1585                         swap_free(pte_to_swp_entry(pte));
1586                         return 0;
1587                 }
1588                 page = pte_page(pte);
1589                 if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
1590                         page_cache_free(page);
1591                         return 0;
1592                 }
1593         }
1594         pgoff = (address - vma->vm_start) >> PAGE_CACHE_SHIFT;
1595         pgoff += vma->vm_pgoff;
1596         if (page->index != pgoff) {
1597                 printk("weirdness: pgoff=%lu index=%lu address=%lu vm_start=%lu vm_pgoff=%lu\n",
1598                         pgoff, page->index, address, vma->vm_start, vma->vm_pgoff);
1599         }
1600         lock_page(page);
1601         error = filemap_write_page(vma->vm_file, page, 1);
1602         UnlockPage(page);
1603         page_cache_free(page);
1604         return error;
1605 }
1606
1607 static inline int filemap_sync_pte_range(pmd_t * pmd,
1608         unsigned long address, unsigned long size,
1609         struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
1610 {
1611         pte_t * pte;
1612         unsigned long end;
1613         int error;
1614
1615         if (pmd_none(*pmd))
1616                 return 0;
1617         if (pmd_bad(*pmd)) {
1618                 pmd_ERROR(*pmd);
1619                 pmd_clear(pmd);
1620                 return 0;
1621         }
1622         pte = pte_offset(pmd, address);
1623         offset += address & PMD_MASK;
1624         address &= ~PMD_MASK;
1625         end = address + size;
1626         if (end > PMD_SIZE)
1627                 end = PMD_SIZE;
1628         error = 0;
1629         do {
1630                 error |= filemap_sync_pte(pte, vma, address + offset, flags);
1631                 address += PAGE_SIZE;
1632                 pte++;
1633         } while (address && (address < end));
1634         return error;
1635 }
1636
1637 static inline int filemap_sync_pmd_range(pgd_t * pgd,
1638         unsigned long address, unsigned long size,
1639         struct vm_area_struct *vma, unsigned int flags)
1640 {
1641         pmd_t * pmd;
1642         unsigned long offset, end;
1643         int error;
1644
1645         if (pgd_none(*pgd))
1646                 return 0;
1647         if (pgd_bad(*pgd)) {
1648                 pgd_ERROR(*pgd);
1649                 pgd_clear(pgd);
1650                 return 0;
1651         }
1652         pmd = pmd_offset(pgd, address);
1653         offset = address & PGDIR_MASK;
1654         address &= ~PGDIR_MASK;
1655         end = address + size;
1656         if (end > PGDIR_SIZE)
1657                 end = PGDIR_SIZE;
1658         error = 0;
1659         do {
1660                 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
1661                 address = (address + PMD_SIZE) & PMD_MASK;
1662                 pmd++;
1663         } while (address && (address < end));
1664         return error;
1665 }
1666
1667 int filemap_sync(struct vm_area_struct * vma, unsigned long address,
1668         size_t size, unsigned int flags)
1669 {
1670         pgd_t * dir;
1671         unsigned long end = address + size;
1672         int error = 0;
1673
1674         dir = pgd_offset(vma->vm_mm, address);
1675         flush_cache_range(vma->vm_mm, end - size, end);
1676         if (address >= end)
1677                 BUG();
1678         do {
1679                 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
1680                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
1681                 dir++;
1682         } while (address && (address < end));
1683         flush_tlb_range(vma->vm_mm, end - size, end);
1684         return error;
1685 }
1686
1687 /*
1688  * This handles (potentially partial) area unmaps..
1689  */
1690 static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len)
1691 {
1692         filemap_sync(vma, start, len, MS_ASYNC);
1693 }
1694
1695 /*
1696  * Shared mappings need to be able to do the right thing at
1697  * close/unmap/sync. They will also use the private file as
1698  * backing-store for swapping..
1699  */
1700 static struct vm_operations_struct file_shared_mmap = {
1701         unmap:          filemap_unmap,          /* unmap - we need to sync the pages */
1702         sync:           filemap_sync,
1703         nopage:         filemap_nopage,
1704         swapout:        filemap_swapout,
1705 };
1706
1707 /*
1708  * Private mappings just need to be able to load in the map.
1709  *
1710  * (This is actually used for shared mappings as well, if we
1711  * know they can't ever get write permissions..)
1712  */
1713 static struct vm_operations_struct file_private_mmap = {
1714         nopage:         filemap_nopage,
1715 };
1716
1717 /* This is used for a general mmap of a disk file */
1718
1719 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1720 {
1721         struct vm_operations_struct * ops;
1722         struct inode *inode = file->f_dentry->d_inode;
1723
1724         ops = &file_private_mmap;
1725         if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1726                 if (!inode->i_mapping->a_ops->writepage)
1727                         return -EINVAL;
1728                 ops = &file_shared_mmap;
1729         }
1730         if (!inode->i_sb || !S_ISREG(inode->i_mode))
1731                 return -EACCES;
1732         if (!inode->i_mapping->a_ops->readpage)
1733                 return -ENOEXEC;
1734         UPDATE_ATIME(inode);
1735         vma->vm_ops = ops;
1736         return 0;
1737 }
1738
1739 /*
1740  * The msync() system call.
1741  */
1742
1743 static int msync_interval(struct vm_area_struct * vma,
1744         unsigned long start, unsigned long end, int flags)
1745 {
1746         if (vma->vm_file && vma->vm_ops && vma->vm_ops->sync) {
1747                 int error;
1748                 error = vma->vm_ops->sync(vma, start, end-start, flags);
1749                 if (!error && (flags & MS_SYNC)) {
1750                         struct file * file = vma->vm_file;
1751                         if (file && file->f_op && file->f_op->fsync) {
1752                                 down(&file->f_dentry->d_inode->i_sem);
1753                                 error = file->f_op->fsync(file, file->f_dentry, 1);
1754                                 up(&file->f_dentry->d_inode->i_sem);
1755                         }
1756                 }
1757                 return error;
1758         }
1759         return 0;
1760 }
1761
1762 asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
1763 {
1764         unsigned long end;
1765         struct vm_area_struct * vma;
1766         int unmapped_error, error = -EINVAL;
1767
1768         down(&current->mm->mmap_sem);
1769         if (start & ~PAGE_MASK)
1770                 goto out;
1771         len = (len + ~PAGE_MASK) & PAGE_MASK;
1772         end = start + len;
1773         if (end < start)
1774                 goto out;
1775         if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1776                 goto out;
1777         error = 0;
1778         if (end == start)
1779                 goto out;
1780         /*
1781          * If the interval [start,end) covers some unmapped address ranges,
1782          * just ignore them, but return -EFAULT at the end.
1783          */
1784         vma = find_vma(current->mm, start);
1785         unmapped_error = 0;
1786         for (;;) {
1787                 /* Still start < end. */
1788                 error = -EFAULT;
1789                 if (!vma)
1790                         goto out;
1791                 /* Here start < vma->vm_end. */
1792                 if (start < vma->vm_start) {
1793                         unmapped_error = -EFAULT;
1794                         start = vma->vm_start;
1795                 }
1796                 /* Here vma->vm_start <= start < vma->vm_end. */
1797                 if (end <= vma->vm_end) {
1798                         if (start < end) {
1799                                 error = msync_interval(vma, start, end, flags);
1800                                 if (error)
1801                                         goto out;
1802                         }
1803                         error = unmapped_error;
1804                         goto out;
1805                 }
1806                 /* Here vma->vm_start <= start < vma->vm_end < end. */
1807                 error = msync_interval(vma, start, vma->vm_end, flags);
1808                 if (error)
1809                         goto out;
1810                 start = vma->vm_end;
1811                 vma = vma->vm_next;
1812         }
1813 out:
1814         up(&current->mm->mmap_sem);
1815         return error;
1816 }
1817
1818 static inline void setup_read_behavior(struct vm_area_struct * vma,
1819         int behavior)
1820 {
1821         VM_ClearReadHint(vma);
1822         switch(behavior) {
1823                 case MADV_SEQUENTIAL:
1824                         vma->vm_flags |= VM_SEQ_READ;
1825                         break;
1826                 case MADV_RANDOM:
1827                         vma->vm_flags |= VM_RAND_READ;
1828                         break;
1829                 default:
1830                         break;
1831         }
1832         return;
1833 }
1834
1835 static long madvise_fixup_start(struct vm_area_struct * vma,
1836         unsigned long end, int behavior)
1837 {
1838         struct vm_area_struct * n;
1839
1840         n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1841         if (!n)
1842                 return -EAGAIN;
1843         *n = *vma;
1844         n->vm_end = end;
1845         setup_read_behavior(n, behavior);
1846         n->vm_raend = 0;
1847         get_file(n->vm_file);
1848         if (n->vm_ops && n->vm_ops->open)
1849                 n->vm_ops->open(n);
1850         vmlist_modify_lock(vma->vm_mm);
1851         vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
1852         vma->vm_start = end;
1853         insert_vm_struct(current->mm, n);
1854         vmlist_modify_unlock(vma->vm_mm);
1855         return 0;
1856 }
1857
1858 static long madvise_fixup_end(struct vm_area_struct * vma,
1859         unsigned long start, int behavior)
1860 {
1861         struct vm_area_struct * n;
1862
1863         n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1864         if (!n)
1865                 return -EAGAIN;
1866         *n = *vma;
1867         n->vm_start = start;
1868         n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT;
1869         setup_read_behavior(n, behavior);
1870         n->vm_raend = 0;
1871         get_file(n->vm_file);
1872         if (n->vm_ops && n->vm_ops->open)
1873                 n->vm_ops->open(n);
1874         vmlist_modify_lock(vma->vm_mm);
1875         vma->vm_end = start;
1876         insert_vm_struct(current->mm, n);
1877         vmlist_modify_unlock(vma->vm_mm);
1878         return 0;
1879 }
1880
1881 static long madvise_fixup_middle(struct vm_area_struct * vma,
1882         unsigned long start, unsigned long end, int behavior)
1883 {
1884         struct vm_area_struct * left, * right;
1885
1886         left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1887         if (!left)
1888                 return -EAGAIN;
1889         right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1890         if (!right) {
1891                 kmem_cache_free(vm_area_cachep, left);
1892                 return -EAGAIN;
1893         }
1894         *left = *vma;
1895         *right = *vma;
1896         left->vm_end = start;
1897         right->vm_start = end;
1898         right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT;
1899         left->vm_raend = 0;
1900         right->vm_raend = 0;
1901         atomic_add(2, &vma->vm_file->f_count);
1902
1903         if (vma->vm_ops && vma->vm_ops->open) {
1904                 vma->vm_ops->open(left);
1905                 vma->vm_ops->open(right);
1906         }
1907         vmlist_modify_lock(vma->vm_mm);
1908         vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
1909         vma->vm_start = start;
1910         vma->vm_end = end;
1911         setup_read_behavior(vma, behavior);
1912         vma->vm_raend = 0;
1913         insert_vm_struct(current->mm, left);
1914         insert_vm_struct(current->mm, right);
1915         vmlist_modify_unlock(vma->vm_mm);
1916         return 0;
1917 }
1918
1919 /*
1920  * We can potentially split a vm area into separate
1921  * areas, each area with its own behavior.
1922  */
1923 static long madvise_behavior(struct vm_area_struct * vma,
1924         unsigned long start, unsigned long end, int behavior)
1925 {
1926         int error = 0;
1927
1928         /* This caps the number of vma's this process can own */
1929         if (vma->vm_mm->map_count > MAX_MAP_COUNT)
1930                 return -ENOMEM;
1931
1932         if (start == vma->vm_start) {
1933                 if (end == vma->vm_end) {
1934                         setup_read_behavior(vma, behavior);
1935                         vma->vm_raend = 0;
1936                 } else
1937                         error = madvise_fixup_start(vma, end, behavior);
1938         } else {
1939                 if (end == vma->vm_end)
1940                         error = madvise_fixup_end(vma, start, behavior);
1941                 else
1942                         error = madvise_fixup_middle(vma, start, end, behavior);
1943         }
1944
1945         return error;
1946 }
1947
1948 /*
1949  * Schedule all required I/O operations, then run the disk queue
1950  * to make sure they are started.  Do not wait for completion.
1951  */
1952 static long madvise_willneed(struct vm_area_struct * vma,
1953         unsigned long start, unsigned long end)
1954 {
1955         long error = -EBADF;
1956         struct file * file;
1957         unsigned long size, rlim_rss;
1958
1959         /* Doesn't work if there's no mapped file. */
1960         if (!vma->vm_file)
1961                 return error;
1962         file = vma->vm_file;
1963         size = (file->f_dentry->d_inode->i_size + PAGE_CACHE_SIZE - 1) >>
1964                                                         PAGE_CACHE_SHIFT;
1965
1966         start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1967         if (end > vma->vm_end)
1968                 end = vma->vm_end;
1969         end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1970
1971         /* Make sure this doesn't exceed the process's max rss. */
1972         error = -EIO;
1973         rlim_rss = current->rlim ?  current->rlim[RLIMIT_RSS].rlim_cur :
1974                                 LONG_MAX; /* default: see resource.h */
1975         if ((vma->vm_mm->rss + (end - start)) > rlim_rss)
1976                 return error;
1977
1978         /* round to cluster boundaries if this isn't a "random" area. */
1979         if (!VM_RandomReadHint(vma)) {
1980                 start = CLUSTER_OFFSET(start);
1981                 end = CLUSTER_OFFSET(end + CLUSTER_PAGES - 1);
1982
1983                 while ((start < end) && (start < size)) {
1984                         error = read_cluster_nonblocking(file, start, size);
1985                         start += CLUSTER_PAGES;
1986                         if (error < 0)
1987                                 break;
1988                 }
1989         } else {
1990                 while ((start < end) && (start < size)) {
1991                         error = page_cache_read(file, start);
1992                         start++;
1993                         if (error < 0)
1994                                 break;
1995                 }
1996         }
1997
1998         /* Don't wait for someone else to push these requests. */
1999         run_task_queue(&tq_disk);
2000
2001         return error;
2002 }
2003
2004 /*
2005  * Application no longer needs these pages.  If the pages are dirty,
2006  * it's OK to just throw them away.  The app will be more careful about
2007  * data it wants to keep.  Be sure to free swap resources too.  The
2008  * zap_page_range call sets things up for shrink_mmap to actually free
2009  * these pages later if no one else has touched them in the meantime,
2010  * although we could add these pages to a global reuse list for
2011  * shrink_mmap to pick up before reclaiming other pages.
2012  *
2013  * NB: This interface discards data rather than pushes it out to swap,
2014  * as some implementations do.  This has performance implications for
2015  * applications like large transactional databases which want to discard
2016  * pages in anonymous maps after committing to backing store the data
2017  * that was kept in them.  There is no reason to write this data out to
2018  * the swap area if the application is discarding it.
2019  *
2020  * An interface that causes the system to free clean pages and flush
2021  * dirty pages is already available as msync(MS_INVALIDATE).
2022  */
2023 static long madvise_dontneed(struct vm_area_struct * vma,
2024         unsigned long start, unsigned long end)
2025 {
2026         if (vma->vm_flags & VM_LOCKED)
2027                 return -EINVAL;
2028
2029         flush_cache_range(vma->vm_mm, start, end);
2030         zap_page_range(vma->vm_mm, start, end - start);
2031         flush_tlb_range(vma->vm_mm, start, end);
2032         return 0;
2033 }
2034
2035 static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
2036         unsigned long end, int behavior)
2037 {
2038         long error = -EBADF;
2039
2040         switch (behavior) {
2041         case MADV_NORMAL:
2042         case MADV_SEQUENTIAL:
2043         case MADV_RANDOM:
2044                 error = madvise_behavior(vma, start, end, behavior);
2045                 break;
2046
2047         case MADV_WILLNEED:
2048                 error = madvise_willneed(vma, start, end);
2049                 break;
2050
2051         case MADV_DONTNEED:
2052                 error = madvise_dontneed(vma, start, end);
2053                 break;
2054
2055         default:
2056                 error = -EINVAL;
2057                 break;
2058         }
2059
2060         return error;
2061 }
2062
2063 /*
2064  * The madvise(2) system call.
2065  *
2066  * Applications can use madvise() to advise the kernel how it should
2067  * handle paging I/O in this VM area.  The idea is to help the kernel
2068  * use appropriate read-ahead and caching techniques.  The information
2069  * provided is advisory only, and can be safely disregarded by the
2070  * kernel without affecting the correct operation of the application.
2071  *
2072  * behavior values:
2073  *  MADV_NORMAL - the default behavior is to read clusters.  This
2074  *              results in some read-ahead and read-behind.
2075  *  MADV_RANDOM - the system should read the minimum amount of data
2076  *              on any access, since it is unlikely that the appli-
2077  *              cation will need more than what it asks for.
2078  *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
2079  *              once, so they can be aggressively read ahead, and
2080  *              can be freed soon after they are accessed.
2081  *  MADV_WILLNEED - the application is notifying the system to read
2082  *              some pages ahead.
2083  *  MADV_DONTNEED - the application is finished with the given range,
2084  *              so the kernel can free resources associated with it.
2085  *
2086  * return values:
2087  *  zero    - success
2088  *  -EINVAL - start + len < 0, start is not page-aligned,
2089  *              "behavior" is not a valid value, or application
2090  *              is attempting to release locked or shared pages.
2091  *  -ENOMEM - addresses in the specified range are not currently
2092  *              mapped, or are outside the AS of the process.
2093  *  -EIO    - an I/O error occurred while paging in data.
2094  *  -EBADF  - map exists, but area maps something that isn't a file.
2095  *  -EAGAIN - a kernel resource was temporarily unavailable.
2096  */
2097 asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior)
2098 {
2099         unsigned long end;
2100         struct vm_area_struct * vma;
2101         int unmapped_error = 0;
2102         int error = -EINVAL;
2103
2104         down(&current->mm->mmap_sem);
2105
2106         if (start & ~PAGE_MASK)
2107                 goto out;
2108         len = (len + ~PAGE_MASK) & PAGE_MASK;
2109         end = start + len;
2110         if (end < start)
2111                 goto out;
2112
2113         error = 0;
2114         if (end == start)
2115                 goto out;
2116
2117         /*
2118          * If the interval [start,end) covers some unmapped address
2119          * ranges, just ignore them, but return -ENOMEM at the end.
2120          */
2121         vma = find_vma(current->mm, start);
2122         for (;;) {
2123                 /* Still start < end. */
2124                 error = -ENOMEM;
2125                 if (!vma)
2126                         goto out;
2127
2128                 /* Here start < vma->vm_end. */
2129                 if (start < vma->vm_start) {
2130                         unmapped_error = -ENOMEM;
2131                         start = vma->vm_start;
2132                 }
2133
2134                 /* Here vma->vm_start <= start < vma->vm_end. */
2135                 if (end <= vma->vm_end) {
2136                         if (start < end) {
2137                                 error = madvise_vma(vma, start, end,
2138                                                         behavior);
2139                                 if (error)
2140                                         goto out;
2141                         }
2142                         error = unmapped_error;
2143                         goto out;
2144                 }
2145
2146                 /* Here vma->vm_start <= start < vma->vm_end < end. */
2147                 error = madvise_vma(vma, start, vma->vm_end, behavior);
2148                 if (error)
2149                         goto out;
2150                 start = vma->vm_end;
2151                 vma = vma->vm_next;
2152         }
2153
2154 out:
2155         up(&current->mm->mmap_sem);
2156         return error;
2157 }
2158
2159 /*
2160  * Later we can get more picky about what "in core" means precisely.
2161  * For now, simply check to see if the page is in the page cache,
2162  * and is up to date; i.e. that no page-in operation would be required
2163  * at this time if an application were to map and access this page.
2164  */
2165 static unsigned char mincore_page(struct vm_area_struct * vma,
2166         unsigned long pgoff)
2167 {
2168         unsigned char present = 0;
2169         struct address_space * as = &vma->vm_file->f_dentry->d_inode->i_data;
2170         struct page * page, ** hash = page_hash(as, pgoff);
2171
2172         spin_lock(&pagecache_lock);
2173         page = __find_page_nolock(as, pgoff, *hash);
2174         if ((page) && (Page_Uptodate(page)))
2175                 present = 1;
2176         spin_unlock(&pagecache_lock);
2177
2178         return present;
2179 }
2180
2181 static long mincore_vma(struct vm_area_struct * vma,
2182         unsigned long start, unsigned long end, unsigned char * vec)
2183 {
2184         long error, i, remaining;
2185         unsigned char * tmp;
2186
2187         error = -ENOMEM;
2188         if (!vma->vm_file)
2189                 return error;
2190
2191         start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2192         if (end > vma->vm_end)
2193                 end = vma->vm_end;
2194         end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2195
2196         error = -EAGAIN;
2197         tmp = (unsigned char *) __get_free_page(GFP_KERNEL);
2198         if (!tmp)
2199                 return error;
2200
2201         /* (end - start) is # of pages, and also # of bytes in "vec */
2202         remaining = (end - start),
2203
2204         error = 0;
2205         for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) {
2206                 int j = 0;
2207                 long thispiece = (remaining < PAGE_SIZE) ?
2208                                                 remaining : PAGE_SIZE;
2209
2210                 while (j < thispiece)
2211                         tmp[j++] = mincore_page(vma, start++);
2212
2213                 if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) {
2214                         error = -EFAULT;
2215                         break;
2216                 }
2217         }
2218
2219         free_page((unsigned long) tmp);
2220         return error;
2221 }
2222
2223 /*
2224  * The mincore(2) system call.
2225  *
2226  * mincore() returns the memory residency status of the pages in the
2227  * current process's address space specified by [addr, addr + len).
2228  * The status is returned in a vector of bytes.  The least significant
2229  * bit of each byte is 1 if the referenced page is in memory, otherwise
2230  * it is zero.
2231  *
2232  * Because the status of a page can change after mincore() checks it
2233  * but before it returns to the application, the returned vector may
2234  * contain stale information.  Only locked pages are guaranteed to
2235  * remain in memory.
2236  *
2237  * return values:
2238  *  zero    - success
2239  *  -EFAULT - vec points to an illegal address
2240  *  -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE,
2241  *              or len has a nonpositive value
2242  *  -ENOMEM - Addresses in the range [addr, addr + len] are
2243  *              invalid for the address space of this process, or
2244  *              specify one or more pages which are not currently
2245  *              mapped
2246  *  -EAGAIN - A kernel resource was temporarily unavailable.
2247  */
2248 asmlinkage long sys_mincore(unsigned long start, size_t len,
2249         unsigned char * vec)
2250 {
2251         int index = 0;
2252         unsigned long end;
2253         struct vm_area_struct * vma;
2254         int unmapped_error = 0;
2255         long error = -EINVAL;
2256
2257         down(&current->mm->mmap_sem);
2258
2259         if (start & ~PAGE_CACHE_MASK)
2260                 goto out;
2261         len = (len + ~PAGE_CACHE_MASK) & PAGE_CACHE_MASK;
2262         end = start + len;
2263         if (end < start)
2264                 goto out;
2265
2266         error = 0;
2267         if (end == start)
2268                 goto out;
2269
2270         /*
2271          * If the interval [start,end) covers some unmapped address
2272          * ranges, just ignore them, but return -ENOMEM at the end.
2273          */
2274         vma = find_vma(current->mm, start);
2275         for (;;) {
2276                 /* Still start < end. */
2277                 error = -ENOMEM;
2278                 if (!vma)
2279                         goto out;
2280
2281                 /* Here start < vma->vm_end. */
2282                 if (start < vma->vm_start) {
2283                         unmapped_error = -ENOMEM;
2284                         start = vma->vm_start;
2285                 }
2286
2287                 /* Here vma->vm_start <= start < vma->vm_end. */
2288                 if (end <= vma->vm_end) {
2289                         if (start < end) {
2290                                 error = mincore_vma(vma, start, end,
2291                                                         &vec[index]);
2292                                 if (error)
2293                                         goto out;
2294                         }
2295                         error = unmapped_error;
2296                         goto out;
2297                 }
2298
2299                 /* Here vma->vm_start <= start < vma->vm_end < end. */
2300                 error = mincore_vma(vma, start, vma->vm_end, &vec[index]);
2301                 if (error)
2302                         goto out;
2303                 index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT;
2304                 start = vma->vm_end;
2305                 vma = vma->vm_next;
2306         }
2307
2308 out:
2309         up(&current->mm->mmap_sem);
2310         return error;
2311 }
2312
2313 static inline
2314 struct page *__read_cache_page(struct address_space *mapping,
2315                                 unsigned long index,
2316                                 int (*filler)(void *,struct page*),
2317                                 void *data)
2318 {
2319         struct page **hash = page_hash(mapping, index);
2320         struct page *page, *cached_page = NULL;
2321         int err;
2322 repeat:
2323         page = __find_get_page(mapping, index, hash);
2324         if (!page) {
2325                 if (!cached_page) {
2326                         cached_page = page_cache_alloc();
2327                         if (!cached_page)
2328                                 return ERR_PTR(-ENOMEM);
2329                 }
2330                 page = cached_page;
2331                 if (add_to_page_cache_unique(page, mapping, index, hash))
2332                         goto repeat;
2333                 cached_page = NULL;
2334                 err = filler(data, page);
2335                 if (err < 0) {
2336                         page_cache_release(page);
2337                         page = ERR_PTR(err);
2338                 }
2339         }
2340         if (cached_page)
2341                 page_cache_free(cached_page);
2342         return page;
2343 }
2344
2345 /*
2346  * Read into the page cache. If a page already exists,
2347  * and Page_Uptodate() is not set, try to fill the page.
2348  */
2349 struct page *read_cache_page(struct address_space *mapping,
2350                                 unsigned long index,
2351                                 int (*filler)(void *,struct page*),
2352                                 void *data)
2353 {
2354         struct page *page = __read_cache_page(mapping, index, filler, data);
2355         int err;
2356
2357         if (IS_ERR(page) || Page_Uptodate(page))
2358                 goto out;
2359
2360         lock_page(page);
2361         if (Page_Uptodate(page)) {
2362                 UnlockPage(page);
2363                 goto out;
2364         }
2365         err = filler(data, page);
2366         if (err < 0) {
2367                 page_cache_release(page);
2368                 page = ERR_PTR(err);
2369         }
2370  out:
2371         return page;
2372 }
2373
2374 static inline struct page * __grab_cache_page(struct address_space *mapping,
2375                                 unsigned long index, struct page **cached_page)
2376 {
2377         struct page *page, **hash = page_hash(mapping, index);
2378 repeat:
2379         page = __find_lock_page(mapping, index, hash);
2380         if (!page) {
2381                 if (!*cached_page) {
2382                         *cached_page = page_cache_alloc();
2383                         if (!*cached_page)
2384                                 return NULL;
2385                 }
2386                 page = *cached_page;
2387                 if (add_to_page_cache_unique(page, mapping, index, hash))
2388                         goto repeat;
2389                 *cached_page = NULL;
2390         }
2391         return page;
2392 }
2393
2394 /*
2395  * Returns locked page at given index in given cache, creating it if needed.
2396  */
2397
2398 struct page *grab_cache_page(struct address_space *mapping, unsigned long index)
2399 {
2400         struct page *cached_page = NULL;
2401         struct page *page = __grab_cache_page(mapping,index,&cached_page);
2402         if (cached_page)
2403                 page_cache_free(cached_page);
2404         return page;
2405 }
2406
2407 static inline void remove_suid(struct inode *inode)
2408 {
2409         unsigned int mode;
2410
2411         /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
2412         mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
2413
2414         /* was any of the uid bits set? */
2415         mode &= inode->i_mode;
2416         if (mode && !capable(CAP_FSETID)) {
2417                 inode->i_mode &= ~mode;
2418                 mark_inode_dirty(inode);
2419         }
2420 }
2421
2422 /*
2423  * Write to a file through the page cache.
2424  *
2425  * We currently put everything into the page cache prior to writing it.
2426  * This is not a problem when writing full pages. With partial pages,
2427  * however, we first have to read the data into the cache, then
2428  * dirty the page, and finally schedule it for writing. Alternatively, we
2429  * could write-through just the portion of data that would go into that
2430  * page, but that would kill performance for applications that write data
2431  * line by line, and it's prone to race conditions.
2432  *
2433  * Note that this routine doesn't try to keep track of dirty pages. Each
2434  * file system has to do this all by itself, unfortunately.
2435  *                                                      okir@monad.swb.de
2436  */
2437 ssize_t
2438 generic_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)
2439 {
2440         struct inode    *inode = file->f_dentry->d_inode;
2441         struct address_space *mapping = inode->i_mapping;
2442         unsigned long   limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
2443         loff_t          pos;
2444         struct page     *page, *cached_page;
2445         unsigned long   written;
2446         long            status;
2447         int             err;
2448
2449         cached_page = NULL;
2450
2451         down(&inode->i_sem);
2452
2453         pos = *ppos;
2454         err = -EINVAL;
2455         if (pos < 0)
2456                 goto out;
2457
2458         err = file->f_error;
2459         if (err) {
2460                 file->f_error = 0;
2461                 goto out;
2462         }
2463
2464         written = 0;
2465
2466         if (file->f_flags & O_APPEND)
2467                 pos = inode->i_size;
2468
2469         /*
2470          * Check whether we've reached the file size limit.
2471          */
2472         err = -EFBIG;
2473         if (limit != RLIM_INFINITY) {
2474                 if (pos >= limit) {
2475                         send_sig(SIGXFSZ, current, 0);
2476                         goto out;
2477                 }
2478                 if (count > limit - pos) {
2479                         send_sig(SIGXFSZ, current, 0);
2480                         count = limit - pos;
2481                 }
2482         }
2483
2484         status  = 0;
2485         if (count) {
2486                 remove_suid(inode);
2487                 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
2488                 mark_inode_dirty(inode);
2489         }
2490
2491         while (count) {
2492                 unsigned long bytes, index, offset;
2493                 char *kaddr;
2494
2495                 /*
2496                  * Try to find the page in the cache. If it isn't there,
2497                  * allocate a free page.
2498                  */
2499                 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
2500                 index = pos >> PAGE_CACHE_SHIFT;
2501                 bytes = PAGE_CACHE_SIZE - offset;
2502                 if (bytes > count)
2503                         bytes = count;
2504
2505                 status = -ENOMEM;       /* we'll assign it later anyway */
2506                 page = __grab_cache_page(mapping, index, &cached_page);
2507                 if (!page)
2508                         break;
2509
2510                 /* We have exclusive IO access to the page.. */
2511                 if (!PageLocked(page)) {
2512                         PAGE_BUG(page);
2513                 }
2514
2515                 status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes);
2516                 if (status)
2517                         goto unlock;
2518                 kaddr = page_address(page);
2519                 status = copy_from_user(kaddr+offset, buf, bytes);
2520                 flush_dcache_page(page);
2521                 if (status)
2522                         goto fail_write;
2523                 status = mapping->a_ops->commit_write(file, page, offset, offset+bytes);
2524                 if (!status)
2525                         status = bytes;
2526
2527                 if (status >= 0) {
2528                         written += status;
2529                         count -= status;
2530                         pos += status;
2531                         buf += status;
2532                 }
2533 unlock:
2534                 /* Mark it unlocked again and drop the page.. */
2535                 UnlockPage(page);
2536                 page_cache_release(page);
2537
2538                 if (status < 0)
2539                         break;
2540         }
2541         *ppos = pos;
2542
2543         if (cached_page)
2544                 page_cache_free(cached_page);
2545
2546         err = written ? written : status;
2547 out:
2548         up(&inode->i_sem);
2549         return err;
2550 fail_write:
2551         status = -EFAULT;
2552         ClearPageUptodate(page);
2553         kunmap(page);
2554         goto unlock;
2555 }
2556
2557 void __init page_cache_init(unsigned long mempages)
2558 {
2559         unsigned long htable_size, order;
2560
2561         htable_size = mempages;
2562         htable_size *= sizeof(struct page *);
2563         for(order = 0; (PAGE_SIZE << order) < htable_size; order++)
2564                 ;
2565
2566         do {
2567                 unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *);
2568
2569                 page_hash_bits = 0;
2570                 while((tmp >>= 1UL) != 0UL)
2571                         page_hash_bits++;
2572
2573                 page_hash_table = (struct page **)
2574                         __get_free_pages(GFP_ATOMIC, order);
2575         } while(page_hash_table == NULL && --order > 0);
2576
2577         printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
2578                (1 << page_hash_bits), order, (PAGE_SIZE << order));
2579         if (!page_hash_table)
2580                 panic("Failed to allocate page hash table\n");
2581         memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));
2582 }