mm/filemap.c

   1 /*
   2  *      linux/mm/filemap.c
   3  *
   4  * Copyright (C) 1994-1999  Linus Torvalds
   5  */
   6
   7 /*
   8  * This file handles the generic file mmap semantics used by
   9  * most "normal" filesystems (but you don't /have/ to use this:
  10  * the NFS filesystem used to do this differently, for example)
  11  */
  12 #include <linux/malloc.h>
  13 #include <linux/shm.h>
  14 #include <linux/mman.h>
  15 #include <linux/locks.h>
  16 #include <linux/pagemap.h>
  17 #include <linux/swap.h>
  18 #include <linux/smp_lock.h>
  19 #include <linux/blkdev.h>
  20 #include <linux/file.h>
  21 #include <linux/swapctl.h>
  22 #include <linux/slab.h>
  23 #include <linux/init.h>
  24 #include <linux/mm.h>
  25
  26 #include <asm/pgalloc.h>
  27 #include <asm/uaccess.h>
  28 #include <asm/mman.h>
  29
  30 #include <linux/highmem.h>
  31
  32 /*
  33  * Shared mappings implemented 30.11.1994. It's not fully working yet,
  34  * though.
  35  *
  36  * Shared mappings now work. 15.8.1995  Bruno.
  37  *
  38  * finished 'unifying' the page and buffer cache and SMP-threaded the
  39  * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
  40  *
  41  * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
  42  */
  43
  44 atomic_t page_cache_size = ATOMIC_INIT(0);
  45 unsigned int page_hash_bits;
  46 struct page **page_hash_table;
  47 struct list_head lru_cache;
  48
  49 static spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
  50 /*
  51  * NOTE: to avoid deadlocking you must never acquire the pagecache_lock with
  52  *       the pagemap_lru_lock held.
  53  */
  54 spinlock_t pagemap_lru_lock = SPIN_LOCK_UNLOCKED;
  55
  56 #define CLUSTER_PAGES           (1 << page_cluster)
  57 #define CLUSTER_OFFSET(x)       (((x) >> page_cluster) << page_cluster)
  58
  59 void __add_page_to_hash_queue(struct page * page, struct page **p)
  60 {
  61         atomic_inc(&page_cache_size);
  62         if((page->next_hash = *p) != NULL)
  63                 (*p)->pprev_hash = &page->next_hash;
  64         *p = page;
  65         page->pprev_hash = p;
  66         if (page->buffers)
  67                 PAGE_BUG(page);
  68 }
  69
  70 static inline void remove_page_from_hash_queue(struct page * page)
  71 {
  72         if(page->pprev_hash) {
  73                 if(page->next_hash)
  74                         page->next_hash->pprev_hash = page->pprev_hash;
  75                 *page->pprev_hash = page->next_hash;
  76                 page->pprev_hash = NULL;
  77         }
  78         atomic_dec(&page_cache_size);
  79 }
  80
  81 static inline int sync_page(struct page *page)
  82 {
  83         struct address_space *mapping = page->mapping;
  84
  85         if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
  86                 return mapping->a_ops->sync_page(page);
  87         return 0;
  88 }
  89
  90 /*
  91  * Remove a page from the page cache and free it. Caller has to make
  92  * sure the page is locked and that nobody else uses it - or that usage
  93  * is safe.
  94  */
  95 static inline void __remove_inode_page(struct page *page)
  96 {
  97         remove_page_from_inode_queue(page);
  98         remove_page_from_hash_queue(page);
  99         page->mapping = NULL;
 100 }
 101
 102 void remove_inode_page(struct page *page)
 103 {
 104         if (!PageLocked(page))
 105                 PAGE_BUG(page);
 106
 107         spin_lock(&pagecache_lock);
 108         __remove_inode_page(page);
 109         spin_unlock(&pagecache_lock);
 110 }
 111
 112 /**
 113  * invalidate_inode_pages - Invalidate all the unlocked pages of one inode
 114  * @inode: the inode which pages we want to invalidate
 115  *
 116  * This function only removes the unlocked pages, if you want to
 117  * remove all the pages of one inode, you must call truncate_inode_pages.
 118  */
 119
 120 void invalidate_inode_pages(struct inode * inode)
 121 {
 122         struct list_head *head, *curr;
 123         struct page * page;
 124
 125         head = &inode->i_mapping->pages;
 126
 127         spin_lock(&pagecache_lock);
 128         spin_lock(&pagemap_lru_lock);
 129         curr = head->next;
 130
 131         while (curr != head) {
 132                 page = list_entry(curr, struct page, list);
 133                 curr = curr->next;
 134
 135                 /* We cannot invalidate a locked page */
 136                 if (TryLockPage(page))
 137                         continue;
 138
 139                 __lru_cache_del(page);
 140                 __remove_inode_page(page);
 141                 UnlockPage(page);
 142                 page_cache_release(page);
 143         }
 144
 145         spin_unlock(&pagemap_lru_lock);
 146         spin_unlock(&pagecache_lock);
 147 }
 148
 149 /*
 150  * Truncate the page cache at a set offset, removing the pages
 151  * that are beyond that offset (and zeroing out partial pages).
 152  */
 153 void truncate_inode_pages(struct address_space * mapping, loff_t lstart)
 154 {
 155         struct list_head *head, *curr;
 156         struct page * page;
 157         unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
 158         unsigned long start;
 159
 160         start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 161
 162 repeat:
 163         head = &mapping->pages;
 164         spin_lock(&pagecache_lock);
 165         curr = head->next;
 166         while (curr != head) {
 167                 unsigned long offset;
 168
 169                 page = list_entry(curr, struct page, list);
 170                 curr = curr->next;
 171
 172                 offset = page->index;
 173
 174                 /* page wholly truncated - free it */
 175                 if (offset >= start) {
 176                         if (TryLockPage(page)) {
 177                                 page_cache_get(page);
 178                                 spin_unlock(&pagecache_lock);
 179                                 wait_on_page(page);
 180                                 page_cache_release(page);
 181                                 goto repeat;
 182                         }
 183                         page_cache_get(page);
 184                         spin_unlock(&pagecache_lock);
 185
 186                         if (!page->buffers || block_flushpage(page, 0))
 187                                 lru_cache_del(page);
 188
 189                         /*
 190                          * We remove the page from the page cache
 191                          * _after_ we have destroyed all buffer-cache
 192                          * references to it. Otherwise some other process
 193                          * might think this inode page is not in the
 194                          * page cache and creates a buffer-cache alias
 195                          * to it causing all sorts of fun problems ...
 196                          */
 197                         remove_inode_page(page);
 198
 199                         UnlockPage(page);
 200                         page_cache_release(page);
 201                         page_cache_release(page);
 202
 203                         /*
 204                          * We have done things without the pagecache lock,
 205                          * so we'll have to repeat the scan.
 206                          * It's not possible to deadlock here because
 207                          * we are guaranteed to make progress. (ie. we have
 208                          * just removed a page)
 209                          */
 210                         goto repeat;
 211                 }
 212                 /*
 213                  * there is only one partial page possible.
 214                  */
 215                 if (!partial)
 216                         continue;
 217
 218                 /* and it's the one preceeding the first wholly truncated page */
 219                 if ((offset + 1) != start)
 220                         continue;
 221
 222                 /* partial truncate, clear end of page */
 223                 if (TryLockPage(page)) {
 224                         spin_unlock(&pagecache_lock);
 225                         goto repeat;
 226                 }
 227                 page_cache_get(page);
 228                 spin_unlock(&pagecache_lock);
 229
 230                 memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
 231                 if (page->buffers)
 232                         block_flushpage(page, partial);
 233
 234                 partial = 0;
 235
 236                 /*
 237                  * we have dropped the spinlock so we have to
 238                  * restart.
 239                  */
 240                 UnlockPage(page);
 241                 page_cache_release(page);
 242                 goto repeat;
 243         }
 244         spin_unlock(&pagecache_lock);
 245 }
 246
 247 /*
 248  * nr_dirty represents the number of dirty pages that we will write async
 249  * before doing sync writes.  We can only do sync writes if we can
 250  * wait for IO (__GFP_IO set).
 251  */
 252 int shrink_mmap(int priority, int gfp_mask)
 253 {
 254         int ret = 0, count, nr_dirty;
 255         struct list_head * page_lru;
 256         struct page * page = NULL;
 257
 258         count = nr_lru_pages / (priority + 1);
 259         nr_dirty = priority;
 260
 261         /* we need pagemap_lru_lock for list_del() ... subtle code below */
 262         spin_lock(&pagemap_lru_lock);
 263         while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache) {
 264                 page = list_entry(page_lru, struct page, lru);
 265                 list_del(page_lru);
 266
 267                 if (PageTestandClearReferenced(page))
 268                         goto dispose_continue;
 269
 270                 count--;
 271                 /*
 272                  * Avoid unscalable SMP locking for pages we can
 273                  * immediate tell are untouchable..
 274                  */
 275                 if (!page->buffers && page_count(page) > 1)
 276                         goto dispose_continue;
 277
 278                 if (TryLockPage(page))
 279                         goto dispose_continue;
 280
 281                 /* Release the pagemap_lru lock even if the page is not yet
 282                    queued in any lru queue since we have just locked down
 283                    the page so nobody else may SMP race with us running
 284                    a lru_cache_del() (lru_cache_del() always run with the
 285                    page locked down ;). */
 286                 spin_unlock(&pagemap_lru_lock);
 287
 288                 /* avoid freeing the page while it's locked */
 289                 page_cache_get(page);
 290
 291                 /*
 292                  * Is it a buffer page? Try to clean it up regardless
 293                  * of zone - it's old.
 294                  */
 295                 if (page->buffers) {
 296                         int wait = ((gfp_mask & __GFP_IO) && (nr_dirty-- < 0));
 297                         if (!try_to_free_buffers(page, wait))
 298                                 goto unlock_continue;
 299                         /* page was locked, inode can't go away under us */
 300                         if (!page->mapping) {
 301                                 atomic_dec(&buffermem_pages);
 302                                 goto made_buffer_progress;
 303                         }
 304                 }
 305
 306                 /* Take the pagecache_lock spinlock held to avoid
 307                    other tasks to notice the page while we are looking at its
 308                    page count. If it's a pagecache-page we'll free it
 309                    in one atomic transaction after checking its page count. */
 310                 spin_lock(&pagecache_lock);
 311
 312                 /*
 313                  * We can't free pages unless there's just one user
 314                  * (count == 2 because we added one ourselves above).
 315                  */
 316                 if (page_count(page) != 2)
 317                         goto cache_unlock_continue;
 318
 319                 /*
 320                  * Is it a page swap page? If so, we want to
 321                  * drop it if it is no longer used, even if it
 322                  * were to be marked referenced..
 323                  */
 324                 if (PageSwapCache(page)) {
 325                         spin_unlock(&pagecache_lock);
 326                         __delete_from_swap_cache(page);
 327                         goto made_inode_progress;
 328                 }
 329
 330                 /*
 331                  * Page is from a zone we don't care about.
 332                  * Don't drop page cache entries in vain.
 333                  */
 334                 if (page->zone->free_pages > page->zone->pages_high)
 335                         goto cache_unlock_continue;
 336
 337                 /* is it a page-cache page? */
 338                 if (page->mapping) {
 339                         if (!PageDirty(page) && !pgcache_under_min()) {
 340                                 __remove_inode_page(page);
 341                                 spin_unlock(&pagecache_lock);
 342                                 goto made_inode_progress;
 343                         }
 344                         goto cache_unlock_continue;
 345                 }
 346
 347                 printk(KERN_ERR "shrink_mmap: unknown LRU page!\n");
 348
 349 cache_unlock_continue:
 350                 spin_unlock(&pagecache_lock);
 351 unlock_continue:
 352                 spin_lock(&pagemap_lru_lock);
 353                 UnlockPage(page);
 354                 page_cache_release(page);
 355 dispose_continue:
 356                 list_add(page_lru, &lru_cache);
 357         }
 358         goto out;
 359
 360 made_inode_progress:
 361         page_cache_release(page);
 362 made_buffer_progress:
 363         UnlockPage(page);
 364         page_cache_release(page);
 365         ret = 1;
 366         spin_lock(&pagemap_lru_lock);
 367         /* nr_lru_pages needs the spinlock */
 368         nr_lru_pages--;
 369
 370 out:
 371         spin_unlock(&pagemap_lru_lock);
 372
 373         return ret;
 374 }
 375
 376 static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
 377 {
 378         goto inside;
 379
 380         for (;;) {
 381                 page = page->next_hash;
 382 inside:
 383                 if (!page)
 384                         goto not_found;
 385                 if (page->mapping != mapping)
 386                         continue;
 387                 if (page->index == offset)
 388                         break;
 389         }
 390         SetPageReferenced(page);
 391 not_found:
 392         return page;
 393 }
 394
 395 /*
 396  * By the time this is called, the page is locked and
 397  * we don't have to worry about any races any more.
 398  *
 399  * Start the IO..
 400  */
 401 static int writeout_one_page(struct page *page)
 402 {
 403         struct buffer_head *bh, *head = page->buffers;
 404
 405         bh = head;
 406         do {
 407                 if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
 408                         continue;
 409
 410                 bh->b_flushtime = 0;
 411                 ll_rw_block(WRITE, 1, &bh);
 412         } while ((bh = bh->b_this_page) != head);
 413         return 0;
 414 }
 415
 416 static int waitfor_one_page(struct page *page)
 417 {
 418         int error = 0;
 419         struct buffer_head *bh, *head = page->buffers;
 420
 421         bh = head;
 422         do {
 423                 wait_on_buffer(bh);
 424                 if (buffer_req(bh) && !buffer_uptodate(bh))
 425                         error = -EIO;
 426         } while ((bh = bh->b_this_page) != head);
 427         return error;
 428 }
 429
 430 static int do_buffer_fdatasync(struct inode *inode, unsigned long start, unsigned long end, int (*fn)(struct page *))
 431 {
 432         struct list_head *head, *curr;
 433         struct page *page;
 434         int retval = 0;
 435
 436         head = &inode->i_mapping->pages;
 437
 438         spin_lock(&pagecache_lock);
 439         curr = head->next;
 440         while (curr != head) {
 441                 page = list_entry(curr, struct page, list);
 442                 curr = curr->next;
 443                 if (!page->buffers)
 444                         continue;
 445                 if (page->index >= end)
 446                         continue;
 447                 if (page->index < start)
 448                         continue;
 449
 450                 page_cache_get(page);
 451                 spin_unlock(&pagecache_lock);
 452                 lock_page(page);
 453
 454                 /* The buffers could have been free'd while we waited for the page lock */
 455                 if (page->buffers)
 456                         retval |= fn(page);
 457
 458                 UnlockPage(page);
 459                 spin_lock(&pagecache_lock);
 460                 curr = page->list.next;
 461                 page_cache_release(page);
 462         }
 463         spin_unlock(&pagecache_lock);
 464
 465         return retval;
 466 }
 467
 468 /*
 469  * Two-stage data sync: first start the IO, then go back and
 470  * collect the information..
 471  */
 472 int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx)
 473 {
 474         int retval;
 475
 476         retval = do_buffer_fdatasync(inode, start_idx, end_idx, writeout_one_page);
 477         retval |= do_buffer_fdatasync(inode, start_idx, end_idx, waitfor_one_page);
 478         return retval;
 479 }
 480
 481 /*
 482  * Add a page to the inode page cache.
 483  *
 484  * The caller must have locked the page and
 485  * set all the page flags correctly..
 486  */
 487 void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index)
 488 {
 489         if (!PageLocked(page))
 490                 BUG();
 491
 492         page_cache_get(page);
 493         spin_lock(&pagecache_lock);
 494         page->index = index;
 495         add_page_to_inode_queue(mapping, page);
 496         __add_page_to_hash_queue(page, page_hash(mapping, index));
 497         lru_cache_add(page);
 498         spin_unlock(&pagecache_lock);
 499 }
 500
 501 /*
 502  * This adds a page to the page cache, starting out as locked,
 503  * owned by us, referenced, but not uptodate and with no errors.
 504  */
 505 static inline void __add_to_page_cache(struct page * page,
 506         struct address_space *mapping, unsigned long offset,
 507         struct page **hash)
 508 {
 509         struct page *alias;
 510         unsigned long flags;
 511
 512         if (PageLocked(page))
 513                 BUG();
 514
 515         flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty));
 516         page->flags = flags | (1 << PG_locked) | (1 << PG_referenced);
 517         page_cache_get(page);
 518         page->index = offset;
 519         add_page_to_inode_queue(mapping, page);
 520         __add_page_to_hash_queue(page, hash);
 521         lru_cache_add(page);
 522         alias = __find_page_nolock(mapping, offset, *hash);
 523         if (alias != page)
 524                 BUG();
 525 }
 526
 527 void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset)
 528 {
 529         spin_lock(&pagecache_lock);
 530         __add_to_page_cache(page, mapping, offset, page_hash(mapping, offset));
 531         spin_unlock(&pagecache_lock);
 532 }
 533
 534 static int add_to_page_cache_unique(struct page * page,
 535         struct address_space *mapping, unsigned long offset,
 536         struct page **hash)
 537 {
 538         int err;
 539         struct page *alias;
 540
 541         spin_lock(&pagecache_lock);
 542         alias = __find_page_nolock(mapping, offset, *hash);
 543
 544         err = 1;
 545         if (!alias) {
 546                 __add_to_page_cache(page,mapping,offset,hash);
 547                 err = 0;
 548         }
 549
 550         spin_unlock(&pagecache_lock);
 551         return err;
 552 }
 553
 554 /*
 555  * This adds the requested page to the page cache if it isn't already there,
 556  * and schedules an I/O to read in its contents from disk.
 557  */
 558 static inline int page_cache_read(struct file * file, unsigned long offset)
 559 {
 560         struct inode *inode = file->f_dentry->d_inode;
 561         struct address_space *mapping = inode->i_mapping;
 562         struct page **hash = page_hash(mapping, offset);
 563         struct page *page;
 564
 565         spin_lock(&pagecache_lock);
 566         page = __find_page_nolock(mapping, offset, *hash);
 567         spin_unlock(&pagecache_lock);
 568         if (page)
 569                 return 0;
 570
 571         page = page_cache_alloc();
 572         if (!page)
 573                 return -ENOMEM;
 574
 575         if (!add_to_page_cache_unique(page, mapping, offset, hash)) {
 576                 int error = mapping->a_ops->readpage(file, page);
 577                 page_cache_release(page);
 578                 return error;
 579         }
 580         /*
 581          * We arrive here in the unlikely event that someone
 582          * raced with us and added our page to the cache first.
 583          */
 584         page_cache_free(page);
 585         return 0;
 586 }
 587
 588 /*
 589  * Read in an entire cluster at once.  A cluster is usually a 64k-
 590  * aligned block that includes the page requested in "offset."
 591  */
 592 static int read_cluster_nonblocking(struct file * file, unsigned long offset,
 593         unsigned long filesize)
 594 {
 595         unsigned long pages = CLUSTER_PAGES;
 596
 597         offset = CLUSTER_OFFSET(offset);
 598         while ((pages-- > 0) && (offset < filesize)) {
 599                 int error = page_cache_read(file, offset);
 600                 if (error < 0)
 601                         return error;
 602                 offset ++;
 603         }
 604
 605         return 0;
 606 }
 607
 608 /*
 609  * Wait for a page to get unlocked.
 610  *
 611  * This must be called with the caller "holding" the page,
 612  * ie with increased "page->count" so that the page won't
 613  * go away during the wait..
 614  */
 615 void ___wait_on_page(struct page *page)
 616 {
 617         struct task_struct *tsk = current;
 618         DECLARE_WAITQUEUE(wait, tsk);
 619
 620         add_wait_queue(&page->wait, &wait);
 621         do {
 622                 sync_page(page);
 623                 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 624                 if (!PageLocked(page))
 625                         break;
 626                 schedule();
 627         } while (PageLocked(page));
 628         tsk->state = TASK_RUNNING;
 629         remove_wait_queue(&page->wait, &wait);
 630 }
 631
 632 /*
 633  * Get an exclusive lock on the page..
 634  */
 635 void lock_page(struct page *page)
 636 {
 637         while (TryLockPage(page))
 638                 ___wait_on_page(page);
 639 }
 640
 641
 642 /*
 643  * a rather lightweight function, finding and getting a reference to a
 644  * hashed page atomically, waiting for it if it's locked.
 645  */
 646 struct page * __find_get_page (struct address_space *mapping,
 647                                 unsigned long offset, struct page **hash)
 648 {
 649         struct page *page;
 650
 651         /*
 652          * We scan the hash list read-only. Addition to and removal from
 653          * the hash-list needs a held write-lock.
 654          */
 655 repeat:
 656         spin_lock(&pagecache_lock);
 657         page = __find_page_nolock(mapping, offset, *hash);
 658         if (page)
 659                 page_cache_get(page);
 660         spin_unlock(&pagecache_lock);
 661
 662         /* Found the page, sleep if locked. */
 663         if (page && PageLocked(page)) {
 664                 struct task_struct *tsk = current;
 665                 DECLARE_WAITQUEUE(wait, tsk);
 666
 667                 sync_page(page);
 668
 669                 __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 670                 add_wait_queue(&page->wait, &wait);
 671
 672                 if (PageLocked(page))
 673                         schedule();
 674                 __set_task_state(tsk, TASK_RUNNING);
 675                 remove_wait_queue(&page->wait, &wait);
 676
 677                 /*
 678                  * The page might have been unhashed meanwhile. It's
 679                  * not freed though because we hold a reference to it.
 680                  * If this is the case then it will be freed _here_,
 681                  * and we recheck the hash anyway.
 682                  */
 683                 page_cache_release(page);
 684                 goto repeat;
 685         }
 686         /*
 687          * It's not locked so we can return the page and we hold
 688          * a reference to it.
 689          */
 690         return page;
 691 }
 692
 693 /*
 694  * Get the lock to a page atomically.
 695  */
 696 struct page * __find_lock_page (struct address_space *mapping,
 697                                 unsigned long offset, struct page **hash)
 698 {
 699         struct page *page;
 700
 701         /*
 702          * We scan the hash list read-only. Addition to and removal from
 703          * the hash-list needs a held write-lock.
 704          */
 705 repeat:
 706         spin_lock(&pagecache_lock);
 707         page = __find_page_nolock(mapping, offset, *hash);
 708         if (page)
 709                 page_cache_get(page);
 710         spin_unlock(&pagecache_lock);
 711
 712         /* Found the page, sleep if locked. */
 713         if (page && TryLockPage(page)) {
 714                 struct task_struct *tsk = current;
 715                 DECLARE_WAITQUEUE(wait, tsk);
 716
 717                 sync_page(page);
 718
 719                 __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 720                 add_wait_queue(&page->wait, &wait);
 721
 722                 if (PageLocked(page))
 723                         schedule();
 724                 __set_task_state(tsk, TASK_RUNNING);
 725                 remove_wait_queue(&page->wait, &wait);
 726
 727                 /*
 728                  * The page might have been unhashed meanwhile. It's
 729                  * not freed though because we hold a reference to it.
 730                  * If this is the case then it will be freed _here_,
 731                  * and we recheck the hash anyway.
 732                  */
 733                 page_cache_release(page);
 734                 goto repeat;
 735         }
 736         /*
 737          * It's not locked so we can return the page and we hold
 738          * a reference to it.
 739          */
 740         return page;
 741 }
 742
 743 #if 0
 744 #define PROFILE_READAHEAD
 745 #define DEBUG_READAHEAD
 746 #endif
 747
 748 /*
 749  * Read-ahead profiling information
 750  * --------------------------------
 751  * Every PROFILE_MAXREADCOUNT, the following information is written
 752  * to the syslog:
 753  *   Percentage of asynchronous read-ahead.
 754  *   Average of read-ahead fields context value.
 755  * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
 756  * to the syslog.
 757  */
 758
 759 #ifdef PROFILE_READAHEAD
 760
 761 #define PROFILE_MAXREADCOUNT 1000
 762
 763 static unsigned long total_reada;
 764 static unsigned long total_async;
 765 static unsigned long total_ramax;
 766 static unsigned long total_ralen;
 767 static unsigned long total_rawin;
 768
 769 static void profile_readahead(int async, struct file *filp)
 770 {
 771         unsigned long flags;
 772
 773         ++total_reada;
 774         if (async)
 775                 ++total_async;
 776
 777         total_ramax     += filp->f_ramax;
 778         total_ralen     += filp->f_ralen;
 779         total_rawin     += filp->f_rawin;
 780
 781         if (total_reada > PROFILE_MAXREADCOUNT) {
 782                 save_flags(flags);
 783                 cli();
 784                 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
 785                         restore_flags(flags);
 786                         return;
 787                 }
 788
 789                 printk("Readahead average:  max=%ld, len=%ld, win=%ld, async=%ld%%\n",
 790                         total_ramax/total_reada,
 791                         total_ralen/total_reada,
 792                         total_rawin/total_reada,
 793                         (total_async*100)/total_reada);
 794 #ifdef DEBUG_READAHEAD
 795                 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n",
 796                         filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
 797 #endif
 798
 799                 total_reada     = 0;
 800                 total_async     = 0;
 801                 total_ramax     = 0;
 802                 total_ralen     = 0;
 803                 total_rawin     = 0;
 804
 805                 restore_flags(flags);
 806         }
 807 }
 808 #endif  /* defined PROFILE_READAHEAD */
 809
 810 /*
 811  * Read-ahead context:
 812  * -------------------
 813  * The read ahead context fields of the "struct file" are the following:
 814  * - f_raend : position of the first byte after the last page we tried to
 815  *             read ahead.
 816  * - f_ramax : current read-ahead maximum size.
 817  * - f_ralen : length of the current IO read block we tried to read-ahead.
 818  * - f_rawin : length of the current read-ahead window.
 819  *              if last read-ahead was synchronous then
 820  *                      f_rawin = f_ralen
 821  *              otherwise (was asynchronous)
 822  *                      f_rawin = previous value of f_ralen + f_ralen
 823  *
 824  * Read-ahead limits:
 825  * ------------------
 826  * MIN_READAHEAD   : minimum read-ahead size when read-ahead.
 827  * MAX_READAHEAD   : maximum read-ahead size when read-ahead.
 828  *
 829  * Synchronous read-ahead benefits:
 830  * --------------------------------
 831  * Using reasonable IO xfer length from peripheral devices increase system
 832  * performances.
 833  * Reasonable means, in this context, not too large but not too small.
 834  * The actual maximum value is:
 835  *      MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
 836  *      and 32K if defined (4K page size assumed).
 837  *
 838  * Asynchronous read-ahead benefits:
 839  * ---------------------------------
 840  * Overlapping next read request and user process execution increase system
 841  * performance.
 842  *
 843  * Read-ahead risks:
 844  * -----------------
 845  * We have to guess which further data are needed by the user process.
 846  * If these data are often not really needed, it's bad for system
 847  * performances.
 848  * However, we know that files are often accessed sequentially by
 849  * application programs and it seems that it is possible to have some good
 850  * strategy in that guessing.
 851  * We only try to read-ahead files that seems to be read sequentially.
 852  *
 853  * Asynchronous read-ahead risks:
 854  * ------------------------------
 855  * In order to maximize overlapping, we must start some asynchronous read
 856  * request from the device, as soon as possible.
 857  * We must be very careful about:
 858  * - The number of effective pending IO read requests.
 859  *   ONE seems to be the only reasonable value.
 860  * - The total memory pool usage for the file access stream.
 861  *   This maximum memory usage is implicitly 2 IO read chunks:
 862  *   2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
 863  *   64k if defined (4K page size assumed).
 864  */
 865
 866 static inline int get_max_readahead(struct inode * inode)
 867 {
 868         if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
 869                 return MAX_READAHEAD;
 870         return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
 871 }
 872
 873 static void generic_file_readahead(int reada_ok,
 874         struct file * filp, struct inode * inode,
 875         struct page * page)
 876 {
 877         unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
 878         unsigned long index = page->index;
 879         unsigned long max_ahead, ahead;
 880         unsigned long raend;
 881         int max_readahead = get_max_readahead(inode);
 882
 883         raend = filp->f_raend;
 884         max_ahead = 0;
 885
 886 /*
 887  * The current page is locked.
 888  * If the current position is inside the previous read IO request, do not
 889  * try to reread previously read ahead pages.
 890  * Otherwise decide or not to read ahead some pages synchronously.
 891  * If we are not going to read ahead, set the read ahead context for this
 892  * page only.
 893  */
 894         if (PageLocked(page)) {
 895                 if (!filp->f_ralen || index >= raend || index + filp->f_ralen < raend) {
 896                         raend = index;
 897                         if (raend < end_index)
 898                                 max_ahead = filp->f_ramax;
 899                         filp->f_rawin = 0;
 900                         filp->f_ralen = 1;
 901                         if (!max_ahead) {
 902                                 filp->f_raend  = index + filp->f_ralen;
 903                                 filp->f_rawin += filp->f_ralen;
 904                         }
 905                 }
 906         }
 907 /*
 908  * The current page is not locked.
 909  * If we were reading ahead and,
 910  * if the current max read ahead size is not zero and,
 911  * if the current position is inside the last read-ahead IO request,
 912  *   it is the moment to try to read ahead asynchronously.
 913  * We will later force unplug device in order to force asynchronous read IO.
 914  */
 915         else if (reada_ok && filp->f_ramax && raend >= 1 &&
 916                  index <= raend && index + filp->f_ralen >= raend) {
 917 /*
 918  * Add ONE page to max_ahead in order to try to have about the same IO max size
 919  * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
 920  * Compute the position of the last page we have tried to read in order to
 921  * begin to read ahead just at the next page.
 922  */
 923                 raend -= 1;
 924                 if (raend < end_index)
 925                         max_ahead = filp->f_ramax + 1;
 926
 927                 if (max_ahead) {
 928                         filp->f_rawin = filp->f_ralen;
 929                         filp->f_ralen = 0;
 930                         reada_ok      = 2;
 931                 }
 932         }
 933 /*
 934  * Try to read ahead pages.
 935  * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
 936  * scheduler, will work enough for us to avoid too bad actuals IO requests.
 937  */
 938         ahead = 0;
 939         while (ahead < max_ahead) {
 940                 ahead ++;
 941                 if ((raend + ahead) >= end_index)
 942                         break;
 943                 if (page_cache_read(filp, raend + ahead) < 0)
 944                         break;
 945         }
 946 /*
 947  * If we tried to read ahead some pages,
 948  * If we tried to read ahead asynchronously,
 949  *   Try to force unplug of the device in order to start an asynchronous
 950  *   read IO request.
 951  * Update the read-ahead context.
 952  * Store the length of the current read-ahead window.
 953  * Double the current max read ahead size.
 954  *   That heuristic avoid to do some large IO for files that are not really
 955  *   accessed sequentially.
 956  */
 957         if (ahead) {
 958                 if (reada_ok == 2) {
 959                         run_task_queue(&tq_disk);
 960                 }
 961
 962                 filp->f_ralen += ahead;
 963                 filp->f_rawin += filp->f_ralen;
 964                 filp->f_raend = raend + ahead + 1;
 965
 966                 filp->f_ramax += filp->f_ramax;
 967
 968                 if (filp->f_ramax > max_readahead)
 969                         filp->f_ramax = max_readahead;
 970
 971 #ifdef PROFILE_READAHEAD
 972                 profile_readahead((reada_ok == 2), filp);
 973 #endif
 974         }
 975
 976         return;
 977 }
 978
 979
 980 /*
 981  * This is a generic file read routine, and uses the
 982  * inode->i_op->readpage() function for the actual low-level
 983  * stuff.
 984  *
 985  * This is really ugly. But the goto's actually try to clarify some
 986  * of the logic when it comes to error handling etc.
 987  */
 988 void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
 989 {
 990         struct inode *inode = filp->f_dentry->d_inode;
 991         struct address_space *mapping = inode->i_mapping;
 992         unsigned long index, offset;
 993         struct page *cached_page;
 994         int reada_ok;
 995         int error;
 996         int max_readahead = get_max_readahead(inode);
 997
 998         cached_page = NULL;
 999         index = *ppos >> PAGE_CACHE_SHIFT;
1000         offset = *ppos & ~PAGE_CACHE_MASK;
1001
1002 /*
1003  * If the current position is outside the previous read-ahead window,
1004  * we reset the current read-ahead context and set read ahead max to zero
1005  * (will be set to just needed value later),
1006  * otherwise, we assume that the file accesses are sequential enough to
1007  * continue read-ahead.
1008  */
1009         if (index > filp->f_raend || index + filp->f_rawin < filp->f_raend) {
1010                 reada_ok = 0;
1011                 filp->f_raend = 0;
1012                 filp->f_ralen = 0;
1013                 filp->f_ramax = 0;
1014                 filp->f_rawin = 0;
1015         } else {
1016                 reada_ok = 1;
1017         }
1018 /*
1019  * Adjust the current value of read-ahead max.
1020  * If the read operation stay in the first half page, force no readahead.
1021  * Otherwise try to increase read ahead max just enough to do the read request.
1022  * Then, at least MIN_READAHEAD if read ahead is ok,
1023  * and at most MAX_READAHEAD in all cases.
1024  */
1025         if (!index && offset + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
1026                 filp->f_ramax = 0;
1027         } else {
1028                 unsigned long needed;
1029
1030                 needed = ((offset + desc->count) >> PAGE_CACHE_SHIFT) + 1;
1031
1032                 if (filp->f_ramax < needed)
1033                         filp->f_ramax = needed;
1034
1035                 if (reada_ok && filp->f_ramax < MIN_READAHEAD)
1036                                 filp->f_ramax = MIN_READAHEAD;
1037                 if (filp->f_ramax > max_readahead)
1038                         filp->f_ramax = max_readahead;
1039         }
1040
1041         for (;;) {
1042                 struct page *page, **hash;
1043                 unsigned long end_index, nr;
1044
1045                 end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1046                 if (index > end_index)
1047                         break;
1048                 nr = PAGE_CACHE_SIZE;
1049                 if (index == end_index) {
1050                         nr = inode->i_size & ~PAGE_CACHE_MASK;
1051                         if (nr <= offset)
1052                                 break;
1053                 }
1054
1055                 nr = nr - offset;
1056
1057                 /*
1058                  * Try to find the data in the page cache..
1059                  */
1060                 hash = page_hash(mapping, index);
1061
1062                 spin_lock(&pagecache_lock);
1063                 page = __find_page_nolock(mapping, index, *hash);
1064                 if (!page)
1065                         goto no_cached_page;
1066 found_page:
1067                 page_cache_get(page);
1068                 spin_unlock(&pagecache_lock);
1069
1070                 if (!Page_Uptodate(page))
1071                         goto page_not_up_to_date;
1072 page_ok:
1073                 /*
1074                  * Ok, we have the page, and it's up-to-date, so
1075                  * now we can copy it to user space...
1076                  *
1077                  * The actor routine returns how many bytes were actually used..
1078                  * NOTE! This may not be the same as how much of a user buffer
1079                  * we filled up (we may be padding etc), so we can only update
1080                  * "pos" here (the actor routine has to update the user buffer
1081                  * pointers and the remaining count).
1082                  */
1083                 nr = actor(desc, page, offset, nr);
1084                 offset += nr;
1085                 index += offset >> PAGE_CACHE_SHIFT;
1086                 offset &= ~PAGE_CACHE_MASK;
1087
1088                 page_cache_release(page);
1089                 if (nr && desc->count)
1090                         continue;
1091                 break;
1092
1093 /*
1094  * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
1095  */
1096 page_not_up_to_date:
1097                 generic_file_readahead(reada_ok, filp, inode, page);
1098
1099                 if (Page_Uptodate(page))
1100                         goto page_ok;
1101
1102                 /* Get exclusive access to the page ... */
1103                 lock_page(page);
1104                 if (Page_Uptodate(page)) {
1105                         UnlockPage(page);
1106                         goto page_ok;
1107                 }
1108
1109 readpage:
1110                 /* ... and start the actual read. The read will unlock the page. */
1111                 error = mapping->a_ops->readpage(filp, page);
1112
1113                 if (!error) {
1114                         if (Page_Uptodate(page))
1115                                 goto page_ok;
1116
1117                         /* Again, try some read-ahead while waiting for the page to finish.. */
1118                         generic_file_readahead(reada_ok, filp, inode, page);
1119                         wait_on_page(page);
1120                         if (Page_Uptodate(page))
1121                                 goto page_ok;
1122                         error = -EIO;
1123                 }
1124
1125                 /* UHHUH! A synchronous read error occurred. Report it */
1126                 desc->error = error;
1127                 page_cache_release(page);
1128                 break;
1129
1130 no_cached_page:
1131                 /*
1132                  * Ok, it wasn't cached, so we need to create a new
1133                  * page..
1134                  *
1135                  * We get here with the page cache lock held.
1136                  */
1137                 if (!cached_page) {
1138                         spin_unlock(&pagecache_lock);
1139                         cached_page = page_cache_alloc();
1140                         if (!cached_page) {
1141                                 desc->error = -ENOMEM;
1142                                 break;
1143                         }
1144
1145                         /*
1146                          * Somebody may have added the page while we
1147                          * dropped the page cache lock. Check for that.
1148                          */
1149                         spin_lock(&pagecache_lock);
1150                         page = __find_page_nolock(mapping, index, *hash);
1151                         if (page)
1152                                 goto found_page;
1153                 }
1154
1155                 /*
1156                  * Ok, add the new page to the hash-queues...
1157                  */
1158                 page = cached_page;
1159                 __add_to_page_cache(page, mapping, index, hash);
1160                 spin_unlock(&pagecache_lock);
1161                 cached_page = NULL;
1162
1163                 goto readpage;
1164         }
1165
1166         *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1167         filp->f_reada = 1;
1168         if (cached_page)
1169                 page_cache_free(cached_page);
1170         UPDATE_ATIME(inode);
1171 }
1172
1173 static int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
1174 {
1175         unsigned long kaddr;
1176         unsigned long left, count = desc->count;
1177
1178         if (size > count)
1179                 size = count;
1180
1181         kaddr = kmap(page);
1182         left = __copy_to_user(desc->buf, (void *)(kaddr + offset), size);
1183         kunmap(page);
1184
1185         if (left) {
1186                 size -= left;
1187                 desc->error = -EFAULT;
1188         }
1189         desc->count = count - size;
1190         desc->written += size;
1191         desc->buf += size;
1192         return size;
1193 }
1194
1195 /*
1196  * This is the "read()" routine for all filesystems
1197  * that can use the page cache directly.
1198  */
1199 ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
1200 {
1201         ssize_t retval;
1202
1203         retval = -EFAULT;
1204         if (access_ok(VERIFY_WRITE, buf, count)) {
1205                 retval = 0;
1206
1207                 if (count) {
1208                         read_descriptor_t desc;
1209
1210                         desc.written = 0;
1211                         desc.count = count;
1212                         desc.buf = buf;
1213                         desc.error = 0;
1214                         do_generic_file_read(filp, ppos, &desc, file_read_actor);
1215
1216                         retval = desc.written;
1217                         if (!retval)
1218                                 retval = desc.error;
1219                 }
1220         }
1221         return retval;
1222 }
1223
1224 static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size)
1225 {
1226         unsigned long kaddr;
1227         ssize_t written;
1228         unsigned long count = desc->count;
1229         struct file *file = (struct file *) desc->buf;
1230         mm_segment_t old_fs;
1231
1232         if (size > count)
1233                 size = count;
1234         old_fs = get_fs();
1235         set_fs(KERNEL_DS);
1236
1237         kaddr = kmap(page);
1238         written = file->f_op->write(file, (char *)kaddr + offset,
1239                                                  size, &file->f_pos);
1240         kunmap(page);
1241         set_fs(old_fs);
1242         if (written < 0) {
1243                 desc->error = written;
1244                 written = 0;
1245         }
1246         desc->count = count - written;
1247         desc->written += written;
1248         return written;
1249 }
1250
1251 asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
1252 {
1253         ssize_t retval;
1254         struct file * in_file, * out_file;
1255         struct inode * in_inode, * out_inode;
1256
1257         /*
1258          * Get input file, and verify that it is ok..
1259          */
1260         retval = -EBADF;
1261         in_file = fget(in_fd);
1262         if (!in_file)
1263                 goto out;
1264         if (!(in_file->f_mode & FMODE_READ))
1265                 goto fput_in;
1266         retval = -EINVAL;
1267         in_inode = in_file->f_dentry->d_inode;
1268         if (!in_inode)
1269                 goto fput_in;
1270         if (!in_inode->i_mapping->a_ops->readpage)
1271                 goto fput_in;
1272         retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
1273         if (retval)
1274                 goto fput_in;
1275
1276         /*
1277          * Get output file, and verify that it is ok..
1278          */
1279         retval = -EBADF;
1280         out_file = fget(out_fd);
1281         if (!out_file)
1282                 goto fput_in;
1283         if (!(out_file->f_mode & FMODE_WRITE))
1284                 goto fput_out;
1285         retval = -EINVAL;
1286         if (!out_file->f_op || !out_file->f_op->write)
1287                 goto fput_out;
1288         out_inode = out_file->f_dentry->d_inode;
1289         if (!out_inode)
1290                 goto fput_out;
1291         retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
1292         if (retval)
1293                 goto fput_out;
1294
1295         retval = 0;
1296         if (count) {
1297                 read_descriptor_t desc;
1298                 loff_t pos = 0, *ppos;
1299
1300                 retval = -EFAULT;
1301                 ppos = &in_file->f_pos;
1302                 if (offset) {
1303                         if (get_user(pos, offset))
1304                                 goto fput_out;
1305                         ppos = &pos;
1306                 }
1307
1308                 desc.written = 0;
1309                 desc.count = count;
1310                 desc.buf = (char *) out_file;
1311                 desc.error = 0;
1312                 do_generic_file_read(in_file, ppos, &desc, file_send_actor);
1313
1314                 retval = desc.written;
1315                 if (!retval)
1316                         retval = desc.error;
1317                 if (offset)
1318                         put_user(pos, offset);
1319         }
1320
1321 fput_out:
1322         fput(out_file);
1323 fput_in:
1324         fput(in_file);
1325 out:
1326         return retval;
1327 }
1328
1329 /*
1330  * Read-ahead and flush behind for MADV_SEQUENTIAL areas.  Since we are
1331  * sure this is sequential access, we don't need a flexible read-ahead
1332  * window size -- we can always use a large fixed size window.
1333  */
1334 static void nopage_sequential_readahead(struct vm_area_struct * vma,
1335         unsigned long pgoff, unsigned long filesize)
1336 {
1337         unsigned long ra_window;
1338
1339         ra_window = get_max_readahead(vma->vm_file->f_dentry->d_inode);
1340         ra_window = CLUSTER_OFFSET(ra_window + CLUSTER_PAGES - 1);
1341
1342         /* vm_raend is zero if we haven't read ahead in this area yet.  */
1343         if (vma->vm_raend == 0)
1344                 vma->vm_raend = vma->vm_pgoff + ra_window;
1345
1346         /*
1347          * If we've just faulted the page half-way through our window,
1348          * then schedule reads for the next window, and release the
1349          * pages in the previous window.
1350          */
1351         if ((pgoff + (ra_window >> 1)) == vma->vm_raend) {
1352                 unsigned long start = vma->vm_pgoff + vma->vm_raend;
1353                 unsigned long end = start + ra_window;
1354
1355                 if (end > ((vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff))
1356                         end = (vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff;
1357                 if (start > end)
1358                         return;
1359
1360                 while ((start < end) && (start < filesize)) {
1361                         if (read_cluster_nonblocking(vma->vm_file,
1362                                                         start, filesize) < 0)
1363                                 break;
1364                         start += CLUSTER_PAGES;
1365                 }
1366                 run_task_queue(&tq_disk);
1367
1368                 /* if we're far enough past the beginning of this area,
1369                    recycle pages that are in the previous window. */
1370                 if (vma->vm_raend > (vma->vm_pgoff + ra_window + ra_window)) {
1371                         unsigned long window = ra_window << PAGE_SHIFT;
1372
1373                         end = vma->vm_start + (vma->vm_raend << PAGE_SHIFT);
1374                         end -= window + window;
1375                         filemap_sync(vma, end - window, window, MS_INVALIDATE);
1376                 }
1377
1378                 vma->vm_raend += ra_window;
1379         }
1380
1381         return;
1382 }
1383
1384 /*
1385  * filemap_nopage() is invoked via the vma operations vector for a
1386  * mapped memory region to read in file data during a page fault.
1387  *
1388  * The goto's are kind of ugly, but this streamlines the normal case of having
1389  * it in the page cache, and handles the special cases reasonably without
1390  * having a lot of duplicated code.
1391  */
1392 struct page * filemap_nopage(struct vm_area_struct * area,
1393         unsigned long address, int no_share)
1394 {
1395         int error;
1396         struct file *file = area->vm_file;
1397         struct inode *inode = file->f_dentry->d_inode;
1398         struct address_space *mapping = inode->i_mapping;
1399         struct page *page, **hash, *old_page;
1400         unsigned long size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1401
1402         unsigned long pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
1403
1404         /*
1405          * Semantics for shared and private memory areas are different
1406          * past the end of the file. A shared mapping past the last page
1407          * of the file is an error and results in a SIGBUS, while a
1408          * private mapping just maps in a zero page.
1409          */
1410         if ((pgoff >= size) && (area->vm_mm == current->mm))
1411                 return NULL;
1412
1413         /*
1414          * Do we have something in the page cache already?
1415          */
1416         hash = page_hash(mapping, pgoff);
1417 retry_find:
1418         page = __find_get_page(mapping, pgoff, hash);
1419         if (!page)
1420                 goto no_cached_page;
1421
1422         /*
1423          * Ok, found a page in the page cache, now we need to check
1424          * that it's up-to-date.
1425          */
1426         if (!Page_Uptodate(page))
1427                 goto page_not_uptodate;
1428
1429 success:
1430         /*
1431          * Try read-ahead for sequential areas.
1432          */
1433         if (VM_SequentialReadHint(area))
1434                 nopage_sequential_readahead(area, pgoff, size);
1435
1436         /*
1437          * Found the page and have a reference on it, need to check sharing
1438          * and possibly copy it over to another page..
1439          */
1440         old_page = page;
1441         if (no_share) {
1442                 struct page *new_page = page_cache_alloc();
1443
1444                 if (new_page) {
1445                         copy_user_highpage(new_page, old_page, address);
1446                         flush_page_to_ram(new_page);
1447                 } else
1448                         new_page = NOPAGE_OOM;
1449                 page_cache_release(page);
1450                 return new_page;
1451         }
1452
1453         flush_page_to_ram(old_page);
1454         return old_page;
1455
1456 no_cached_page:
1457         /*
1458          * If the requested offset is within our file, try to read a whole
1459          * cluster of pages at once.
1460          *
1461          * Otherwise, we're off the end of a privately mapped file,
1462          * so we need to map a zero page.
1463          */
1464         if ((pgoff < size) && !VM_RandomReadHint(area))
1465                 error = read_cluster_nonblocking(file, pgoff, size);
1466         else
1467                 error = page_cache_read(file, pgoff);
1468
1469         /*
1470          * The page we want has now been added to the page cache.
1471          * In the unlikely event that someone removed it in the
1472          * meantime, we'll just come back here and read it again.
1473          */
1474         if (error >= 0)
1475                 goto retry_find;
1476
1477         /*
1478          * An error return from page_cache_read can result if the
1479          * system is low on memory, or a problem occurs while trying
1480          * to schedule I/O.
1481          */
1482         if (error == -ENOMEM)
1483                 return NOPAGE_OOM;
1484         return NULL;
1485
1486 page_not_uptodate:
1487         lock_page(page);
1488         if (Page_Uptodate(page)) {
1489                 UnlockPage(page);
1490                 goto success;
1491         }
1492
1493         if (!mapping->a_ops->readpage(file, page)) {
1494                 wait_on_page(page);
1495                 if (Page_Uptodate(page))
1496                         goto success;
1497         }
1498
1499         /*
1500          * Umm, take care of errors if the page isn't up-to-date.
1501          * Try to re-read it _once_. We do this synchronously,
1502          * because there really aren't any performance issues here
1503          * and we need to check for errors.
1504          */
1505         lock_page(page);
1506         if (Page_Uptodate(page)) {
1507                 UnlockPage(page);
1508                 goto success;
1509         }
1510         ClearPageError(page);
1511         if (!mapping->a_ops->readpage(file, page)) {
1512                 wait_on_page(page);
1513                 if (Page_Uptodate(page))
1514                         goto success;
1515         }
1516
1517         /*
1518          * Things didn't work out. Return zero to tell the
1519          * mm layer so, possibly freeing the page cache page first.
1520          */
1521         page_cache_release(page);
1522         return NULL;
1523 }
1524
1525 static int filemap_write_page(struct file *file,
1526                               struct page * page,
1527                               int wait)
1528 {
1529         /*
1530          * If a task terminates while we're swapping the page, the vma and
1531          * and file could be released: try_to_swap_out has done a get_file.
1532          * vma/file is guaranteed to exist in the unmap/sync cases because
1533          * mmap_sem is held.
1534          */
1535         return page->mapping->a_ops->writepage(file, page);
1536 }
1537
1538
1539 /*
1540  * The page cache takes care of races between somebody
1541  * trying to swap something out and swap something in
1542  * at the same time..
1543  */
1544 extern void wakeup_bdflush(int);
1545 int filemap_swapout(struct page * page, struct file * file)
1546 {
1547         int retval = filemap_write_page(file, page, 0);
1548         wakeup_bdflush(0);
1549         return retval;
1550 }
1551
1552 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
1553         unsigned long address, unsigned int flags)
1554 {
1555         unsigned long pgoff;
1556         pte_t pte = *ptep;
1557         struct page *page;
1558         int error;
1559
1560         if (!(flags & MS_INVALIDATE)) {
1561                 if (!pte_present(pte))
1562                         return 0;
1563                 if (!pte_dirty(pte))
1564                         return 0;
1565                 flush_page_to_ram(pte_page(pte));
1566                 flush_cache_page(vma, address);
1567                 set_pte(ptep, pte_mkclean(pte));
1568                 flush_tlb_page(vma, address);
1569                 page = pte_page(pte);
1570                 page_cache_get(page);
1571         } else {
1572                 if (pte_none(pte))
1573                         return 0;
1574                 flush_cache_page(vma, address);
1575                 pte_clear(ptep);
1576                 flush_tlb_page(vma, address);
1577                 if (!pte_present(pte)) {
1578                         swap_free(pte_to_swp_entry(pte));
1579                         return 0;
1580                 }
1581                 page = pte_page(pte);
1582                 if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
1583                         page_cache_free(page);
1584                         return 0;
1585                 }
1586         }
1587         pgoff = (address - vma->vm_start) >> PAGE_CACHE_SHIFT;
1588         pgoff += vma->vm_pgoff;
1589         if (page->index != pgoff) {
1590                 printk("weirdness: pgoff=%lu index=%lu address=%lu vm_start=%lu vm_pgoff=%lu\n",
1591                         pgoff, page->index, address, vma->vm_start, vma->vm_pgoff);
1592         }
1593         lock_page(page);
1594         error = filemap_write_page(vma->vm_file, page, 1);
1595         UnlockPage(page);
1596         page_cache_free(page);
1597         return error;
1598 }
1599
1600 static inline int filemap_sync_pte_range(pmd_t * pmd,
1601         unsigned long address, unsigned long size,
1602         struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
1603 {
1604         pte_t * pte;
1605         unsigned long end;
1606         int error;
1607
1608         if (pmd_none(*pmd))
1609                 return 0;
1610         if (pmd_bad(*pmd)) {
1611                 pmd_ERROR(*pmd);
1612                 pmd_clear(pmd);
1613                 return 0;
1614         }
1615         pte = pte_offset(pmd, address);
1616         offset += address & PMD_MASK;
1617         address &= ~PMD_MASK;
1618         end = address + size;
1619         if (end > PMD_SIZE)
1620                 end = PMD_SIZE;
1621         error = 0;
1622         do {
1623                 error |= filemap_sync_pte(pte, vma, address + offset, flags);
1624                 address += PAGE_SIZE;
1625                 pte++;
1626         } while (address && (address < end));
1627         return error;
1628 }
1629
1630 static inline int filemap_sync_pmd_range(pgd_t * pgd,
1631         unsigned long address, unsigned long size,
1632         struct vm_area_struct *vma, unsigned int flags)
1633 {
1634         pmd_t * pmd;
1635         unsigned long offset, end;
1636         int error;
1637
1638         if (pgd_none(*pgd))
1639                 return 0;
1640         if (pgd_bad(*pgd)) {
1641                 pgd_ERROR(*pgd);
1642                 pgd_clear(pgd);
1643                 return 0;
1644         }
1645         pmd = pmd_offset(pgd, address);
1646         offset = address & PGDIR_MASK;
1647         address &= ~PGDIR_MASK;
1648         end = address + size;
1649         if (end > PGDIR_SIZE)
1650                 end = PGDIR_SIZE;
1651         error = 0;
1652         do {
1653                 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
1654                 address = (address + PMD_SIZE) & PMD_MASK;
1655                 pmd++;
1656         } while (address && (address < end));
1657         return error;
1658 }
1659
1660 int filemap_sync(struct vm_area_struct * vma, unsigned long address,
1661         size_t size, unsigned int flags)
1662 {
1663         pgd_t * dir;
1664         unsigned long end = address + size;
1665         int error = 0;
1666
1667         dir = pgd_offset(vma->vm_mm, address);
1668         flush_cache_range(vma->vm_mm, end - size, end);
1669         if (address >= end)
1670                 BUG();
1671         do {
1672                 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
1673                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
1674                 dir++;
1675         } while (address && (address < end));
1676         flush_tlb_range(vma->vm_mm, end - size, end);
1677         return error;
1678 }
1679
1680 /*
1681  * This handles (potentially partial) area unmaps..
1682  */
1683 static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len)
1684 {
1685         lock_kernel();
1686         filemap_sync(vma, start, len, MS_ASYNC);
1687         unlock_kernel();
1688 }
1689
1690 /*
1691  * Shared mappings need to be able to do the right thing at
1692  * close/unmap/sync. They will also use the private file as
1693  * backing-store for swapping..
1694  */
1695 static struct vm_operations_struct file_shared_mmap = {
1696         unmap:          filemap_unmap,          /* unmap - we need to sync the pages */
1697         sync:           filemap_sync,
1698         nopage:         filemap_nopage,
1699         swapout:        filemap_swapout,
1700 };
1701
1702 /*
1703  * Private mappings just need to be able to load in the map.
1704  *
1705  * (This is actually used for shared mappings as well, if we
1706  * know they can't ever get write permissions..)
1707  */
1708 static struct vm_operations_struct file_private_mmap = {
1709         nopage:         filemap_nopage,
1710 };
1711
1712 /* This is used for a general mmap of a disk file */
1713
1714 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1715 {
1716         struct vm_operations_struct * ops;
1717         struct inode *inode = file->f_dentry->d_inode;
1718
1719         ops = &file_private_mmap;
1720         if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1721                 if (!inode->i_mapping->a_ops->writepage)
1722                         return -EINVAL;
1723                 ops = &file_shared_mmap;
1724         }
1725         if (!inode->i_sb || !S_ISREG(inode->i_mode))
1726                 return -EACCES;
1727         if (!inode->i_mapping->a_ops->readpage)
1728                 return -ENOEXEC;
1729         UPDATE_ATIME(inode);
1730         vma->vm_ops = ops;
1731         return 0;
1732 }
1733
1734 /*
1735  * The msync() system call.
1736  */
1737
1738 static int msync_interval(struct vm_area_struct * vma,
1739         unsigned long start, unsigned long end, int flags)
1740 {
1741         if (vma->vm_file && vma->vm_ops && vma->vm_ops->sync) {
1742                 int error;
1743                 error = vma->vm_ops->sync(vma, start, end-start, flags);
1744                 if (!error && (flags & MS_SYNC)) {
1745                         struct file * file = vma->vm_file;
1746                         if (file && file->f_op && file->f_op->fsync)
1747                                 error = file->f_op->fsync(file, file->f_dentry);
1748                 }
1749                 return error;
1750         }
1751         return 0;
1752 }
1753
1754 asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
1755 {
1756         unsigned long end;
1757         struct vm_area_struct * vma;
1758         int unmapped_error, error = -EINVAL;
1759
1760         down(&current->mm->mmap_sem);
1761         lock_kernel();
1762         if (start & ~PAGE_MASK)
1763                 goto out;
1764         len = (len + ~PAGE_MASK) & PAGE_MASK;
1765         end = start + len;
1766         if (end < start)
1767                 goto out;
1768         if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1769                 goto out;
1770         error = 0;
1771         if (end == start)
1772                 goto out;
1773         /*
1774          * If the interval [start,end) covers some unmapped address ranges,
1775          * just ignore them, but return -EFAULT at the end.
1776          */
1777         vma = find_vma(current->mm, start);
1778         unmapped_error = 0;
1779         for (;;) {
1780                 /* Still start < end. */
1781                 error = -EFAULT;
1782                 if (!vma)
1783                         goto out;
1784                 /* Here start < vma->vm_end. */
1785                 if (start < vma->vm_start) {
1786                         unmapped_error = -EFAULT;
1787                         start = vma->vm_start;
1788                 }
1789                 /* Here vma->vm_start <= start < vma->vm_end. */
1790                 if (end <= vma->vm_end) {
1791                         if (start < end) {
1792                                 error = msync_interval(vma, start, end, flags);
1793                                 if (error)
1794                                         goto out;
1795                         }
1796                         error = unmapped_error;
1797                         goto out;
1798                 }
1799                 /* Here vma->vm_start <= start < vma->vm_end < end. */
1800                 error = msync_interval(vma, start, vma->vm_end, flags);
1801                 if (error)
1802                         goto out;
1803                 start = vma->vm_end;
1804                 vma = vma->vm_next;
1805         }
1806 out:
1807         unlock_kernel();
1808         up(&current->mm->mmap_sem);
1809         return error;
1810 }
1811
1812 static inline void setup_read_behavior(struct vm_area_struct * vma,
1813         int behavior)
1814 {
1815         VM_ClearReadHint(vma);
1816         switch(behavior) {
1817                 case MADV_SEQUENTIAL:
1818                         vma->vm_flags |= VM_SEQ_READ;
1819                         break;
1820                 case MADV_RANDOM:
1821                         vma->vm_flags |= VM_RAND_READ;
1822                         break;
1823                 default:
1824                         break;
1825         }
1826         return;
1827 }
1828
1829 static long madvise_fixup_start(struct vm_area_struct * vma,
1830         unsigned long end, int behavior)
1831 {
1832         struct vm_area_struct * n;
1833
1834         n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1835         if (!n)
1836                 return -EAGAIN;
1837         *n = *vma;
1838         n->vm_end = end;
1839         setup_read_behavior(n, behavior);
1840         n->vm_raend = 0;
1841         get_file(n->vm_file);
1842         if (n->vm_ops && n->vm_ops->open)
1843                 n->vm_ops->open(n);
1844         vmlist_modify_lock(vma->vm_mm);
1845         vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
1846         vma->vm_start = end;
1847         insert_vm_struct(current->mm, n);
1848         vmlist_modify_unlock(vma->vm_mm);
1849         return 0;
1850 }
1851
1852 static long madvise_fixup_end(struct vm_area_struct * vma,
1853         unsigned long start, int behavior)
1854 {
1855         struct vm_area_struct * n;
1856
1857         n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1858         if (!n)
1859                 return -EAGAIN;
1860         *n = *vma;
1861         n->vm_start = start;
1862         n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT;
1863         setup_read_behavior(n, behavior);
1864         n->vm_raend = 0;
1865         get_file(n->vm_file);
1866         if (n->vm_ops && n->vm_ops->open)
1867                 n->vm_ops->open(n);
1868         vmlist_modify_lock(vma->vm_mm);
1869         vma->vm_end = start;
1870         insert_vm_struct(current->mm, n);
1871         vmlist_modify_unlock(vma->vm_mm);
1872         return 0;
1873 }
1874
1875 static long madvise_fixup_middle(struct vm_area_struct * vma,
1876         unsigned long start, unsigned long end, int behavior)
1877 {
1878         struct vm_area_struct * left, * right;
1879
1880         left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1881         if (!left)
1882                 return -EAGAIN;
1883         right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1884         if (!right) {
1885                 kmem_cache_free(vm_area_cachep, left);
1886                 return -EAGAIN;
1887         }
1888         *left = *vma;
1889         *right = *vma;
1890         left->vm_end = start;
1891         right->vm_start = end;
1892         right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT;
1893         left->vm_raend = 0;
1894         right->vm_raend = 0;
1895         atomic_add(2, &vma->vm_file->f_count);
1896
1897         if (vma->vm_ops && vma->vm_ops->open) {
1898                 vma->vm_ops->open(left);
1899                 vma->vm_ops->open(right);
1900         }
1901         vmlist_modify_lock(vma->vm_mm);
1902         vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
1903         vma->vm_start = start;
1904         vma->vm_end = end;
1905         setup_read_behavior(vma, behavior);
1906         vma->vm_raend = 0;
1907         insert_vm_struct(current->mm, left);
1908         insert_vm_struct(current->mm, right);
1909         vmlist_modify_unlock(vma->vm_mm);
1910         return 0;
1911 }
1912
1913 /*
1914  * We can potentially split a vm area into separate
1915  * areas, each area with its own behavior.
1916  */
1917 static long madvise_behavior(struct vm_area_struct * vma,
1918         unsigned long start, unsigned long end, int behavior)
1919 {
1920         int error = 0;
1921
1922         /* This caps the number of vma's this process can own */
1923         if (vma->vm_mm->map_count > MAX_MAP_COUNT)
1924                 return -ENOMEM;
1925
1926         if (start == vma->vm_start) {
1927                 if (end == vma->vm_end) {
1928                         setup_read_behavior(vma, behavior);
1929                         vma->vm_raend = 0;
1930                 } else
1931                         error = madvise_fixup_start(vma, end, behavior);
1932         } else {
1933                 if (end == vma->vm_end)
1934                         error = madvise_fixup_end(vma, start, behavior);
1935                 else
1936                         error = madvise_fixup_middle(vma, start, end, behavior);
1937         }
1938
1939         return error;
1940 }
1941
1942 /*
1943  * Schedule all required I/O operations, then run the disk queue
1944  * to make sure they are started.  Do not wait for completion.
1945  */
1946 static long madvise_willneed(struct vm_area_struct * vma,
1947         unsigned long start, unsigned long end)
1948 {
1949         long error = -EBADF;
1950         struct file * file;
1951         unsigned long size, rlim_rss;
1952
1953         /* Doesn't work if there's no mapped file. */
1954         if (!vma->vm_file)
1955                 return error;
1956         file = vma->vm_file;
1957         size = (file->f_dentry->d_inode->i_size + PAGE_CACHE_SIZE - 1) >>
1958                                                         PAGE_CACHE_SHIFT;
1959
1960         start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1961         if (end > vma->vm_end)
1962                 end = vma->vm_end;
1963         end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1964
1965         /* Make sure this doesn't exceed the process's max rss. */
1966         error = -EIO;
1967         rlim_rss = current->rlim ?  current->rlim[RLIMIT_RSS].rlim_cur :
1968                                 LONG_MAX; /* default: see resource.h */
1969         if ((vma->vm_mm->rss + (end - start)) > rlim_rss)
1970                 return error;
1971
1972         /* round to cluster boundaries if this isn't a "random" area. */
1973         if (!VM_RandomReadHint(vma)) {
1974                 start = CLUSTER_OFFSET(start);
1975                 end = CLUSTER_OFFSET(end + CLUSTER_PAGES - 1);
1976
1977                 while ((start < end) && (start < size)) {
1978                         error = read_cluster_nonblocking(file, start, size);
1979                         start += CLUSTER_PAGES;
1980                         if (error < 0)
1981                                 break;
1982                 }
1983         } else {
1984                 while ((start < end) && (start < size)) {
1985                         error = page_cache_read(file, start);
1986                         start++;
1987                         if (error < 0)
1988                                 break;
1989                 }
1990         }
1991
1992         /* Don't wait for someone else to push these requests. */
1993         run_task_queue(&tq_disk);
1994
1995         return error;
1996 }
1997
1998 /*
1999  * Application no longer needs these pages.  If the pages are dirty,
2000  * it's OK to just throw them away.  The app will be more careful about
2001  * data it wants to keep.  Be sure to free swap resources too.  The
2002  * zap_page_range call sets things up for shrink_mmap to actually free
2003  * these pages later if no one else has touched them in the meantime,
2004  * although we could add these pages to a global reuse list for
2005  * shrink_mmap to pick up before reclaiming other pages.
2006  *
2007  * NB: This interface discards data rather than pushes it out to swap,
2008  * as some implementations do.  This has performance implications for
2009  * applications like large transactional databases which want to discard
2010  * pages in anonymous maps after committing to backing store the data
2011  * that was kept in them.  There is no reason to write this data out to
2012  * the swap area if the application is discarding it.
2013  *
2014  * An interface that causes the system to free clean pages and flush
2015  * dirty pages is already available as msync(MS_INVALIDATE).
2016  */
2017 static long madvise_dontneed(struct vm_area_struct * vma,
2018         unsigned long start, unsigned long end)
2019 {
2020         if (vma->vm_flags & VM_LOCKED)
2021                 return -EINVAL;
2022
2023         lock_kernel();  /* is this really necessary? */
2024
2025         flush_cache_range(vma->vm_mm, start, end);
2026         zap_page_range(vma->vm_mm, start, end - start);
2027         flush_tlb_range(vma->vm_mm, start, end);
2028
2029         unlock_kernel();
2030         return 0;
2031 }
2032
2033 static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
2034         unsigned long end, int behavior)
2035 {
2036         long error = -EBADF;
2037
2038         switch (behavior) {
2039         case MADV_NORMAL:
2040         case MADV_SEQUENTIAL:
2041         case MADV_RANDOM:
2042                 error = madvise_behavior(vma, start, end, behavior);
2043                 break;
2044
2045         case MADV_WILLNEED:
2046                 error = madvise_willneed(vma, start, end);
2047                 break;
2048
2049         case MADV_DONTNEED:
2050                 error = madvise_dontneed(vma, start, end);
2051                 break;
2052
2053         default:
2054                 error = -EINVAL;
2055                 break;
2056         }
2057
2058         return error;
2059 }
2060
2061 /*
2062  * The madvise(2) system call.
2063  *
2064  * Applications can use madvise() to advise the kernel how it should
2065  * handle paging I/O in this VM area.  The idea is to help the kernel
2066  * use appropriate read-ahead and caching techniques.  The information
2067  * provided is advisory only, and can be safely disregarded by the
2068  * kernel without affecting the correct operation of the application.
2069  *
2070  * behavior values:
2071  *  MADV_NORMAL - the default behavior is to read clusters.  This
2072  *              results in some read-ahead and read-behind.
2073  *  MADV_RANDOM - the system should read the minimum amount of data
2074  *              on any access, since it is unlikely that the appli-
2075  *              cation will need more than what it asks for.
2076  *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
2077  *              once, so they can be aggressively read ahead, and
2078  *              can be freed soon after they are accessed.
2079  *  MADV_WILLNEED - the application is notifying the system to read
2080  *              some pages ahead.
2081  *  MADV_DONTNEED - the application is finished with the given range,
2082  *              so the kernel can free resources associated with it.
2083  *
2084  * return values:
2085  *  zero    - success
2086  *  -EINVAL - start + len < 0, start is not page-aligned,
2087  *              "behavior" is not a valid value, or application
2088  *              is attempting to release locked or shared pages.
2089  *  -ENOMEM - addresses in the specified range are not currently
2090  *              mapped, or are outside the AS of the process.
2091  *  -EIO    - an I/O error occurred while paging in data.
2092  *  -EBADF  - map exists, but area maps something that isn't a file.
2093  *  -EAGAIN - a kernel resource was temporarily unavailable.
2094  */
2095 asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior)
2096 {
2097         unsigned long end;
2098         struct vm_area_struct * vma;
2099         int unmapped_error = 0;
2100         int error = -EINVAL;
2101
2102         down(&current->mm->mmap_sem);
2103
2104         if (start & ~PAGE_MASK)
2105                 goto out;
2106         len = (len + ~PAGE_MASK) & PAGE_MASK;
2107         end = start + len;
2108         if (end < start)
2109                 goto out;
2110
2111         error = 0;
2112         if (end == start)
2113                 goto out;
2114
2115         /*
2116          * If the interval [start,end) covers some unmapped address
2117          * ranges, just ignore them, but return -ENOMEM at the end.
2118          */
2119         vma = find_vma(current->mm, start);
2120         for (;;) {
2121                 /* Still start < end. */
2122                 error = -ENOMEM;
2123                 if (!vma)
2124                         goto out;
2125
2126                 /* Here start < vma->vm_end. */
2127                 if (start < vma->vm_start) {
2128                         unmapped_error = -ENOMEM;
2129                         start = vma->vm_start;
2130                 }
2131
2132                 /* Here vma->vm_start <= start < vma->vm_end. */
2133                 if (end <= vma->vm_end) {
2134                         if (start < end) {
2135                                 error = madvise_vma(vma, start, end,
2136                                                         behavior);
2137                                 if (error)
2138                                         goto out;
2139                         }
2140                         error = unmapped_error;
2141                         goto out;
2142                 }
2143
2144                 /* Here vma->vm_start <= start < vma->vm_end < end. */
2145                 error = madvise_vma(vma, start, vma->vm_end, behavior);
2146                 if (error)
2147                         goto out;
2148                 start = vma->vm_end;
2149                 vma = vma->vm_next;
2150         }
2151
2152 out:
2153         up(&current->mm->mmap_sem);
2154         return error;
2155 }
2156
2157 /*
2158  * Later we can get more picky about what "in core" means precisely.
2159  * For now, simply check to see if the page is in the page cache,
2160  * and is up to date; i.e. that no page-in operation would be required
2161  * at this time if an application were to map and access this page.
2162  */
2163 static unsigned char mincore_page(struct vm_area_struct * vma,
2164         unsigned long pgoff)
2165 {
2166         unsigned char present = 0;
2167         struct address_space * as = &vma->vm_file->f_dentry->d_inode->i_data;
2168         struct page * page, ** hash = page_hash(as, pgoff);
2169
2170         spin_lock(&pagecache_lock);
2171         page = __find_page_nolock(as, pgoff, *hash);
2172         if ((page) && (Page_Uptodate(page)))
2173                 present = 1;
2174         spin_unlock(&pagecache_lock);
2175
2176         return present;
2177 }
2178
2179 static long mincore_vma(struct vm_area_struct * vma,
2180         unsigned long start, unsigned long end, unsigned char * vec)
2181 {
2182         long error, i, remaining;
2183         unsigned char * tmp;
2184
2185         error = -ENOMEM;
2186         if (!vma->vm_file)
2187                 return error;
2188
2189         start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2190         if (end > vma->vm_end)
2191                 end = vma->vm_end;
2192         end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2193
2194         error = -EAGAIN;
2195         tmp = (unsigned char *) __get_free_page(GFP_KERNEL);
2196         if (!tmp)
2197                 return error;
2198
2199         /* (end - start) is # of pages, and also # of bytes in "vec */
2200         remaining = (end - start),
2201
2202         error = 0;
2203         for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) {
2204                 int j = 0;
2205                 long thispiece = (remaining < PAGE_SIZE) ?
2206                                                 remaining : PAGE_SIZE;
2207
2208                 while (j < thispiece)
2209                         tmp[j++] = mincore_page(vma, start++);
2210
2211                 if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) {
2212                         error = -EFAULT;
2213                         break;
2214                 }
2215         }
2216
2217         free_page((unsigned long) tmp);
2218         return error;
2219 }
2220
2221 /*
2222  * The mincore(2) system call.
2223  *
2224  * mincore() returns the memory residency status of the pages in the
2225  * current process's address space specified by [addr, addr + len).
2226  * The status is returned in a vector of bytes.  The least significant
2227  * bit of each byte is 1 if the referenced page is in memory, otherwise
2228  * it is zero.
2229  *
2230  * Because the status of a page can change after mincore() checks it
2231  * but before it returns to the application, the returned vector may
2232  * contain stale information.  Only locked pages are guaranteed to
2233  * remain in memory.
2234  *
2235  * return values:
2236  *  zero    - success
2237  *  -EFAULT - vec points to an illegal address
2238  *  -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE,
2239  *              or len has a nonpositive value
2240  *  -ENOMEM - Addresses in the range [addr, addr + len] are
2241  *              invalid for the address space of this process, or
2242  *              specify one or more pages which are not currently
2243  *              mapped
2244  *  -EAGAIN - A kernel resource was temporarily unavailable.
2245  */
2246 asmlinkage long sys_mincore(unsigned long start, size_t len,
2247         unsigned char * vec)
2248 {
2249         int index = 0;
2250         unsigned long end;
2251         struct vm_area_struct * vma;
2252         int unmapped_error = 0;
2253         long error = -EINVAL;
2254
2255         down(&current->mm->mmap_sem);
2256
2257         if (start & ~PAGE_CACHE_MASK)
2258                 goto out;
2259         len = (len + ~PAGE_CACHE_MASK) & PAGE_CACHE_MASK;
2260         end = start + len;
2261         if (end < start)
2262                 goto out;
2263
2264         error = 0;
2265         if (end == start)
2266                 goto out;
2267
2268         /*
2269          * If the interval [start,end) covers some unmapped address
2270          * ranges, just ignore them, but return -ENOMEM at the end.
2271          */
2272         vma = find_vma(current->mm, start);
2273         for (;;) {
2274                 /* Still start < end. */
2275                 error = -ENOMEM;
2276                 if (!vma)
2277                         goto out;
2278
2279                 /* Here start < vma->vm_end. */
2280                 if (start < vma->vm_start) {
2281                         unmapped_error = -ENOMEM;
2282                         start = vma->vm_start;
2283                 }
2284
2285                 /* Here vma->vm_start <= start < vma->vm_end. */
2286                 if (end <= vma->vm_end) {
2287                         if (start < end) {
2288                                 error = mincore_vma(vma, start, end,
2289                                                         &vec[index]);
2290                                 if (error)
2291                                         goto out;
2292                         }
2293                         error = unmapped_error;
2294                         goto out;
2295                 }
2296
2297                 /* Here vma->vm_start <= start < vma->vm_end < end. */
2298                 error = mincore_vma(vma, start, vma->vm_end, &vec[index]);
2299                 if (error)
2300                         goto out;
2301                 index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT;
2302                 start = vma->vm_end;
2303                 vma = vma->vm_next;
2304         }
2305
2306 out:
2307         up(&current->mm->mmap_sem);
2308         return error;
2309 }
2310
2311 static inline
2312 struct page *__read_cache_page(struct address_space *mapping,
2313                                 unsigned long index,
2314                                 int (*filler)(void *,struct page*),
2315                                 void *data)
2316 {
2317         struct page **hash = page_hash(mapping, index);
2318         struct page *page, *cached_page = NULL;
2319         int err;
2320 repeat:
2321         page = __find_get_page(mapping, index, hash);
2322         if (!page) {
2323                 if (!cached_page) {
2324                         cached_page = page_cache_alloc();
2325                         if (!cached_page)
2326                                 return ERR_PTR(-ENOMEM);
2327                 }
2328                 page = cached_page;
2329                 if (add_to_page_cache_unique(page, mapping, index, hash))
2330                         goto repeat;
2331                 cached_page = NULL;
2332                 err = filler(data, page);
2333                 if (err < 0) {
2334                         page_cache_release(page);
2335                         page = ERR_PTR(err);
2336                 }
2337         }
2338         if (cached_page)
2339                 page_cache_free(cached_page);
2340         return page;
2341 }
2342
2343 /*
2344  * Read into the page cache. If a page already exists,
2345  * and Page_Uptodate() is not set, try to fill the page.
2346  */
2347 struct page *read_cache_page(struct address_space *mapping,
2348                                 unsigned long index,
2349                                 int (*filler)(void *,struct page*),
2350                                 void *data)
2351 {
2352         struct page *page = __read_cache_page(mapping, index, filler, data);
2353         int err;
2354
2355         if (IS_ERR(page) || Page_Uptodate(page))
2356                 goto out;
2357
2358         lock_page(page);
2359         if (Page_Uptodate(page)) {
2360                 UnlockPage(page);
2361                 goto out;
2362         }
2363         err = filler(data, page);
2364         if (err < 0) {
2365                 page_cache_release(page);
2366                 page = ERR_PTR(err);
2367         }
2368  out:
2369         return page;
2370 }
2371
2372 static inline struct page * __grab_cache_page(struct address_space *mapping,
2373                                 unsigned long index, struct page **cached_page)
2374 {
2375         struct page *page, **hash = page_hash(mapping, index);
2376 repeat:
2377         page = __find_lock_page(mapping, index, hash);
2378         if (!page) {
2379                 if (!*cached_page) {
2380                         *cached_page = page_cache_alloc();
2381                         if (!*cached_page)
2382                                 return NULL;
2383                 }
2384                 page = *cached_page;
2385                 if (add_to_page_cache_unique(page, mapping, index, hash))
2386                         goto repeat;
2387                 *cached_page = NULL;
2388         }
2389         return page;
2390 }
2391
2392 /*
2393  * Returns locked page at given index in given cache, creating it if needed.
2394  */
2395
2396 struct page *grab_cache_page(struct address_space *mapping, unsigned long index)
2397 {
2398         struct page *cached_page = NULL;
2399         struct page *page = __grab_cache_page(mapping,index,&cached_page);
2400         if (cached_page)
2401                 page_cache_free(cached_page);
2402         return page;
2403 }
2404
2405 static inline void remove_suid(struct inode *inode)
2406 {
2407         unsigned int mode;
2408
2409         /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
2410         mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
2411
2412         /* was any of the uid bits set? */
2413         mode &= inode->i_mode;
2414         if (mode && !capable(CAP_FSETID)) {
2415                 inode->i_mode &= ~mode;
2416                 mark_inode_dirty(inode);
2417         }
2418 }
2419
2420 /*
2421  * Write to a file through the page cache.
2422  *
2423  * We currently put everything into the page cache prior to writing it.
2424  * This is not a problem when writing full pages. With partial pages,
2425  * however, we first have to read the data into the cache, then
2426  * dirty the page, and finally schedule it for writing. Alternatively, we
2427  * could write-through just the portion of data that would go into that
2428  * page, but that would kill performance for applications that write data
2429  * line by line, and it's prone to race conditions.
2430  *
2431  * Note that this routine doesn't try to keep track of dirty pages. Each
2432  * file system has to do this all by itself, unfortunately.
2433  *                                                      okir@monad.swb.de
2434  */
2435 ssize_t
2436 generic_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)
2437 {
2438         struct inode    *inode = file->f_dentry->d_inode;
2439         struct address_space *mapping = inode->i_mapping;
2440         unsigned long   limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
2441         loff_t          pos;
2442         struct page     *page, *cached_page;
2443         unsigned long   written;
2444         long            status;
2445         int             err;
2446
2447         cached_page = NULL;
2448
2449         down(&inode->i_sem);
2450
2451         pos = *ppos;
2452         err = -EINVAL;
2453         if (pos < 0)
2454                 goto out;
2455
2456         err = file->f_error;
2457         if (err) {
2458                 file->f_error = 0;
2459                 goto out;
2460         }
2461
2462         written = 0;
2463
2464         if (file->f_flags & O_APPEND)
2465                 pos = inode->i_size;
2466
2467         /*
2468          * Check whether we've reached the file size limit.
2469          */
2470         err = -EFBIG;
2471         if (limit != RLIM_INFINITY) {
2472                 if (pos >= limit) {
2473                         send_sig(SIGXFSZ, current, 0);
2474                         goto out;
2475                 }
2476                 if (count > limit - pos) {
2477                         send_sig(SIGXFSZ, current, 0);
2478                         count = limit - pos;
2479                 }
2480         }
2481
2482         status  = 0;
2483         if (count) {
2484                 remove_suid(inode);
2485                 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
2486                 mark_inode_dirty(inode);
2487         }
2488
2489         while (count) {
2490                 unsigned long bytes, index, offset;
2491                 char *kaddr;
2492
2493                 /*
2494                  * Try to find the page in the cache. If it isn't there,
2495                  * allocate a free page.
2496                  */
2497                 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
2498                 index = pos >> PAGE_CACHE_SHIFT;
2499                 bytes = PAGE_CACHE_SIZE - offset;
2500                 if (bytes > count)
2501                         bytes = count;
2502
2503                 status = -ENOMEM;       /* we'll assign it later anyway */
2504                 page = __grab_cache_page(mapping, index, &cached_page);
2505                 if (!page)
2506                         break;
2507
2508                 /* We have exclusive IO access to the page.. */
2509                 if (!PageLocked(page)) {
2510                         PAGE_BUG(page);
2511                 }
2512
2513                 status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes);
2514                 if (status)
2515                         goto unlock;
2516                 kaddr = (char*)page_address(page);
2517                 status = copy_from_user(kaddr+offset, buf, bytes);
2518                 if (status)
2519                         goto fail_write;
2520                 status = mapping->a_ops->commit_write(file, page, offset, offset+bytes);
2521                 if (!status)
2522                         status = bytes;
2523
2524                 if (status >= 0) {
2525                         written += status;
2526                         count -= status;
2527                         pos += status;
2528                         buf += status;
2529                 }
2530 unlock:
2531                 /* Mark it unlocked again and drop the page.. */
2532                 UnlockPage(page);
2533                 page_cache_release(page);
2534
2535                 if (status < 0)
2536                         break;
2537         }
2538         *ppos = pos;
2539
2540         if (cached_page)
2541                 page_cache_free(cached_page);
2542
2543         err = written ? written : status;
2544 out:
2545         up(&inode->i_sem);
2546         return err;
2547 fail_write:
2548         status = -EFAULT;
2549         ClearPageUptodate(page);
2550         kunmap(page);
2551         goto unlock;
2552 }
2553
2554 void __init page_cache_init(unsigned long mempages)
2555 {
2556         unsigned long htable_size, order;
2557
2558         htable_size = mempages;
2559         htable_size *= sizeof(struct page *);
2560         for(order = 0; (PAGE_SIZE << order) < htable_size; order++)
2561                 ;
2562
2563         do {
2564                 unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *);
2565
2566                 page_hash_bits = 0;
2567                 while((tmp >>= 1UL) != 0UL)
2568                         page_hash_bits++;
2569
2570                 page_hash_table = (struct page **)
2571                         __get_free_pages(GFP_ATOMIC, order);
2572         } while(page_hash_table == NULL && --order > 0);
2573
2574         printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
2575                (1 << page_hash_bits), order, (PAGE_SIZE << order));
2576         if (!page_hash_table)
2577                 panic("Failed to allocate page hash table\n");
2578         memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));
2579 }