mm/filemap.c

   1 /*
   2  *      linux/mm/filemap.c
   3  *
   4  * Copyright (C) 1994-1999  Linus Torvalds
   5  */
   6
   7 /*
   8  * This file handles the generic file mmap semantics used by
   9  * most "normal" filesystems (but you don't /have/ to use this:
  10  * the NFS filesystem used to do this differently, for example)
  11  */
  12 #include <linux/malloc.h>
  13 #include <linux/shm.h>
  14 #include <linux/mman.h>
  15 #include <linux/locks.h>
  16 #include <linux/pagemap.h>
  17 #include <linux/swap.h>
  18 #include <linux/smp_lock.h>
  19 #include <linux/blkdev.h>
  20 #include <linux/file.h>
  21 #include <linux/swapctl.h>
  22 #include <linux/slab.h>
  23 #include <linux/init.h>
  24 #include <linux/mm.h>
  25
  26 #include <asm/pgalloc.h>
  27 #include <asm/uaccess.h>
  28 #include <asm/mman.h>
  29
  30 #include <linux/highmem.h>
  31
  32 /*
  33  * Shared mappings implemented 30.11.1994. It's not fully working yet,
  34  * though.
  35  *
  36  * Shared mappings now work. 15.8.1995  Bruno.
  37  *
  38  * finished 'unifying' the page and buffer cache and SMP-threaded the
  39  * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
  40  *
  41  * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
  42  */
  43
  44 atomic_t page_cache_size = ATOMIC_INIT(0);
  45 unsigned int page_hash_bits;
  46 struct page **page_hash_table;
  47 struct list_head lru_cache;
  48
  49 static spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
  50 /*
  51  * NOTE: to avoid deadlocking you must never acquire the pagecache_lock with
  52  *       the pagemap_lru_lock held.
  53  */
  54 spinlock_t pagemap_lru_lock = SPIN_LOCK_UNLOCKED;
  55
  56 #define CLUSTER_PAGES           (1 << page_cluster)
  57 #define CLUSTER_OFFSET(x)       (((x) >> page_cluster) << page_cluster)
  58
  59 void __add_page_to_hash_queue(struct page * page, struct page **p)
  60 {
  61         atomic_inc(&page_cache_size);
  62         if((page->next_hash = *p) != NULL)
  63                 (*p)->pprev_hash = &page->next_hash;
  64         *p = page;
  65         page->pprev_hash = p;
  66         if (page->buffers)
  67                 PAGE_BUG(page);
  68 }
  69
  70 static inline void remove_page_from_hash_queue(struct page * page)
  71 {
  72         if(page->pprev_hash) {
  73                 if(page->next_hash)
  74                         page->next_hash->pprev_hash = page->pprev_hash;
  75                 *page->pprev_hash = page->next_hash;
  76                 page->pprev_hash = NULL;
  77         }
  78         atomic_dec(&page_cache_size);
  79 }
  80
  81 static inline int sync_page(struct page *page)
  82 {
  83         struct address_space *mapping = page->mapping;
  84
  85         if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
  86                 return mapping->a_ops->sync_page(page);
  87         return 0;
  88 }
  89
  90 /*
  91  * Remove a page from the page cache and free it. Caller has to make
  92  * sure the page is locked and that nobody else uses it - or that usage
  93  * is safe.
  94  */
  95 static inline void __remove_inode_page(struct page *page)
  96 {
  97         remove_page_from_inode_queue(page);
  98         remove_page_from_hash_queue(page);
  99         page->mapping = NULL;
 100 }
 101
 102 void remove_inode_page(struct page *page)
 103 {
 104         if (!PageLocked(page))
 105                 PAGE_BUG(page);
 106
 107         spin_lock(&pagecache_lock);
 108         __remove_inode_page(page);
 109         spin_unlock(&pagecache_lock);
 110 }
 111
 112 /**
 113  * invalidate_inode_pages - Invalidate all the unlocked pages of one inode
 114  * @inode: the inode which pages we want to invalidate
 115  *
 116  * This function only removes the unlocked pages, if you want to
 117  * remove all the pages of one inode, you must call truncate_inode_pages.
 118  */
 119
 120 void invalidate_inode_pages(struct inode * inode)
 121 {
 122         struct list_head *head, *curr;
 123         struct page * page;
 124
 125         head = &inode->i_mapping->pages;
 126
 127         spin_lock(&pagecache_lock);
 128         spin_lock(&pagemap_lru_lock);
 129         curr = head->next;
 130
 131         while (curr != head) {
 132                 page = list_entry(curr, struct page, list);
 133                 curr = curr->next;
 134
 135                 /* We cannot invalidate a locked page */
 136                 if (TryLockPage(page))
 137                         continue;
 138
 139                 __lru_cache_del(page);
 140                 __remove_inode_page(page);
 141                 UnlockPage(page);
 142                 page_cache_release(page);
 143         }
 144
 145         spin_unlock(&pagemap_lru_lock);
 146         spin_unlock(&pagecache_lock);
 147 }
 148
 149 /*
 150  * Truncate the page cache at a set offset, removing the pages
 151  * that are beyond that offset (and zeroing out partial pages).
 152  */
 153 void truncate_inode_pages(struct address_space * mapping, loff_t lstart)
 154 {
 155         struct list_head *head, *curr;
 156         struct page * page;
 157         unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
 158         unsigned long start;
 159
 160         start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 161
 162 repeat:
 163         head = &mapping->pages;
 164         spin_lock(&pagecache_lock);
 165         curr = head->next;
 166         while (curr != head) {
 167                 unsigned long offset;
 168
 169                 page = list_entry(curr, struct page, list);
 170                 curr = curr->next;
 171
 172                 offset = page->index;
 173
 174                 /* page wholly truncated - free it */
 175                 if (offset >= start) {
 176                         if (TryLockPage(page)) {
 177                                 page_cache_get(page);
 178                                 spin_unlock(&pagecache_lock);
 179                                 wait_on_page(page);
 180                                 page_cache_release(page);
 181                                 goto repeat;
 182                         }
 183                         page_cache_get(page);
 184                         spin_unlock(&pagecache_lock);
 185
 186                         if (!page->buffers || block_flushpage(page, 0))
 187                                 lru_cache_del(page);
 188
 189                         /*
 190                          * We remove the page from the page cache
 191                          * _after_ we have destroyed all buffer-cache
 192                          * references to it. Otherwise some other process
 193                          * might think this inode page is not in the
 194                          * page cache and creates a buffer-cache alias
 195                          * to it causing all sorts of fun problems ...
 196                          */
 197                         remove_inode_page(page);
 198                         ClearPageDirty(page);
 199
 200                         UnlockPage(page);
 201                         page_cache_release(page);
 202                         page_cache_release(page);
 203
 204                         /*
 205                          * We have done things without the pagecache lock,
 206                          * so we'll have to repeat the scan.
 207                          * It's not possible to deadlock here because
 208                          * we are guaranteed to make progress. (ie. we have
 209                          * just removed a page)
 210                          */
 211                         goto repeat;
 212                 }
 213                 /*
 214                  * there is only one partial page possible.
 215                  */
 216                 if (!partial)
 217                         continue;
 218
 219                 /* and it's the one preceeding the first wholly truncated page */
 220                 if ((offset + 1) != start)
 221                         continue;
 222
 223                 /* partial truncate, clear end of page */
 224                 if (TryLockPage(page)) {
 225                         spin_unlock(&pagecache_lock);
 226                         goto repeat;
 227                 }
 228                 page_cache_get(page);
 229                 spin_unlock(&pagecache_lock);
 230
 231                 memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
 232                 if (page->buffers)
 233                         block_flushpage(page, partial);
 234
 235                 partial = 0;
 236
 237                 /*
 238                  * we have dropped the spinlock so we have to
 239                  * restart.
 240                  */
 241                 UnlockPage(page);
 242                 page_cache_release(page);
 243                 goto repeat;
 244         }
 245         spin_unlock(&pagecache_lock);
 246 }
 247
 248 /*
 249  * nr_dirty represents the number of dirty pages that we will write async
 250  * before doing sync writes.  We can only do sync writes if we can
 251  * wait for IO (__GFP_IO set).
 252  */
 253 int shrink_mmap(int priority, int gfp_mask)
 254 {
 255         int ret = 0, count, nr_dirty;
 256         struct list_head * page_lru;
 257         struct page * page = NULL;
 258
 259         count = nr_lru_pages / (priority + 1);
 260         nr_dirty = priority;
 261
 262         /* we need pagemap_lru_lock for list_del() ... subtle code below */
 263         spin_lock(&pagemap_lru_lock);
 264         while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache) {
 265                 page = list_entry(page_lru, struct page, lru);
 266                 list_del(page_lru);
 267
 268                 if (PageTestandClearReferenced(page))
 269                         goto dispose_continue;
 270
 271                 count--;
 272                 /*
 273                  * Avoid unscalable SMP locking for pages we can
 274                  * immediate tell are untouchable..
 275                  */
 276                 if (!page->buffers && page_count(page) > 1)
 277                         goto dispose_continue;
 278
 279                 if (TryLockPage(page))
 280                         goto dispose_continue;
 281
 282                 /* Release the pagemap_lru lock even if the page is not yet
 283                    queued in any lru queue since we have just locked down
 284                    the page so nobody else may SMP race with us running
 285                    a lru_cache_del() (lru_cache_del() always run with the
 286                    page locked down ;). */
 287                 spin_unlock(&pagemap_lru_lock);
 288
 289                 /* avoid freeing the page while it's locked */
 290                 page_cache_get(page);
 291
 292                 /*
 293                  * Is it a buffer page? Try to clean it up regardless
 294                  * of zone - it's old.
 295                  */
 296                 if (page->buffers) {
 297                         int wait = ((gfp_mask & __GFP_IO) && (nr_dirty-- < 0));
 298                         if (!try_to_free_buffers(page, wait))
 299                                 goto unlock_continue;
 300                         /* page was locked, inode can't go away under us */
 301                         if (!page->mapping) {
 302                                 atomic_dec(&buffermem_pages);
 303                                 goto made_buffer_progress;
 304                         }
 305                 }
 306
 307                 /* Take the pagecache_lock spinlock held to avoid
 308                    other tasks to notice the page while we are looking at its
 309                    page count. If it's a pagecache-page we'll free it
 310                    in one atomic transaction after checking its page count. */
 311                 spin_lock(&pagecache_lock);
 312
 313                 /*
 314                  * We can't free pages unless there's just one user
 315                  * (count == 2 because we added one ourselves above).
 316                  */
 317                 if (page_count(page) != 2)
 318                         goto cache_unlock_continue;
 319
 320                 /*
 321                  * Is it a page swap page? If so, we want to
 322                  * drop it if it is no longer used, even if it
 323                  * were to be marked referenced..
 324                  */
 325                 if (PageSwapCache(page)) {
 326                         spin_unlock(&pagecache_lock);
 327                         __delete_from_swap_cache(page);
 328                         goto made_inode_progress;
 329                 }
 330
 331                 /*
 332                  * Page is from a zone we don't care about.
 333                  * Don't drop page cache entries in vain.
 334                  */
 335                 if (page->zone->free_pages > page->zone->pages_high)
 336                         goto cache_unlock_continue;
 337
 338                 /* is it a page-cache page? */
 339                 if (page->mapping) {
 340                         if (!PageDirty(page) && !pgcache_under_min()) {
 341                                 __remove_inode_page(page);
 342                                 spin_unlock(&pagecache_lock);
 343                                 goto made_inode_progress;
 344                         }
 345                         goto cache_unlock_continue;
 346                 }
 347
 348                 printk(KERN_ERR "shrink_mmap: unknown LRU page!\n");
 349
 350 cache_unlock_continue:
 351                 spin_unlock(&pagecache_lock);
 352 unlock_continue:
 353                 spin_lock(&pagemap_lru_lock);
 354                 UnlockPage(page);
 355                 page_cache_release(page);
 356 dispose_continue:
 357                 list_add(page_lru, &lru_cache);
 358         }
 359         goto out;
 360
 361 made_inode_progress:
 362         page_cache_release(page);
 363 made_buffer_progress:
 364         UnlockPage(page);
 365         page_cache_release(page);
 366         ret = 1;
 367         spin_lock(&pagemap_lru_lock);
 368         /* nr_lru_pages needs the spinlock */
 369         nr_lru_pages--;
 370
 371 out:
 372         spin_unlock(&pagemap_lru_lock);
 373
 374         return ret;
 375 }
 376
 377 static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
 378 {
 379         goto inside;
 380
 381         for (;;) {
 382                 page = page->next_hash;
 383 inside:
 384                 if (!page)
 385                         goto not_found;
 386                 if (page->mapping != mapping)
 387                         continue;
 388                 if (page->index == offset)
 389                         break;
 390         }
 391         SetPageReferenced(page);
 392 not_found:
 393         return page;
 394 }
 395
 396 /*
 397  * By the time this is called, the page is locked and
 398  * we don't have to worry about any races any more.
 399  *
 400  * Start the IO..
 401  */
 402 static int writeout_one_page(struct page *page)
 403 {
 404         struct buffer_head *bh, *head = page->buffers;
 405
 406         bh = head;
 407         do {
 408                 if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
 409                         continue;
 410
 411                 bh->b_flushtime = 0;
 412                 ll_rw_block(WRITE, 1, &bh);
 413         } while ((bh = bh->b_this_page) != head);
 414         return 0;
 415 }
 416
 417 static int waitfor_one_page(struct page *page)
 418 {
 419         int error = 0;
 420         struct buffer_head *bh, *head = page->buffers;
 421
 422         bh = head;
 423         do {
 424                 wait_on_buffer(bh);
 425                 if (buffer_req(bh) && !buffer_uptodate(bh))
 426                         error = -EIO;
 427         } while ((bh = bh->b_this_page) != head);
 428         return error;
 429 }
 430
 431 static int do_buffer_fdatasync(struct inode *inode, unsigned long start, unsigned long end, int (*fn)(struct page *))
 432 {
 433         struct list_head *head, *curr;
 434         struct page *page;
 435         int retval = 0;
 436
 437         head = &inode->i_mapping->pages;
 438
 439         spin_lock(&pagecache_lock);
 440         curr = head->next;
 441         while (curr != head) {
 442                 page = list_entry(curr, struct page, list);
 443                 curr = curr->next;
 444                 if (!page->buffers)
 445                         continue;
 446                 if (page->index >= end)
 447                         continue;
 448                 if (page->index < start)
 449                         continue;
 450
 451                 page_cache_get(page);
 452                 spin_unlock(&pagecache_lock);
 453                 lock_page(page);
 454
 455                 /* The buffers could have been free'd while we waited for the page lock */
 456                 if (page->buffers)
 457                         retval |= fn(page);
 458
 459                 UnlockPage(page);
 460                 spin_lock(&pagecache_lock);
 461                 curr = page->list.next;
 462                 page_cache_release(page);
 463         }
 464         spin_unlock(&pagecache_lock);
 465
 466         return retval;
 467 }
 468
 469 /*
 470  * Two-stage data sync: first start the IO, then go back and
 471  * collect the information..
 472  */
 473 int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx)
 474 {
 475         int retval;
 476
 477         retval = do_buffer_fdatasync(inode, start_idx, end_idx, writeout_one_page);
 478         retval |= do_buffer_fdatasync(inode, start_idx, end_idx, waitfor_one_page);
 479         return retval;
 480 }
 481
 482 /*
 483  * Add a page to the inode page cache.
 484  *
 485  * The caller must have locked the page and
 486  * set all the page flags correctly..
 487  */
 488 void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index)
 489 {
 490         if (!PageLocked(page))
 491                 BUG();
 492
 493         page_cache_get(page);
 494         spin_lock(&pagecache_lock);
 495         page->index = index;
 496         add_page_to_inode_queue(mapping, page);
 497         __add_page_to_hash_queue(page, page_hash(mapping, index));
 498         lru_cache_add(page);
 499         spin_unlock(&pagecache_lock);
 500 }
 501
 502 /*
 503  * This adds a page to the page cache, starting out as locked,
 504  * owned by us, but unreferenced, not uptodate and with no errors.
 505  */
 506 static inline void __add_to_page_cache(struct page * page,
 507         struct address_space *mapping, unsigned long offset,
 508         struct page **hash)
 509 {
 510         struct page *alias;
 511         unsigned long flags;
 512
 513         if (PageLocked(page))
 514                 BUG();
 515
 516         flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced));
 517         page->flags = flags | (1 << PG_locked);
 518         page_cache_get(page);
 519         page->index = offset;
 520         add_page_to_inode_queue(mapping, page);
 521         __add_page_to_hash_queue(page, hash);
 522         lru_cache_add(page);
 523         alias = __find_page_nolock(mapping, offset, *hash);
 524         if (alias != page)
 525                 BUG();
 526 }
 527
 528 void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset)
 529 {
 530         spin_lock(&pagecache_lock);
 531         __add_to_page_cache(page, mapping, offset, page_hash(mapping, offset));
 532         spin_unlock(&pagecache_lock);
 533 }
 534
 535 static int add_to_page_cache_unique(struct page * page,
 536         struct address_space *mapping, unsigned long offset,
 537         struct page **hash)
 538 {
 539         int err;
 540         struct page *alias;
 541
 542         spin_lock(&pagecache_lock);
 543         alias = __find_page_nolock(mapping, offset, *hash);
 544
 545         err = 1;
 546         if (!alias) {
 547                 __add_to_page_cache(page,mapping,offset,hash);
 548                 err = 0;
 549         }
 550
 551         spin_unlock(&pagecache_lock);
 552         return err;
 553 }
 554
 555 /*
 556  * This adds the requested page to the page cache if it isn't already there,
 557  * and schedules an I/O to read in its contents from disk.
 558  */
 559 static inline int page_cache_read(struct file * file, unsigned long offset)
 560 {
 561         struct inode *inode = file->f_dentry->d_inode;
 562         struct address_space *mapping = inode->i_mapping;
 563         struct page **hash = page_hash(mapping, offset);
 564         struct page *page;
 565
 566         spin_lock(&pagecache_lock);
 567         page = __find_page_nolock(mapping, offset, *hash);
 568         spin_unlock(&pagecache_lock);
 569         if (page)
 570                 return 0;
 571
 572         page = page_cache_alloc();
 573         if (!page)
 574                 return -ENOMEM;
 575
 576         if (!add_to_page_cache_unique(page, mapping, offset, hash)) {
 577                 int error = mapping->a_ops->readpage(file, page);
 578                 page_cache_release(page);
 579                 return error;
 580         }
 581         /*
 582          * We arrive here in the unlikely event that someone
 583          * raced with us and added our page to the cache first.
 584          */
 585         page_cache_free(page);
 586         return 0;
 587 }
 588
 589 /*
 590  * Read in an entire cluster at once.  A cluster is usually a 64k-
 591  * aligned block that includes the page requested in "offset."
 592  */
 593 static int read_cluster_nonblocking(struct file * file, unsigned long offset,
 594         unsigned long filesize)
 595 {
 596         unsigned long pages = CLUSTER_PAGES;
 597
 598         offset = CLUSTER_OFFSET(offset);
 599         while ((pages-- > 0) && (offset < filesize)) {
 600                 int error = page_cache_read(file, offset);
 601                 if (error < 0)
 602                         return error;
 603                 offset ++;
 604         }
 605
 606         return 0;
 607 }
 608
 609 /*
 610  * Wait for a page to get unlocked.
 611  *
 612  * This must be called with the caller "holding" the page,
 613  * ie with increased "page->count" so that the page won't
 614  * go away during the wait..
 615  */
 616 void ___wait_on_page(struct page *page)
 617 {
 618         struct task_struct *tsk = current;
 619         DECLARE_WAITQUEUE(wait, tsk);
 620
 621         add_wait_queue(&page->wait, &wait);
 622         do {
 623                 sync_page(page);
 624                 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 625                 if (!PageLocked(page))
 626                         break;
 627                 schedule();
 628         } while (PageLocked(page));
 629         tsk->state = TASK_RUNNING;
 630         remove_wait_queue(&page->wait, &wait);
 631 }
 632
 633 /*
 634  * Get an exclusive lock on the page..
 635  */
 636 void lock_page(struct page *page)
 637 {
 638         while (TryLockPage(page))
 639                 ___wait_on_page(page);
 640 }
 641
 642
 643 /*
 644  * a rather lightweight function, finding and getting a reference to a
 645  * hashed page atomically, waiting for it if it's locked.
 646  */
 647 struct page * __find_get_page (struct address_space *mapping,
 648                                 unsigned long offset, struct page **hash)
 649 {
 650         struct page *page;
 651
 652         /*
 653          * We scan the hash list read-only. Addition to and removal from
 654          * the hash-list needs a held write-lock.
 655          */
 656 repeat:
 657         spin_lock(&pagecache_lock);
 658         page = __find_page_nolock(mapping, offset, *hash);
 659         if (page)
 660                 page_cache_get(page);
 661         spin_unlock(&pagecache_lock);
 662
 663         /* Found the page, sleep if locked. */
 664         if (page && PageLocked(page)) {
 665                 struct task_struct *tsk = current;
 666                 DECLARE_WAITQUEUE(wait, tsk);
 667
 668                 sync_page(page);
 669
 670                 __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 671                 add_wait_queue(&page->wait, &wait);
 672
 673                 if (PageLocked(page))
 674                         schedule();
 675                 __set_task_state(tsk, TASK_RUNNING);
 676                 remove_wait_queue(&page->wait, &wait);
 677
 678                 /*
 679                  * The page might have been unhashed meanwhile. It's
 680                  * not freed though because we hold a reference to it.
 681                  * If this is the case then it will be freed _here_,
 682                  * and we recheck the hash anyway.
 683                  */
 684                 page_cache_release(page);
 685                 goto repeat;
 686         }
 687         /*
 688          * It's not locked so we can return the page and we hold
 689          * a reference to it.
 690          */
 691         return page;
 692 }
 693
 694 /*
 695  * Get the lock to a page atomically.
 696  */
 697 struct page * __find_lock_page (struct address_space *mapping,
 698                                 unsigned long offset, struct page **hash)
 699 {
 700         struct page *page;
 701
 702         /*
 703          * We scan the hash list read-only. Addition to and removal from
 704          * the hash-list needs a held write-lock.
 705          */
 706 repeat:
 707         spin_lock(&pagecache_lock);
 708         page = __find_page_nolock(mapping, offset, *hash);
 709         if (page)
 710                 page_cache_get(page);
 711         spin_unlock(&pagecache_lock);
 712
 713         /* Found the page, sleep if locked. */
 714         if (page && TryLockPage(page)) {
 715                 struct task_struct *tsk = current;
 716                 DECLARE_WAITQUEUE(wait, tsk);
 717
 718                 sync_page(page);
 719
 720                 __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 721                 add_wait_queue(&page->wait, &wait);
 722
 723                 if (PageLocked(page))
 724                         schedule();
 725                 __set_task_state(tsk, TASK_RUNNING);
 726                 remove_wait_queue(&page->wait, &wait);
 727
 728                 /*
 729                  * The page might have been unhashed meanwhile. It's
 730                  * not freed though because we hold a reference to it.
 731                  * If this is the case then it will be freed _here_,
 732                  * and we recheck the hash anyway.
 733                  */
 734                 page_cache_release(page);
 735                 goto repeat;
 736         }
 737         /*
 738          * It's not locked so we can return the page and we hold
 739          * a reference to it.
 740          */
 741         return page;
 742 }
 743
 744 #if 0
 745 #define PROFILE_READAHEAD
 746 #define DEBUG_READAHEAD
 747 #endif
 748
 749 /*
 750  * Read-ahead profiling information
 751  * --------------------------------
 752  * Every PROFILE_MAXREADCOUNT, the following information is written
 753  * to the syslog:
 754  *   Percentage of asynchronous read-ahead.
 755  *   Average of read-ahead fields context value.
 756  * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
 757  * to the syslog.
 758  */
 759
 760 #ifdef PROFILE_READAHEAD
 761
 762 #define PROFILE_MAXREADCOUNT 1000
 763
 764 static unsigned long total_reada;
 765 static unsigned long total_async;
 766 static unsigned long total_ramax;
 767 static unsigned long total_ralen;
 768 static unsigned long total_rawin;
 769
 770 static void profile_readahead(int async, struct file *filp)
 771 {
 772         unsigned long flags;
 773
 774         ++total_reada;
 775         if (async)
 776                 ++total_async;
 777
 778         total_ramax     += filp->f_ramax;
 779         total_ralen     += filp->f_ralen;
 780         total_rawin     += filp->f_rawin;
 781
 782         if (total_reada > PROFILE_MAXREADCOUNT) {
 783                 save_flags(flags);
 784                 cli();
 785                 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
 786                         restore_flags(flags);
 787                         return;
 788                 }
 789
 790                 printk("Readahead average:  max=%ld, len=%ld, win=%ld, async=%ld%%\n",
 791                         total_ramax/total_reada,
 792                         total_ralen/total_reada,
 793                         total_rawin/total_reada,
 794                         (total_async*100)/total_reada);
 795 #ifdef DEBUG_READAHEAD
 796                 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n",
 797                         filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
 798 #endif
 799
 800                 total_reada     = 0;
 801                 total_async     = 0;
 802                 total_ramax     = 0;
 803                 total_ralen     = 0;
 804                 total_rawin     = 0;
 805
 806                 restore_flags(flags);
 807         }
 808 }
 809 #endif  /* defined PROFILE_READAHEAD */
 810
 811 /*
 812  * Read-ahead context:
 813  * -------------------
 814  * The read ahead context fields of the "struct file" are the following:
 815  * - f_raend : position of the first byte after the last page we tried to
 816  *             read ahead.
 817  * - f_ramax : current read-ahead maximum size.
 818  * - f_ralen : length of the current IO read block we tried to read-ahead.
 819  * - f_rawin : length of the current read-ahead window.
 820  *              if last read-ahead was synchronous then
 821  *                      f_rawin = f_ralen
 822  *              otherwise (was asynchronous)
 823  *                      f_rawin = previous value of f_ralen + f_ralen
 824  *
 825  * Read-ahead limits:
 826  * ------------------
 827  * MIN_READAHEAD   : minimum read-ahead size when read-ahead.
 828  * MAX_READAHEAD   : maximum read-ahead size when read-ahead.
 829  *
 830  * Synchronous read-ahead benefits:
 831  * --------------------------------
 832  * Using reasonable IO xfer length from peripheral devices increase system
 833  * performances.
 834  * Reasonable means, in this context, not too large but not too small.
 835  * The actual maximum value is:
 836  *      MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
 837  *      and 32K if defined (4K page size assumed).
 838  *
 839  * Asynchronous read-ahead benefits:
 840  * ---------------------------------
 841  * Overlapping next read request and user process execution increase system
 842  * performance.
 843  *
 844  * Read-ahead risks:
 845  * -----------------
 846  * We have to guess which further data are needed by the user process.
 847  * If these data are often not really needed, it's bad for system
 848  * performances.
 849  * However, we know that files are often accessed sequentially by
 850  * application programs and it seems that it is possible to have some good
 851  * strategy in that guessing.
 852  * We only try to read-ahead files that seems to be read sequentially.
 853  *
 854  * Asynchronous read-ahead risks:
 855  * ------------------------------
 856  * In order to maximize overlapping, we must start some asynchronous read
 857  * request from the device, as soon as possible.
 858  * We must be very careful about:
 859  * - The number of effective pending IO read requests.
 860  *   ONE seems to be the only reasonable value.
 861  * - The total memory pool usage for the file access stream.
 862  *   This maximum memory usage is implicitly 2 IO read chunks:
 863  *   2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
 864  *   64k if defined (4K page size assumed).
 865  */
 866
 867 static inline int get_max_readahead(struct inode * inode)
 868 {
 869         if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
 870                 return MAX_READAHEAD;
 871         return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
 872 }
 873
 874 static void generic_file_readahead(int reada_ok,
 875         struct file * filp, struct inode * inode,
 876         struct page * page)
 877 {
 878         unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
 879         unsigned long index = page->index;
 880         unsigned long max_ahead, ahead;
 881         unsigned long raend;
 882         int max_readahead = get_max_readahead(inode);
 883
 884         raend = filp->f_raend;
 885         max_ahead = 0;
 886
 887 /*
 888  * The current page is locked.
 889  * If the current position is inside the previous read IO request, do not
 890  * try to reread previously read ahead pages.
 891  * Otherwise decide or not to read ahead some pages synchronously.
 892  * If we are not going to read ahead, set the read ahead context for this
 893  * page only.
 894  */
 895         if (PageLocked(page)) {
 896                 if (!filp->f_ralen || index >= raend || index + filp->f_ralen < raend) {
 897                         raend = index;
 898                         if (raend < end_index)
 899                                 max_ahead = filp->f_ramax;
 900                         filp->f_rawin = 0;
 901                         filp->f_ralen = 1;
 902                         if (!max_ahead) {
 903                                 filp->f_raend  = index + filp->f_ralen;
 904                                 filp->f_rawin += filp->f_ralen;
 905                         }
 906                 }
 907         }
 908 /*
 909  * The current page is not locked.
 910  * If we were reading ahead and,
 911  * if the current max read ahead size is not zero and,
 912  * if the current position is inside the last read-ahead IO request,
 913  *   it is the moment to try to read ahead asynchronously.
 914  * We will later force unplug device in order to force asynchronous read IO.
 915  */
 916         else if (reada_ok && filp->f_ramax && raend >= 1 &&
 917                  index <= raend && index + filp->f_ralen >= raend) {
 918 /*
 919  * Add ONE page to max_ahead in order to try to have about the same IO max size
 920  * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
 921  * Compute the position of the last page we have tried to read in order to
 922  * begin to read ahead just at the next page.
 923  */
 924                 raend -= 1;
 925                 if (raend < end_index)
 926                         max_ahead = filp->f_ramax + 1;
 927
 928                 if (max_ahead) {
 929                         filp->f_rawin = filp->f_ralen;
 930                         filp->f_ralen = 0;
 931                         reada_ok      = 2;
 932                 }
 933         }
 934 /*
 935  * Try to read ahead pages.
 936  * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
 937  * scheduler, will work enough for us to avoid too bad actuals IO requests.
 938  */
 939         ahead = 0;
 940         while (ahead < max_ahead) {
 941                 ahead ++;
 942                 if ((raend + ahead) >= end_index)
 943                         break;
 944                 if (page_cache_read(filp, raend + ahead) < 0)
 945                         break;
 946         }
 947 /*
 948  * If we tried to read ahead some pages,
 949  * If we tried to read ahead asynchronously,
 950  *   Try to force unplug of the device in order to start an asynchronous
 951  *   read IO request.
 952  * Update the read-ahead context.
 953  * Store the length of the current read-ahead window.
 954  * Double the current max read ahead size.
 955  *   That heuristic avoid to do some large IO for files that are not really
 956  *   accessed sequentially.
 957  */
 958         if (ahead) {
 959                 if (reada_ok == 2) {
 960                         run_task_queue(&tq_disk);
 961                 }
 962
 963                 filp->f_ralen += ahead;
 964                 filp->f_rawin += filp->f_ralen;
 965                 filp->f_raend = raend + ahead + 1;
 966
 967                 filp->f_ramax += filp->f_ramax;
 968
 969                 if (filp->f_ramax > max_readahead)
 970                         filp->f_ramax = max_readahead;
 971
 972 #ifdef PROFILE_READAHEAD
 973                 profile_readahead((reada_ok == 2), filp);
 974 #endif
 975         }
 976
 977         return;
 978 }
 979
 980
 981 /*
 982  * This is a generic file read routine, and uses the
 983  * inode->i_op->readpage() function for the actual low-level
 984  * stuff.
 985  *
 986  * This is really ugly. But the goto's actually try to clarify some
 987  * of the logic when it comes to error handling etc.
 988  */
 989 void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
 990 {
 991         struct inode *inode = filp->f_dentry->d_inode;
 992         struct address_space *mapping = inode->i_mapping;
 993         unsigned long index, offset;
 994         struct page *cached_page;
 995         int reada_ok;
 996         int error;
 997         int max_readahead = get_max_readahead(inode);
 998
 999         cached_page = NULL;
1000         index = *ppos >> PAGE_CACHE_SHIFT;
1001         offset = *ppos & ~PAGE_CACHE_MASK;
1002
1003 /*
1004  * If the current position is outside the previous read-ahead window,
1005  * we reset the current read-ahead context and set read ahead max to zero
1006  * (will be set to just needed value later),
1007  * otherwise, we assume that the file accesses are sequential enough to
1008  * continue read-ahead.
1009  */
1010         if (index > filp->f_raend || index + filp->f_rawin < filp->f_raend) {
1011                 reada_ok = 0;
1012                 filp->f_raend = 0;
1013                 filp->f_ralen = 0;
1014                 filp->f_ramax = 0;
1015                 filp->f_rawin = 0;
1016         } else {
1017                 reada_ok = 1;
1018         }
1019 /*
1020  * Adjust the current value of read-ahead max.
1021  * If the read operation stay in the first half page, force no readahead.
1022  * Otherwise try to increase read ahead max just enough to do the read request.
1023  * Then, at least MIN_READAHEAD if read ahead is ok,
1024  * and at most MAX_READAHEAD in all cases.
1025  */
1026         if (!index && offset + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
1027                 filp->f_ramax = 0;
1028         } else {
1029                 unsigned long needed;
1030
1031                 needed = ((offset + desc->count) >> PAGE_CACHE_SHIFT) + 1;
1032
1033                 if (filp->f_ramax < needed)
1034                         filp->f_ramax = needed;
1035
1036                 if (reada_ok && filp->f_ramax < MIN_READAHEAD)
1037                                 filp->f_ramax = MIN_READAHEAD;
1038                 if (filp->f_ramax > max_readahead)
1039                         filp->f_ramax = max_readahead;
1040         }
1041
1042         for (;;) {
1043                 struct page *page, **hash;
1044                 unsigned long end_index, nr;
1045
1046                 end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1047                 if (index > end_index)
1048                         break;
1049                 nr = PAGE_CACHE_SIZE;
1050                 if (index == end_index) {
1051                         nr = inode->i_size & ~PAGE_CACHE_MASK;
1052                         if (nr <= offset)
1053                                 break;
1054                 }
1055
1056                 nr = nr - offset;
1057
1058                 /*
1059                  * Try to find the data in the page cache..
1060                  */
1061                 hash = page_hash(mapping, index);
1062
1063                 spin_lock(&pagecache_lock);
1064                 page = __find_page_nolock(mapping, index, *hash);
1065                 if (!page)
1066                         goto no_cached_page;
1067 found_page:
1068                 page_cache_get(page);
1069                 spin_unlock(&pagecache_lock);
1070
1071                 if (!Page_Uptodate(page))
1072                         goto page_not_up_to_date;
1073 page_ok:
1074                 /*
1075                  * Ok, we have the page, and it's up-to-date, so
1076                  * now we can copy it to user space...
1077                  *
1078                  * The actor routine returns how many bytes were actually used..
1079                  * NOTE! This may not be the same as how much of a user buffer
1080                  * we filled up (we may be padding etc), so we can only update
1081                  * "pos" here (the actor routine has to update the user buffer
1082                  * pointers and the remaining count).
1083                  */
1084                 nr = actor(desc, page, offset, nr);
1085                 offset += nr;
1086                 index += offset >> PAGE_CACHE_SHIFT;
1087                 offset &= ~PAGE_CACHE_MASK;
1088
1089                 page_cache_release(page);
1090                 if (nr && desc->count)
1091                         continue;
1092                 break;
1093
1094 /*
1095  * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
1096  */
1097 page_not_up_to_date:
1098                 generic_file_readahead(reada_ok, filp, inode, page);
1099
1100                 if (Page_Uptodate(page))
1101                         goto page_ok;
1102
1103                 /* Get exclusive access to the page ... */
1104                 lock_page(page);
1105                 if (Page_Uptodate(page)) {
1106                         UnlockPage(page);
1107                         goto page_ok;
1108                 }
1109
1110 readpage:
1111                 /* ... and start the actual read. The read will unlock the page. */
1112                 error = mapping->a_ops->readpage(filp, page);
1113
1114                 if (!error) {
1115                         if (Page_Uptodate(page))
1116                                 goto page_ok;
1117
1118                         /* Again, try some read-ahead while waiting for the page to finish.. */
1119                         generic_file_readahead(reada_ok, filp, inode, page);
1120                         wait_on_page(page);
1121                         if (Page_Uptodate(page))
1122                                 goto page_ok;
1123                         error = -EIO;
1124                 }
1125
1126                 /* UHHUH! A synchronous read error occurred. Report it */
1127                 desc->error = error;
1128                 page_cache_release(page);
1129                 break;
1130
1131 no_cached_page:
1132                 /*
1133                  * Ok, it wasn't cached, so we need to create a new
1134                  * page..
1135                  *
1136                  * We get here with the page cache lock held.
1137                  */
1138                 if (!cached_page) {
1139                         spin_unlock(&pagecache_lock);
1140                         cached_page = page_cache_alloc();
1141                         if (!cached_page) {
1142                                 desc->error = -ENOMEM;
1143                                 break;
1144                         }
1145
1146                         /*
1147                          * Somebody may have added the page while we
1148                          * dropped the page cache lock. Check for that.
1149                          */
1150                         spin_lock(&pagecache_lock);
1151                         page = __find_page_nolock(mapping, index, *hash);
1152                         if (page)
1153                                 goto found_page;
1154                 }
1155
1156                 /*
1157                  * Ok, add the new page to the hash-queues...
1158                  */
1159                 page = cached_page;
1160                 __add_to_page_cache(page, mapping, index, hash);
1161                 spin_unlock(&pagecache_lock);
1162                 cached_page = NULL;
1163
1164                 goto readpage;
1165         }
1166
1167         *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1168         filp->f_reada = 1;
1169         if (cached_page)
1170                 page_cache_free(cached_page);
1171         UPDATE_ATIME(inode);
1172 }
1173
1174 static int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
1175 {
1176         unsigned long kaddr;
1177         unsigned long left, count = desc->count;
1178
1179         if (size > count)
1180                 size = count;
1181
1182         kaddr = kmap(page);
1183         left = __copy_to_user(desc->buf, (void *)(kaddr + offset), size);
1184         kunmap(page);
1185
1186         if (left) {
1187                 size -= left;
1188                 desc->error = -EFAULT;
1189         }
1190         desc->count = count - size;
1191         desc->written += size;
1192         desc->buf += size;
1193         return size;
1194 }
1195
1196 /*
1197  * This is the "read()" routine for all filesystems
1198  * that can use the page cache directly.
1199  */
1200 ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
1201 {
1202         ssize_t retval;
1203
1204         retval = -EFAULT;
1205         if (access_ok(VERIFY_WRITE, buf, count)) {
1206                 retval = 0;
1207
1208                 if (count) {
1209                         read_descriptor_t desc;
1210
1211                         desc.written = 0;
1212                         desc.count = count;
1213                         desc.buf = buf;
1214                         desc.error = 0;
1215                         do_generic_file_read(filp, ppos, &desc, file_read_actor);
1216
1217                         retval = desc.written;
1218                         if (!retval)
1219                                 retval = desc.error;
1220                 }
1221         }
1222         return retval;
1223 }
1224
1225 static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size)
1226 {
1227         unsigned long kaddr;
1228         ssize_t written;
1229         unsigned long count = desc->count;
1230         struct file *file = (struct file *) desc->buf;
1231         mm_segment_t old_fs;
1232
1233         if (size > count)
1234                 size = count;
1235         old_fs = get_fs();
1236         set_fs(KERNEL_DS);
1237
1238         kaddr = kmap(page);
1239         written = file->f_op->write(file, (char *)kaddr + offset,
1240                                                  size, &file->f_pos);
1241         kunmap(page);
1242         set_fs(old_fs);
1243         if (written < 0) {
1244                 desc->error = written;
1245                 written = 0;
1246         }
1247         desc->count = count - written;
1248         desc->written += written;
1249         return written;
1250 }
1251
1252 asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
1253 {
1254         ssize_t retval;
1255         struct file * in_file, * out_file;
1256         struct inode * in_inode, * out_inode;
1257
1258         /*
1259          * Get input file, and verify that it is ok..
1260          */
1261         retval = -EBADF;
1262         in_file = fget(in_fd);
1263         if (!in_file)
1264                 goto out;
1265         if (!(in_file->f_mode & FMODE_READ))
1266                 goto fput_in;
1267         retval = -EINVAL;
1268         in_inode = in_file->f_dentry->d_inode;
1269         if (!in_inode)
1270                 goto fput_in;
1271         if (!in_inode->i_mapping->a_ops->readpage)
1272                 goto fput_in;
1273         retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
1274         if (retval)
1275                 goto fput_in;
1276
1277         /*
1278          * Get output file, and verify that it is ok..
1279          */
1280         retval = -EBADF;
1281         out_file = fget(out_fd);
1282         if (!out_file)
1283                 goto fput_in;
1284         if (!(out_file->f_mode & FMODE_WRITE))
1285                 goto fput_out;
1286         retval = -EINVAL;
1287         if (!out_file->f_op || !out_file->f_op->write)
1288                 goto fput_out;
1289         out_inode = out_file->f_dentry->d_inode;
1290         if (!out_inode)
1291                 goto fput_out;
1292         retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
1293         if (retval)
1294                 goto fput_out;
1295
1296         retval = 0;
1297         if (count) {
1298                 read_descriptor_t desc;
1299                 loff_t pos = 0, *ppos;
1300
1301                 retval = -EFAULT;
1302                 ppos = &in_file->f_pos;
1303                 if (offset) {
1304                         if (get_user(pos, offset))
1305                                 goto fput_out;
1306                         ppos = &pos;
1307                 }
1308
1309                 desc.written = 0;
1310                 desc.count = count;
1311                 desc.buf = (char *) out_file;
1312                 desc.error = 0;
1313                 do_generic_file_read(in_file, ppos, &desc, file_send_actor);
1314
1315                 retval = desc.written;
1316                 if (!retval)
1317                         retval = desc.error;
1318                 if (offset)
1319                         put_user(pos, offset);
1320         }
1321
1322 fput_out:
1323         fput(out_file);
1324 fput_in:
1325         fput(in_file);
1326 out:
1327         return retval;
1328 }
1329
1330 /*
1331  * Read-ahead and flush behind for MADV_SEQUENTIAL areas.  Since we are
1332  * sure this is sequential access, we don't need a flexible read-ahead
1333  * window size -- we can always use a large fixed size window.
1334  */
1335 static void nopage_sequential_readahead(struct vm_area_struct * vma,
1336         unsigned long pgoff, unsigned long filesize)
1337 {
1338         unsigned long ra_window;
1339
1340         ra_window = get_max_readahead(vma->vm_file->f_dentry->d_inode);
1341         ra_window = CLUSTER_OFFSET(ra_window + CLUSTER_PAGES - 1);
1342
1343         /* vm_raend is zero if we haven't read ahead in this area yet.  */
1344         if (vma->vm_raend == 0)
1345                 vma->vm_raend = vma->vm_pgoff + ra_window;
1346
1347         /*
1348          * If we've just faulted the page half-way through our window,
1349          * then schedule reads for the next window, and release the
1350          * pages in the previous window.
1351          */
1352         if ((pgoff + (ra_window >> 1)) == vma->vm_raend) {
1353                 unsigned long start = vma->vm_pgoff + vma->vm_raend;
1354                 unsigned long end = start + ra_window;
1355
1356                 if (end > ((vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff))
1357                         end = (vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff;
1358                 if (start > end)
1359                         return;
1360
1361                 while ((start < end) && (start < filesize)) {
1362                         if (read_cluster_nonblocking(vma->vm_file,
1363                                                         start, filesize) < 0)
1364                                 break;
1365                         start += CLUSTER_PAGES;
1366                 }
1367                 run_task_queue(&tq_disk);
1368
1369                 /* if we're far enough past the beginning of this area,
1370                    recycle pages that are in the previous window. */
1371                 if (vma->vm_raend > (vma->vm_pgoff + ra_window + ra_window)) {
1372                         unsigned long window = ra_window << PAGE_SHIFT;
1373
1374                         end = vma->vm_start + (vma->vm_raend << PAGE_SHIFT);
1375                         end -= window + window;
1376                         filemap_sync(vma, end - window, window, MS_INVALIDATE);
1377                 }
1378
1379                 vma->vm_raend += ra_window;
1380         }
1381
1382         return;
1383 }
1384
1385 /*
1386  * filemap_nopage() is invoked via the vma operations vector for a
1387  * mapped memory region to read in file data during a page fault.
1388  *
1389  * The goto's are kind of ugly, but this streamlines the normal case of having
1390  * it in the page cache, and handles the special cases reasonably without
1391  * having a lot of duplicated code.
1392  */
1393 struct page * filemap_nopage(struct vm_area_struct * area,
1394         unsigned long address, int no_share)
1395 {
1396         int error;
1397         struct file *file = area->vm_file;
1398         struct inode *inode = file->f_dentry->d_inode;
1399         struct address_space *mapping = inode->i_mapping;
1400         struct page *page, **hash, *old_page;
1401         unsigned long size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1402
1403         unsigned long pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
1404
1405         /*
1406          * Semantics for shared and private memory areas are different
1407          * past the end of the file. A shared mapping past the last page
1408          * of the file is an error and results in a SIGBUS, while a
1409          * private mapping just maps in a zero page.
1410          */
1411         if ((pgoff >= size) && (area->vm_mm == current->mm))
1412                 return NULL;
1413
1414         /*
1415          * Do we have something in the page cache already?
1416          */
1417         hash = page_hash(mapping, pgoff);
1418 retry_find:
1419         page = __find_get_page(mapping, pgoff, hash);
1420         if (!page)
1421                 goto no_cached_page;
1422
1423         /*
1424          * Ok, found a page in the page cache, now we need to check
1425          * that it's up-to-date.
1426          */
1427         if (!Page_Uptodate(page))
1428                 goto page_not_uptodate;
1429
1430 success:
1431         /*
1432          * Try read-ahead for sequential areas.
1433          */
1434         if (VM_SequentialReadHint(area))
1435                 nopage_sequential_readahead(area, pgoff, size);
1436
1437         /*
1438          * Found the page and have a reference on it, need to check sharing
1439          * and possibly copy it over to another page..
1440          */
1441         old_page = page;
1442         if (no_share) {
1443                 struct page *new_page = page_cache_alloc();
1444
1445                 if (new_page) {
1446                         copy_user_highpage(new_page, old_page, address);
1447                         flush_page_to_ram(new_page);
1448                 } else
1449                         new_page = NOPAGE_OOM;
1450                 page_cache_release(page);
1451                 return new_page;
1452         }
1453
1454         flush_page_to_ram(old_page);
1455         return old_page;
1456
1457 no_cached_page:
1458         /*
1459          * If the requested offset is within our file, try to read a whole
1460          * cluster of pages at once.
1461          *
1462          * Otherwise, we're off the end of a privately mapped file,
1463          * so we need to map a zero page.
1464          */
1465         if ((pgoff < size) && !VM_RandomReadHint(area))
1466                 error = read_cluster_nonblocking(file, pgoff, size);
1467         else
1468                 error = page_cache_read(file, pgoff);
1469
1470         /*
1471          * The page we want has now been added to the page cache.
1472          * In the unlikely event that someone removed it in the
1473          * meantime, we'll just come back here and read it again.
1474          */
1475         if (error >= 0)
1476                 goto retry_find;
1477
1478         /*
1479          * An error return from page_cache_read can result if the
1480          * system is low on memory, or a problem occurs while trying
1481          * to schedule I/O.
1482          */
1483         if (error == -ENOMEM)
1484                 return NOPAGE_OOM;
1485         return NULL;
1486
1487 page_not_uptodate:
1488         lock_page(page);
1489         if (Page_Uptodate(page)) {
1490                 UnlockPage(page);
1491                 goto success;
1492         }
1493
1494         if (!mapping->a_ops->readpage(file, page)) {
1495                 wait_on_page(page);
1496                 if (Page_Uptodate(page))
1497                         goto success;
1498         }
1499
1500         /*
1501          * Umm, take care of errors if the page isn't up-to-date.
1502          * Try to re-read it _once_. We do this synchronously,
1503          * because there really aren't any performance issues here
1504          * and we need to check for errors.
1505          */
1506         lock_page(page);
1507         if (Page_Uptodate(page)) {
1508                 UnlockPage(page);
1509                 goto success;
1510         }
1511         ClearPageError(page);
1512         if (!mapping->a_ops->readpage(file, page)) {
1513                 wait_on_page(page);
1514                 if (Page_Uptodate(page))
1515                         goto success;
1516         }
1517
1518         /*
1519          * Things didn't work out. Return zero to tell the
1520          * mm layer so, possibly freeing the page cache page first.
1521          */
1522         page_cache_release(page);
1523         return NULL;
1524 }
1525
1526 static int filemap_write_page(struct file *file,
1527                               struct page * page,
1528                               int wait)
1529 {
1530         /*
1531          * If a task terminates while we're swapping the page, the vma and
1532          * and file could be released: try_to_swap_out has done a get_file.
1533          * vma/file is guaranteed to exist in the unmap/sync cases because
1534          * mmap_sem is held.
1535          */
1536         return page->mapping->a_ops->writepage(file, page);
1537 }
1538
1539
1540 /*
1541  * The page cache takes care of races between somebody
1542  * trying to swap something out and swap something in
1543  * at the same time..
1544  */
1545 extern void wakeup_bdflush(int);
1546 int filemap_swapout(struct page * page, struct file * file)
1547 {
1548         int retval = filemap_write_page(file, page, 0);
1549         wakeup_bdflush(0);
1550         return retval;
1551 }
1552
1553 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
1554         unsigned long address, unsigned int flags)
1555 {
1556         unsigned long pgoff;
1557         pte_t pte = *ptep;
1558         struct page *page;
1559         int error;
1560
1561         if (!(flags & MS_INVALIDATE)) {
1562                 if (!pte_present(pte))
1563                         return 0;
1564                 if (!pte_dirty(pte))
1565                         return 0;
1566                 flush_page_to_ram(pte_page(pte));
1567                 flush_cache_page(vma, address);
1568                 set_pte(ptep, pte_mkclean(pte));
1569                 flush_tlb_page(vma, address);
1570                 page = pte_page(pte);
1571                 page_cache_get(page);
1572         } else {
1573                 if (pte_none(pte))
1574                         return 0;
1575                 flush_cache_page(vma, address);
1576                 pte_clear(ptep);
1577                 flush_tlb_page(vma, address);
1578                 if (!pte_present(pte)) {
1579                         swap_free(pte_to_swp_entry(pte));
1580                         return 0;
1581                 }
1582                 page = pte_page(pte);
1583                 if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
1584                         page_cache_free(page);
1585                         return 0;
1586                 }
1587         }
1588         pgoff = (address - vma->vm_start) >> PAGE_CACHE_SHIFT;
1589         pgoff += vma->vm_pgoff;
1590         if (page->index != pgoff) {
1591                 printk("weirdness: pgoff=%lu index=%lu address=%lu vm_start=%lu vm_pgoff=%lu\n",
1592                         pgoff, page->index, address, vma->vm_start, vma->vm_pgoff);
1593         }
1594         lock_page(page);
1595         error = filemap_write_page(vma->vm_file, page, 1);
1596         UnlockPage(page);
1597         page_cache_free(page);
1598         return error;
1599 }
1600
1601 static inline int filemap_sync_pte_range(pmd_t * pmd,
1602         unsigned long address, unsigned long size,
1603         struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
1604 {
1605         pte_t * pte;
1606         unsigned long end;
1607         int error;
1608
1609         if (pmd_none(*pmd))
1610                 return 0;
1611         if (pmd_bad(*pmd)) {
1612                 pmd_ERROR(*pmd);
1613                 pmd_clear(pmd);
1614                 return 0;
1615         }
1616         pte = pte_offset(pmd, address);
1617         offset += address & PMD_MASK;
1618         address &= ~PMD_MASK;
1619         end = address + size;
1620         if (end > PMD_SIZE)
1621                 end = PMD_SIZE;
1622         error = 0;
1623         do {
1624                 error |= filemap_sync_pte(pte, vma, address + offset, flags);
1625                 address += PAGE_SIZE;
1626                 pte++;
1627         } while (address && (address < end));
1628         return error;
1629 }
1630
1631 static inline int filemap_sync_pmd_range(pgd_t * pgd,
1632         unsigned long address, unsigned long size,
1633         struct vm_area_struct *vma, unsigned int flags)
1634 {
1635         pmd_t * pmd;
1636         unsigned long offset, end;
1637         int error;
1638
1639         if (pgd_none(*pgd))
1640                 return 0;
1641         if (pgd_bad(*pgd)) {
1642                 pgd_ERROR(*pgd);
1643                 pgd_clear(pgd);
1644                 return 0;
1645         }
1646         pmd = pmd_offset(pgd, address);
1647         offset = address & PGDIR_MASK;
1648         address &= ~PGDIR_MASK;
1649         end = address + size;
1650         if (end > PGDIR_SIZE)
1651                 end = PGDIR_SIZE;
1652         error = 0;
1653         do {
1654                 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
1655                 address = (address + PMD_SIZE) & PMD_MASK;
1656                 pmd++;
1657         } while (address && (address < end));
1658         return error;
1659 }
1660
1661 int filemap_sync(struct vm_area_struct * vma, unsigned long address,
1662         size_t size, unsigned int flags)
1663 {
1664         pgd_t * dir;
1665         unsigned long end = address + size;
1666         int error = 0;
1667
1668         dir = pgd_offset(vma->vm_mm, address);
1669         flush_cache_range(vma->vm_mm, end - size, end);
1670         if (address >= end)
1671                 BUG();
1672         do {
1673                 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
1674                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
1675                 dir++;
1676         } while (address && (address < end));
1677         flush_tlb_range(vma->vm_mm, end - size, end);
1678         return error;
1679 }
1680
1681 /*
1682  * This handles (potentially partial) area unmaps..
1683  */
1684 static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len)
1685 {
1686         filemap_sync(vma, start, len, MS_ASYNC);
1687 }
1688
1689 /*
1690  * Shared mappings need to be able to do the right thing at
1691  * close/unmap/sync. They will also use the private file as
1692  * backing-store for swapping..
1693  */
1694 static struct vm_operations_struct file_shared_mmap = {
1695         unmap:          filemap_unmap,          /* unmap - we need to sync the pages */
1696         sync:           filemap_sync,
1697         nopage:         filemap_nopage,
1698         swapout:        filemap_swapout,
1699 };
1700
1701 /*
1702  * Private mappings just need to be able to load in the map.
1703  *
1704  * (This is actually used for shared mappings as well, if we
1705  * know they can't ever get write permissions..)
1706  */
1707 static struct vm_operations_struct file_private_mmap = {
1708         nopage:         filemap_nopage,
1709 };
1710
1711 /* This is used for a general mmap of a disk file */
1712
1713 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1714 {
1715         struct vm_operations_struct * ops;
1716         struct inode *inode = file->f_dentry->d_inode;
1717
1718         ops = &file_private_mmap;
1719         if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1720                 if (!inode->i_mapping->a_ops->writepage)
1721                         return -EINVAL;
1722                 ops = &file_shared_mmap;
1723         }
1724         if (!inode->i_sb || !S_ISREG(inode->i_mode))
1725                 return -EACCES;
1726         if (!inode->i_mapping->a_ops->readpage)
1727                 return -ENOEXEC;
1728         UPDATE_ATIME(inode);
1729         vma->vm_ops = ops;
1730         return 0;
1731 }
1732
1733 /*
1734  * The msync() system call.
1735  */
1736
1737 static int msync_interval(struct vm_area_struct * vma,
1738         unsigned long start, unsigned long end, int flags)
1739 {
1740         if (vma->vm_file && vma->vm_ops && vma->vm_ops->sync) {
1741                 int error;
1742                 error = vma->vm_ops->sync(vma, start, end-start, flags);
1743                 if (!error && (flags & MS_SYNC)) {
1744                         struct file * file = vma->vm_file;
1745                         if (file && file->f_op && file->f_op->fsync) {
1746                                 down(&file->f_dentry->d_inode->i_sem);
1747                                 error = file->f_op->fsync(file, file->f_dentry, 1);
1748                                 up(&file->f_dentry->d_inode->i_sem);
1749                         }
1750                 }
1751                 return error;
1752         }
1753         return 0;
1754 }
1755
1756 asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
1757 {
1758         unsigned long end;
1759         struct vm_area_struct * vma;
1760         int unmapped_error, error = -EINVAL;
1761
1762         down(&current->mm->mmap_sem);
1763         if (start & ~PAGE_MASK)
1764                 goto out;
1765         len = (len + ~PAGE_MASK) & PAGE_MASK;
1766         end = start + len;
1767         if (end < start)
1768                 goto out;
1769         if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1770                 goto out;
1771         error = 0;
1772         if (end == start)
1773                 goto out;
1774         /*
1775          * If the interval [start,end) covers some unmapped address ranges,
1776          * just ignore them, but return -EFAULT at the end.
1777          */
1778         vma = find_vma(current->mm, start);
1779         unmapped_error = 0;
1780         for (;;) {
1781                 /* Still start < end. */
1782                 error = -EFAULT;
1783                 if (!vma)
1784                         goto out;
1785                 /* Here start < vma->vm_end. */
1786                 if (start < vma->vm_start) {
1787                         unmapped_error = -EFAULT;
1788                         start = vma->vm_start;
1789                 }
1790                 /* Here vma->vm_start <= start < vma->vm_end. */
1791                 if (end <= vma->vm_end) {
1792                         if (start < end) {
1793                                 error = msync_interval(vma, start, end, flags);
1794                                 if (error)
1795                                         goto out;
1796                         }
1797                         error = unmapped_error;
1798                         goto out;
1799                 }
1800                 /* Here vma->vm_start <= start < vma->vm_end < end. */
1801                 error = msync_interval(vma, start, vma->vm_end, flags);
1802                 if (error)
1803                         goto out;
1804                 start = vma->vm_end;
1805                 vma = vma->vm_next;
1806         }
1807 out:
1808         up(&current->mm->mmap_sem);
1809         return error;
1810 }
1811
1812 static inline void setup_read_behavior(struct vm_area_struct * vma,
1813         int behavior)
1814 {
1815         VM_ClearReadHint(vma);
1816         switch(behavior) {
1817                 case MADV_SEQUENTIAL:
1818                         vma->vm_flags |= VM_SEQ_READ;
1819                         break;
1820                 case MADV_RANDOM:
1821                         vma->vm_flags |= VM_RAND_READ;
1822                         break;
1823                 default:
1824                         break;
1825         }
1826         return;
1827 }
1828
1829 static long madvise_fixup_start(struct vm_area_struct * vma,
1830         unsigned long end, int behavior)
1831 {
1832         struct vm_area_struct * n;
1833
1834         n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1835         if (!n)
1836                 return -EAGAIN;
1837         *n = *vma;
1838         n->vm_end = end;
1839         setup_read_behavior(n, behavior);
1840         n->vm_raend = 0;
1841         get_file(n->vm_file);
1842         if (n->vm_ops && n->vm_ops->open)
1843                 n->vm_ops->open(n);
1844         vmlist_modify_lock(vma->vm_mm);
1845         vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
1846         vma->vm_start = end;
1847         insert_vm_struct(current->mm, n);
1848         vmlist_modify_unlock(vma->vm_mm);
1849         return 0;
1850 }
1851
1852 static long madvise_fixup_end(struct vm_area_struct * vma,
1853         unsigned long start, int behavior)
1854 {
1855         struct vm_area_struct * n;
1856
1857         n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1858         if (!n)
1859                 return -EAGAIN;
1860         *n = *vma;
1861         n->vm_start = start;
1862         n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT;
1863         setup_read_behavior(n, behavior);
1864         n->vm_raend = 0;
1865         get_file(n->vm_file);
1866         if (n->vm_ops && n->vm_ops->open)
1867                 n->vm_ops->open(n);
1868         vmlist_modify_lock(vma->vm_mm);
1869         vma->vm_end = start;
1870         insert_vm_struct(current->mm, n);
1871         vmlist_modify_unlock(vma->vm_mm);
1872         return 0;
1873 }
1874
1875 static long madvise_fixup_middle(struct vm_area_struct * vma,
1876         unsigned long start, unsigned long end, int behavior)
1877 {
1878         struct vm_area_struct * left, * right;
1879
1880         left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1881         if (!left)
1882                 return -EAGAIN;
1883         right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1884         if (!right) {
1885                 kmem_cache_free(vm_area_cachep, left);
1886                 return -EAGAIN;
1887         }
1888         *left = *vma;
1889         *right = *vma;
1890         left->vm_end = start;
1891         right->vm_start = end;
1892         right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT;
1893         left->vm_raend = 0;
1894         right->vm_raend = 0;
1895         atomic_add(2, &vma->vm_file->f_count);
1896
1897         if (vma->vm_ops && vma->vm_ops->open) {
1898                 vma->vm_ops->open(left);
1899                 vma->vm_ops->open(right);
1900         }
1901         vmlist_modify_lock(vma->vm_mm);
1902         vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
1903         vma->vm_start = start;
1904         vma->vm_end = end;
1905         setup_read_behavior(vma, behavior);
1906         vma->vm_raend = 0;
1907         insert_vm_struct(current->mm, left);
1908         insert_vm_struct(current->mm, right);
1909         vmlist_modify_unlock(vma->vm_mm);
1910         return 0;
1911 }
1912
1913 /*
1914  * We can potentially split a vm area into separate
1915  * areas, each area with its own behavior.
1916  */
1917 static long madvise_behavior(struct vm_area_struct * vma,
1918         unsigned long start, unsigned long end, int behavior)
1919 {
1920         int error = 0;
1921
1922         /* This caps the number of vma's this process can own */
1923         if (vma->vm_mm->map_count > MAX_MAP_COUNT)
1924                 return -ENOMEM;
1925
1926         if (start == vma->vm_start) {
1927                 if (end == vma->vm_end) {
1928                         setup_read_behavior(vma, behavior);
1929                         vma->vm_raend = 0;
1930                 } else
1931                         error = madvise_fixup_start(vma, end, behavior);
1932         } else {
1933                 if (end == vma->vm_end)
1934                         error = madvise_fixup_end(vma, start, behavior);
1935                 else
1936                         error = madvise_fixup_middle(vma, start, end, behavior);
1937         }
1938
1939         return error;
1940 }
1941
1942 /*
1943  * Schedule all required I/O operations, then run the disk queue
1944  * to make sure they are started.  Do not wait for completion.
1945  */
1946 static long madvise_willneed(struct vm_area_struct * vma,
1947         unsigned long start, unsigned long end)
1948 {
1949         long error = -EBADF;
1950         struct file * file;
1951         unsigned long size, rlim_rss;
1952
1953         /* Doesn't work if there's no mapped file. */
1954         if (!vma->vm_file)
1955                 return error;
1956         file = vma->vm_file;
1957         size = (file->f_dentry->d_inode->i_size + PAGE_CACHE_SIZE - 1) >>
1958                                                         PAGE_CACHE_SHIFT;
1959
1960         start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1961         if (end > vma->vm_end)
1962                 end = vma->vm_end;
1963         end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1964
1965         /* Make sure this doesn't exceed the process's max rss. */
1966         error = -EIO;
1967         rlim_rss = current->rlim ?  current->rlim[RLIMIT_RSS].rlim_cur :
1968                                 LONG_MAX; /* default: see resource.h */
1969         if ((vma->vm_mm->rss + (end - start)) > rlim_rss)
1970                 return error;
1971
1972         /* round to cluster boundaries if this isn't a "random" area. */
1973         if (!VM_RandomReadHint(vma)) {
1974                 start = CLUSTER_OFFSET(start);
1975                 end = CLUSTER_OFFSET(end + CLUSTER_PAGES - 1);
1976
1977                 while ((start < end) && (start < size)) {
1978                         error = read_cluster_nonblocking(file, start, size);
1979                         start += CLUSTER_PAGES;
1980                         if (error < 0)
1981                                 break;
1982                 }
1983         } else {
1984                 while ((start < end) && (start < size)) {
1985                         error = page_cache_read(file, start);
1986                         start++;
1987                         if (error < 0)
1988                                 break;
1989                 }
1990         }
1991
1992         /* Don't wait for someone else to push these requests. */
1993         run_task_queue(&tq_disk);
1994
1995         return error;
1996 }
1997
1998 /*
1999  * Application no longer needs these pages.  If the pages are dirty,
2000  * it's OK to just throw them away.  The app will be more careful about
2001  * data it wants to keep.  Be sure to free swap resources too.  The
2002  * zap_page_range call sets things up for shrink_mmap to actually free
2003  * these pages later if no one else has touched them in the meantime,
2004  * although we could add these pages to a global reuse list for
2005  * shrink_mmap to pick up before reclaiming other pages.
2006  *
2007  * NB: This interface discards data rather than pushes it out to swap,
2008  * as some implementations do.  This has performance implications for
2009  * applications like large transactional databases which want to discard
2010  * pages in anonymous maps after committing to backing store the data
2011  * that was kept in them.  There is no reason to write this data out to
2012  * the swap area if the application is discarding it.
2013  *
2014  * An interface that causes the system to free clean pages and flush
2015  * dirty pages is already available as msync(MS_INVALIDATE).
2016  */
2017 static long madvise_dontneed(struct vm_area_struct * vma,
2018         unsigned long start, unsigned long end)
2019 {
2020         if (vma->vm_flags & VM_LOCKED)
2021                 return -EINVAL;
2022
2023         flush_cache_range(vma->vm_mm, start, end);
2024         zap_page_range(vma->vm_mm, start, end - start);
2025         flush_tlb_range(vma->vm_mm, start, end);
2026         return 0;
2027 }
2028
2029 static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
2030         unsigned long end, int behavior)
2031 {
2032         long error = -EBADF;
2033
2034         switch (behavior) {
2035         case MADV_NORMAL:
2036         case MADV_SEQUENTIAL:
2037         case MADV_RANDOM:
2038                 error = madvise_behavior(vma, start, end, behavior);
2039                 break;
2040
2041         case MADV_WILLNEED:
2042                 error = madvise_willneed(vma, start, end);
2043                 break;
2044
2045         case MADV_DONTNEED:
2046                 error = madvise_dontneed(vma, start, end);
2047                 break;
2048
2049         default:
2050                 error = -EINVAL;
2051                 break;
2052         }
2053
2054         return error;
2055 }
2056
2057 /*
2058  * The madvise(2) system call.
2059  *
2060  * Applications can use madvise() to advise the kernel how it should
2061  * handle paging I/O in this VM area.  The idea is to help the kernel
2062  * use appropriate read-ahead and caching techniques.  The information
2063  * provided is advisory only, and can be safely disregarded by the
2064  * kernel without affecting the correct operation of the application.
2065  *
2066  * behavior values:
2067  *  MADV_NORMAL - the default behavior is to read clusters.  This
2068  *              results in some read-ahead and read-behind.
2069  *  MADV_RANDOM - the system should read the minimum amount of data
2070  *              on any access, since it is unlikely that the appli-
2071  *              cation will need more than what it asks for.
2072  *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
2073  *              once, so they can be aggressively read ahead, and
2074  *              can be freed soon after they are accessed.
2075  *  MADV_WILLNEED - the application is notifying the system to read
2076  *              some pages ahead.
2077  *  MADV_DONTNEED - the application is finished with the given range,
2078  *              so the kernel can free resources associated with it.
2079  *
2080  * return values:
2081  *  zero    - success
2082  *  -EINVAL - start + len < 0, start is not page-aligned,
2083  *              "behavior" is not a valid value, or application
2084  *              is attempting to release locked or shared pages.
2085  *  -ENOMEM - addresses in the specified range are not currently
2086  *              mapped, or are outside the AS of the process.
2087  *  -EIO    - an I/O error occurred while paging in data.
2088  *  -EBADF  - map exists, but area maps something that isn't a file.
2089  *  -EAGAIN - a kernel resource was temporarily unavailable.
2090  */
2091 asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior)
2092 {
2093         unsigned long end;
2094         struct vm_area_struct * vma;
2095         int unmapped_error = 0;
2096         int error = -EINVAL;
2097
2098         down(&current->mm->mmap_sem);
2099
2100         if (start & ~PAGE_MASK)
2101                 goto out;
2102         len = (len + ~PAGE_MASK) & PAGE_MASK;
2103         end = start + len;
2104         if (end < start)
2105                 goto out;
2106
2107         error = 0;
2108         if (end == start)
2109                 goto out;
2110
2111         /*
2112          * If the interval [start,end) covers some unmapped address
2113          * ranges, just ignore them, but return -ENOMEM at the end.
2114          */
2115         vma = find_vma(current->mm, start);
2116         for (;;) {
2117                 /* Still start < end. */
2118                 error = -ENOMEM;
2119                 if (!vma)
2120                         goto out;
2121
2122                 /* Here start < vma->vm_end. */
2123                 if (start < vma->vm_start) {
2124                         unmapped_error = -ENOMEM;
2125                         start = vma->vm_start;
2126                 }
2127
2128                 /* Here vma->vm_start <= start < vma->vm_end. */
2129                 if (end <= vma->vm_end) {
2130                         if (start < end) {
2131                                 error = madvise_vma(vma, start, end,
2132                                                         behavior);
2133                                 if (error)
2134                                         goto out;
2135                         }
2136                         error = unmapped_error;
2137                         goto out;
2138                 }
2139
2140                 /* Here vma->vm_start <= start < vma->vm_end < end. */
2141                 error = madvise_vma(vma, start, vma->vm_end, behavior);
2142                 if (error)
2143                         goto out;
2144                 start = vma->vm_end;
2145                 vma = vma->vm_next;
2146         }
2147
2148 out:
2149         up(&current->mm->mmap_sem);
2150         return error;
2151 }
2152
2153 /*
2154  * Later we can get more picky about what "in core" means precisely.
2155  * For now, simply check to see if the page is in the page cache,
2156  * and is up to date; i.e. that no page-in operation would be required
2157  * at this time if an application were to map and access this page.
2158  */
2159 static unsigned char mincore_page(struct vm_area_struct * vma,
2160         unsigned long pgoff)
2161 {
2162         unsigned char present = 0;
2163         struct address_space * as = &vma->vm_file->f_dentry->d_inode->i_data;
2164         struct page * page, ** hash = page_hash(as, pgoff);
2165
2166         spin_lock(&pagecache_lock);
2167         page = __find_page_nolock(as, pgoff, *hash);
2168         if ((page) && (Page_Uptodate(page)))
2169                 present = 1;
2170         spin_unlock(&pagecache_lock);
2171
2172         return present;
2173 }
2174
2175 static long mincore_vma(struct vm_area_struct * vma,
2176         unsigned long start, unsigned long end, unsigned char * vec)
2177 {
2178         long error, i, remaining;
2179         unsigned char * tmp;
2180
2181         error = -ENOMEM;
2182         if (!vma->vm_file)
2183                 return error;
2184
2185         start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2186         if (end > vma->vm_end)
2187                 end = vma->vm_end;
2188         end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2189
2190         error = -EAGAIN;
2191         tmp = (unsigned char *) __get_free_page(GFP_KERNEL);
2192         if (!tmp)
2193                 return error;
2194
2195         /* (end - start) is # of pages, and also # of bytes in "vec */
2196         remaining = (end - start),
2197
2198         error = 0;
2199         for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) {
2200                 int j = 0;
2201                 long thispiece = (remaining < PAGE_SIZE) ?
2202                                                 remaining : PAGE_SIZE;
2203
2204                 while (j < thispiece)
2205                         tmp[j++] = mincore_page(vma, start++);
2206
2207                 if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) {
2208                         error = -EFAULT;
2209                         break;
2210                 }
2211         }
2212
2213         free_page((unsigned long) tmp);
2214         return error;
2215 }
2216
2217 /*
2218  * The mincore(2) system call.
2219  *
2220  * mincore() returns the memory residency status of the pages in the
2221  * current process's address space specified by [addr, addr + len).
2222  * The status is returned in a vector of bytes.  The least significant
2223  * bit of each byte is 1 if the referenced page is in memory, otherwise
2224  * it is zero.
2225  *
2226  * Because the status of a page can change after mincore() checks it
2227  * but before it returns to the application, the returned vector may
2228  * contain stale information.  Only locked pages are guaranteed to
2229  * remain in memory.
2230  *
2231  * return values:
2232  *  zero    - success
2233  *  -EFAULT - vec points to an illegal address
2234  *  -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE,
2235  *              or len has a nonpositive value
2236  *  -ENOMEM - Addresses in the range [addr, addr + len] are
2237  *              invalid for the address space of this process, or
2238  *              specify one or more pages which are not currently
2239  *              mapped
2240  *  -EAGAIN - A kernel resource was temporarily unavailable.
2241  */
2242 asmlinkage long sys_mincore(unsigned long start, size_t len,
2243         unsigned char * vec)
2244 {
2245         int index = 0;
2246         unsigned long end;
2247         struct vm_area_struct * vma;
2248         int unmapped_error = 0;
2249         long error = -EINVAL;
2250
2251         down(&current->mm->mmap_sem);
2252
2253         if (start & ~PAGE_CACHE_MASK)
2254                 goto out;
2255         len = (len + ~PAGE_CACHE_MASK) & PAGE_CACHE_MASK;
2256         end = start + len;
2257         if (end < start)
2258                 goto out;
2259
2260         error = 0;
2261         if (end == start)
2262                 goto out;
2263
2264         /*
2265          * If the interval [start,end) covers some unmapped address
2266          * ranges, just ignore them, but return -ENOMEM at the end.
2267          */
2268         vma = find_vma(current->mm, start);
2269         for (;;) {
2270                 /* Still start < end. */
2271                 error = -ENOMEM;
2272                 if (!vma)
2273                         goto out;
2274
2275                 /* Here start < vma->vm_end. */
2276                 if (start < vma->vm_start) {
2277                         unmapped_error = -ENOMEM;
2278                         start = vma->vm_start;
2279                 }
2280
2281                 /* Here vma->vm_start <= start < vma->vm_end. */
2282                 if (end <= vma->vm_end) {
2283                         if (start < end) {
2284                                 error = mincore_vma(vma, start, end,
2285                                                         &vec[index]);
2286                                 if (error)
2287                                         goto out;
2288                         }
2289                         error = unmapped_error;
2290                         goto out;
2291                 }
2292
2293                 /* Here vma->vm_start <= start < vma->vm_end < end. */
2294                 error = mincore_vma(vma, start, vma->vm_end, &vec[index]);
2295                 if (error)
2296                         goto out;
2297                 index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT;
2298                 start = vma->vm_end;
2299                 vma = vma->vm_next;
2300         }
2301
2302 out:
2303         up(&current->mm->mmap_sem);
2304         return error;
2305 }
2306
2307 static inline
2308 struct page *__read_cache_page(struct address_space *mapping,
2309                                 unsigned long index,
2310                                 int (*filler)(void *,struct page*),
2311                                 void *data)
2312 {
2313         struct page **hash = page_hash(mapping, index);
2314         struct page *page, *cached_page = NULL;
2315         int err;
2316 repeat:
2317         page = __find_get_page(mapping, index, hash);
2318         if (!page) {
2319                 if (!cached_page) {
2320                         cached_page = page_cache_alloc();
2321                         if (!cached_page)
2322                                 return ERR_PTR(-ENOMEM);
2323                 }
2324                 page = cached_page;
2325                 if (add_to_page_cache_unique(page, mapping, index, hash))
2326                         goto repeat;
2327                 cached_page = NULL;
2328                 err = filler(data, page);
2329                 if (err < 0) {
2330                         page_cache_release(page);
2331                         page = ERR_PTR(err);
2332                 }
2333         }
2334         if (cached_page)
2335                 page_cache_free(cached_page);
2336         return page;
2337 }
2338
2339 /*
2340  * Read into the page cache. If a page already exists,
2341  * and Page_Uptodate() is not set, try to fill the page.
2342  */
2343 struct page *read_cache_page(struct address_space *mapping,
2344                                 unsigned long index,
2345                                 int (*filler)(void *,struct page*),
2346                                 void *data)
2347 {
2348         struct page *page = __read_cache_page(mapping, index, filler, data);
2349         int err;
2350
2351         if (IS_ERR(page) || Page_Uptodate(page))
2352                 goto out;
2353
2354         lock_page(page);
2355         if (Page_Uptodate(page)) {
2356                 UnlockPage(page);
2357                 goto out;
2358         }
2359         err = filler(data, page);
2360         if (err < 0) {
2361                 page_cache_release(page);
2362                 page = ERR_PTR(err);
2363         }
2364  out:
2365         return page;
2366 }
2367
2368 static inline struct page * __grab_cache_page(struct address_space *mapping,
2369                                 unsigned long index, struct page **cached_page)
2370 {
2371         struct page *page, **hash = page_hash(mapping, index);
2372 repeat:
2373         page = __find_lock_page(mapping, index, hash);
2374         if (!page) {
2375                 if (!*cached_page) {
2376                         *cached_page = page_cache_alloc();
2377                         if (!*cached_page)
2378                                 return NULL;
2379                 }
2380                 page = *cached_page;
2381                 if (add_to_page_cache_unique(page, mapping, index, hash))
2382                         goto repeat;
2383                 *cached_page = NULL;
2384         }
2385         return page;
2386 }
2387
2388 /*
2389  * Returns locked page at given index in given cache, creating it if needed.
2390  */
2391
2392 struct page *grab_cache_page(struct address_space *mapping, unsigned long index)
2393 {
2394         struct page *cached_page = NULL;
2395         struct page *page = __grab_cache_page(mapping,index,&cached_page);
2396         if (cached_page)
2397                 page_cache_free(cached_page);
2398         return page;
2399 }
2400
2401 static inline void remove_suid(struct inode *inode)
2402 {
2403         unsigned int mode;
2404
2405         /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
2406         mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
2407
2408         /* was any of the uid bits set? */
2409         mode &= inode->i_mode;
2410         if (mode && !capable(CAP_FSETID)) {
2411                 inode->i_mode &= ~mode;
2412                 mark_inode_dirty(inode);
2413         }
2414 }
2415
2416 /*
2417  * Write to a file through the page cache.
2418  *
2419  * We currently put everything into the page cache prior to writing it.
2420  * This is not a problem when writing full pages. With partial pages,
2421  * however, we first have to read the data into the cache, then
2422  * dirty the page, and finally schedule it for writing. Alternatively, we
2423  * could write-through just the portion of data that would go into that
2424  * page, but that would kill performance for applications that write data
2425  * line by line, and it's prone to race conditions.
2426  *
2427  * Note that this routine doesn't try to keep track of dirty pages. Each
2428  * file system has to do this all by itself, unfortunately.
2429  *                                                      okir@monad.swb.de
2430  */
2431 ssize_t
2432 generic_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)
2433 {
2434         struct inode    *inode = file->f_dentry->d_inode;
2435         struct address_space *mapping = inode->i_mapping;
2436         unsigned long   limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
2437         loff_t          pos;
2438         struct page     *page, *cached_page;
2439         unsigned long   written;
2440         long            status;
2441         int             err;
2442
2443         cached_page = NULL;
2444
2445         down(&inode->i_sem);
2446
2447         pos = *ppos;
2448         err = -EINVAL;
2449         if (pos < 0)
2450                 goto out;
2451
2452         err = file->f_error;
2453         if (err) {
2454                 file->f_error = 0;
2455                 goto out;
2456         }
2457
2458         written = 0;
2459
2460         if (file->f_flags & O_APPEND)
2461                 pos = inode->i_size;
2462
2463         /*
2464          * Check whether we've reached the file size limit.
2465          */
2466         err = -EFBIG;
2467         if (limit != RLIM_INFINITY) {
2468                 if (pos >= limit) {
2469                         send_sig(SIGXFSZ, current, 0);
2470                         goto out;
2471                 }
2472                 if (count > limit - pos) {
2473                         send_sig(SIGXFSZ, current, 0);
2474                         count = limit - pos;
2475                 }
2476         }
2477
2478         status  = 0;
2479         if (count) {
2480                 remove_suid(inode);
2481                 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
2482                 mark_inode_dirty(inode);
2483         }
2484
2485         while (count) {
2486                 unsigned long bytes, index, offset;
2487                 char *kaddr;
2488
2489                 /*
2490                  * Try to find the page in the cache. If it isn't there,
2491                  * allocate a free page.
2492                  */
2493                 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
2494                 index = pos >> PAGE_CACHE_SHIFT;
2495                 bytes = PAGE_CACHE_SIZE - offset;
2496                 if (bytes > count)
2497                         bytes = count;
2498
2499                 status = -ENOMEM;       /* we'll assign it later anyway */
2500                 page = __grab_cache_page(mapping, index, &cached_page);
2501                 if (!page)
2502                         break;
2503
2504                 /* We have exclusive IO access to the page.. */
2505                 if (!PageLocked(page)) {
2506                         PAGE_BUG(page);
2507                 }
2508
2509                 status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes);
2510                 if (status)
2511                         goto unlock;
2512                 kaddr = (char*)page_address(page);
2513                 status = copy_from_user(kaddr+offset, buf, bytes);
2514                 if (status)
2515                         goto fail_write;
2516                 status = mapping->a_ops->commit_write(file, page, offset, offset+bytes);
2517                 if (!status)
2518                         status = bytes;
2519
2520                 if (status >= 0) {
2521                         written += status;
2522                         count -= status;
2523                         pos += status;
2524                         buf += status;
2525                 }
2526 unlock:
2527                 /* Mark it unlocked again and drop the page.. */
2528                 UnlockPage(page);
2529                 page_cache_release(page);
2530
2531                 if (status < 0)
2532                         break;
2533         }
2534         *ppos = pos;
2535
2536         if (cached_page)
2537                 page_cache_free(cached_page);
2538
2539         err = written ? written : status;
2540 out:
2541         up(&inode->i_sem);
2542         return err;
2543 fail_write:
2544         status = -EFAULT;
2545         ClearPageUptodate(page);
2546         kunmap(page);
2547         goto unlock;
2548 }
2549
2550 void __init page_cache_init(unsigned long mempages)
2551 {
2552         unsigned long htable_size, order;
2553
2554         htable_size = mempages;
2555         htable_size *= sizeof(struct page *);
2556         for(order = 0; (PAGE_SIZE << order) < htable_size; order++)
2557                 ;
2558
2559         do {
2560                 unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *);
2561
2562                 page_hash_bits = 0;
2563                 while((tmp >>= 1UL) != 0UL)
2564                         page_hash_bits++;
2565
2566                 page_hash_table = (struct page **)
2567                         __get_free_pages(GFP_ATOMIC, order);
2568         } while(page_hash_table == NULL && --order > 0);
2569
2570         printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
2571                (1 << page_hash_bits), order, (PAGE_SIZE << order));
2572         if (!page_hash_table)
2573                 panic("Failed to allocate page hash table\n");
2574         memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));
2575 }