mm/filemap.c

   1 /*
   2  *      linux/mm/filemap.c
   3  *
   4  * Copyright (C) 1994-1999  Linus Torvalds
   5  */
   6
   7 /*
   8  * This file handles the generic file mmap semantics used by
   9  * most "normal" filesystems (but you don't /have/ to use this:
  10  * the NFS filesystem used to do this differently, for example)
  11  */
  12 #include <linux/malloc.h>
  13 #include <linux/shm.h>
  14 #include <linux/mman.h>
  15 #include <linux/locks.h>
  16 #include <linux/pagemap.h>
  17 #include <linux/swap.h>
  18 #include <linux/smp_lock.h>
  19 #include <linux/blkdev.h>
  20 #include <linux/file.h>
  21 #include <linux/swapctl.h>
  22 #include <linux/slab.h>
  23 #include <linux/init.h>
  24 #include <linux/mm.h>
  25
  26 #include <asm/pgalloc.h>
  27 #include <asm/uaccess.h>
  28 #include <asm/mman.h>
  29
  30 #include <linux/highmem.h>
  31
  32 /*
  33  * Shared mappings implemented 30.11.1994. It's not fully working yet,
  34  * though.
  35  *
  36  * Shared mappings now work. 15.8.1995  Bruno.
  37  *
  38  * finished 'unifying' the page and buffer cache and SMP-threaded the
  39  * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
  40  *
  41  * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
  42  */
  43
  44 atomic_t page_cache_size = ATOMIC_INIT(0);
  45 unsigned int page_hash_bits;
  46 struct page **page_hash_table;
  47 struct list_head lru_cache;
  48
  49 static spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
  50 /*
  51  * NOTE: to avoid deadlocking you must never acquire the pagecache_lock with
  52  *       the pagemap_lru_lock held.
  53  */
  54 spinlock_t pagemap_lru_lock = SPIN_LOCK_UNLOCKED;
  55
  56 #define CLUSTER_PAGES           (1 << page_cluster)
  57 #define CLUSTER_OFFSET(x)       (((x) >> page_cluster) << page_cluster)
  58
  59 void __add_page_to_hash_queue(struct page * page, struct page **p)
  60 {
  61         atomic_inc(&page_cache_size);
  62         if((page->next_hash = *p) != NULL)
  63                 (*p)->pprev_hash = &page->next_hash;
  64         *p = page;
  65         page->pprev_hash = p;
  66         if (page->buffers)
  67                 PAGE_BUG(page);
  68 }
  69
  70 static inline void remove_page_from_hash_queue(struct page * page)
  71 {
  72         if(page->pprev_hash) {
  73                 if(page->next_hash)
  74                         page->next_hash->pprev_hash = page->pprev_hash;
  75                 *page->pprev_hash = page->next_hash;
  76                 page->pprev_hash = NULL;
  77         }
  78         atomic_dec(&page_cache_size);
  79 }
  80
  81 static inline int sync_page(struct page *page)
  82 {
  83         struct address_space *mapping = page->mapping;
  84
  85         if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
  86                 return mapping->a_ops->sync_page(page);
  87         return 0;
  88 }
  89
  90 /*
  91  * Remove a page from the page cache and free it. Caller has to make
  92  * sure the page is locked and that nobody else uses it - or that usage
  93  * is safe.
  94  */
  95 static inline void __remove_inode_page(struct page *page)
  96 {
  97         remove_page_from_inode_queue(page);
  98         remove_page_from_hash_queue(page);
  99         page->mapping = NULL;
 100 }
 101
 102 void remove_inode_page(struct page *page)
 103 {
 104         if (!PageLocked(page))
 105                 PAGE_BUG(page);
 106
 107         spin_lock(&pagecache_lock);
 108         __remove_inode_page(page);
 109         spin_unlock(&pagecache_lock);
 110 }
 111
 112 /**
 113  * invalidate_inode_pages - Invalidate all the unlocked pages of one inode
 114  * @inode: the inode which pages we want to invalidate
 115  *
 116  * This function only removes the unlocked pages, if you want to
 117  * remove all the pages of one inode, you must call truncate_inode_pages.
 118  */
 119
 120 void invalidate_inode_pages(struct inode * inode)
 121 {
 122         struct list_head *head, *curr;
 123         struct page * page;
 124
 125         head = &inode->i_mapping->pages;
 126
 127         spin_lock(&pagecache_lock);
 128         spin_lock(&pagemap_lru_lock);
 129         curr = head->next;
 130
 131         while (curr != head) {
 132                 page = list_entry(curr, struct page, list);
 133                 curr = curr->next;
 134
 135                 /* We cannot invalidate a locked page */
 136                 if (TryLockPage(page))
 137                         continue;
 138
 139                 __lru_cache_del(page);
 140                 __remove_inode_page(page);
 141                 UnlockPage(page);
 142                 page_cache_release(page);
 143         }
 144
 145         spin_unlock(&pagemap_lru_lock);
 146         spin_unlock(&pagecache_lock);
 147 }
 148
 149 /*
 150  * Truncate the page cache at a set offset, removing the pages
 151  * that are beyond that offset (and zeroing out partial pages).
 152  */
 153 void truncate_inode_pages(struct address_space * mapping, loff_t lstart)
 154 {
 155         struct list_head *head, *curr;
 156         struct page * page;
 157         unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
 158         unsigned long start;
 159
 160         start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 161
 162 repeat:
 163         head = &mapping->pages;
 164         spin_lock(&pagecache_lock);
 165         curr = head->next;
 166         while (curr != head) {
 167                 unsigned long offset;
 168
 169                 page = list_entry(curr, struct page, list);
 170                 curr = curr->next;
 171
 172                 offset = page->index;
 173
 174                 /* page wholly truncated - free it */
 175                 if (offset >= start) {
 176                         if (TryLockPage(page)) {
 177                                 page_cache_get(page);
 178                                 spin_unlock(&pagecache_lock);
 179                                 wait_on_page(page);
 180                                 page_cache_release(page);
 181                                 goto repeat;
 182                         }
 183                         page_cache_get(page);
 184                         spin_unlock(&pagecache_lock);
 185
 186                         if (!page->buffers || block_flushpage(page, 0))
 187                                 lru_cache_del(page);
 188
 189                         /*
 190                          * We remove the page from the page cache
 191                          * _after_ we have destroyed all buffer-cache
 192                          * references to it. Otherwise some other process
 193                          * might think this inode page is not in the
 194                          * page cache and creates a buffer-cache alias
 195                          * to it causing all sorts of fun problems ...
 196                          */
 197                         remove_inode_page(page);
 198                         ClearPageDirty(page);
 199
 200                         UnlockPage(page);
 201                         page_cache_release(page);
 202                         page_cache_release(page);
 203
 204                         /*
 205                          * We have done things without the pagecache lock,
 206                          * so we'll have to repeat the scan.
 207                          * It's not possible to deadlock here because
 208                          * we are guaranteed to make progress. (ie. we have
 209                          * just removed a page)
 210                          */
 211                         goto repeat;
 212                 }
 213                 /*
 214                  * there is only one partial page possible.
 215                  */
 216                 if (!partial)
 217                         continue;
 218
 219                 /* and it's the one preceeding the first wholly truncated page */
 220                 if ((offset + 1) != start)
 221                         continue;
 222
 223                 /* partial truncate, clear end of page */
 224                 if (TryLockPage(page)) {
 225                         spin_unlock(&pagecache_lock);
 226                         goto repeat;
 227                 }
 228                 page_cache_get(page);
 229                 spin_unlock(&pagecache_lock);
 230
 231                 memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
 232                 if (page->buffers)
 233                         block_flushpage(page, partial);
 234
 235                 partial = 0;
 236
 237                 /*
 238                  * we have dropped the spinlock so we have to
 239                  * restart.
 240                  */
 241                 UnlockPage(page);
 242                 page_cache_release(page);
 243                 goto repeat;
 244         }
 245         spin_unlock(&pagecache_lock);
 246 }
 247
 248 /*
 249  * nr_dirty represents the number of dirty pages that we will write async
 250  * before doing sync writes.  We can only do sync writes if we can
 251  * wait for IO (__GFP_IO set).
 252  */
 253 int shrink_mmap(int priority, int gfp_mask)
 254 {
 255         int ret = 0, count, nr_dirty;
 256         struct list_head * page_lru;
 257         struct page * page = NULL;
 258
 259         count = nr_lru_pages / (priority + 1);
 260         nr_dirty = priority;
 261
 262         /* we need pagemap_lru_lock for list_del() ... subtle code below */
 263         spin_lock(&pagemap_lru_lock);
 264         while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache) {
 265                 page = list_entry(page_lru, struct page, lru);
 266                 list_del(page_lru);
 267
 268                 if (PageTestandClearReferenced(page))
 269                         goto dispose_continue;
 270
 271                 count--;
 272                 /*
 273                  * Avoid unscalable SMP locking for pages we can
 274                  * immediate tell are untouchable..
 275                  */
 276                 if (!page->buffers && page_count(page) > 1)
 277                         goto dispose_continue;
 278
 279                 if (TryLockPage(page))
 280                         goto dispose_continue;
 281
 282                 /* Release the pagemap_lru lock even if the page is not yet
 283                    queued in any lru queue since we have just locked down
 284                    the page so nobody else may SMP race with us running
 285                    a lru_cache_del() (lru_cache_del() always run with the
 286                    page locked down ;). */
 287                 spin_unlock(&pagemap_lru_lock);
 288
 289                 /* avoid freeing the page while it's locked */
 290                 page_cache_get(page);
 291
 292                 /*
 293                  * Is it a buffer page? Try to clean it up regardless
 294                  * of zone - it's old.
 295                  */
 296                 if (page->buffers) {
 297                         int wait;
 298                         /*
 299                          * 0 - free it if can do so without IO
 300                          * 1 - start write-out of dirty buffers
 301                          * 2 - wait for locked buffers
 302                          */
 303                         wait = (gfp_mask & __GFP_IO) ? (nr_dirty-- < 0) ? 2 : 1 : 0;
 304                         if (!try_to_free_buffers(page, wait))
 305                                 goto unlock_continue;
 306                         /* page was locked, inode can't go away under us */
 307                         if (!page->mapping) {
 308                                 atomic_dec(&buffermem_pages);
 309                                 goto made_buffer_progress;
 310                         }
 311                 }
 312
 313                 /* Take the pagecache_lock spinlock held to avoid
 314                    other tasks to notice the page while we are looking at its
 315                    page count. If it's a pagecache-page we'll free it
 316                    in one atomic transaction after checking its page count. */
 317                 spin_lock(&pagecache_lock);
 318
 319                 /*
 320                  * We can't free pages unless there's just one user
 321                  * (count == 2 because we added one ourselves above).
 322                  */
 323                 if (page_count(page) != 2)
 324                         goto cache_unlock_continue;
 325
 326                 /*
 327                  * Is it a page swap page? If so, we want to
 328                  * drop it if it is no longer used, even if it
 329                  * were to be marked referenced..
 330                  */
 331                 if (PageSwapCache(page)) {
 332                         spin_unlock(&pagecache_lock);
 333                         __delete_from_swap_cache(page);
 334                         goto made_inode_progress;
 335                 }
 336
 337                 /*
 338                  * Page is from a zone we don't care about.
 339                  * Don't drop page cache entries in vain.
 340                  */
 341                 if (page->zone->free_pages > page->zone->pages_high)
 342                         goto cache_unlock_continue;
 343
 344                 /* is it a page-cache page? */
 345                 if (page->mapping) {
 346                         if (!PageDirty(page) && !pgcache_under_min()) {
 347                                 __remove_inode_page(page);
 348                                 spin_unlock(&pagecache_lock);
 349                                 goto made_inode_progress;
 350                         }
 351                         goto cache_unlock_continue;
 352                 }
 353
 354                 printk(KERN_ERR "shrink_mmap: unknown LRU page!\n");
 355
 356 cache_unlock_continue:
 357                 spin_unlock(&pagecache_lock);
 358 unlock_continue:
 359                 spin_lock(&pagemap_lru_lock);
 360                 UnlockPage(page);
 361                 page_cache_release(page);
 362 dispose_continue:
 363                 list_add(page_lru, &lru_cache);
 364         }
 365         goto out;
 366
 367 made_inode_progress:
 368         page_cache_release(page);
 369 made_buffer_progress:
 370         UnlockPage(page);
 371         page_cache_release(page);
 372         ret = 1;
 373         spin_lock(&pagemap_lru_lock);
 374         /* nr_lru_pages needs the spinlock */
 375         nr_lru_pages--;
 376
 377 out:
 378         spin_unlock(&pagemap_lru_lock);
 379
 380         return ret;
 381 }
 382
 383 static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
 384 {
 385         goto inside;
 386
 387         for (;;) {
 388                 page = page->next_hash;
 389 inside:
 390                 if (!page)
 391                         goto not_found;
 392                 if (page->mapping != mapping)
 393                         continue;
 394                 if (page->index == offset)
 395                         break;
 396         }
 397         SetPageReferenced(page);
 398 not_found:
 399         return page;
 400 }
 401
 402 /*
 403  * By the time this is called, the page is locked and
 404  * we don't have to worry about any races any more.
 405  *
 406  * Start the IO..
 407  */
 408 static int writeout_one_page(struct page *page)
 409 {
 410         struct buffer_head *bh, *head = page->buffers;
 411
 412         bh = head;
 413         do {
 414                 if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
 415                         continue;
 416
 417                 bh->b_flushtime = jiffies;
 418                 ll_rw_block(WRITE, 1, &bh);
 419         } while ((bh = bh->b_this_page) != head);
 420         return 0;
 421 }
 422
 423 static int waitfor_one_page(struct page *page)
 424 {
 425         int error = 0;
 426         struct buffer_head *bh, *head = page->buffers;
 427
 428         bh = head;
 429         do {
 430                 wait_on_buffer(bh);
 431                 if (buffer_req(bh) && !buffer_uptodate(bh))
 432                         error = -EIO;
 433         } while ((bh = bh->b_this_page) != head);
 434         return error;
 435 }
 436
 437 static int do_buffer_fdatasync(struct inode *inode, unsigned long start, unsigned long end, int (*fn)(struct page *))
 438 {
 439         struct list_head *head, *curr;
 440         struct page *page;
 441         int retval = 0;
 442
 443         head = &inode->i_mapping->pages;
 444
 445         spin_lock(&pagecache_lock);
 446         curr = head->next;
 447         while (curr != head) {
 448                 page = list_entry(curr, struct page, list);
 449                 curr = curr->next;
 450                 if (!page->buffers)
 451                         continue;
 452                 if (page->index >= end)
 453                         continue;
 454                 if (page->index < start)
 455                         continue;
 456
 457                 page_cache_get(page);
 458                 spin_unlock(&pagecache_lock);
 459                 lock_page(page);
 460
 461                 /* The buffers could have been free'd while we waited for the page lock */
 462                 if (page->buffers)
 463                         retval |= fn(page);
 464
 465                 UnlockPage(page);
 466                 spin_lock(&pagecache_lock);
 467                 curr = page->list.next;
 468                 page_cache_release(page);
 469         }
 470         spin_unlock(&pagecache_lock);
 471
 472         return retval;
 473 }
 474
 475 /*
 476  * Two-stage data sync: first start the IO, then go back and
 477  * collect the information..
 478  */
 479 int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx)
 480 {
 481         int retval;
 482
 483         retval = do_buffer_fdatasync(inode, start_idx, end_idx, writeout_one_page);
 484         retval |= do_buffer_fdatasync(inode, start_idx, end_idx, waitfor_one_page);
 485         return retval;
 486 }
 487
 488 /*
 489  * Add a page to the inode page cache.
 490  *
 491  * The caller must have locked the page and
 492  * set all the page flags correctly..
 493  */
 494 void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index)
 495 {
 496         if (!PageLocked(page))
 497                 BUG();
 498
 499         page_cache_get(page);
 500         spin_lock(&pagecache_lock);
 501         page->index = index;
 502         add_page_to_inode_queue(mapping, page);
 503         __add_page_to_hash_queue(page, page_hash(mapping, index));
 504         lru_cache_add(page);
 505         spin_unlock(&pagecache_lock);
 506 }
 507
 508 /*
 509  * This adds a page to the page cache, starting out as locked,
 510  * owned by us, but unreferenced, not uptodate and with no errors.
 511  */
 512 static inline void __add_to_page_cache(struct page * page,
 513         struct address_space *mapping, unsigned long offset,
 514         struct page **hash)
 515 {
 516         unsigned long flags;
 517
 518         if (PageLocked(page))
 519                 BUG();
 520
 521         flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced));
 522         page->flags = flags | (1 << PG_locked);
 523         page_cache_get(page);
 524         page->index = offset;
 525         add_page_to_inode_queue(mapping, page);
 526         __add_page_to_hash_queue(page, hash);
 527         lru_cache_add(page);
 528 }
 529
 530 void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset)
 531 {
 532         spin_lock(&pagecache_lock);
 533         __add_to_page_cache(page, mapping, offset, page_hash(mapping, offset));
 534         spin_unlock(&pagecache_lock);
 535 }
 536
 537 static int add_to_page_cache_unique(struct page * page,
 538         struct address_space *mapping, unsigned long offset,
 539         struct page **hash)
 540 {
 541         int err;
 542         struct page *alias;
 543
 544         spin_lock(&pagecache_lock);
 545         alias = __find_page_nolock(mapping, offset, *hash);
 546
 547         err = 1;
 548         if (!alias) {
 549                 __add_to_page_cache(page,mapping,offset,hash);
 550                 err = 0;
 551         }
 552
 553         spin_unlock(&pagecache_lock);
 554         return err;
 555 }
 556
 557 /*
 558  * This adds the requested page to the page cache if it isn't already there,
 559  * and schedules an I/O to read in its contents from disk.
 560  */
 561 static inline int page_cache_read(struct file * file, unsigned long offset)
 562 {
 563         struct inode *inode = file->f_dentry->d_inode;
 564         struct address_space *mapping = inode->i_mapping;
 565         struct page **hash = page_hash(mapping, offset);
 566         struct page *page;
 567
 568         spin_lock(&pagecache_lock);
 569         page = __find_page_nolock(mapping, offset, *hash);
 570         spin_unlock(&pagecache_lock);
 571         if (page)
 572                 return 0;
 573
 574         page = page_cache_alloc();
 575         if (!page)
 576                 return -ENOMEM;
 577
 578         if (!add_to_page_cache_unique(page, mapping, offset, hash)) {
 579                 int error = mapping->a_ops->readpage(file, page);
 580                 page_cache_release(page);
 581                 return error;
 582         }
 583         /*
 584          * We arrive here in the unlikely event that someone
 585          * raced with us and added our page to the cache first.
 586          */
 587         page_cache_free(page);
 588         return 0;
 589 }
 590
 591 /*
 592  * Read in an entire cluster at once.  A cluster is usually a 64k-
 593  * aligned block that includes the page requested in "offset."
 594  */
 595 static int read_cluster_nonblocking(struct file * file, unsigned long offset,
 596         unsigned long filesize)
 597 {
 598         unsigned long pages = CLUSTER_PAGES;
 599
 600         offset = CLUSTER_OFFSET(offset);
 601         while ((pages-- > 0) && (offset < filesize)) {
 602                 int error = page_cache_read(file, offset);
 603                 if (error < 0)
 604                         return error;
 605                 offset ++;
 606         }
 607
 608         return 0;
 609 }
 610
 611 /*
 612  * Wait for a page to get unlocked.
 613  *
 614  * This must be called with the caller "holding" the page,
 615  * ie with increased "page->count" so that the page won't
 616  * go away during the wait..
 617  */
 618 void ___wait_on_page(struct page *page)
 619 {
 620         struct task_struct *tsk = current;
 621         DECLARE_WAITQUEUE(wait, tsk);
 622
 623         add_wait_queue(&page->wait, &wait);
 624         do {
 625                 sync_page(page);
 626                 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 627                 if (!PageLocked(page))
 628                         break;
 629                 schedule();
 630         } while (PageLocked(page));
 631         tsk->state = TASK_RUNNING;
 632         remove_wait_queue(&page->wait, &wait);
 633 }
 634
 635 /*
 636  * Get an exclusive lock on the page..
 637  */
 638 void lock_page(struct page *page)
 639 {
 640         while (TryLockPage(page))
 641                 ___wait_on_page(page);
 642 }
 643
 644
 645 /*
 646  * a rather lightweight function, finding and getting a reference to a
 647  * hashed page atomically, waiting for it if it's locked.
 648  */
 649 struct page * __find_get_page (struct address_space *mapping,
 650                                 unsigned long offset, struct page **hash)
 651 {
 652         struct page *page;
 653
 654         /*
 655          * We scan the hash list read-only. Addition to and removal from
 656          * the hash-list needs a held write-lock.
 657          */
 658 repeat:
 659         spin_lock(&pagecache_lock);
 660         page = __find_page_nolock(mapping, offset, *hash);
 661         if (page)
 662                 page_cache_get(page);
 663         spin_unlock(&pagecache_lock);
 664
 665         /* Found the page, sleep if locked. */
 666         if (page && PageLocked(page)) {
 667                 struct task_struct *tsk = current;
 668                 DECLARE_WAITQUEUE(wait, tsk);
 669
 670                 sync_page(page);
 671
 672                 __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 673                 add_wait_queue(&page->wait, &wait);
 674
 675                 if (PageLocked(page))
 676                         schedule();
 677                 __set_task_state(tsk, TASK_RUNNING);
 678                 remove_wait_queue(&page->wait, &wait);
 679
 680                 /*
 681                  * The page might have been unhashed meanwhile. It's
 682                  * not freed though because we hold a reference to it.
 683                  * If this is the case then it will be freed _here_,
 684                  * and we recheck the hash anyway.
 685                  */
 686                 page_cache_release(page);
 687                 goto repeat;
 688         }
 689         /*
 690          * It's not locked so we can return the page and we hold
 691          * a reference to it.
 692          */
 693         return page;
 694 }
 695
 696 /*
 697  * Get the lock to a page atomically.
 698  */
 699 struct page * __find_lock_page (struct address_space *mapping,
 700                                 unsigned long offset, struct page **hash)
 701 {
 702         struct page *page;
 703
 704         /*
 705          * We scan the hash list read-only. Addition to and removal from
 706          * the hash-list needs a held write-lock.
 707          */
 708 repeat:
 709         spin_lock(&pagecache_lock);
 710         page = __find_page_nolock(mapping, offset, *hash);
 711         if (page)
 712                 page_cache_get(page);
 713         spin_unlock(&pagecache_lock);
 714
 715         /* Found the page, sleep if locked. */
 716         if (page && TryLockPage(page)) {
 717                 struct task_struct *tsk = current;
 718                 DECLARE_WAITQUEUE(wait, tsk);
 719
 720                 sync_page(page);
 721
 722                 __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 723                 add_wait_queue(&page->wait, &wait);
 724
 725                 if (PageLocked(page))
 726                         schedule();
 727                 __set_task_state(tsk, TASK_RUNNING);
 728                 remove_wait_queue(&page->wait, &wait);
 729
 730                 /*
 731                  * The page might have been unhashed meanwhile. It's
 732                  * not freed though because we hold a reference to it.
 733                  * If this is the case then it will be freed _here_,
 734                  * and we recheck the hash anyway.
 735                  */
 736                 page_cache_release(page);
 737                 goto repeat;
 738         }
 739         /*
 740          * It's not locked so we can return the page and we hold
 741          * a reference to it.
 742          */
 743         return page;
 744 }
 745
 746 #if 0
 747 #define PROFILE_READAHEAD
 748 #define DEBUG_READAHEAD
 749 #endif
 750
 751 /*
 752  * Read-ahead profiling information
 753  * --------------------------------
 754  * Every PROFILE_MAXREADCOUNT, the following information is written
 755  * to the syslog:
 756  *   Percentage of asynchronous read-ahead.
 757  *   Average of read-ahead fields context value.
 758  * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
 759  * to the syslog.
 760  */
 761
 762 #ifdef PROFILE_READAHEAD
 763
 764 #define PROFILE_MAXREADCOUNT 1000
 765
 766 static unsigned long total_reada;
 767 static unsigned long total_async;
 768 static unsigned long total_ramax;
 769 static unsigned long total_ralen;
 770 static unsigned long total_rawin;
 771
 772 static void profile_readahead(int async, struct file *filp)
 773 {
 774         unsigned long flags;
 775
 776         ++total_reada;
 777         if (async)
 778                 ++total_async;
 779
 780         total_ramax     += filp->f_ramax;
 781         total_ralen     += filp->f_ralen;
 782         total_rawin     += filp->f_rawin;
 783
 784         if (total_reada > PROFILE_MAXREADCOUNT) {
 785                 save_flags(flags);
 786                 cli();
 787                 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
 788                         restore_flags(flags);
 789                         return;
 790                 }
 791
 792                 printk("Readahead average:  max=%ld, len=%ld, win=%ld, async=%ld%%\n",
 793                         total_ramax/total_reada,
 794                         total_ralen/total_reada,
 795                         total_rawin/total_reada,
 796                         (total_async*100)/total_reada);
 797 #ifdef DEBUG_READAHEAD
 798                 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n",
 799                         filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
 800 #endif
 801
 802                 total_reada     = 0;
 803                 total_async     = 0;
 804                 total_ramax     = 0;
 805                 total_ralen     = 0;
 806                 total_rawin     = 0;
 807
 808                 restore_flags(flags);
 809         }
 810 }
 811 #endif  /* defined PROFILE_READAHEAD */
 812
 813 /*
 814  * Read-ahead context:
 815  * -------------------
 816  * The read ahead context fields of the "struct file" are the following:
 817  * - f_raend : position of the first byte after the last page we tried to
 818  *             read ahead.
 819  * - f_ramax : current read-ahead maximum size.
 820  * - f_ralen : length of the current IO read block we tried to read-ahead.
 821  * - f_rawin : length of the current read-ahead window.
 822  *              if last read-ahead was synchronous then
 823  *                      f_rawin = f_ralen
 824  *              otherwise (was asynchronous)
 825  *                      f_rawin = previous value of f_ralen + f_ralen
 826  *
 827  * Read-ahead limits:
 828  * ------------------
 829  * MIN_READAHEAD   : minimum read-ahead size when read-ahead.
 830  * MAX_READAHEAD   : maximum read-ahead size when read-ahead.
 831  *
 832  * Synchronous read-ahead benefits:
 833  * --------------------------------
 834  * Using reasonable IO xfer length from peripheral devices increase system
 835  * performances.
 836  * Reasonable means, in this context, not too large but not too small.
 837  * The actual maximum value is:
 838  *      MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
 839  *      and 32K if defined (4K page size assumed).
 840  *
 841  * Asynchronous read-ahead benefits:
 842  * ---------------------------------
 843  * Overlapping next read request and user process execution increase system
 844  * performance.
 845  *
 846  * Read-ahead risks:
 847  * -----------------
 848  * We have to guess which further data are needed by the user process.
 849  * If these data are often not really needed, it's bad for system
 850  * performances.
 851  * However, we know that files are often accessed sequentially by
 852  * application programs and it seems that it is possible to have some good
 853  * strategy in that guessing.
 854  * We only try to read-ahead files that seems to be read sequentially.
 855  *
 856  * Asynchronous read-ahead risks:
 857  * ------------------------------
 858  * In order to maximize overlapping, we must start some asynchronous read
 859  * request from the device, as soon as possible.
 860  * We must be very careful about:
 861  * - The number of effective pending IO read requests.
 862  *   ONE seems to be the only reasonable value.
 863  * - The total memory pool usage for the file access stream.
 864  *   This maximum memory usage is implicitly 2 IO read chunks:
 865  *   2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
 866  *   64k if defined (4K page size assumed).
 867  */
 868
 869 static inline int get_max_readahead(struct inode * inode)
 870 {
 871         if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
 872                 return MAX_READAHEAD;
 873         return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
 874 }
 875
 876 static void generic_file_readahead(int reada_ok,
 877         struct file * filp, struct inode * inode,
 878         struct page * page)
 879 {
 880         unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
 881         unsigned long index = page->index;
 882         unsigned long max_ahead, ahead;
 883         unsigned long raend;
 884         int max_readahead = get_max_readahead(inode);
 885
 886         raend = filp->f_raend;
 887         max_ahead = 0;
 888
 889 /*
 890  * The current page is locked.
 891  * If the current position is inside the previous read IO request, do not
 892  * try to reread previously read ahead pages.
 893  * Otherwise decide or not to read ahead some pages synchronously.
 894  * If we are not going to read ahead, set the read ahead context for this
 895  * page only.
 896  */
 897         if (PageLocked(page)) {
 898                 if (!filp->f_ralen || index >= raend || index + filp->f_rawin < raend) {
 899                         raend = index;
 900                         if (raend < end_index)
 901                                 max_ahead = filp->f_ramax;
 902                         filp->f_rawin = 0;
 903                         filp->f_ralen = 1;
 904                         if (!max_ahead) {
 905                                 filp->f_raend  = index + filp->f_ralen;
 906                                 filp->f_rawin += filp->f_ralen;
 907                         }
 908                 }
 909         }
 910 /*
 911  * The current page is not locked.
 912  * If we were reading ahead and,
 913  * if the current max read ahead size is not zero and,
 914  * if the current position is inside the last read-ahead IO request,
 915  *   it is the moment to try to read ahead asynchronously.
 916  * We will later force unplug device in order to force asynchronous read IO.
 917  */
 918         else if (reada_ok && filp->f_ramax && raend >= 1 &&
 919                  index <= raend && index + filp->f_ralen >= raend) {
 920 /*
 921  * Add ONE page to max_ahead in order to try to have about the same IO max size
 922  * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
 923  * Compute the position of the last page we have tried to read in order to
 924  * begin to read ahead just at the next page.
 925  */
 926                 raend -= 1;
 927                 if (raend < end_index)
 928                         max_ahead = filp->f_ramax + 1;
 929
 930                 if (max_ahead) {
 931                         filp->f_rawin = filp->f_ralen;
 932                         filp->f_ralen = 0;
 933                         reada_ok      = 2;
 934                 }
 935         }
 936 /*
 937  * Try to read ahead pages.
 938  * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
 939  * scheduler, will work enough for us to avoid too bad actuals IO requests.
 940  */
 941         ahead = 0;
 942         while (ahead < max_ahead) {
 943                 ahead ++;
 944                 if ((raend + ahead) >= end_index)
 945                         break;
 946                 if (page_cache_read(filp, raend + ahead) < 0)
 947                         break;
 948         }
 949 /*
 950  * If we tried to read ahead some pages,
 951  * If we tried to read ahead asynchronously,
 952  *   Try to force unplug of the device in order to start an asynchronous
 953  *   read IO request.
 954  * Update the read-ahead context.
 955  * Store the length of the current read-ahead window.
 956  * Double the current max read ahead size.
 957  *   That heuristic avoid to do some large IO for files that are not really
 958  *   accessed sequentially.
 959  */
 960         if (ahead) {
 961                 if (reada_ok == 2) {
 962                         run_task_queue(&tq_disk);
 963                 }
 964
 965                 filp->f_ralen += ahead;
 966                 filp->f_rawin += filp->f_ralen;
 967                 filp->f_raend = raend + ahead + 1;
 968
 969                 filp->f_ramax += filp->f_ramax;
 970
 971                 if (filp->f_ramax > max_readahead)
 972                         filp->f_ramax = max_readahead;
 973
 974 #ifdef PROFILE_READAHEAD
 975                 profile_readahead((reada_ok == 2), filp);
 976 #endif
 977         }
 978
 979         return;
 980 }
 981
 982
 983 /*
 984  * This is a generic file read routine, and uses the
 985  * inode->i_op->readpage() function for the actual low-level
 986  * stuff.
 987  *
 988  * This is really ugly. But the goto's actually try to clarify some
 989  * of the logic when it comes to error handling etc.
 990  */
 991 void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
 992 {
 993         struct inode *inode = filp->f_dentry->d_inode;
 994         struct address_space *mapping = inode->i_mapping;
 995         unsigned long index, offset;
 996         struct page *cached_page;
 997         int reada_ok;
 998         int error;
 999         int max_readahead = get_max_readahead(inode);
1000
1001         cached_page = NULL;
1002         index = *ppos >> PAGE_CACHE_SHIFT;
1003         offset = *ppos & ~PAGE_CACHE_MASK;
1004
1005 /*
1006  * If the current position is outside the previous read-ahead window,
1007  * we reset the current read-ahead context and set read ahead max to zero
1008  * (will be set to just needed value later),
1009  * otherwise, we assume that the file accesses are sequential enough to
1010  * continue read-ahead.
1011  */
1012         if (index > filp->f_raend || index + filp->f_rawin < filp->f_raend) {
1013                 reada_ok = 0;
1014                 filp->f_raend = 0;
1015                 filp->f_ralen = 0;
1016                 filp->f_ramax = 0;
1017                 filp->f_rawin = 0;
1018         } else {
1019                 reada_ok = 1;
1020         }
1021 /*
1022  * Adjust the current value of read-ahead max.
1023  * If the read operation stay in the first half page, force no readahead.
1024  * Otherwise try to increase read ahead max just enough to do the read request.
1025  * Then, at least MIN_READAHEAD if read ahead is ok,
1026  * and at most MAX_READAHEAD in all cases.
1027  */
1028         if (!index && offset + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
1029                 filp->f_ramax = 0;
1030         } else {
1031                 unsigned long needed;
1032
1033                 needed = ((offset + desc->count) >> PAGE_CACHE_SHIFT) + 1;
1034
1035                 if (filp->f_ramax < needed)
1036                         filp->f_ramax = needed;
1037
1038                 if (reada_ok && filp->f_ramax < MIN_READAHEAD)
1039                                 filp->f_ramax = MIN_READAHEAD;
1040                 if (filp->f_ramax > max_readahead)
1041                         filp->f_ramax = max_readahead;
1042         }
1043
1044         for (;;) {
1045                 struct page *page, **hash;
1046                 unsigned long end_index, nr;
1047
1048                 end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1049                 if (index > end_index)
1050                         break;
1051                 nr = PAGE_CACHE_SIZE;
1052                 if (index == end_index) {
1053                         nr = inode->i_size & ~PAGE_CACHE_MASK;
1054                         if (nr <= offset)
1055                                 break;
1056                 }
1057
1058                 nr = nr - offset;
1059
1060                 /*
1061                  * Try to find the data in the page cache..
1062                  */
1063                 hash = page_hash(mapping, index);
1064
1065                 spin_lock(&pagecache_lock);
1066                 page = __find_page_nolock(mapping, index, *hash);
1067                 if (!page)
1068                         goto no_cached_page;
1069 found_page:
1070                 page_cache_get(page);
1071                 spin_unlock(&pagecache_lock);
1072
1073                 if (!Page_Uptodate(page))
1074                         goto page_not_up_to_date;
1075                 generic_file_readahead(reada_ok, filp, inode, page);
1076 page_ok:
1077                 /* If users can be writing to this page using arbitrary
1078                  * virtual addresses, take care about potential aliasing
1079                  * before reading the page on the kernel side.
1080                  */
1081                 if (page->mapping->i_mmap_shared != NULL)
1082                         flush_dcache_page(page);
1083
1084                 /*
1085                  * Ok, we have the page, and it's up-to-date, so
1086                  * now we can copy it to user space...
1087                  *
1088                  * The actor routine returns how many bytes were actually used..
1089                  * NOTE! This may not be the same as how much of a user buffer
1090                  * we filled up (we may be padding etc), so we can only update
1091                  * "pos" here (the actor routine has to update the user buffer
1092                  * pointers and the remaining count).
1093                  */
1094                 nr = actor(desc, page, offset, nr);
1095                 offset += nr;
1096                 index += offset >> PAGE_CACHE_SHIFT;
1097                 offset &= ~PAGE_CACHE_MASK;
1098
1099                 page_cache_release(page);
1100                 if (nr && desc->count)
1101                         continue;
1102                 break;
1103
1104 /*
1105  * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
1106  */
1107 page_not_up_to_date:
1108                 generic_file_readahead(reada_ok, filp, inode, page);
1109
1110                 if (Page_Uptodate(page))
1111                         goto page_ok;
1112
1113                 /* Get exclusive access to the page ... */
1114                 lock_page(page);
1115                 if (Page_Uptodate(page)) {
1116                         UnlockPage(page);
1117                         goto page_ok;
1118                 }
1119
1120 readpage:
1121                 /* ... and start the actual read. The read will unlock the page. */
1122                 error = mapping->a_ops->readpage(filp, page);
1123
1124                 if (!error) {
1125                         if (Page_Uptodate(page))
1126                                 goto page_ok;
1127
1128                         /* Again, try some read-ahead while waiting for the page to finish.. */
1129                         generic_file_readahead(reada_ok, filp, inode, page);
1130                         wait_on_page(page);
1131                         if (Page_Uptodate(page))
1132                                 goto page_ok;
1133                         error = -EIO;
1134                 }
1135
1136                 /* UHHUH! A synchronous read error occurred. Report it */
1137                 desc->error = error;
1138                 page_cache_release(page);
1139                 break;
1140
1141 no_cached_page:
1142                 /*
1143                  * Ok, it wasn't cached, so we need to create a new
1144                  * page..
1145                  *
1146                  * We get here with the page cache lock held.
1147                  */
1148                 if (!cached_page) {
1149                         spin_unlock(&pagecache_lock);
1150                         cached_page = page_cache_alloc();
1151                         if (!cached_page) {
1152                                 desc->error = -ENOMEM;
1153                                 break;
1154                         }
1155
1156                         /*
1157                          * Somebody may have added the page while we
1158                          * dropped the page cache lock. Check for that.
1159                          */
1160                         spin_lock(&pagecache_lock);
1161                         page = __find_page_nolock(mapping, index, *hash);
1162                         if (page)
1163                                 goto found_page;
1164                 }
1165
1166                 /*
1167                  * Ok, add the new page to the hash-queues...
1168                  */
1169                 page = cached_page;
1170                 __add_to_page_cache(page, mapping, index, hash);
1171                 spin_unlock(&pagecache_lock);
1172                 cached_page = NULL;
1173
1174                 goto readpage;
1175         }
1176
1177         *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1178         filp->f_reada = 1;
1179         if (cached_page)
1180                 page_cache_free(cached_page);
1181         UPDATE_ATIME(inode);
1182 }
1183
1184 static int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
1185 {
1186         unsigned long kaddr;
1187         unsigned long left, count = desc->count;
1188
1189         if (size > count)
1190                 size = count;
1191
1192         kaddr = kmap(page);
1193         left = __copy_to_user(desc->buf, (void *)(kaddr + offset), size);
1194         kunmap(page);
1195
1196         if (left) {
1197                 size -= left;
1198                 desc->error = -EFAULT;
1199         }
1200         desc->count = count - size;
1201         desc->written += size;
1202         desc->buf += size;
1203         return size;
1204 }
1205
1206 /*
1207  * This is the "read()" routine for all filesystems
1208  * that can use the page cache directly.
1209  */
1210 ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
1211 {
1212         ssize_t retval;
1213
1214         retval = -EFAULT;
1215         if (access_ok(VERIFY_WRITE, buf, count)) {
1216                 retval = 0;
1217
1218                 if (count) {
1219                         read_descriptor_t desc;
1220
1221                         desc.written = 0;
1222                         desc.count = count;
1223                         desc.buf = buf;
1224                         desc.error = 0;
1225                         do_generic_file_read(filp, ppos, &desc, file_read_actor);
1226
1227                         retval = desc.written;
1228                         if (!retval)
1229                                 retval = desc.error;
1230                 }
1231         }
1232         return retval;
1233 }
1234
1235 static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size)
1236 {
1237         unsigned long kaddr;
1238         ssize_t written;
1239         unsigned long count = desc->count;
1240         struct file *file = (struct file *) desc->buf;
1241         mm_segment_t old_fs;
1242
1243         if (size > count)
1244                 size = count;
1245         old_fs = get_fs();
1246         set_fs(KERNEL_DS);
1247
1248         kaddr = kmap(page);
1249         written = file->f_op->write(file, (char *)kaddr + offset,
1250                                                  size, &file->f_pos);
1251         kunmap(page);
1252         set_fs(old_fs);
1253         if (written < 0) {
1254                 desc->error = written;
1255                 written = 0;
1256         }
1257         desc->count = count - written;
1258         desc->written += written;
1259         return written;
1260 }
1261
1262 asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
1263 {
1264         ssize_t retval;
1265         struct file * in_file, * out_file;
1266         struct inode * in_inode, * out_inode;
1267
1268         /*
1269          * Get input file, and verify that it is ok..
1270          */
1271         retval = -EBADF;
1272         in_file = fget(in_fd);
1273         if (!in_file)
1274                 goto out;
1275         if (!(in_file->f_mode & FMODE_READ))
1276                 goto fput_in;
1277         retval = -EINVAL;
1278         in_inode = in_file->f_dentry->d_inode;
1279         if (!in_inode)
1280                 goto fput_in;
1281         if (!in_inode->i_mapping->a_ops->readpage)
1282                 goto fput_in;
1283         retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
1284         if (retval)
1285                 goto fput_in;
1286
1287         /*
1288          * Get output file, and verify that it is ok..
1289          */
1290         retval = -EBADF;
1291         out_file = fget(out_fd);
1292         if (!out_file)
1293                 goto fput_in;
1294         if (!(out_file->f_mode & FMODE_WRITE))
1295                 goto fput_out;
1296         retval = -EINVAL;
1297         if (!out_file->f_op || !out_file->f_op->write)
1298                 goto fput_out;
1299         out_inode = out_file->f_dentry->d_inode;
1300         if (!out_inode)
1301                 goto fput_out;
1302         retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
1303         if (retval)
1304                 goto fput_out;
1305
1306         retval = 0;
1307         if (count) {
1308                 read_descriptor_t desc;
1309                 loff_t pos = 0, *ppos;
1310
1311                 retval = -EFAULT;
1312                 ppos = &in_file->f_pos;
1313                 if (offset) {
1314                         if (get_user(pos, offset))
1315                                 goto fput_out;
1316                         ppos = &pos;
1317                 }
1318
1319                 desc.written = 0;
1320                 desc.count = count;
1321                 desc.buf = (char *) out_file;
1322                 desc.error = 0;
1323                 do_generic_file_read(in_file, ppos, &desc, file_send_actor);
1324
1325                 retval = desc.written;
1326                 if (!retval)
1327                         retval = desc.error;
1328                 if (offset)
1329                         put_user(pos, offset);
1330         }
1331
1332 fput_out:
1333         fput(out_file);
1334 fput_in:
1335         fput(in_file);
1336 out:
1337         return retval;
1338 }
1339
1340 /*
1341  * Read-ahead and flush behind for MADV_SEQUENTIAL areas.  Since we are
1342  * sure this is sequential access, we don't need a flexible read-ahead
1343  * window size -- we can always use a large fixed size window.
1344  */
1345 static void nopage_sequential_readahead(struct vm_area_struct * vma,
1346         unsigned long pgoff, unsigned long filesize)
1347 {
1348         unsigned long ra_window;
1349
1350         ra_window = get_max_readahead(vma->vm_file->f_dentry->d_inode);
1351         ra_window = CLUSTER_OFFSET(ra_window + CLUSTER_PAGES - 1);
1352
1353         /* vm_raend is zero if we haven't read ahead in this area yet.  */
1354         if (vma->vm_raend == 0)
1355                 vma->vm_raend = vma->vm_pgoff + ra_window;
1356
1357         /*
1358          * If we've just faulted the page half-way through our window,
1359          * then schedule reads for the next window, and release the
1360          * pages in the previous window.
1361          */
1362         if ((pgoff + (ra_window >> 1)) == vma->vm_raend) {
1363                 unsigned long start = vma->vm_pgoff + vma->vm_raend;
1364                 unsigned long end = start + ra_window;
1365
1366                 if (end > ((vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff))
1367                         end = (vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff;
1368                 if (start > end)
1369                         return;
1370
1371                 while ((start < end) && (start < filesize)) {
1372                         if (read_cluster_nonblocking(vma->vm_file,
1373                                                         start, filesize) < 0)
1374                                 break;
1375                         start += CLUSTER_PAGES;
1376                 }
1377                 run_task_queue(&tq_disk);
1378
1379                 /* if we're far enough past the beginning of this area,
1380                    recycle pages that are in the previous window. */
1381                 if (vma->vm_raend > (vma->vm_pgoff + ra_window + ra_window)) {
1382                         unsigned long window = ra_window << PAGE_SHIFT;
1383
1384                         end = vma->vm_start + (vma->vm_raend << PAGE_SHIFT);
1385                         end -= window + window;
1386                         filemap_sync(vma, end - window, window, MS_INVALIDATE);
1387                 }
1388
1389                 vma->vm_raend += ra_window;
1390         }
1391
1392         return;
1393 }
1394
1395 /*
1396  * filemap_nopage() is invoked via the vma operations vector for a
1397  * mapped memory region to read in file data during a page fault.
1398  *
1399  * The goto's are kind of ugly, but this streamlines the normal case of having
1400  * it in the page cache, and handles the special cases reasonably without
1401  * having a lot of duplicated code.
1402  */
1403 struct page * filemap_nopage(struct vm_area_struct * area,
1404         unsigned long address, int no_share)
1405 {
1406         int error;
1407         struct file *file = area->vm_file;
1408         struct inode *inode = file->f_dentry->d_inode;
1409         struct address_space *mapping = inode->i_mapping;
1410         struct page *page, **hash, *old_page;
1411         unsigned long size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1412
1413         unsigned long pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
1414
1415         /*
1416          * Semantics for shared and private memory areas are different
1417          * past the end of the file. A shared mapping past the last page
1418          * of the file is an error and results in a SIGBUS, while a
1419          * private mapping just maps in a zero page.
1420          */
1421         if ((pgoff >= size) && (area->vm_mm == current->mm))
1422                 return NULL;
1423
1424         /*
1425          * Do we have something in the page cache already?
1426          */
1427         hash = page_hash(mapping, pgoff);
1428 retry_find:
1429         page = __find_get_page(mapping, pgoff, hash);
1430         if (!page)
1431                 goto no_cached_page;
1432
1433         /*
1434          * Ok, found a page in the page cache, now we need to check
1435          * that it's up-to-date.
1436          */
1437         if (!Page_Uptodate(page))
1438                 goto page_not_uptodate;
1439
1440 success:
1441         /*
1442          * Try read-ahead for sequential areas.
1443          */
1444         if (VM_SequentialReadHint(area))
1445                 nopage_sequential_readahead(area, pgoff, size);
1446
1447         /*
1448          * Found the page and have a reference on it, need to check sharing
1449          * and possibly copy it over to another page..
1450          */
1451         old_page = page;
1452         if (no_share) {
1453                 struct page *new_page = page_cache_alloc();
1454
1455                 if (new_page) {
1456                         copy_user_highpage(new_page, old_page, address);
1457                         flush_page_to_ram(new_page);
1458                 } else
1459                         new_page = NOPAGE_OOM;
1460                 page_cache_release(page);
1461                 return new_page;
1462         }
1463
1464         flush_page_to_ram(old_page);
1465         return old_page;
1466
1467 no_cached_page:
1468         /*
1469          * If the requested offset is within our file, try to read a whole
1470          * cluster of pages at once.
1471          *
1472          * Otherwise, we're off the end of a privately mapped file,
1473          * so we need to map a zero page.
1474          */
1475         if ((pgoff < size) && !VM_RandomReadHint(area))
1476                 error = read_cluster_nonblocking(file, pgoff, size);
1477         else
1478                 error = page_cache_read(file, pgoff);
1479
1480         /*
1481          * The page we want has now been added to the page cache.
1482          * In the unlikely event that someone removed it in the
1483          * meantime, we'll just come back here and read it again.
1484          */
1485         if (error >= 0)
1486                 goto retry_find;
1487
1488         /*
1489          * An error return from page_cache_read can result if the
1490          * system is low on memory, or a problem occurs while trying
1491          * to schedule I/O.
1492          */
1493         if (error == -ENOMEM)
1494                 return NOPAGE_OOM;
1495         return NULL;
1496
1497 page_not_uptodate:
1498         lock_page(page);
1499         if (Page_Uptodate(page)) {
1500                 UnlockPage(page);
1501                 goto success;
1502         }
1503
1504         if (!mapping->a_ops->readpage(file, page)) {
1505                 wait_on_page(page);
1506                 if (Page_Uptodate(page))
1507                         goto success;
1508         }
1509
1510         /*
1511          * Umm, take care of errors if the page isn't up-to-date.
1512          * Try to re-read it _once_. We do this synchronously,
1513          * because there really aren't any performance issues here
1514          * and we need to check for errors.
1515          */
1516         lock_page(page);
1517         if (Page_Uptodate(page)) {
1518                 UnlockPage(page);
1519                 goto success;
1520         }
1521         ClearPageError(page);
1522         if (!mapping->a_ops->readpage(file, page)) {
1523                 wait_on_page(page);
1524                 if (Page_Uptodate(page))
1525                         goto success;
1526         }
1527
1528         /*
1529          * Things didn't work out. Return zero to tell the
1530          * mm layer so, possibly freeing the page cache page first.
1531          */
1532         page_cache_release(page);
1533         return NULL;
1534 }
1535
1536 static int filemap_write_page(struct file *file,
1537                               struct page * page,
1538                               int wait)
1539 {
1540         /*
1541          * If a task terminates while we're swapping the page, the vma and
1542          * and file could be released: try_to_swap_out has done a get_file.
1543          * vma/file is guaranteed to exist in the unmap/sync cases because
1544          * mmap_sem is held.
1545          */
1546         return page->mapping->a_ops->writepage(file, page);
1547 }
1548
1549
1550 /*
1551  * The page cache takes care of races between somebody
1552  * trying to swap something out and swap something in
1553  * at the same time..
1554  */
1555 extern void wakeup_bdflush(int);
1556 int filemap_swapout(struct page * page, struct file * file)
1557 {
1558         int retval = filemap_write_page(file, page, 0);
1559         wakeup_bdflush(0);
1560         return retval;
1561 }
1562
1563 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
1564         unsigned long address, unsigned int flags)
1565 {
1566         unsigned long pgoff;
1567         pte_t pte = *ptep;
1568         struct page *page;
1569         int error;
1570
1571         if (!(flags & MS_INVALIDATE)) {
1572                 if (!pte_present(pte))
1573                         return 0;
1574                 if (!pte_dirty(pte))
1575                         return 0;
1576                 flush_page_to_ram(pte_page(pte));
1577                 flush_cache_page(vma, address);
1578                 set_pte(ptep, pte_mkclean(pte));
1579                 flush_tlb_page(vma, address);
1580                 page = pte_page(pte);
1581                 page_cache_get(page);
1582         } else {
1583                 if (pte_none(pte))
1584                         return 0;
1585                 flush_cache_page(vma, address);
1586                 pte_clear(ptep);
1587                 flush_tlb_page(vma, address);
1588                 if (!pte_present(pte)) {
1589                         swap_free(pte_to_swp_entry(pte));
1590                         return 0;
1591                 }
1592                 page = pte_page(pte);
1593                 if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
1594                         page_cache_free(page);
1595                         return 0;
1596                 }
1597         }
1598         pgoff = (address - vma->vm_start) >> PAGE_CACHE_SHIFT;
1599         pgoff += vma->vm_pgoff;
1600         if (page->index != pgoff) {
1601                 printk("weirdness: pgoff=%lu index=%lu address=%lu vm_start=%lu vm_pgoff=%lu\n",
1602                         pgoff, page->index, address, vma->vm_start, vma->vm_pgoff);
1603         }
1604         lock_page(page);
1605         error = filemap_write_page(vma->vm_file, page, 1);
1606         UnlockPage(page);
1607         page_cache_free(page);
1608         return error;
1609 }
1610
1611 static inline int filemap_sync_pte_range(pmd_t * pmd,
1612         unsigned long address, unsigned long size,
1613         struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
1614 {
1615         pte_t * pte;
1616         unsigned long end;
1617         int error;
1618
1619         if (pmd_none(*pmd))
1620                 return 0;
1621         if (pmd_bad(*pmd)) {
1622                 pmd_ERROR(*pmd);
1623                 pmd_clear(pmd);
1624                 return 0;
1625         }
1626         pte = pte_offset(pmd, address);
1627         offset += address & PMD_MASK;
1628         address &= ~PMD_MASK;
1629         end = address + size;
1630         if (end > PMD_SIZE)
1631                 end = PMD_SIZE;
1632         error = 0;
1633         do {
1634                 error |= filemap_sync_pte(pte, vma, address + offset, flags);
1635                 address += PAGE_SIZE;
1636                 pte++;
1637         } while (address && (address < end));
1638         return error;
1639 }
1640
1641 static inline int filemap_sync_pmd_range(pgd_t * pgd,
1642         unsigned long address, unsigned long size,
1643         struct vm_area_struct *vma, unsigned int flags)
1644 {
1645         pmd_t * pmd;
1646         unsigned long offset, end;
1647         int error;
1648
1649         if (pgd_none(*pgd))
1650                 return 0;
1651         if (pgd_bad(*pgd)) {
1652                 pgd_ERROR(*pgd);
1653                 pgd_clear(pgd);
1654                 return 0;
1655         }
1656         pmd = pmd_offset(pgd, address);
1657         offset = address & PGDIR_MASK;
1658         address &= ~PGDIR_MASK;
1659         end = address + size;
1660         if (end > PGDIR_SIZE)
1661                 end = PGDIR_SIZE;
1662         error = 0;
1663         do {
1664                 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
1665                 address = (address + PMD_SIZE) & PMD_MASK;
1666                 pmd++;
1667         } while (address && (address < end));
1668         return error;
1669 }
1670
1671 int filemap_sync(struct vm_area_struct * vma, unsigned long address,
1672         size_t size, unsigned int flags)
1673 {
1674         pgd_t * dir;
1675         unsigned long end = address + size;
1676         int error = 0;
1677
1678         dir = pgd_offset(vma->vm_mm, address);
1679         flush_cache_range(vma->vm_mm, end - size, end);
1680         if (address >= end)
1681                 BUG();
1682         do {
1683                 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
1684                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
1685                 dir++;
1686         } while (address && (address < end));
1687         flush_tlb_range(vma->vm_mm, end - size, end);
1688         return error;
1689 }
1690
1691 /*
1692  * This handles (potentially partial) area unmaps..
1693  */
1694 static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len)
1695 {
1696         filemap_sync(vma, start, len, MS_ASYNC);
1697 }
1698
1699 /*
1700  * Shared mappings need to be able to do the right thing at
1701  * close/unmap/sync. They will also use the private file as
1702  * backing-store for swapping..
1703  */
1704 static struct vm_operations_struct file_shared_mmap = {
1705         unmap:          filemap_unmap,          /* unmap - we need to sync the pages */
1706         sync:           filemap_sync,
1707         nopage:         filemap_nopage,
1708         swapout:        filemap_swapout,
1709 };
1710
1711 /*
1712  * Private mappings just need to be able to load in the map.
1713  *
1714  * (This is actually used for shared mappings as well, if we
1715  * know they can't ever get write permissions..)
1716  */
1717 static struct vm_operations_struct file_private_mmap = {
1718         nopage:         filemap_nopage,
1719 };
1720
1721 /* This is used for a general mmap of a disk file */
1722
1723 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1724 {
1725         struct vm_operations_struct * ops;
1726         struct inode *inode = file->f_dentry->d_inode;
1727
1728         ops = &file_private_mmap;
1729         if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1730                 if (!inode->i_mapping->a_ops->writepage)
1731                         return -EINVAL;
1732                 ops = &file_shared_mmap;
1733         }
1734         if (!inode->i_sb || !S_ISREG(inode->i_mode))
1735                 return -EACCES;
1736         if (!inode->i_mapping->a_ops->readpage)
1737                 return -ENOEXEC;
1738         UPDATE_ATIME(inode);
1739         vma->vm_ops = ops;
1740         return 0;
1741 }
1742
1743 /*
1744  * The msync() system call.
1745  */
1746
1747 static int msync_interval(struct vm_area_struct * vma,
1748         unsigned long start, unsigned long end, int flags)
1749 {
1750         if (vma->vm_file && vma->vm_ops && vma->vm_ops->sync) {
1751                 int error;
1752                 error = vma->vm_ops->sync(vma, start, end-start, flags);
1753                 if (!error && (flags & MS_SYNC)) {
1754                         struct file * file = vma->vm_file;
1755                         if (file && file->f_op && file->f_op->fsync) {
1756                                 down(&file->f_dentry->d_inode->i_sem);
1757                                 error = file->f_op->fsync(file, file->f_dentry, 1);
1758                                 up(&file->f_dentry->d_inode->i_sem);
1759                         }
1760                 }
1761                 return error;
1762         }
1763         return 0;
1764 }
1765
1766 asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
1767 {
1768         unsigned long end;
1769         struct vm_area_struct * vma;
1770         int unmapped_error, error = -EINVAL;
1771
1772         down(&current->mm->mmap_sem);
1773         if (start & ~PAGE_MASK)
1774                 goto out;
1775         len = (len + ~PAGE_MASK) & PAGE_MASK;
1776         end = start + len;
1777         if (end < start)
1778                 goto out;
1779         if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1780                 goto out;
1781         error = 0;
1782         if (end == start)
1783                 goto out;
1784         /*
1785          * If the interval [start,end) covers some unmapped address ranges,
1786          * just ignore them, but return -EFAULT at the end.
1787          */
1788         vma = find_vma(current->mm, start);
1789         unmapped_error = 0;
1790         for (;;) {
1791                 /* Still start < end. */
1792                 error = -EFAULT;
1793                 if (!vma)
1794                         goto out;
1795                 /* Here start < vma->vm_end. */
1796                 if (start < vma->vm_start) {
1797                         unmapped_error = -EFAULT;
1798                         start = vma->vm_start;
1799                 }
1800                 /* Here vma->vm_start <= start < vma->vm_end. */
1801                 if (end <= vma->vm_end) {
1802                         if (start < end) {
1803                                 error = msync_interval(vma, start, end, flags);
1804                                 if (error)
1805                                         goto out;
1806                         }
1807                         error = unmapped_error;
1808                         goto out;
1809                 }
1810                 /* Here vma->vm_start <= start < vma->vm_end < end. */
1811                 error = msync_interval(vma, start, vma->vm_end, flags);
1812                 if (error)
1813                         goto out;
1814                 start = vma->vm_end;
1815                 vma = vma->vm_next;
1816         }
1817 out:
1818         up(&current->mm->mmap_sem);
1819         return error;
1820 }
1821
1822 static inline void setup_read_behavior(struct vm_area_struct * vma,
1823         int behavior)
1824 {
1825         VM_ClearReadHint(vma);
1826         switch(behavior) {
1827                 case MADV_SEQUENTIAL:
1828                         vma->vm_flags |= VM_SEQ_READ;
1829                         break;
1830                 case MADV_RANDOM:
1831                         vma->vm_flags |= VM_RAND_READ;
1832                         break;
1833                 default:
1834                         break;
1835         }
1836         return;
1837 }
1838
1839 static long madvise_fixup_start(struct vm_area_struct * vma,
1840         unsigned long end, int behavior)
1841 {
1842         struct vm_area_struct * n;
1843
1844         n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1845         if (!n)
1846                 return -EAGAIN;
1847         *n = *vma;
1848         n->vm_end = end;
1849         setup_read_behavior(n, behavior);
1850         n->vm_raend = 0;
1851         get_file(n->vm_file);
1852         if (n->vm_ops && n->vm_ops->open)
1853                 n->vm_ops->open(n);
1854         vmlist_modify_lock(vma->vm_mm);
1855         vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
1856         vma->vm_start = end;
1857         insert_vm_struct(current->mm, n);
1858         vmlist_modify_unlock(vma->vm_mm);
1859         return 0;
1860 }
1861
1862 static long madvise_fixup_end(struct vm_area_struct * vma,
1863         unsigned long start, int behavior)
1864 {
1865         struct vm_area_struct * n;
1866
1867         n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1868         if (!n)
1869                 return -EAGAIN;
1870         *n = *vma;
1871         n->vm_start = start;
1872         n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT;
1873         setup_read_behavior(n, behavior);
1874         n->vm_raend = 0;
1875         get_file(n->vm_file);
1876         if (n->vm_ops && n->vm_ops->open)
1877                 n->vm_ops->open(n);
1878         vmlist_modify_lock(vma->vm_mm);
1879         vma->vm_end = start;
1880         insert_vm_struct(current->mm, n);
1881         vmlist_modify_unlock(vma->vm_mm);
1882         return 0;
1883 }
1884
1885 static long madvise_fixup_middle(struct vm_area_struct * vma,
1886         unsigned long start, unsigned long end, int behavior)
1887 {
1888         struct vm_area_struct * left, * right;
1889
1890         left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1891         if (!left)
1892                 return -EAGAIN;
1893         right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1894         if (!right) {
1895                 kmem_cache_free(vm_area_cachep, left);
1896                 return -EAGAIN;
1897         }
1898         *left = *vma;
1899         *right = *vma;
1900         left->vm_end = start;
1901         right->vm_start = end;
1902         right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT;
1903         left->vm_raend = 0;
1904         right->vm_raend = 0;
1905         atomic_add(2, &vma->vm_file->f_count);
1906
1907         if (vma->vm_ops && vma->vm_ops->open) {
1908                 vma->vm_ops->open(left);
1909                 vma->vm_ops->open(right);
1910         }
1911         vmlist_modify_lock(vma->vm_mm);
1912         vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
1913         vma->vm_start = start;
1914         vma->vm_end = end;
1915         setup_read_behavior(vma, behavior);
1916         vma->vm_raend = 0;
1917         insert_vm_struct(current->mm, left);
1918         insert_vm_struct(current->mm, right);
1919         vmlist_modify_unlock(vma->vm_mm);
1920         return 0;
1921 }
1922
1923 /*
1924  * We can potentially split a vm area into separate
1925  * areas, each area with its own behavior.
1926  */
1927 static long madvise_behavior(struct vm_area_struct * vma,
1928         unsigned long start, unsigned long end, int behavior)
1929 {
1930         int error = 0;
1931
1932         /* This caps the number of vma's this process can own */
1933         if (vma->vm_mm->map_count > MAX_MAP_COUNT)
1934                 return -ENOMEM;
1935
1936         if (start == vma->vm_start) {
1937                 if (end == vma->vm_end) {
1938                         setup_read_behavior(vma, behavior);
1939                         vma->vm_raend = 0;
1940                 } else
1941                         error = madvise_fixup_start(vma, end, behavior);
1942         } else {
1943                 if (end == vma->vm_end)
1944                         error = madvise_fixup_end(vma, start, behavior);
1945                 else
1946                         error = madvise_fixup_middle(vma, start, end, behavior);
1947         }
1948
1949         return error;
1950 }
1951
1952 /*
1953  * Schedule all required I/O operations, then run the disk queue
1954  * to make sure they are started.  Do not wait for completion.
1955  */
1956 static long madvise_willneed(struct vm_area_struct * vma,
1957         unsigned long start, unsigned long end)
1958 {
1959         long error = -EBADF;
1960         struct file * file;
1961         unsigned long size, rlim_rss;
1962
1963         /* Doesn't work if there's no mapped file. */
1964         if (!vma->vm_file)
1965                 return error;
1966         file = vma->vm_file;
1967         size = (file->f_dentry->d_inode->i_size + PAGE_CACHE_SIZE - 1) >>
1968                                                         PAGE_CACHE_SHIFT;
1969
1970         start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1971         if (end > vma->vm_end)
1972                 end = vma->vm_end;
1973         end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1974
1975         /* Make sure this doesn't exceed the process's max rss. */
1976         error = -EIO;
1977         rlim_rss = current->rlim ?  current->rlim[RLIMIT_RSS].rlim_cur :
1978                                 LONG_MAX; /* default: see resource.h */
1979         if ((vma->vm_mm->rss + (end - start)) > rlim_rss)
1980                 return error;
1981
1982         /* round to cluster boundaries if this isn't a "random" area. */
1983         if (!VM_RandomReadHint(vma)) {
1984                 start = CLUSTER_OFFSET(start);
1985                 end = CLUSTER_OFFSET(end + CLUSTER_PAGES - 1);
1986
1987                 while ((start < end) && (start < size)) {
1988                         error = read_cluster_nonblocking(file, start, size);
1989                         start += CLUSTER_PAGES;
1990                         if (error < 0)
1991                                 break;
1992                 }
1993         } else {
1994                 while ((start < end) && (start < size)) {
1995                         error = page_cache_read(file, start);
1996                         start++;
1997                         if (error < 0)
1998                                 break;
1999                 }
2000         }
2001
2002         /* Don't wait for someone else to push these requests. */
2003         run_task_queue(&tq_disk);
2004
2005         return error;
2006 }
2007
2008 /*
2009  * Application no longer needs these pages.  If the pages are dirty,
2010  * it's OK to just throw them away.  The app will be more careful about
2011  * data it wants to keep.  Be sure to free swap resources too.  The
2012  * zap_page_range call sets things up for shrink_mmap to actually free
2013  * these pages later if no one else has touched them in the meantime,
2014  * although we could add these pages to a global reuse list for
2015  * shrink_mmap to pick up before reclaiming other pages.
2016  *
2017  * NB: This interface discards data rather than pushes it out to swap,
2018  * as some implementations do.  This has performance implications for
2019  * applications like large transactional databases which want to discard
2020  * pages in anonymous maps after committing to backing store the data
2021  * that was kept in them.  There is no reason to write this data out to
2022  * the swap area if the application is discarding it.
2023  *
2024  * An interface that causes the system to free clean pages and flush
2025  * dirty pages is already available as msync(MS_INVALIDATE).
2026  */
2027 static long madvise_dontneed(struct vm_area_struct * vma,
2028         unsigned long start, unsigned long end)
2029 {
2030         if (vma->vm_flags & VM_LOCKED)
2031                 return -EINVAL;
2032
2033         flush_cache_range(vma->vm_mm, start, end);
2034         zap_page_range(vma->vm_mm, start, end - start);
2035         flush_tlb_range(vma->vm_mm, start, end);
2036         return 0;
2037 }
2038
2039 static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
2040         unsigned long end, int behavior)
2041 {
2042         long error = -EBADF;
2043
2044         switch (behavior) {
2045         case MADV_NORMAL:
2046         case MADV_SEQUENTIAL:
2047         case MADV_RANDOM:
2048                 error = madvise_behavior(vma, start, end, behavior);
2049                 break;
2050
2051         case MADV_WILLNEED:
2052                 error = madvise_willneed(vma, start, end);
2053                 break;
2054
2055         case MADV_DONTNEED:
2056                 error = madvise_dontneed(vma, start, end);
2057                 break;
2058
2059         default:
2060                 error = -EINVAL;
2061                 break;
2062         }
2063
2064         return error;
2065 }
2066
2067 /*
2068  * The madvise(2) system call.
2069  *
2070  * Applications can use madvise() to advise the kernel how it should
2071  * handle paging I/O in this VM area.  The idea is to help the kernel
2072  * use appropriate read-ahead and caching techniques.  The information
2073  * provided is advisory only, and can be safely disregarded by the
2074  * kernel without affecting the correct operation of the application.
2075  *
2076  * behavior values:
2077  *  MADV_NORMAL - the default behavior is to read clusters.  This
2078  *              results in some read-ahead and read-behind.
2079  *  MADV_RANDOM - the system should read the minimum amount of data
2080  *              on any access, since it is unlikely that the appli-
2081  *              cation will need more than what it asks for.
2082  *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
2083  *              once, so they can be aggressively read ahead, and
2084  *              can be freed soon after they are accessed.
2085  *  MADV_WILLNEED - the application is notifying the system to read
2086  *              some pages ahead.
2087  *  MADV_DONTNEED - the application is finished with the given range,
2088  *              so the kernel can free resources associated with it.
2089  *
2090  * return values:
2091  *  zero    - success
2092  *  -EINVAL - start + len < 0, start is not page-aligned,
2093  *              "behavior" is not a valid value, or application
2094  *              is attempting to release locked or shared pages.
2095  *  -ENOMEM - addresses in the specified range are not currently
2096  *              mapped, or are outside the AS of the process.
2097  *  -EIO    - an I/O error occurred while paging in data.
2098  *  -EBADF  - map exists, but area maps something that isn't a file.
2099  *  -EAGAIN - a kernel resource was temporarily unavailable.
2100  */
2101 asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior)
2102 {
2103         unsigned long end;
2104         struct vm_area_struct * vma;
2105         int unmapped_error = 0;
2106         int error = -EINVAL;
2107
2108         down(&current->mm->mmap_sem);
2109
2110         if (start & ~PAGE_MASK)
2111                 goto out;
2112         len = (len + ~PAGE_MASK) & PAGE_MASK;
2113         end = start + len;
2114         if (end < start)
2115                 goto out;
2116
2117         error = 0;
2118         if (end == start)
2119                 goto out;
2120
2121         /*
2122          * If the interval [start,end) covers some unmapped address
2123          * ranges, just ignore them, but return -ENOMEM at the end.
2124          */
2125         vma = find_vma(current->mm, start);
2126         for (;;) {
2127                 /* Still start < end. */
2128                 error = -ENOMEM;
2129                 if (!vma)
2130                         goto out;
2131
2132                 /* Here start < vma->vm_end. */
2133                 if (start < vma->vm_start) {
2134                         unmapped_error = -ENOMEM;
2135                         start = vma->vm_start;
2136                 }
2137
2138                 /* Here vma->vm_start <= start < vma->vm_end. */
2139                 if (end <= vma->vm_end) {
2140                         if (start < end) {
2141                                 error = madvise_vma(vma, start, end,
2142                                                         behavior);
2143                                 if (error)
2144                                         goto out;
2145                         }
2146                         error = unmapped_error;
2147                         goto out;
2148                 }
2149
2150                 /* Here vma->vm_start <= start < vma->vm_end < end. */
2151                 error = madvise_vma(vma, start, vma->vm_end, behavior);
2152                 if (error)
2153                         goto out;
2154                 start = vma->vm_end;
2155                 vma = vma->vm_next;
2156         }
2157
2158 out:
2159         up(&current->mm->mmap_sem);
2160         return error;
2161 }
2162
2163 /*
2164  * Later we can get more picky about what "in core" means precisely.
2165  * For now, simply check to see if the page is in the page cache,
2166  * and is up to date; i.e. that no page-in operation would be required
2167  * at this time if an application were to map and access this page.
2168  */
2169 static unsigned char mincore_page(struct vm_area_struct * vma,
2170         unsigned long pgoff)
2171 {
2172         unsigned char present = 0;
2173         struct address_space * as = &vma->vm_file->f_dentry->d_inode->i_data;
2174         struct page * page, ** hash = page_hash(as, pgoff);
2175
2176         spin_lock(&pagecache_lock);
2177         page = __find_page_nolock(as, pgoff, *hash);
2178         if ((page) && (Page_Uptodate(page)))
2179                 present = 1;
2180         spin_unlock(&pagecache_lock);
2181
2182         return present;
2183 }
2184
2185 static long mincore_vma(struct vm_area_struct * vma,
2186         unsigned long start, unsigned long end, unsigned char * vec)
2187 {
2188         long error, i, remaining;
2189         unsigned char * tmp;
2190
2191         error = -ENOMEM;
2192         if (!vma->vm_file)
2193                 return error;
2194
2195         start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2196         if (end > vma->vm_end)
2197                 end = vma->vm_end;
2198         end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2199
2200         error = -EAGAIN;
2201         tmp = (unsigned char *) __get_free_page(GFP_KERNEL);
2202         if (!tmp)
2203                 return error;
2204
2205         /* (end - start) is # of pages, and also # of bytes in "vec */
2206         remaining = (end - start),
2207
2208         error = 0;
2209         for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) {
2210                 int j = 0;
2211                 long thispiece = (remaining < PAGE_SIZE) ?
2212                                                 remaining : PAGE_SIZE;
2213
2214                 while (j < thispiece)
2215                         tmp[j++] = mincore_page(vma, start++);
2216
2217                 if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) {
2218                         error = -EFAULT;
2219                         break;
2220                 }
2221         }
2222
2223         free_page((unsigned long) tmp);
2224         return error;
2225 }
2226
2227 /*
2228  * The mincore(2) system call.
2229  *
2230  * mincore() returns the memory residency status of the pages in the
2231  * current process's address space specified by [addr, addr + len).
2232  * The status is returned in a vector of bytes.  The least significant
2233  * bit of each byte is 1 if the referenced page is in memory, otherwise
2234  * it is zero.
2235  *
2236  * Because the status of a page can change after mincore() checks it
2237  * but before it returns to the application, the returned vector may
2238  * contain stale information.  Only locked pages are guaranteed to
2239  * remain in memory.
2240  *
2241  * return values:
2242  *  zero    - success
2243  *  -EFAULT - vec points to an illegal address
2244  *  -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE,
2245  *              or len has a nonpositive value
2246  *  -ENOMEM - Addresses in the range [addr, addr + len] are
2247  *              invalid for the address space of this process, or
2248  *              specify one or more pages which are not currently
2249  *              mapped
2250  *  -EAGAIN - A kernel resource was temporarily unavailable.
2251  */
2252 asmlinkage long sys_mincore(unsigned long start, size_t len,
2253         unsigned char * vec)
2254 {
2255         int index = 0;
2256         unsigned long end;
2257         struct vm_area_struct * vma;
2258         int unmapped_error = 0;
2259         long error = -EINVAL;
2260
2261         down(&current->mm->mmap_sem);
2262
2263         if (start & ~PAGE_CACHE_MASK)
2264                 goto out;
2265         len = (len + ~PAGE_CACHE_MASK) & PAGE_CACHE_MASK;
2266         end = start + len;
2267         if (end < start)
2268                 goto out;
2269
2270         error = 0;
2271         if (end == start)
2272                 goto out;
2273
2274         /*
2275          * If the interval [start,end) covers some unmapped address
2276          * ranges, just ignore them, but return -ENOMEM at the end.
2277          */
2278         vma = find_vma(current->mm, start);
2279         for (;;) {
2280                 /* Still start < end. */
2281                 error = -ENOMEM;
2282                 if (!vma)
2283                         goto out;
2284
2285                 /* Here start < vma->vm_end. */
2286                 if (start < vma->vm_start) {
2287                         unmapped_error = -ENOMEM;
2288                         start = vma->vm_start;
2289                 }
2290
2291                 /* Here vma->vm_start <= start < vma->vm_end. */
2292                 if (end <= vma->vm_end) {
2293                         if (start < end) {
2294                                 error = mincore_vma(vma, start, end,
2295                                                         &vec[index]);
2296                                 if (error)
2297                                         goto out;
2298                         }
2299                         error = unmapped_error;
2300                         goto out;
2301                 }
2302
2303                 /* Here vma->vm_start <= start < vma->vm_end < end. */
2304                 error = mincore_vma(vma, start, vma->vm_end, &vec[index]);
2305                 if (error)
2306                         goto out;
2307                 index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT;
2308                 start = vma->vm_end;
2309                 vma = vma->vm_next;
2310         }
2311
2312 out:
2313         up(&current->mm->mmap_sem);
2314         return error;
2315 }
2316
2317 static inline
2318 struct page *__read_cache_page(struct address_space *mapping,
2319                                 unsigned long index,
2320                                 int (*filler)(void *,struct page*),
2321                                 void *data)
2322 {
2323         struct page **hash = page_hash(mapping, index);
2324         struct page *page, *cached_page = NULL;
2325         int err;
2326 repeat:
2327         page = __find_get_page(mapping, index, hash);
2328         if (!page) {
2329                 if (!cached_page) {
2330                         cached_page = page_cache_alloc();
2331                         if (!cached_page)
2332                                 return ERR_PTR(-ENOMEM);
2333                 }
2334                 page = cached_page;
2335                 if (add_to_page_cache_unique(page, mapping, index, hash))
2336                         goto repeat;
2337                 cached_page = NULL;
2338                 err = filler(data, page);
2339                 if (err < 0) {
2340                         page_cache_release(page);
2341                         page = ERR_PTR(err);
2342                 }
2343         }
2344         if (cached_page)
2345                 page_cache_free(cached_page);
2346         return page;
2347 }
2348
2349 /*
2350  * Read into the page cache. If a page already exists,
2351  * and Page_Uptodate() is not set, try to fill the page.
2352  */
2353 struct page *read_cache_page(struct address_space *mapping,
2354                                 unsigned long index,
2355                                 int (*filler)(void *,struct page*),
2356                                 void *data)
2357 {
2358         struct page *page = __read_cache_page(mapping, index, filler, data);
2359         int err;
2360
2361         if (IS_ERR(page) || Page_Uptodate(page))
2362                 goto out;
2363
2364         lock_page(page);
2365         if (Page_Uptodate(page)) {
2366                 UnlockPage(page);
2367                 goto out;
2368         }
2369         err = filler(data, page);
2370         if (err < 0) {
2371                 page_cache_release(page);
2372                 page = ERR_PTR(err);
2373         }
2374  out:
2375         return page;
2376 }
2377
2378 static inline struct page * __grab_cache_page(struct address_space *mapping,
2379                                 unsigned long index, struct page **cached_page)
2380 {
2381         struct page *page, **hash = page_hash(mapping, index);
2382 repeat:
2383         page = __find_lock_page(mapping, index, hash);
2384         if (!page) {
2385                 if (!*cached_page) {
2386                         *cached_page = page_cache_alloc();
2387                         if (!*cached_page)
2388                                 return NULL;
2389                 }
2390                 page = *cached_page;
2391                 if (add_to_page_cache_unique(page, mapping, index, hash))
2392                         goto repeat;
2393                 *cached_page = NULL;
2394         }
2395         return page;
2396 }
2397
2398 /*
2399  * Returns locked page at given index in given cache, creating it if needed.
2400  */
2401
2402 struct page *grab_cache_page(struct address_space *mapping, unsigned long index)
2403 {
2404         struct page *cached_page = NULL;
2405         struct page *page = __grab_cache_page(mapping,index,&cached_page);
2406         if (cached_page)
2407                 page_cache_free(cached_page);
2408         return page;
2409 }
2410
2411 static inline void remove_suid(struct inode *inode)
2412 {
2413         unsigned int mode;
2414
2415         /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
2416         mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
2417
2418         /* was any of the uid bits set? */
2419         mode &= inode->i_mode;
2420         if (mode && !capable(CAP_FSETID)) {
2421                 inode->i_mode &= ~mode;
2422                 mark_inode_dirty(inode);
2423         }
2424 }
2425
2426 /*
2427  * Write to a file through the page cache.
2428  *
2429  * We currently put everything into the page cache prior to writing it.
2430  * This is not a problem when writing full pages. With partial pages,
2431  * however, we first have to read the data into the cache, then
2432  * dirty the page, and finally schedule it for writing. Alternatively, we
2433  * could write-through just the portion of data that would go into that
2434  * page, but that would kill performance for applications that write data
2435  * line by line, and it's prone to race conditions.
2436  *
2437  * Note that this routine doesn't try to keep track of dirty pages. Each
2438  * file system has to do this all by itself, unfortunately.
2439  *                                                      okir@monad.swb.de
2440  */
2441 ssize_t
2442 generic_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)
2443 {
2444         struct inode    *inode = file->f_dentry->d_inode;
2445         struct address_space *mapping = inode->i_mapping;
2446         unsigned long   limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
2447         loff_t          pos;
2448         struct page     *page, *cached_page;
2449         unsigned long   written;
2450         long            status;
2451         int             err;
2452
2453         cached_page = NULL;
2454
2455         down(&inode->i_sem);
2456
2457         pos = *ppos;
2458         err = -EINVAL;
2459         if (pos < 0)
2460                 goto out;
2461
2462         err = file->f_error;
2463         if (err) {
2464                 file->f_error = 0;
2465                 goto out;
2466         }
2467
2468         written = 0;
2469
2470         if (file->f_flags & O_APPEND)
2471                 pos = inode->i_size;
2472
2473         /*
2474          * Check whether we've reached the file size limit.
2475          */
2476         err = -EFBIG;
2477         if (limit != RLIM_INFINITY) {
2478                 if (pos >= limit) {
2479                         send_sig(SIGXFSZ, current, 0);
2480                         goto out;
2481                 }
2482                 if (count > limit - pos) {
2483                         send_sig(SIGXFSZ, current, 0);
2484                         count = limit - pos;
2485                 }
2486         }
2487
2488         status  = 0;
2489         if (count) {
2490                 remove_suid(inode);
2491                 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
2492                 mark_inode_dirty(inode);
2493         }
2494
2495         while (count) {
2496                 unsigned long bytes, index, offset;
2497                 char *kaddr;
2498
2499                 /*
2500                  * Try to find the page in the cache. If it isn't there,
2501                  * allocate a free page.
2502                  */
2503                 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
2504                 index = pos >> PAGE_CACHE_SHIFT;
2505                 bytes = PAGE_CACHE_SIZE - offset;
2506                 if (bytes > count)
2507                         bytes = count;
2508
2509                 status = -ENOMEM;       /* we'll assign it later anyway */
2510                 page = __grab_cache_page(mapping, index, &cached_page);
2511                 if (!page)
2512                         break;
2513
2514                 /* We have exclusive IO access to the page.. */
2515                 if (!PageLocked(page)) {
2516                         PAGE_BUG(page);
2517                 }
2518
2519                 status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes);
2520                 if (status)
2521                         goto unlock;
2522                 kaddr = page_address(page);
2523                 status = copy_from_user(kaddr+offset, buf, bytes);
2524                 flush_dcache_page(page);
2525                 if (status)
2526                         goto fail_write;
2527                 status = mapping->a_ops->commit_write(file, page, offset, offset+bytes);
2528                 if (!status)
2529                         status = bytes;
2530
2531                 if (status >= 0) {
2532                         written += status;
2533                         count -= status;
2534                         pos += status;
2535                         buf += status;
2536                 }
2537 unlock:
2538                 /* Mark it unlocked again and drop the page.. */
2539                 UnlockPage(page);
2540                 page_cache_release(page);
2541
2542                 if (status < 0)
2543                         break;
2544         }
2545         *ppos = pos;
2546
2547         if (cached_page)
2548                 page_cache_free(cached_page);
2549
2550         err = written ? written : status;
2551 out:
2552         up(&inode->i_sem);
2553         return err;
2554 fail_write:
2555         status = -EFAULT;
2556         ClearPageUptodate(page);
2557         kunmap(page);
2558         goto unlock;
2559 }
2560
2561 void __init page_cache_init(unsigned long mempages)
2562 {
2563         unsigned long htable_size, order;
2564
2565         htable_size = mempages;
2566         htable_size *= sizeof(struct page *);
2567         for(order = 0; (PAGE_SIZE << order) < htable_size; order++)
2568                 ;
2569
2570         do {
2571                 unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *);
2572
2573                 page_hash_bits = 0;
2574                 while((tmp >>= 1UL) != 0UL)
2575                         page_hash_bits++;
2576
2577                 page_hash_table = (struct page **)
2578                         __get_free_pages(GFP_ATOMIC, order);
2579         } while(page_hash_table == NULL && --order > 0);
2580
2581         printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
2582                (1 << page_hash_bits), order, (PAGE_SIZE << order));
2583         if (!page_hash_table)
2584                 panic("Failed to allocate page hash table\n");
2585         memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));
2586 }