mm/filemap.c

   1 /*
   2  *      linux/mm/filemap.c
   3  *
   4  * Copyright (C) 1994-1999  Linus Torvalds
   5  */
   6
   7 /*
   8  * This file handles the generic file mmap semantics used by
   9  * most "normal" filesystems (but you don't /have/ to use this:
  10  * the NFS filesystem used to do this differently, for example)
  11  */
  12 #include <linux/malloc.h>
  13 #include <linux/shm.h>
  14 #include <linux/mman.h>
  15 #include <linux/locks.h>
  16 #include <linux/pagemap.h>
  17 #include <linux/swap.h>
  18 #include <linux/smp_lock.h>
  19 #include <linux/blkdev.h>
  20 #include <linux/file.h>
  21 #include <linux/swapctl.h>
  22 #include <linux/slab.h>
  23 #include <linux/init.h>
  24 #include <linux/mm.h>
  25
  26 #include <asm/pgalloc.h>
  27 #include <asm/uaccess.h>
  28 #include <asm/mman.h>
  29
  30 #include <linux/highmem.h>
  31
  32 /*
  33  * Shared mappings implemented 30.11.1994. It's not fully working yet,
  34  * though.
  35  *
  36  * Shared mappings now work. 15.8.1995  Bruno.
  37  *
  38  * finished 'unifying' the page and buffer cache and SMP-threaded the
  39  * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
  40  *
  41  * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
  42  */
  43
  44 atomic_t page_cache_size = ATOMIC_INIT(0);
  45 unsigned int page_hash_bits;
  46 struct page **page_hash_table;
  47 struct list_head lru_cache;
  48
  49 static spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
  50 /*
  51  * NOTE: to avoid deadlocking you must never acquire the pagecache_lock with
  52  *       the pagemap_lru_lock held.
  53  */
  54 spinlock_t pagemap_lru_lock = SPIN_LOCK_UNLOCKED;
  55
  56 #define CLUSTER_PAGES           (1 << page_cluster)
  57 #define CLUSTER_OFFSET(x)       (((x) >> page_cluster) << page_cluster)
  58
  59 void __add_page_to_hash_queue(struct page * page, struct page **p)
  60 {
  61         atomic_inc(&page_cache_size);
  62         if((page->next_hash = *p) != NULL)
  63                 (*p)->pprev_hash = &page->next_hash;
  64         *p = page;
  65         page->pprev_hash = p;
  66         if (page->buffers)
  67                 PAGE_BUG(page);
  68 }
  69
  70 static inline void remove_page_from_hash_queue(struct page * page)
  71 {
  72         if(page->pprev_hash) {
  73                 if(page->next_hash)
  74                         page->next_hash->pprev_hash = page->pprev_hash;
  75                 *page->pprev_hash = page->next_hash;
  76                 page->pprev_hash = NULL;
  77         }
  78         atomic_dec(&page_cache_size);
  79 }
  80
  81 static inline int sync_page(struct page *page)
  82 {
  83         struct address_space *mapping = page->mapping;
  84
  85         if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
  86                 return mapping->a_ops->sync_page(page);
  87         return 0;
  88 }
  89
  90 /*
  91  * Remove a page from the page cache and free it. Caller has to make
  92  * sure the page is locked and that nobody else uses it - or that usage
  93  * is safe.
  94  */
  95 static inline void __remove_inode_page(struct page *page)
  96 {
  97         remove_page_from_inode_queue(page);
  98         remove_page_from_hash_queue(page);
  99         page->mapping = NULL;
 100 }
 101
 102 void remove_inode_page(struct page *page)
 103 {
 104         if (!PageLocked(page))
 105                 PAGE_BUG(page);
 106
 107         spin_lock(&pagecache_lock);
 108         __remove_inode_page(page);
 109         spin_unlock(&pagecache_lock);
 110 }
 111
 112 /**
 113  * invalidate_inode_pages - Invalidate all the unlocked pages of one inode
 114  * @inode: the inode which pages we want to invalidate
 115  *
 116  * This function only removes the unlocked pages, if you want to
 117  * remove all the pages of one inode, you must call truncate_inode_pages.
 118  */
 119
 120 void invalidate_inode_pages(struct inode * inode)
 121 {
 122         struct list_head *head, *curr;
 123         struct page * page;
 124
 125         head = &inode->i_mapping->pages;
 126
 127         spin_lock(&pagecache_lock);
 128         spin_lock(&pagemap_lru_lock);
 129         curr = head->next;
 130
 131         while (curr != head) {
 132                 page = list_entry(curr, struct page, list);
 133                 curr = curr->next;
 134
 135                 /* We cannot invalidate a locked page */
 136                 if (TryLockPage(page))
 137                         continue;
 138
 139                 __lru_cache_del(page);
 140                 __remove_inode_page(page);
 141                 UnlockPage(page);
 142                 page_cache_release(page);
 143         }
 144
 145         spin_unlock(&pagemap_lru_lock);
 146         spin_unlock(&pagecache_lock);
 147 }
 148
 149 /*
 150  * Truncate the page cache at a set offset, removing the pages
 151  * that are beyond that offset (and zeroing out partial pages).
 152  */
 153 void truncate_inode_pages(struct address_space * mapping, loff_t lstart)
 154 {
 155         struct list_head *head, *curr;
 156         struct page * page;
 157         unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
 158         unsigned long start;
 159
 160         start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 161
 162 repeat:
 163         head = &mapping->pages;
 164         spin_lock(&pagecache_lock);
 165         curr = head->next;
 166         while (curr != head) {
 167                 unsigned long offset;
 168
 169                 page = list_entry(curr, struct page, list);
 170                 curr = curr->next;
 171
 172                 offset = page->index;
 173
 174                 /* page wholly truncated - free it */
 175                 if (offset >= start) {
 176                         if (TryLockPage(page)) {
 177                                 page_cache_get(page);
 178                                 spin_unlock(&pagecache_lock);
 179                                 wait_on_page(page);
 180                                 page_cache_release(page);
 181                                 goto repeat;
 182                         }
 183                         page_cache_get(page);
 184                         spin_unlock(&pagecache_lock);
 185
 186                         if (!page->buffers || block_flushpage(page, 0))
 187                                 lru_cache_del(page);
 188
 189                         /*
 190                          * We remove the page from the page cache
 191                          * _after_ we have destroyed all buffer-cache
 192                          * references to it. Otherwise some other process
 193                          * might think this inode page is not in the
 194                          * page cache and creates a buffer-cache alias
 195                          * to it causing all sorts of fun problems ...
 196                          */
 197                         remove_inode_page(page);
 198                         ClearPageDirty(page);
 199
 200                         UnlockPage(page);
 201                         page_cache_release(page);
 202                         page_cache_release(page);
 203
 204                         /*
 205                          * We have done things without the pagecache lock,
 206                          * so we'll have to repeat the scan.
 207                          * It's not possible to deadlock here because
 208                          * we are guaranteed to make progress. (ie. we have
 209                          * just removed a page)
 210                          */
 211                         goto repeat;
 212                 }
 213                 /*
 214                  * there is only one partial page possible.
 215                  */
 216                 if (!partial)
 217                         continue;
 218
 219                 /* and it's the one preceeding the first wholly truncated page */
 220                 if ((offset + 1) != start)
 221                         continue;
 222
 223                 /* partial truncate, clear end of page */
 224                 if (TryLockPage(page)) {
 225                         spin_unlock(&pagecache_lock);
 226                         goto repeat;
 227                 }
 228                 page_cache_get(page);
 229                 spin_unlock(&pagecache_lock);
 230
 231                 memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
 232                 if (page->buffers)
 233                         block_flushpage(page, partial);
 234
 235                 partial = 0;
 236
 237                 /*
 238                  * we have dropped the spinlock so we have to
 239                  * restart.
 240                  */
 241                 UnlockPage(page);
 242                 page_cache_release(page);
 243                 goto repeat;
 244         }
 245         spin_unlock(&pagecache_lock);
 246 }
 247
 248 /*
 249  * nr_dirty represents the number of dirty pages that we will write async
 250  * before doing sync writes.  We can only do sync writes if we can
 251  * wait for IO (__GFP_IO set).
 252  */
 253 int shrink_mmap(int priority, int gfp_mask)
 254 {
 255         int ret = 0, count, nr_dirty;
 256         struct list_head * page_lru;
 257         struct page * page = NULL;
 258
 259         count = nr_lru_pages / (priority + 1);
 260         nr_dirty = priority;
 261
 262         /* we need pagemap_lru_lock for list_del() ... subtle code below */
 263         spin_lock(&pagemap_lru_lock);
 264         while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache) {
 265                 page = list_entry(page_lru, struct page, lru);
 266                 list_del(page_lru);
 267
 268                 if (PageTestandClearReferenced(page))
 269                         goto dispose_continue;
 270
 271                 count--;
 272                 /*
 273                  * Avoid unscalable SMP locking for pages we can
 274                  * immediate tell are untouchable..
 275                  */
 276                 if (!page->buffers && page_count(page) > 1)
 277                         goto dispose_continue;
 278
 279                 if (TryLockPage(page))
 280                         goto dispose_continue;
 281
 282                 /* Release the pagemap_lru lock even if the page is not yet
 283                    queued in any lru queue since we have just locked down
 284                    the page so nobody else may SMP race with us running
 285                    a lru_cache_del() (lru_cache_del() always run with the
 286                    page locked down ;). */
 287                 spin_unlock(&pagemap_lru_lock);
 288
 289                 /* avoid freeing the page while it's locked */
 290                 page_cache_get(page);
 291
 292                 /*
 293                  * Is it a buffer page? Try to clean it up regardless
 294                  * of zone - it's old.
 295                  */
 296                 if (page->buffers) {
 297                         int wait = ((gfp_mask & __GFP_IO) && (nr_dirty-- < 0));
 298                         if (!try_to_free_buffers(page, wait))
 299                                 goto unlock_continue;
 300                         /* page was locked, inode can't go away under us */
 301                         if (!page->mapping) {
 302                                 atomic_dec(&buffermem_pages);
 303                                 goto made_buffer_progress;
 304                         }
 305                 }
 306
 307                 /* Take the pagecache_lock spinlock held to avoid
 308                    other tasks to notice the page while we are looking at its
 309                    page count. If it's a pagecache-page we'll free it
 310                    in one atomic transaction after checking its page count. */
 311                 spin_lock(&pagecache_lock);
 312
 313                 /*
 314                  * We can't free pages unless there's just one user
 315                  * (count == 2 because we added one ourselves above).
 316                  */
 317                 if (page_count(page) != 2)
 318                         goto cache_unlock_continue;
 319
 320                 /*
 321                  * Is it a page swap page? If so, we want to
 322                  * drop it if it is no longer used, even if it
 323                  * were to be marked referenced..
 324                  */
 325                 if (PageSwapCache(page)) {
 326                         spin_unlock(&pagecache_lock);
 327                         __delete_from_swap_cache(page);
 328                         goto made_inode_progress;
 329                 }
 330
 331                 /*
 332                  * Page is from a zone we don't care about.
 333                  * Don't drop page cache entries in vain.
 334                  */
 335                 if (page->zone->free_pages > page->zone->pages_high)
 336                         goto cache_unlock_continue;
 337
 338                 /* is it a page-cache page? */
 339                 if (page->mapping) {
 340                         if (!PageDirty(page) && !pgcache_under_min()) {
 341                                 __remove_inode_page(page);
 342                                 spin_unlock(&pagecache_lock);
 343                                 goto made_inode_progress;
 344                         }
 345                         goto cache_unlock_continue;
 346                 }
 347
 348                 printk(KERN_ERR "shrink_mmap: unknown LRU page!\n");
 349
 350 cache_unlock_continue:
 351                 spin_unlock(&pagecache_lock);
 352 unlock_continue:
 353                 spin_lock(&pagemap_lru_lock);
 354                 UnlockPage(page);
 355                 page_cache_release(page);
 356 dispose_continue:
 357                 list_add(page_lru, &lru_cache);
 358         }
 359         goto out;
 360
 361 made_inode_progress:
 362         page_cache_release(page);
 363 made_buffer_progress:
 364         UnlockPage(page);
 365         page_cache_release(page);
 366         ret = 1;
 367         spin_lock(&pagemap_lru_lock);
 368         /* nr_lru_pages needs the spinlock */
 369         nr_lru_pages--;
 370
 371 out:
 372         spin_unlock(&pagemap_lru_lock);
 373
 374         return ret;
 375 }
 376
 377 static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
 378 {
 379         goto inside;
 380
 381         for (;;) {
 382                 page = page->next_hash;
 383 inside:
 384                 if (!page)
 385                         goto not_found;
 386                 if (page->mapping != mapping)
 387                         continue;
 388                 if (page->index == offset)
 389                         break;
 390         }
 391         SetPageReferenced(page);
 392 not_found:
 393         return page;
 394 }
 395
 396 /*
 397  * By the time this is called, the page is locked and
 398  * we don't have to worry about any races any more.
 399  *
 400  * Start the IO..
 401  */
 402 static int writeout_one_page(struct page *page)
 403 {
 404         struct buffer_head *bh, *head = page->buffers;
 405
 406         bh = head;
 407         do {
 408                 if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
 409                         continue;
 410
 411                 bh->b_flushtime = 0;
 412                 ll_rw_block(WRITE, 1, &bh);
 413         } while ((bh = bh->b_this_page) != head);
 414         return 0;
 415 }
 416
 417 static int waitfor_one_page(struct page *page)
 418 {
 419         int error = 0;
 420         struct buffer_head *bh, *head = page->buffers;
 421
 422         bh = head;
 423         do {
 424                 wait_on_buffer(bh);
 425                 if (buffer_req(bh) && !buffer_uptodate(bh))
 426                         error = -EIO;
 427         } while ((bh = bh->b_this_page) != head);
 428         return error;
 429 }
 430
 431 static int do_buffer_fdatasync(struct inode *inode, unsigned long start, unsigned long end, int (*fn)(struct page *))
 432 {
 433         struct list_head *head, *curr;
 434         struct page *page;
 435         int retval = 0;
 436
 437         head = &inode->i_mapping->pages;
 438
 439         spin_lock(&pagecache_lock);
 440         curr = head->next;
 441         while (curr != head) {
 442                 page = list_entry(curr, struct page, list);
 443                 curr = curr->next;
 444                 if (!page->buffers)
 445                         continue;
 446                 if (page->index >= end)
 447                         continue;
 448                 if (page->index < start)
 449                         continue;
 450
 451                 page_cache_get(page);
 452                 spin_unlock(&pagecache_lock);
 453                 lock_page(page);
 454
 455                 /* The buffers could have been free'd while we waited for the page lock */
 456                 if (page->buffers)
 457                         retval |= fn(page);
 458
 459                 UnlockPage(page);
 460                 spin_lock(&pagecache_lock);
 461                 curr = page->list.next;
 462                 page_cache_release(page);
 463         }
 464         spin_unlock(&pagecache_lock);
 465
 466         return retval;
 467 }
 468
 469 /*
 470  * Two-stage data sync: first start the IO, then go back and
 471  * collect the information..
 472  */
 473 int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx)
 474 {
 475         int retval;
 476
 477         retval = do_buffer_fdatasync(inode, start_idx, end_idx, writeout_one_page);
 478         retval |= do_buffer_fdatasync(inode, start_idx, end_idx, waitfor_one_page);
 479         return retval;
 480 }
 481
 482 /*
 483  * Add a page to the inode page cache.
 484  *
 485  * The caller must have locked the page and
 486  * set all the page flags correctly..
 487  */
 488 void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index)
 489 {
 490         if (!PageLocked(page))
 491                 BUG();
 492
 493         page_cache_get(page);
 494         spin_lock(&pagecache_lock);
 495         page->index = index;
 496         add_page_to_inode_queue(mapping, page);
 497         __add_page_to_hash_queue(page, page_hash(mapping, index));
 498         lru_cache_add(page);
 499         spin_unlock(&pagecache_lock);
 500 }
 501
 502 /*
 503  * This adds a page to the page cache, starting out as locked,
 504  * owned by us, but unreferenced, not uptodate and with no errors.
 505  */
 506 static inline void __add_to_page_cache(struct page * page,
 507         struct address_space *mapping, unsigned long offset,
 508         struct page **hash)
 509 {
 510         struct page *alias;
 511         unsigned long flags;
 512
 513         if (PageLocked(page))
 514                 BUG();
 515
 516         flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced));
 517         page->flags = flags | (1 << PG_locked);
 518         page_cache_get(page);
 519         page->index = offset;
 520         add_page_to_inode_queue(mapping, page);
 521         __add_page_to_hash_queue(page, hash);
 522         lru_cache_add(page);
 523         alias = __find_page_nolock(mapping, offset, *hash);
 524         if (alias != page)
 525                 BUG();
 526 }
 527
 528 void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset)
 529 {
 530         spin_lock(&pagecache_lock);
 531         __add_to_page_cache(page, mapping, offset, page_hash(mapping, offset));
 532         spin_unlock(&pagecache_lock);
 533 }
 534
 535 static int add_to_page_cache_unique(struct page * page,
 536         struct address_space *mapping, unsigned long offset,
 537         struct page **hash)
 538 {
 539         int err;
 540         struct page *alias;
 541
 542         spin_lock(&pagecache_lock);
 543         alias = __find_page_nolock(mapping, offset, *hash);
 544
 545         err = 1;
 546         if (!alias) {
 547                 __add_to_page_cache(page,mapping,offset,hash);
 548                 err = 0;
 549         }
 550
 551         spin_unlock(&pagecache_lock);
 552         return err;
 553 }
 554
 555 /*
 556  * This adds the requested page to the page cache if it isn't already there,
 557  * and schedules an I/O to read in its contents from disk.
 558  */
 559 static inline int page_cache_read(struct file * file, unsigned long offset)
 560 {
 561         struct inode *inode = file->f_dentry->d_inode;
 562         struct address_space *mapping = inode->i_mapping;
 563         struct page **hash = page_hash(mapping, offset);
 564         struct page *page;
 565
 566         spin_lock(&pagecache_lock);
 567         page = __find_page_nolock(mapping, offset, *hash);
 568         spin_unlock(&pagecache_lock);
 569         if (page)
 570                 return 0;
 571
 572         page = page_cache_alloc();
 573         if (!page)
 574                 return -ENOMEM;
 575
 576         if (!add_to_page_cache_unique(page, mapping, offset, hash)) {
 577                 int error = mapping->a_ops->readpage(file, page);
 578                 page_cache_release(page);
 579                 return error;
 580         }
 581         /*
 582          * We arrive here in the unlikely event that someone
 583          * raced with us and added our page to the cache first.
 584          */
 585         page_cache_free(page);
 586         return 0;
 587 }
 588
 589 /*
 590  * Read in an entire cluster at once.  A cluster is usually a 64k-
 591  * aligned block that includes the page requested in "offset."
 592  */
 593 static int read_cluster_nonblocking(struct file * file, unsigned long offset,
 594         unsigned long filesize)
 595 {
 596         unsigned long pages = CLUSTER_PAGES;
 597
 598         offset = CLUSTER_OFFSET(offset);
 599         while ((pages-- > 0) && (offset < filesize)) {
 600                 int error = page_cache_read(file, offset);
 601                 if (error < 0)
 602                         return error;
 603                 offset ++;
 604         }
 605
 606         return 0;
 607 }
 608
 609 /*
 610  * Wait for a page to get unlocked.
 611  *
 612  * This must be called with the caller "holding" the page,
 613  * ie with increased "page->count" so that the page won't
 614  * go away during the wait..
 615  */
 616 void ___wait_on_page(struct page *page)
 617 {
 618         struct task_struct *tsk = current;
 619         DECLARE_WAITQUEUE(wait, tsk);
 620
 621         add_wait_queue(&page->wait, &wait);
 622         do {
 623                 sync_page(page);
 624                 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 625                 if (!PageLocked(page))
 626                         break;
 627                 schedule();
 628         } while (PageLocked(page));
 629         tsk->state = TASK_RUNNING;
 630         remove_wait_queue(&page->wait, &wait);
 631 }
 632
 633 /*
 634  * Get an exclusive lock on the page..
 635  */
 636 void lock_page(struct page *page)
 637 {
 638         while (TryLockPage(page))
 639                 ___wait_on_page(page);
 640 }
 641
 642
 643 /*
 644  * a rather lightweight function, finding and getting a reference to a
 645  * hashed page atomically, waiting for it if it's locked.
 646  */
 647 struct page * __find_get_page (struct address_space *mapping,
 648                                 unsigned long offset, struct page **hash)
 649 {
 650         struct page *page;
 651
 652         /*
 653          * We scan the hash list read-only. Addition to and removal from
 654          * the hash-list needs a held write-lock.
 655          */
 656 repeat:
 657         spin_lock(&pagecache_lock);
 658         page = __find_page_nolock(mapping, offset, *hash);
 659         if (page)
 660                 page_cache_get(page);
 661         spin_unlock(&pagecache_lock);
 662
 663         /* Found the page, sleep if locked. */
 664         if (page && PageLocked(page)) {
 665                 struct task_struct *tsk = current;
 666                 DECLARE_WAITQUEUE(wait, tsk);
 667
 668                 sync_page(page);
 669
 670                 __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 671                 add_wait_queue(&page->wait, &wait);
 672
 673                 if (PageLocked(page))
 674                         schedule();
 675                 __set_task_state(tsk, TASK_RUNNING);
 676                 remove_wait_queue(&page->wait, &wait);
 677
 678                 /*
 679                  * The page might have been unhashed meanwhile. It's
 680                  * not freed though because we hold a reference to it.
 681                  * If this is the case then it will be freed _here_,
 682                  * and we recheck the hash anyway.
 683                  */
 684                 page_cache_release(page);
 685                 goto repeat;
 686         }
 687         /*
 688          * It's not locked so we can return the page and we hold
 689          * a reference to it.
 690          */
 691         return page;
 692 }
 693
 694 /*
 695  * Get the lock to a page atomically.
 696  */
 697 struct page * __find_lock_page (struct address_space *mapping,
 698                                 unsigned long offset, struct page **hash)
 699 {
 700         struct page *page;
 701
 702         /*
 703          * We scan the hash list read-only. Addition to and removal from
 704          * the hash-list needs a held write-lock.
 705          */
 706 repeat:
 707         spin_lock(&pagecache_lock);
 708         page = __find_page_nolock(mapping, offset, *hash);
 709         if (page)
 710                 page_cache_get(page);
 711         spin_unlock(&pagecache_lock);
 712
 713         /* Found the page, sleep if locked. */
 714         if (page && TryLockPage(page)) {
 715                 struct task_struct *tsk = current;
 716                 DECLARE_WAITQUEUE(wait, tsk);
 717
 718                 sync_page(page);
 719
 720                 __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 721                 add_wait_queue(&page->wait, &wait);
 722
 723                 if (PageLocked(page))
 724                         schedule();
 725                 __set_task_state(tsk, TASK_RUNNING);
 726                 remove_wait_queue(&page->wait, &wait);
 727
 728                 /*
 729                  * The page might have been unhashed meanwhile. It's
 730                  * not freed though because we hold a reference to it.
 731                  * If this is the case then it will be freed _here_,
 732                  * and we recheck the hash anyway.
 733                  */
 734                 page_cache_release(page);
 735                 goto repeat;
 736         }
 737         /*
 738          * It's not locked so we can return the page and we hold
 739          * a reference to it.
 740          */
 741         return page;
 742 }
 743
 744 #if 0
 745 #define PROFILE_READAHEAD
 746 #define DEBUG_READAHEAD
 747 #endif
 748
 749 /*
 750  * Read-ahead profiling information
 751  * --------------------------------
 752  * Every PROFILE_MAXREADCOUNT, the following information is written
 753  * to the syslog:
 754  *   Percentage of asynchronous read-ahead.
 755  *   Average of read-ahead fields context value.
 756  * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
 757  * to the syslog.
 758  */
 759
 760 #ifdef PROFILE_READAHEAD
 761
 762 #define PROFILE_MAXREADCOUNT 1000
 763
 764 static unsigned long total_reada;
 765 static unsigned long total_async;
 766 static unsigned long total_ramax;
 767 static unsigned long total_ralen;
 768 static unsigned long total_rawin;
 769
 770 static void profile_readahead(int async, struct file *filp)
 771 {
 772         unsigned long flags;
 773
 774         ++total_reada;
 775         if (async)
 776                 ++total_async;
 777
 778         total_ramax     += filp->f_ramax;
 779         total_ralen     += filp->f_ralen;
 780         total_rawin     += filp->f_rawin;
 781
 782         if (total_reada > PROFILE_MAXREADCOUNT) {
 783                 save_flags(flags);
 784                 cli();
 785                 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
 786                         restore_flags(flags);
 787                         return;
 788                 }
 789
 790                 printk("Readahead average:  max=%ld, len=%ld, win=%ld, async=%ld%%\n",
 791                         total_ramax/total_reada,
 792                         total_ralen/total_reada,
 793                         total_rawin/total_reada,
 794                         (total_async*100)/total_reada);
 795 #ifdef DEBUG_READAHEAD
 796                 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n",
 797                         filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
 798 #endif
 799
 800                 total_reada     = 0;
 801                 total_async     = 0;
 802                 total_ramax     = 0;
 803                 total_ralen     = 0;
 804                 total_rawin     = 0;
 805
 806                 restore_flags(flags);
 807         }
 808 }
 809 #endif  /* defined PROFILE_READAHEAD */
 810
 811 /*
 812  * Read-ahead context:
 813  * -------------------
 814  * The read ahead context fields of the "struct file" are the following:
 815  * - f_raend : position of the first byte after the last page we tried to
 816  *             read ahead.
 817  * - f_ramax : current read-ahead maximum size.
 818  * - f_ralen : length of the current IO read block we tried to read-ahead.
 819  * - f_rawin : length of the current read-ahead window.
 820  *              if last read-ahead was synchronous then
 821  *                      f_rawin = f_ralen
 822  *              otherwise (was asynchronous)
 823  *                      f_rawin = previous value of f_ralen + f_ralen
 824  *
 825  * Read-ahead limits:
 826  * ------------------
 827  * MIN_READAHEAD   : minimum read-ahead size when read-ahead.
 828  * MAX_READAHEAD   : maximum read-ahead size when read-ahead.
 829  *
 830  * Synchronous read-ahead benefits:
 831  * --------------------------------
 832  * Using reasonable IO xfer length from peripheral devices increase system
 833  * performances.
 834  * Reasonable means, in this context, not too large but not too small.
 835  * The actual maximum value is:
 836  *      MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
 837  *      and 32K if defined (4K page size assumed).
 838  *
 839  * Asynchronous read-ahead benefits:
 840  * ---------------------------------
 841  * Overlapping next read request and user process execution increase system
 842  * performance.
 843  *
 844  * Read-ahead risks:
 845  * -----------------
 846  * We have to guess which further data are needed by the user process.
 847  * If these data are often not really needed, it's bad for system
 848  * performances.
 849  * However, we know that files are often accessed sequentially by
 850  * application programs and it seems that it is possible to have some good
 851  * strategy in that guessing.
 852  * We only try to read-ahead files that seems to be read sequentially.
 853  *
 854  * Asynchronous read-ahead risks:
 855  * ------------------------------
 856  * In order to maximize overlapping, we must start some asynchronous read
 857  * request from the device, as soon as possible.
 858  * We must be very careful about:
 859  * - The number of effective pending IO read requests.
 860  *   ONE seems to be the only reasonable value.
 861  * - The total memory pool usage for the file access stream.
 862  *   This maximum memory usage is implicitly 2 IO read chunks:
 863  *   2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
 864  *   64k if defined (4K page size assumed).
 865  */
 866
 867 static inline int get_max_readahead(struct inode * inode)
 868 {
 869         if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
 870                 return MAX_READAHEAD;
 871         return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
 872 }
 873
 874 static void generic_file_readahead(int reada_ok,
 875         struct file * filp, struct inode * inode,
 876         struct page * page)
 877 {
 878         unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
 879         unsigned long index = page->index;
 880         unsigned long max_ahead, ahead;
 881         unsigned long raend;
 882         int max_readahead = get_max_readahead(inode);
 883
 884         raend = filp->f_raend;
 885         max_ahead = 0;
 886
 887 /*
 888  * The current page is locked.
 889  * If the current position is inside the previous read IO request, do not
 890  * try to reread previously read ahead pages.
 891  * Otherwise decide or not to read ahead some pages synchronously.
 892  * If we are not going to read ahead, set the read ahead context for this
 893  * page only.
 894  */
 895         if (PageLocked(page)) {
 896                 if (!filp->f_ralen || index >= raend || index + filp->f_ralen < raend) {
 897                         raend = index;
 898                         if (raend < end_index)
 899                                 max_ahead = filp->f_ramax;
 900                         filp->f_rawin = 0;
 901                         filp->f_ralen = 1;
 902                         if (!max_ahead) {
 903                                 filp->f_raend  = index + filp->f_ralen;
 904                                 filp->f_rawin += filp->f_ralen;
 905                         }
 906                 }
 907         }
 908 /*
 909  * The current page is not locked.
 910  * If we were reading ahead and,
 911  * if the current max read ahead size is not zero and,
 912  * if the current position is inside the last read-ahead IO request,
 913  *   it is the moment to try to read ahead asynchronously.
 914  * We will later force unplug device in order to force asynchronous read IO.
 915  */
 916         else if (reada_ok && filp->f_ramax && raend >= 1 &&
 917                  index <= raend && index + filp->f_ralen >= raend) {
 918 /*
 919  * Add ONE page to max_ahead in order to try to have about the same IO max size
 920  * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
 921  * Compute the position of the last page we have tried to read in order to
 922  * begin to read ahead just at the next page.
 923  */
 924                 raend -= 1;
 925                 if (raend < end_index)
 926                         max_ahead = filp->f_ramax + 1;
 927
 928                 if (max_ahead) {
 929                         filp->f_rawin = filp->f_ralen;
 930                         filp->f_ralen = 0;
 931                         reada_ok      = 2;
 932                 }
 933         }
 934 /*
 935  * Try to read ahead pages.
 936  * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
 937  * scheduler, will work enough for us to avoid too bad actuals IO requests.
 938  */
 939         ahead = 0;
 940         while (ahead < max_ahead) {
 941                 ahead ++;
 942                 if ((raend + ahead) >= end_index)
 943                         break;
 944                 if (page_cache_read(filp, raend + ahead) < 0)
 945                         break;
 946         }
 947 /*
 948  * If we tried to read ahead some pages,
 949  * If we tried to read ahead asynchronously,
 950  *   Try to force unplug of the device in order to start an asynchronous
 951  *   read IO request.
 952  * Update the read-ahead context.
 953  * Store the length of the current read-ahead window.
 954  * Double the current max read ahead size.
 955  *   That heuristic avoid to do some large IO for files that are not really
 956  *   accessed sequentially.
 957  */
 958         if (ahead) {
 959                 if (reada_ok == 2) {
 960                         run_task_queue(&tq_disk);
 961                 }
 962
 963                 filp->f_ralen += ahead;
 964                 filp->f_rawin += filp->f_ralen;
 965                 filp->f_raend = raend + ahead + 1;
 966
 967                 filp->f_ramax += filp->f_ramax;
 968
 969                 if (filp->f_ramax > max_readahead)
 970                         filp->f_ramax = max_readahead;
 971
 972 #ifdef PROFILE_READAHEAD
 973                 profile_readahead((reada_ok == 2), filp);
 974 #endif
 975         }
 976
 977         return;
 978 }
 979
 980
 981 /*
 982  * This is a generic file read routine, and uses the
 983  * inode->i_op->readpage() function for the actual low-level
 984  * stuff.
 985  *
 986  * This is really ugly. But the goto's actually try to clarify some
 987  * of the logic when it comes to error handling etc.
 988  */
 989 void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
 990 {
 991         struct inode *inode = filp->f_dentry->d_inode;
 992         struct address_space *mapping = inode->i_mapping;
 993         unsigned long index, offset;
 994         struct page *cached_page;
 995         int reada_ok;
 996         int error;
 997         int max_readahead = get_max_readahead(inode);
 998
 999         cached_page = NULL;
1000         index = *ppos >> PAGE_CACHE_SHIFT;
1001         offset = *ppos & ~PAGE_CACHE_MASK;
1002
1003 /*
1004  * If the current position is outside the previous read-ahead window,
1005  * we reset the current read-ahead context and set read ahead max to zero
1006  * (will be set to just needed value later),
1007  * otherwise, we assume that the file accesses are sequential enough to
1008  * continue read-ahead.
1009  */
1010         if (index > filp->f_raend || index + filp->f_rawin < filp->f_raend) {
1011                 reada_ok = 0;
1012                 filp->f_raend = 0;
1013                 filp->f_ralen = 0;
1014                 filp->f_ramax = 0;
1015                 filp->f_rawin = 0;
1016         } else {
1017                 reada_ok = 1;
1018         }
1019 /*
1020  * Adjust the current value of read-ahead max.
1021  * If the read operation stay in the first half page, force no readahead.
1022  * Otherwise try to increase read ahead max just enough to do the read request.
1023  * Then, at least MIN_READAHEAD if read ahead is ok,
1024  * and at most MAX_READAHEAD in all cases.
1025  */
1026         if (!index && offset + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
1027                 filp->f_ramax = 0;
1028         } else {
1029                 unsigned long needed;
1030
1031                 needed = ((offset + desc->count) >> PAGE_CACHE_SHIFT) + 1;
1032
1033                 if (filp->f_ramax < needed)
1034                         filp->f_ramax = needed;
1035
1036                 if (reada_ok && filp->f_ramax < MIN_READAHEAD)
1037                                 filp->f_ramax = MIN_READAHEAD;
1038                 if (filp->f_ramax > max_readahead)
1039                         filp->f_ramax = max_readahead;
1040         }
1041
1042         for (;;) {
1043                 struct page *page, **hash;
1044                 unsigned long end_index, nr;
1045
1046                 end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1047                 if (index > end_index)
1048                         break;
1049                 nr = PAGE_CACHE_SIZE;
1050                 if (index == end_index) {
1051                         nr = inode->i_size & ~PAGE_CACHE_MASK;
1052                         if (nr <= offset)
1053                                 break;
1054                 }
1055
1056                 nr = nr - offset;
1057
1058                 /*
1059                  * Try to find the data in the page cache..
1060                  */
1061                 hash = page_hash(mapping, index);
1062
1063                 spin_lock(&pagecache_lock);
1064                 page = __find_page_nolock(mapping, index, *hash);
1065                 if (!page)
1066                         goto no_cached_page;
1067 found_page:
1068                 page_cache_get(page);
1069                 spin_unlock(&pagecache_lock);
1070
1071                 if (!Page_Uptodate(page))
1072                         goto page_not_up_to_date;
1073 page_ok:
1074                 /*
1075                  * Ok, we have the page, and it's up-to-date, so
1076                  * now we can copy it to user space...
1077                  *
1078                  * The actor routine returns how many bytes were actually used..
1079                  * NOTE! This may not be the same as how much of a user buffer
1080                  * we filled up (we may be padding etc), so we can only update
1081                  * "pos" here (the actor routine has to update the user buffer
1082                  * pointers and the remaining count).
1083                  */
1084                 nr = actor(desc, page, offset, nr);
1085                 offset += nr;
1086                 index += offset >> PAGE_CACHE_SHIFT;
1087                 offset &= ~PAGE_CACHE_MASK;
1088
1089                 page_cache_release(page);
1090                 if (nr && desc->count)
1091                         continue;
1092                 break;
1093
1094 /*
1095  * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
1096  */
1097 page_not_up_to_date:
1098                 generic_file_readahead(reada_ok, filp, inode, page);
1099
1100                 if (Page_Uptodate(page))
1101                         goto page_ok;
1102
1103                 /* Get exclusive access to the page ... */
1104                 lock_page(page);
1105                 if (Page_Uptodate(page)) {
1106                         UnlockPage(page);
1107                         goto page_ok;
1108                 }
1109
1110 readpage:
1111                 /* ... and start the actual read. The read will unlock the page. */
1112                 error = mapping->a_ops->readpage(filp, page);
1113
1114                 if (!error) {
1115                         if (Page_Uptodate(page))
1116                                 goto page_ok;
1117
1118                         /* Again, try some read-ahead while waiting for the page to finish.. */
1119                         generic_file_readahead(reada_ok, filp, inode, page);
1120                         wait_on_page(page);
1121                         if (Page_Uptodate(page))
1122                                 goto page_ok;
1123                         error = -EIO;
1124                 }
1125
1126                 /* UHHUH! A synchronous read error occurred. Report it */
1127                 desc->error = error;
1128                 page_cache_release(page);
1129                 break;
1130
1131 no_cached_page:
1132                 /*
1133                  * Ok, it wasn't cached, so we need to create a new
1134                  * page..
1135                  *
1136                  * We get here with the page cache lock held.
1137                  */
1138                 if (!cached_page) {
1139                         spin_unlock(&pagecache_lock);
1140                         cached_page = page_cache_alloc();
1141                         if (!cached_page) {
1142                                 desc->error = -ENOMEM;
1143                                 break;
1144                         }
1145
1146                         /*
1147                          * Somebody may have added the page while we
1148                          * dropped the page cache lock. Check for that.
1149                          */
1150                         spin_lock(&pagecache_lock);
1151                         page = __find_page_nolock(mapping, index, *hash);
1152                         if (page)
1153                                 goto found_page;
1154                 }
1155
1156                 /*
1157                  * Ok, add the new page to the hash-queues...
1158                  */
1159                 page = cached_page;
1160                 __add_to_page_cache(page, mapping, index, hash);
1161                 spin_unlock(&pagecache_lock);
1162                 cached_page = NULL;
1163
1164                 goto readpage;
1165         }
1166
1167         *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1168         filp->f_reada = 1;
1169         if (cached_page)
1170                 page_cache_free(cached_page);
1171         UPDATE_ATIME(inode);
1172 }
1173
1174 static int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
1175 {
1176         unsigned long kaddr;
1177         unsigned long left, count = desc->count;
1178
1179         if (size > count)
1180                 size = count;
1181
1182         kaddr = kmap(page);
1183         left = __copy_to_user(desc->buf, (void *)(kaddr + offset), size);
1184         kunmap(page);
1185
1186         if (left) {
1187                 size -= left;
1188                 desc->error = -EFAULT;
1189         }
1190         desc->count = count - size;
1191         desc->written += size;
1192         desc->buf += size;
1193         return size;
1194 }
1195
1196 /*
1197  * This is the "read()" routine for all filesystems
1198  * that can use the page cache directly.
1199  */
1200 ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
1201 {
1202         ssize_t retval;
1203
1204         retval = -EFAULT;
1205         if (access_ok(VERIFY_WRITE, buf, count)) {
1206                 retval = 0;
1207
1208                 if (count) {
1209                         read_descriptor_t desc;
1210
1211                         desc.written = 0;
1212                         desc.count = count;
1213                         desc.buf = buf;
1214                         desc.error = 0;
1215                         do_generic_file_read(filp, ppos, &desc, file_read_actor);
1216
1217                         retval = desc.written;
1218                         if (!retval)
1219                                 retval = desc.error;
1220                 }
1221         }
1222         return retval;
1223 }
1224
1225 static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size)
1226 {
1227         unsigned long kaddr;
1228         ssize_t written;
1229         unsigned long count = desc->count;
1230         struct file *file = (struct file *) desc->buf;
1231         mm_segment_t old_fs;
1232
1233         if (size > count)
1234                 size = count;
1235         old_fs = get_fs();
1236         set_fs(KERNEL_DS);
1237
1238         kaddr = kmap(page);
1239         written = file->f_op->write(file, (char *)kaddr + offset,
1240                                                  size, &file->f_pos);
1241         kunmap(page);
1242         set_fs(old_fs);
1243         if (written < 0) {
1244                 desc->error = written;
1245                 written = 0;
1246         }
1247         desc->count = count - written;
1248         desc->written += written;
1249         return written;
1250 }
1251
1252 asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
1253 {
1254         ssize_t retval;
1255         struct file * in_file, * out_file;
1256         struct inode * in_inode, * out_inode;
1257
1258         /*
1259          * Get input file, and verify that it is ok..
1260          */
1261         retval = -EBADF;
1262         in_file = fget(in_fd);
1263         if (!in_file)
1264                 goto out;
1265         if (!(in_file->f_mode & FMODE_READ))
1266                 goto fput_in;
1267         retval = -EINVAL;
1268         in_inode = in_file->f_dentry->d_inode;
1269         if (!in_inode)
1270                 goto fput_in;
1271         if (!in_inode->i_mapping->a_ops->readpage)
1272                 goto fput_in;
1273         retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
1274         if (retval)
1275                 goto fput_in;
1276
1277         /*
1278          * Get output file, and verify that it is ok..
1279          */
1280         retval = -EBADF;
1281         out_file = fget(out_fd);
1282         if (!out_file)
1283                 goto fput_in;
1284         if (!(out_file->f_mode & FMODE_WRITE))
1285                 goto fput_out;
1286         retval = -EINVAL;
1287         if (!out_file->f_op || !out_file->f_op->write)
1288                 goto fput_out;
1289         out_inode = out_file->f_dentry->d_inode;
1290         if (!out_inode)
1291                 goto fput_out;
1292         retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
1293         if (retval)
1294                 goto fput_out;
1295
1296         retval = 0;
1297         if (count) {
1298                 read_descriptor_t desc;
1299                 loff_t pos = 0, *ppos;
1300
1301                 retval = -EFAULT;
1302                 ppos = &in_file->f_pos;
1303                 if (offset) {
1304                         if (get_user(pos, offset))
1305                                 goto fput_out;
1306                         ppos = &pos;
1307                 }
1308
1309                 desc.written = 0;
1310                 desc.count = count;
1311                 desc.buf = (char *) out_file;
1312                 desc.error = 0;
1313                 do_generic_file_read(in_file, ppos, &desc, file_send_actor);
1314
1315                 retval = desc.written;
1316                 if (!retval)
1317                         retval = desc.error;
1318                 if (offset)
1319                         put_user(pos, offset);
1320         }
1321
1322 fput_out:
1323         fput(out_file);
1324 fput_in:
1325         fput(in_file);
1326 out:
1327         return retval;
1328 }
1329
1330 /*
1331  * Read-ahead and flush behind for MADV_SEQUENTIAL areas.  Since we are
1332  * sure this is sequential access, we don't need a flexible read-ahead
1333  * window size -- we can always use a large fixed size window.
1334  */
1335 static void nopage_sequential_readahead(struct vm_area_struct * vma,
1336         unsigned long pgoff, unsigned long filesize)
1337 {
1338         unsigned long ra_window;
1339
1340         ra_window = get_max_readahead(vma->vm_file->f_dentry->d_inode);
1341         ra_window = CLUSTER_OFFSET(ra_window + CLUSTER_PAGES - 1);
1342
1343         /* vm_raend is zero if we haven't read ahead in this area yet.  */
1344         if (vma->vm_raend == 0)
1345                 vma->vm_raend = vma->vm_pgoff + ra_window;
1346
1347         /*
1348          * If we've just faulted the page half-way through our window,
1349          * then schedule reads for the next window, and release the
1350          * pages in the previous window.
1351          */
1352         if ((pgoff + (ra_window >> 1)) == vma->vm_raend) {
1353                 unsigned long start = vma->vm_pgoff + vma->vm_raend;
1354                 unsigned long end = start + ra_window;
1355
1356                 if (end > ((vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff))
1357                         end = (vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff;
1358                 if (start > end)
1359                         return;
1360
1361                 while ((start < end) && (start < filesize)) {
1362                         if (read_cluster_nonblocking(vma->vm_file,
1363                                                         start, filesize) < 0)
1364                                 break;
1365                         start += CLUSTER_PAGES;
1366                 }
1367                 run_task_queue(&tq_disk);
1368
1369                 /* if we're far enough past the beginning of this area,
1370                    recycle pages that are in the previous window. */
1371                 if (vma->vm_raend > (vma->vm_pgoff + ra_window + ra_window)) {
1372                         unsigned long window = ra_window << PAGE_SHIFT;
1373
1374                         end = vma->vm_start + (vma->vm_raend << PAGE_SHIFT);
1375                         end -= window + window;
1376                         filemap_sync(vma, end - window, window, MS_INVALIDATE);
1377                 }
1378
1379                 vma->vm_raend += ra_window;
1380         }
1381
1382         return;
1383 }
1384
1385 /*
1386  * filemap_nopage() is invoked via the vma operations vector for a
1387  * mapped memory region to read in file data during a page fault.
1388  *
1389  * The goto's are kind of ugly, but this streamlines the normal case of having
1390  * it in the page cache, and handles the special cases reasonably without
1391  * having a lot of duplicated code.
1392  */
1393 struct page * filemap_nopage(struct vm_area_struct * area,
1394         unsigned long address, int no_share)
1395 {
1396         int error;
1397         struct file *file = area->vm_file;
1398         struct inode *inode = file->f_dentry->d_inode;
1399         struct address_space *mapping = inode->i_mapping;
1400         struct page *page, **hash, *old_page;
1401         unsigned long size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1402
1403         unsigned long pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
1404
1405         /*
1406          * Semantics for shared and private memory areas are different
1407          * past the end of the file. A shared mapping past the last page
1408          * of the file is an error and results in a SIGBUS, while a
1409          * private mapping just maps in a zero page.
1410          */
1411         if ((pgoff >= size) && (area->vm_mm == current->mm))
1412                 return NULL;
1413
1414         /*
1415          * Do we have something in the page cache already?
1416          */
1417         hash = page_hash(mapping, pgoff);
1418 retry_find:
1419         page = __find_get_page(mapping, pgoff, hash);
1420         if (!page)
1421                 goto no_cached_page;
1422
1423         /*
1424          * Ok, found a page in the page cache, now we need to check
1425          * that it's up-to-date.
1426          */
1427         if (!Page_Uptodate(page))
1428                 goto page_not_uptodate;
1429
1430 success:
1431         /*
1432          * Try read-ahead for sequential areas.
1433          */
1434         if (VM_SequentialReadHint(area))
1435                 nopage_sequential_readahead(area, pgoff, size);
1436
1437         /*
1438          * Found the page and have a reference on it, need to check sharing
1439          * and possibly copy it over to another page..
1440          */
1441         old_page = page;
1442         if (no_share) {
1443                 struct page *new_page = page_cache_alloc();
1444
1445                 if (new_page) {
1446                         copy_user_highpage(new_page, old_page, address);
1447                         flush_page_to_ram(new_page);
1448                 } else
1449                         new_page = NOPAGE_OOM;
1450                 page_cache_release(page);
1451                 return new_page;
1452         }
1453
1454         flush_page_to_ram(old_page);
1455         return old_page;
1456
1457 no_cached_page:
1458         /*
1459          * If the requested offset is within our file, try to read a whole
1460          * cluster of pages at once.
1461          *
1462          * Otherwise, we're off the end of a privately mapped file,
1463          * so we need to map a zero page.
1464          */
1465         if ((pgoff < size) && !VM_RandomReadHint(area))
1466                 error = read_cluster_nonblocking(file, pgoff, size);
1467         else
1468                 error = page_cache_read(file, pgoff);
1469
1470         /*
1471          * The page we want has now been added to the page cache.
1472          * In the unlikely event that someone removed it in the
1473          * meantime, we'll just come back here and read it again.
1474          */
1475         if (error >= 0)
1476                 goto retry_find;
1477
1478         /*
1479          * An error return from page_cache_read can result if the
1480          * system is low on memory, or a problem occurs while trying
1481          * to schedule I/O.
1482          */
1483         if (error == -ENOMEM)
1484                 return NOPAGE_OOM;
1485         return NULL;
1486
1487 page_not_uptodate:
1488         lock_page(page);
1489         if (Page_Uptodate(page)) {
1490                 UnlockPage(page);
1491                 goto success;
1492         }
1493
1494         if (!mapping->a_ops->readpage(file, page)) {
1495                 wait_on_page(page);
1496                 if (Page_Uptodate(page))
1497                         goto success;
1498         }
1499
1500         /*
1501          * Umm, take care of errors if the page isn't up-to-date.
1502          * Try to re-read it _once_. We do this synchronously,
1503          * because there really aren't any performance issues here
1504          * and we need to check for errors.
1505          */
1506         lock_page(page);
1507         if (Page_Uptodate(page)) {
1508                 UnlockPage(page);
1509                 goto success;
1510         }
1511         ClearPageError(page);
1512         if (!mapping->a_ops->readpage(file, page)) {
1513                 wait_on_page(page);
1514                 if (Page_Uptodate(page))
1515                         goto success;
1516         }
1517
1518         /*
1519          * Things didn't work out. Return zero to tell the
1520          * mm layer so, possibly freeing the page cache page first.
1521          */
1522         page_cache_release(page);
1523         return NULL;
1524 }
1525
1526 static int filemap_write_page(struct file *file,
1527                               struct page * page,
1528                               int wait)
1529 {
1530         /*
1531          * If a task terminates while we're swapping the page, the vma and
1532          * and file could be released: try_to_swap_out has done a get_file.
1533          * vma/file is guaranteed to exist in the unmap/sync cases because
1534          * mmap_sem is held.
1535          */
1536         return page->mapping->a_ops->writepage(file, page);
1537 }
1538
1539
1540 /*
1541  * The page cache takes care of races between somebody
1542  * trying to swap something out and swap something in
1543  * at the same time..
1544  */
1545 extern void wakeup_bdflush(int);
1546 int filemap_swapout(struct page * page, struct file * file)
1547 {
1548         int retval = filemap_write_page(file, page, 0);
1549         wakeup_bdflush(0);
1550         return retval;
1551 }
1552
1553 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
1554         unsigned long address, unsigned int flags)
1555 {
1556         unsigned long pgoff;
1557         pte_t pte = *ptep;
1558         struct page *page;
1559         int error;
1560
1561         if (!(flags & MS_INVALIDATE)) {
1562                 if (!pte_present(pte))
1563                         return 0;
1564                 if (!pte_dirty(pte))
1565                         return 0;
1566                 flush_page_to_ram(pte_page(pte));
1567                 flush_cache_page(vma, address);
1568                 set_pte(ptep, pte_mkclean(pte));
1569                 flush_tlb_page(vma, address);
1570                 page = pte_page(pte);
1571                 page_cache_get(page);
1572         } else {
1573                 if (pte_none(pte))
1574                         return 0;
1575                 flush_cache_page(vma, address);
1576                 pte_clear(ptep);
1577                 flush_tlb_page(vma, address);
1578                 if (!pte_present(pte)) {
1579                         swap_free(pte_to_swp_entry(pte));
1580                         return 0;
1581                 }
1582                 page = pte_page(pte);
1583                 if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
1584                         page_cache_free(page);
1585                         return 0;
1586                 }
1587         }
1588         pgoff = (address - vma->vm_start) >> PAGE_CACHE_SHIFT;
1589         pgoff += vma->vm_pgoff;
1590         if (page->index != pgoff) {
1591                 printk("weirdness: pgoff=%lu index=%lu address=%lu vm_start=%lu vm_pgoff=%lu\n",
1592                         pgoff, page->index, address, vma->vm_start, vma->vm_pgoff);
1593         }
1594         lock_page(page);
1595         error = filemap_write_page(vma->vm_file, page, 1);
1596         UnlockPage(page);
1597         page_cache_free(page);
1598         return error;
1599 }
1600
1601 static inline int filemap_sync_pte_range(pmd_t * pmd,
1602         unsigned long address, unsigned long size,
1603         struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
1604 {
1605         pte_t * pte;
1606         unsigned long end;
1607         int error;
1608
1609         if (pmd_none(*pmd))
1610                 return 0;
1611         if (pmd_bad(*pmd)) {
1612                 pmd_ERROR(*pmd);
1613                 pmd_clear(pmd);
1614                 return 0;
1615         }
1616         pte = pte_offset(pmd, address);
1617         offset += address & PMD_MASK;
1618         address &= ~PMD_MASK;
1619         end = address + size;
1620         if (end > PMD_SIZE)
1621                 end = PMD_SIZE;
1622         error = 0;
1623         do {
1624                 error |= filemap_sync_pte(pte, vma, address + offset, flags);
1625                 address += PAGE_SIZE;
1626                 pte++;
1627         } while (address && (address < end));
1628         return error;
1629 }
1630
1631 static inline int filemap_sync_pmd_range(pgd_t * pgd,
1632         unsigned long address, unsigned long size,
1633         struct vm_area_struct *vma, unsigned int flags)
1634 {
1635         pmd_t * pmd;
1636         unsigned long offset, end;
1637         int error;
1638
1639         if (pgd_none(*pgd))
1640                 return 0;
1641         if (pgd_bad(*pgd)) {
1642                 pgd_ERROR(*pgd);
1643                 pgd_clear(pgd);
1644                 return 0;
1645         }
1646         pmd = pmd_offset(pgd, address);
1647         offset = address & PGDIR_MASK;
1648         address &= ~PGDIR_MASK;
1649         end = address + size;
1650         if (end > PGDIR_SIZE)
1651                 end = PGDIR_SIZE;
1652         error = 0;
1653         do {
1654                 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
1655                 address = (address + PMD_SIZE) & PMD_MASK;
1656                 pmd++;
1657         } while (address && (address < end));
1658         return error;
1659 }
1660
1661 int filemap_sync(struct vm_area_struct * vma, unsigned long address,
1662         size_t size, unsigned int flags)
1663 {
1664         pgd_t * dir;
1665         unsigned long end = address + size;
1666         int error = 0;
1667
1668         dir = pgd_offset(vma->vm_mm, address);
1669         flush_cache_range(vma->vm_mm, end - size, end);
1670         if (address >= end)
1671                 BUG();
1672         do {
1673                 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
1674                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
1675                 dir++;
1676         } while (address && (address < end));
1677         flush_tlb_range(vma->vm_mm, end - size, end);
1678         return error;
1679 }
1680
1681 /*
1682  * This handles (potentially partial) area unmaps..
1683  */
1684 static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len)
1685 {
1686         filemap_sync(vma, start, len, MS_ASYNC);
1687 }
1688
1689 /*
1690  * Shared mappings need to be able to do the right thing at
1691  * close/unmap/sync. They will also use the private file as
1692  * backing-store for swapping..
1693  */
1694 static struct vm_operations_struct file_shared_mmap = {
1695         unmap:          filemap_unmap,          /* unmap - we need to sync the pages */
1696         sync:           filemap_sync,
1697         nopage:         filemap_nopage,
1698         swapout:        filemap_swapout,
1699 };
1700
1701 /*
1702  * Private mappings just need to be able to load in the map.
1703  *
1704  * (This is actually used for shared mappings as well, if we
1705  * know they can't ever get write permissions..)
1706  */
1707 static struct vm_operations_struct file_private_mmap = {
1708         nopage:         filemap_nopage,
1709 };
1710
1711 /* This is used for a general mmap of a disk file */
1712
1713 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1714 {
1715         struct vm_operations_struct * ops;
1716         struct inode *inode = file->f_dentry->d_inode;
1717
1718         ops = &file_private_mmap;
1719         if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1720                 if (!inode->i_mapping->a_ops->writepage)
1721                         return -EINVAL;
1722                 ops = &file_shared_mmap;
1723         }
1724         if (!inode->i_sb || !S_ISREG(inode->i_mode))
1725                 return -EACCES;
1726         if (!inode->i_mapping->a_ops->readpage)
1727                 return -ENOEXEC;
1728         UPDATE_ATIME(inode);
1729         vma->vm_ops = ops;
1730         return 0;
1731 }
1732
1733 /*
1734  * The msync() system call.
1735  */
1736
1737 static int msync_interval(struct vm_area_struct * vma,
1738         unsigned long start, unsigned long end, int flags)
1739 {
1740         if (vma->vm_file && vma->vm_ops && vma->vm_ops->sync) {
1741                 int error;
1742                 error = vma->vm_ops->sync(vma, start, end-start, flags);
1743                 if (!error && (flags & MS_SYNC)) {
1744                         struct file * file = vma->vm_file;
1745                         if (file && file->f_op && file->f_op->fsync) {
1746                                 down(&file->f_dentry->d_inode->i_sem);
1747                                 lock_kernel();
1748                                 error = file->f_op->fsync(file, file->f_dentry, 1);
1749                                 unlock_kernel();
1750                                 up(&file->f_dentry->d_inode->i_sem);
1751                         }
1752                 }
1753                 return error;
1754         }
1755         return 0;
1756 }
1757
1758 asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
1759 {
1760         unsigned long end;
1761         struct vm_area_struct * vma;
1762         int unmapped_error, error = -EINVAL;
1763
1764         down(&current->mm->mmap_sem);
1765         if (start & ~PAGE_MASK)
1766                 goto out;
1767         len = (len + ~PAGE_MASK) & PAGE_MASK;
1768         end = start + len;
1769         if (end < start)
1770                 goto out;
1771         if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1772                 goto out;
1773         error = 0;
1774         if (end == start)
1775                 goto out;
1776         /*
1777          * If the interval [start,end) covers some unmapped address ranges,
1778          * just ignore them, but return -EFAULT at the end.
1779          */
1780         vma = find_vma(current->mm, start);
1781         unmapped_error = 0;
1782         for (;;) {
1783                 /* Still start < end. */
1784                 error = -EFAULT;
1785                 if (!vma)
1786                         goto out;
1787                 /* Here start < vma->vm_end. */
1788                 if (start < vma->vm_start) {
1789                         unmapped_error = -EFAULT;
1790                         start = vma->vm_start;
1791                 }
1792                 /* Here vma->vm_start <= start < vma->vm_end. */
1793                 if (end <= vma->vm_end) {
1794                         if (start < end) {
1795                                 error = msync_interval(vma, start, end, flags);
1796                                 if (error)
1797                                         goto out;
1798                         }
1799                         error = unmapped_error;
1800                         goto out;
1801                 }
1802                 /* Here vma->vm_start <= start < vma->vm_end < end. */
1803                 error = msync_interval(vma, start, vma->vm_end, flags);
1804                 if (error)
1805                         goto out;
1806                 start = vma->vm_end;
1807                 vma = vma->vm_next;
1808         }
1809 out:
1810         up(&current->mm->mmap_sem);
1811         return error;
1812 }
1813
1814 static inline void setup_read_behavior(struct vm_area_struct * vma,
1815         int behavior)
1816 {
1817         VM_ClearReadHint(vma);
1818         switch(behavior) {
1819                 case MADV_SEQUENTIAL:
1820                         vma->vm_flags |= VM_SEQ_READ;
1821                         break;
1822                 case MADV_RANDOM:
1823                         vma->vm_flags |= VM_RAND_READ;
1824                         break;
1825                 default:
1826                         break;
1827         }
1828         return;
1829 }
1830
1831 static long madvise_fixup_start(struct vm_area_struct * vma,
1832         unsigned long end, int behavior)
1833 {
1834         struct vm_area_struct * n;
1835
1836         n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1837         if (!n)
1838                 return -EAGAIN;
1839         *n = *vma;
1840         n->vm_end = end;
1841         setup_read_behavior(n, behavior);
1842         n->vm_raend = 0;
1843         get_file(n->vm_file);
1844         if (n->vm_ops && n->vm_ops->open)
1845                 n->vm_ops->open(n);
1846         vmlist_modify_lock(vma->vm_mm);
1847         vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
1848         vma->vm_start = end;
1849         insert_vm_struct(current->mm, n);
1850         vmlist_modify_unlock(vma->vm_mm);
1851         return 0;
1852 }
1853
1854 static long madvise_fixup_end(struct vm_area_struct * vma,
1855         unsigned long start, int behavior)
1856 {
1857         struct vm_area_struct * n;
1858
1859         n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1860         if (!n)
1861                 return -EAGAIN;
1862         *n = *vma;
1863         n->vm_start = start;
1864         n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT;
1865         setup_read_behavior(n, behavior);
1866         n->vm_raend = 0;
1867         get_file(n->vm_file);
1868         if (n->vm_ops && n->vm_ops->open)
1869                 n->vm_ops->open(n);
1870         vmlist_modify_lock(vma->vm_mm);
1871         vma->vm_end = start;
1872         insert_vm_struct(current->mm, n);
1873         vmlist_modify_unlock(vma->vm_mm);
1874         return 0;
1875 }
1876
1877 static long madvise_fixup_middle(struct vm_area_struct * vma,
1878         unsigned long start, unsigned long end, int behavior)
1879 {
1880         struct vm_area_struct * left, * right;
1881
1882         left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1883         if (!left)
1884                 return -EAGAIN;
1885         right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1886         if (!right) {
1887                 kmem_cache_free(vm_area_cachep, left);
1888                 return -EAGAIN;
1889         }
1890         *left = *vma;
1891         *right = *vma;
1892         left->vm_end = start;
1893         right->vm_start = end;
1894         right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT;
1895         left->vm_raend = 0;
1896         right->vm_raend = 0;
1897         atomic_add(2, &vma->vm_file->f_count);
1898
1899         if (vma->vm_ops && vma->vm_ops->open) {
1900                 vma->vm_ops->open(left);
1901                 vma->vm_ops->open(right);
1902         }
1903         vmlist_modify_lock(vma->vm_mm);
1904         vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
1905         vma->vm_start = start;
1906         vma->vm_end = end;
1907         setup_read_behavior(vma, behavior);
1908         vma->vm_raend = 0;
1909         insert_vm_struct(current->mm, left);
1910         insert_vm_struct(current->mm, right);
1911         vmlist_modify_unlock(vma->vm_mm);
1912         return 0;
1913 }
1914
1915 /*
1916  * We can potentially split a vm area into separate
1917  * areas, each area with its own behavior.
1918  */
1919 static long madvise_behavior(struct vm_area_struct * vma,
1920         unsigned long start, unsigned long end, int behavior)
1921 {
1922         int error = 0;
1923
1924         /* This caps the number of vma's this process can own */
1925         if (vma->vm_mm->map_count > MAX_MAP_COUNT)
1926                 return -ENOMEM;
1927
1928         if (start == vma->vm_start) {
1929                 if (end == vma->vm_end) {
1930                         setup_read_behavior(vma, behavior);
1931                         vma->vm_raend = 0;
1932                 } else
1933                         error = madvise_fixup_start(vma, end, behavior);
1934         } else {
1935                 if (end == vma->vm_end)
1936                         error = madvise_fixup_end(vma, start, behavior);
1937                 else
1938                         error = madvise_fixup_middle(vma, start, end, behavior);
1939         }
1940
1941         return error;
1942 }
1943
1944 /*
1945  * Schedule all required I/O operations, then run the disk queue
1946  * to make sure they are started.  Do not wait for completion.
1947  */
1948 static long madvise_willneed(struct vm_area_struct * vma,
1949         unsigned long start, unsigned long end)
1950 {
1951         long error = -EBADF;
1952         struct file * file;
1953         unsigned long size, rlim_rss;
1954
1955         /* Doesn't work if there's no mapped file. */
1956         if (!vma->vm_file)
1957                 return error;
1958         file = vma->vm_file;
1959         size = (file->f_dentry->d_inode->i_size + PAGE_CACHE_SIZE - 1) >>
1960                                                         PAGE_CACHE_SHIFT;
1961
1962         start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1963         if (end > vma->vm_end)
1964                 end = vma->vm_end;
1965         end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1966
1967         /* Make sure this doesn't exceed the process's max rss. */
1968         error = -EIO;
1969         rlim_rss = current->rlim ?  current->rlim[RLIMIT_RSS].rlim_cur :
1970                                 LONG_MAX; /* default: see resource.h */
1971         if ((vma->vm_mm->rss + (end - start)) > rlim_rss)
1972                 return error;
1973
1974         /* round to cluster boundaries if this isn't a "random" area. */
1975         if (!VM_RandomReadHint(vma)) {
1976                 start = CLUSTER_OFFSET(start);
1977                 end = CLUSTER_OFFSET(end + CLUSTER_PAGES - 1);
1978
1979                 while ((start < end) && (start < size)) {
1980                         error = read_cluster_nonblocking(file, start, size);
1981                         start += CLUSTER_PAGES;
1982                         if (error < 0)
1983                                 break;
1984                 }
1985         } else {
1986                 while ((start < end) && (start < size)) {
1987                         error = page_cache_read(file, start);
1988                         start++;
1989                         if (error < 0)
1990                                 break;
1991                 }
1992         }
1993
1994         /* Don't wait for someone else to push these requests. */
1995         run_task_queue(&tq_disk);
1996
1997         return error;
1998 }
1999
2000 /*
2001  * Application no longer needs these pages.  If the pages are dirty,
2002  * it's OK to just throw them away.  The app will be more careful about
2003  * data it wants to keep.  Be sure to free swap resources too.  The
2004  * zap_page_range call sets things up for shrink_mmap to actually free
2005  * these pages later if no one else has touched them in the meantime,
2006  * although we could add these pages to a global reuse list for
2007  * shrink_mmap to pick up before reclaiming other pages.
2008  *
2009  * NB: This interface discards data rather than pushes it out to swap,
2010  * as some implementations do.  This has performance implications for
2011  * applications like large transactional databases which want to discard
2012  * pages in anonymous maps after committing to backing store the data
2013  * that was kept in them.  There is no reason to write this data out to
2014  * the swap area if the application is discarding it.
2015  *
2016  * An interface that causes the system to free clean pages and flush
2017  * dirty pages is already available as msync(MS_INVALIDATE).
2018  */
2019 static long madvise_dontneed(struct vm_area_struct * vma,
2020         unsigned long start, unsigned long end)
2021 {
2022         if (vma->vm_flags & VM_LOCKED)
2023                 return -EINVAL;
2024
2025         flush_cache_range(vma->vm_mm, start, end);
2026         zap_page_range(vma->vm_mm, start, end - start);
2027         flush_tlb_range(vma->vm_mm, start, end);
2028         return 0;
2029 }
2030
2031 static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
2032         unsigned long end, int behavior)
2033 {
2034         long error = -EBADF;
2035
2036         switch (behavior) {
2037         case MADV_NORMAL:
2038         case MADV_SEQUENTIAL:
2039         case MADV_RANDOM:
2040                 error = madvise_behavior(vma, start, end, behavior);
2041                 break;
2042
2043         case MADV_WILLNEED:
2044                 error = madvise_willneed(vma, start, end);
2045                 break;
2046
2047         case MADV_DONTNEED:
2048                 error = madvise_dontneed(vma, start, end);
2049                 break;
2050
2051         default:
2052                 error = -EINVAL;
2053                 break;
2054         }
2055
2056         return error;
2057 }
2058
2059 /*
2060  * The madvise(2) system call.
2061  *
2062  * Applications can use madvise() to advise the kernel how it should
2063  * handle paging I/O in this VM area.  The idea is to help the kernel
2064  * use appropriate read-ahead and caching techniques.  The information
2065  * provided is advisory only, and can be safely disregarded by the
2066  * kernel without affecting the correct operation of the application.
2067  *
2068  * behavior values:
2069  *  MADV_NORMAL - the default behavior is to read clusters.  This
2070  *              results in some read-ahead and read-behind.
2071  *  MADV_RANDOM - the system should read the minimum amount of data
2072  *              on any access, since it is unlikely that the appli-
2073  *              cation will need more than what it asks for.
2074  *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
2075  *              once, so they can be aggressively read ahead, and
2076  *              can be freed soon after they are accessed.
2077  *  MADV_WILLNEED - the application is notifying the system to read
2078  *              some pages ahead.
2079  *  MADV_DONTNEED - the application is finished with the given range,
2080  *              so the kernel can free resources associated with it.
2081  *
2082  * return values:
2083  *  zero    - success
2084  *  -EINVAL - start + len < 0, start is not page-aligned,
2085  *              "behavior" is not a valid value, or application
2086  *              is attempting to release locked or shared pages.
2087  *  -ENOMEM - addresses in the specified range are not currently
2088  *              mapped, or are outside the AS of the process.
2089  *  -EIO    - an I/O error occurred while paging in data.
2090  *  -EBADF  - map exists, but area maps something that isn't a file.
2091  *  -EAGAIN - a kernel resource was temporarily unavailable.
2092  */
2093 asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior)
2094 {
2095         unsigned long end;
2096         struct vm_area_struct * vma;
2097         int unmapped_error = 0;
2098         int error = -EINVAL;
2099
2100         down(&current->mm->mmap_sem);
2101
2102         if (start & ~PAGE_MASK)
2103                 goto out;
2104         len = (len + ~PAGE_MASK) & PAGE_MASK;
2105         end = start + len;
2106         if (end < start)
2107                 goto out;
2108
2109         error = 0;
2110         if (end == start)
2111                 goto out;
2112
2113         /*
2114          * If the interval [start,end) covers some unmapped address
2115          * ranges, just ignore them, but return -ENOMEM at the end.
2116          */
2117         vma = find_vma(current->mm, start);
2118         for (;;) {
2119                 /* Still start < end. */
2120                 error = -ENOMEM;
2121                 if (!vma)
2122                         goto out;
2123
2124                 /* Here start < vma->vm_end. */
2125                 if (start < vma->vm_start) {
2126                         unmapped_error = -ENOMEM;
2127                         start = vma->vm_start;
2128                 }
2129
2130                 /* Here vma->vm_start <= start < vma->vm_end. */
2131                 if (end <= vma->vm_end) {
2132                         if (start < end) {
2133                                 error = madvise_vma(vma, start, end,
2134                                                         behavior);
2135                                 if (error)
2136                                         goto out;
2137                         }
2138                         error = unmapped_error;
2139                         goto out;
2140                 }
2141
2142                 /* Here vma->vm_start <= start < vma->vm_end < end. */
2143                 error = madvise_vma(vma, start, vma->vm_end, behavior);
2144                 if (error)
2145                         goto out;
2146                 start = vma->vm_end;
2147                 vma = vma->vm_next;
2148         }
2149
2150 out:
2151         up(&current->mm->mmap_sem);
2152         return error;
2153 }
2154
2155 /*
2156  * Later we can get more picky about what "in core" means precisely.
2157  * For now, simply check to see if the page is in the page cache,
2158  * and is up to date; i.e. that no page-in operation would be required
2159  * at this time if an application were to map and access this page.
2160  */
2161 static unsigned char mincore_page(struct vm_area_struct * vma,
2162         unsigned long pgoff)
2163 {
2164         unsigned char present = 0;
2165         struct address_space * as = &vma->vm_file->f_dentry->d_inode->i_data;
2166         struct page * page, ** hash = page_hash(as, pgoff);
2167
2168         spin_lock(&pagecache_lock);
2169         page = __find_page_nolock(as, pgoff, *hash);
2170         if ((page) && (Page_Uptodate(page)))
2171                 present = 1;
2172         spin_unlock(&pagecache_lock);
2173
2174         return present;
2175 }
2176
2177 static long mincore_vma(struct vm_area_struct * vma,
2178         unsigned long start, unsigned long end, unsigned char * vec)
2179 {
2180         long error, i, remaining;
2181         unsigned char * tmp;
2182
2183         error = -ENOMEM;
2184         if (!vma->vm_file)
2185                 return error;
2186
2187         start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2188         if (end > vma->vm_end)
2189                 end = vma->vm_end;
2190         end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2191
2192         error = -EAGAIN;
2193         tmp = (unsigned char *) __get_free_page(GFP_KERNEL);
2194         if (!tmp)
2195                 return error;
2196
2197         /* (end - start) is # of pages, and also # of bytes in "vec */
2198         remaining = (end - start),
2199
2200         error = 0;
2201         for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) {
2202                 int j = 0;
2203                 long thispiece = (remaining < PAGE_SIZE) ?
2204                                                 remaining : PAGE_SIZE;
2205
2206                 while (j < thispiece)
2207                         tmp[j++] = mincore_page(vma, start++);
2208
2209                 if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) {
2210                         error = -EFAULT;
2211                         break;
2212                 }
2213         }
2214
2215         free_page((unsigned long) tmp);
2216         return error;
2217 }
2218
2219 /*
2220  * The mincore(2) system call.
2221  *
2222  * mincore() returns the memory residency status of the pages in the
2223  * current process's address space specified by [addr, addr + len).
2224  * The status is returned in a vector of bytes.  The least significant
2225  * bit of each byte is 1 if the referenced page is in memory, otherwise
2226  * it is zero.
2227  *
2228  * Because the status of a page can change after mincore() checks it
2229  * but before it returns to the application, the returned vector may
2230  * contain stale information.  Only locked pages are guaranteed to
2231  * remain in memory.
2232  *
2233  * return values:
2234  *  zero    - success
2235  *  -EFAULT - vec points to an illegal address
2236  *  -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE,
2237  *              or len has a nonpositive value
2238  *  -ENOMEM - Addresses in the range [addr, addr + len] are
2239  *              invalid for the address space of this process, or
2240  *              specify one or more pages which are not currently
2241  *              mapped
2242  *  -EAGAIN - A kernel resource was temporarily unavailable.
2243  */
2244 asmlinkage long sys_mincore(unsigned long start, size_t len,
2245         unsigned char * vec)
2246 {
2247         int index = 0;
2248         unsigned long end;
2249         struct vm_area_struct * vma;
2250         int unmapped_error = 0;
2251         long error = -EINVAL;
2252
2253         down(&current->mm->mmap_sem);
2254
2255         if (start & ~PAGE_CACHE_MASK)
2256                 goto out;
2257         len = (len + ~PAGE_CACHE_MASK) & PAGE_CACHE_MASK;
2258         end = start + len;
2259         if (end < start)
2260                 goto out;
2261
2262         error = 0;
2263         if (end == start)
2264                 goto out;
2265
2266         /*
2267          * If the interval [start,end) covers some unmapped address
2268          * ranges, just ignore them, but return -ENOMEM at the end.
2269          */
2270         vma = find_vma(current->mm, start);
2271         for (;;) {
2272                 /* Still start < end. */
2273                 error = -ENOMEM;
2274                 if (!vma)
2275                         goto out;
2276
2277                 /* Here start < vma->vm_end. */
2278                 if (start < vma->vm_start) {
2279                         unmapped_error = -ENOMEM;
2280                         start = vma->vm_start;
2281                 }
2282
2283                 /* Here vma->vm_start <= start < vma->vm_end. */
2284                 if (end <= vma->vm_end) {
2285                         if (start < end) {
2286                                 error = mincore_vma(vma, start, end,
2287                                                         &vec[index]);
2288                                 if (error)
2289                                         goto out;
2290                         }
2291                         error = unmapped_error;
2292                         goto out;
2293                 }
2294
2295                 /* Here vma->vm_start <= start < vma->vm_end < end. */
2296                 error = mincore_vma(vma, start, vma->vm_end, &vec[index]);
2297                 if (error)
2298                         goto out;
2299                 index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT;
2300                 start = vma->vm_end;
2301                 vma = vma->vm_next;
2302         }
2303
2304 out:
2305         up(&current->mm->mmap_sem);
2306         return error;
2307 }
2308
2309 static inline
2310 struct page *__read_cache_page(struct address_space *mapping,
2311                                 unsigned long index,
2312                                 int (*filler)(void *,struct page*),
2313                                 void *data)
2314 {
2315         struct page **hash = page_hash(mapping, index);
2316         struct page *page, *cached_page = NULL;
2317         int err;
2318 repeat:
2319         page = __find_get_page(mapping, index, hash);
2320         if (!page) {
2321                 if (!cached_page) {
2322                         cached_page = page_cache_alloc();
2323                         if (!cached_page)
2324                                 return ERR_PTR(-ENOMEM);
2325                 }
2326                 page = cached_page;
2327                 if (add_to_page_cache_unique(page, mapping, index, hash))
2328                         goto repeat;
2329                 cached_page = NULL;
2330                 err = filler(data, page);
2331                 if (err < 0) {
2332                         page_cache_release(page);
2333                         page = ERR_PTR(err);
2334                 }
2335         }
2336         if (cached_page)
2337                 page_cache_free(cached_page);
2338         return page;
2339 }
2340
2341 /*
2342  * Read into the page cache. If a page already exists,
2343  * and Page_Uptodate() is not set, try to fill the page.
2344  */
2345 struct page *read_cache_page(struct address_space *mapping,
2346                                 unsigned long index,
2347                                 int (*filler)(void *,struct page*),
2348                                 void *data)
2349 {
2350         struct page *page = __read_cache_page(mapping, index, filler, data);
2351         int err;
2352
2353         if (IS_ERR(page) || Page_Uptodate(page))
2354                 goto out;
2355
2356         lock_page(page);
2357         if (Page_Uptodate(page)) {
2358                 UnlockPage(page);
2359                 goto out;
2360         }
2361         err = filler(data, page);
2362         if (err < 0) {
2363                 page_cache_release(page);
2364                 page = ERR_PTR(err);
2365         }
2366  out:
2367         return page;
2368 }
2369
2370 static inline struct page * __grab_cache_page(struct address_space *mapping,
2371                                 unsigned long index, struct page **cached_page)
2372 {
2373         struct page *page, **hash = page_hash(mapping, index);
2374 repeat:
2375         page = __find_lock_page(mapping, index, hash);
2376         if (!page) {
2377                 if (!*cached_page) {
2378                         *cached_page = page_cache_alloc();
2379                         if (!*cached_page)
2380                                 return NULL;
2381                 }
2382                 page = *cached_page;
2383                 if (add_to_page_cache_unique(page, mapping, index, hash))
2384                         goto repeat;
2385                 *cached_page = NULL;
2386         }
2387         return page;
2388 }
2389
2390 /*
2391  * Returns locked page at given index in given cache, creating it if needed.
2392  */
2393
2394 struct page *grab_cache_page(struct address_space *mapping, unsigned long index)
2395 {
2396         struct page *cached_page = NULL;
2397         struct page *page = __grab_cache_page(mapping,index,&cached_page);
2398         if (cached_page)
2399                 page_cache_free(cached_page);
2400         return page;
2401 }
2402
2403 static inline void remove_suid(struct inode *inode)
2404 {
2405         unsigned int mode;
2406
2407         /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
2408         mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
2409
2410         /* was any of the uid bits set? */
2411         mode &= inode->i_mode;
2412         if (mode && !capable(CAP_FSETID)) {
2413                 inode->i_mode &= ~mode;
2414                 mark_inode_dirty(inode);
2415         }
2416 }
2417
2418 /*
2419  * Write to a file through the page cache.
2420  *
2421  * We currently put everything into the page cache prior to writing it.
2422  * This is not a problem when writing full pages. With partial pages,
2423  * however, we first have to read the data into the cache, then
2424  * dirty the page, and finally schedule it for writing. Alternatively, we
2425  * could write-through just the portion of data that would go into that
2426  * page, but that would kill performance for applications that write data
2427  * line by line, and it's prone to race conditions.
2428  *
2429  * Note that this routine doesn't try to keep track of dirty pages. Each
2430  * file system has to do this all by itself, unfortunately.
2431  *                                                      okir@monad.swb.de
2432  */
2433 ssize_t
2434 generic_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)
2435 {
2436         struct inode    *inode = file->f_dentry->d_inode;
2437         struct address_space *mapping = inode->i_mapping;
2438         unsigned long   limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
2439         loff_t          pos;
2440         struct page     *page, *cached_page;
2441         unsigned long   written;
2442         long            status;
2443         int             err;
2444
2445         cached_page = NULL;
2446
2447         down(&inode->i_sem);
2448
2449         pos = *ppos;
2450         err = -EINVAL;
2451         if (pos < 0)
2452                 goto out;
2453
2454         err = file->f_error;
2455         if (err) {
2456                 file->f_error = 0;
2457                 goto out;
2458         }
2459
2460         written = 0;
2461
2462         if (file->f_flags & O_APPEND)
2463                 pos = inode->i_size;
2464
2465         /*
2466          * Check whether we've reached the file size limit.
2467          */
2468         err = -EFBIG;
2469         if (limit != RLIM_INFINITY) {
2470                 if (pos >= limit) {
2471                         send_sig(SIGXFSZ, current, 0);
2472                         goto out;
2473                 }
2474                 if (count > limit - pos) {
2475                         send_sig(SIGXFSZ, current, 0);
2476                         count = limit - pos;
2477                 }
2478         }
2479
2480         status  = 0;
2481         if (count) {
2482                 remove_suid(inode);
2483                 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
2484                 mark_inode_dirty(inode);
2485         }
2486
2487         while (count) {
2488                 unsigned long bytes, index, offset;
2489                 char *kaddr;
2490
2491                 /*
2492                  * Try to find the page in the cache. If it isn't there,
2493                  * allocate a free page.
2494                  */
2495                 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
2496                 index = pos >> PAGE_CACHE_SHIFT;
2497                 bytes = PAGE_CACHE_SIZE - offset;
2498                 if (bytes > count)
2499                         bytes = count;
2500
2501                 status = -ENOMEM;       /* we'll assign it later anyway */
2502                 page = __grab_cache_page(mapping, index, &cached_page);
2503                 if (!page)
2504                         break;
2505
2506                 /* We have exclusive IO access to the page.. */
2507                 if (!PageLocked(page)) {
2508                         PAGE_BUG(page);
2509                 }
2510
2511                 status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes);
2512                 if (status)
2513                         goto unlock;
2514                 kaddr = (char*)page_address(page);
2515                 status = copy_from_user(kaddr+offset, buf, bytes);
2516                 if (status)
2517                         goto fail_write;
2518                 status = mapping->a_ops->commit_write(file, page, offset, offset+bytes);
2519                 if (!status)
2520                         status = bytes;
2521
2522                 if (status >= 0) {
2523                         written += status;
2524                         count -= status;
2525                         pos += status;
2526                         buf += status;
2527                 }
2528 unlock:
2529                 /* Mark it unlocked again and drop the page.. */
2530                 UnlockPage(page);
2531                 page_cache_release(page);
2532
2533                 if (status < 0)
2534                         break;
2535         }
2536         *ppos = pos;
2537
2538         if (cached_page)
2539                 page_cache_free(cached_page);
2540
2541         err = written ? written : status;
2542 out:
2543         up(&inode->i_sem);
2544         return err;
2545 fail_write:
2546         status = -EFAULT;
2547         ClearPageUptodate(page);
2548         kunmap(page);
2549         goto unlock;
2550 }
2551
2552 void __init page_cache_init(unsigned long mempages)
2553 {
2554         unsigned long htable_size, order;
2555
2556         htable_size = mempages;
2557         htable_size *= sizeof(struct page *);
2558         for(order = 0; (PAGE_SIZE << order) < htable_size; order++)
2559                 ;
2560
2561         do {
2562                 unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *);
2563
2564                 page_hash_bits = 0;
2565                 while((tmp >>= 1UL) != 0UL)
2566                         page_hash_bits++;
2567
2568                 page_hash_table = (struct page **)
2569                         __get_free_pages(GFP_ATOMIC, order);
2570         } while(page_hash_table == NULL && --order > 0);
2571
2572         printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
2573                (1 << page_hash_bits), order, (PAGE_SIZE << order));
2574         if (!page_hash_table)
2575                 panic("Failed to allocate page hash table\n");
2576         memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));
2577 }