mm/filemap.c

   1 /*
   2  *      linux/mm/filemap.c
   3  *
   4  * Copyright (C) 1994-1999  Linus Torvalds
   5  */
   6
   7 /*
   8  * This file handles the generic file mmap semantics used by
   9  * most "normal" filesystems (but you don't /have/ to use this:
  10  * the NFS filesystem used to do this differently, for example)
  11  */
  12 #include <linux/malloc.h>
  13 #include <linux/shm.h>
  14 #include <linux/mman.h>
  15 #include <linux/locks.h>
  16 #include <linux/pagemap.h>
  17 #include <linux/swap.h>
  18 #include <linux/smp_lock.h>
  19 #include <linux/blkdev.h>
  20 #include <linux/file.h>
  21 #include <linux/swapctl.h>
  22 #include <linux/slab.h>
  23 #include <linux/init.h>
  24 #include <linux/mm.h>
  25
  26 #include <asm/pgalloc.h>
  27 #include <asm/uaccess.h>
  28 #include <asm/mman.h>
  29
  30 #include <linux/highmem.h>
  31
  32 /*
  33  * Shared mappings implemented 30.11.1994. It's not fully working yet,
  34  * though.
  35  *
  36  * Shared mappings now work. 15.8.1995  Bruno.
  37  *
  38  * finished 'unifying' the page and buffer cache and SMP-threaded the
  39  * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
  40  *
  41  * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
  42  */
  43
  44 atomic_t page_cache_size = ATOMIC_INIT(0);
  45 unsigned int page_hash_bits;
  46 struct page **page_hash_table;
  47
  48 spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
  49 /*
  50  * NOTE: to avoid deadlocking you must never acquire the pagecache_lock with
  51  *       the pagemap_lru_lock held.
  52  */
  53 spinlock_t pagemap_lru_lock = SPIN_LOCK_UNLOCKED;
  54
  55 #define CLUSTER_PAGES           (1 << page_cluster)
  56 #define CLUSTER_OFFSET(x)       (((x) >> page_cluster) << page_cluster)
  57
  58 static void add_page_to_hash_queue(struct page * page, struct page **p)
  59 {
  60         struct page *next = *p;
  61
  62         *p = page;
  63         page->next_hash = next;
  64         page->pprev_hash = p;
  65         if (next)
  66                 next->pprev_hash = &page->next_hash;
  67         if (page->buffers)
  68                 PAGE_BUG(page);
  69         atomic_inc(&page_cache_size);
  70 }
  71
  72 static inline void add_page_to_inode_queue(struct address_space *mapping, struct page * page)
  73 {
  74         struct list_head *head = &mapping->clean_pages;
  75
  76         mapping->nrpages++;
  77         list_add(&page->list, head);
  78         page->mapping = mapping;
  79 }
  80
  81 static inline void remove_page_from_inode_queue(struct page * page)
  82 {
  83         struct address_space * mapping = page->mapping;
  84
  85         mapping->nrpages--;
  86         list_del(&page->list);
  87         page->mapping = NULL;
  88 }
  89
  90 static inline void remove_page_from_hash_queue(struct page * page)
  91 {
  92         struct page *next = page->next_hash;
  93         struct page **pprev = page->pprev_hash;
  94
  95         if (next)
  96                 next->pprev_hash = pprev;
  97         *pprev = next;
  98         page->pprev_hash = NULL;
  99         atomic_dec(&page_cache_size);
 100 }
 101
 102 /*
 103  * Remove a page from the page cache and free it. Caller has to make
 104  * sure the page is locked and that nobody else uses it - or that usage
 105  * is safe.
 106  */
 107 void __remove_inode_page(struct page *page)
 108 {
 109         if (PageDirty(page)) BUG();
 110         remove_page_from_inode_queue(page);
 111         remove_page_from_hash_queue(page);
 112         page->mapping = NULL;
 113 }
 114
 115 void remove_inode_page(struct page *page)
 116 {
 117         if (!PageLocked(page))
 118                 PAGE_BUG(page);
 119
 120         spin_lock(&pagecache_lock);
 121         __remove_inode_page(page);
 122         spin_unlock(&pagecache_lock);
 123 }
 124
 125 static inline int sync_page(struct page *page)
 126 {
 127         struct address_space *mapping = page->mapping;
 128
 129         if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
 130                 return mapping->a_ops->sync_page(page);
 131         return 0;
 132 }
 133
 134 /*
 135  * Add a page to the dirty page list.
 136  */
 137 void __set_page_dirty(struct page *page)
 138 {
 139         struct address_space *mapping = page->mapping;
 140
 141         spin_lock(&pagecache_lock);
 142         list_del(&page->list);
 143         list_add(&page->list, &mapping->dirty_pages);
 144         spin_unlock(&pagecache_lock);
 145
 146         mark_inode_dirty_pages(mapping->host);
 147 }
 148
 149 /**
 150  * invalidate_inode_pages - Invalidate all the unlocked pages of one inode
 151  * @inode: the inode which pages we want to invalidate
 152  *
 153  * This function only removes the unlocked pages, if you want to
 154  * remove all the pages of one inode, you must call truncate_inode_pages.
 155  */
 156
 157 void invalidate_inode_pages(struct inode * inode)
 158 {
 159         struct list_head *head, *curr;
 160         struct page * page;
 161
 162         head = &inode->i_mapping->clean_pages;
 163
 164         spin_lock(&pagecache_lock);
 165         spin_lock(&pagemap_lru_lock);
 166         curr = head->next;
 167
 168         while (curr != head) {
 169                 page = list_entry(curr, struct page, list);
 170                 curr = curr->next;
 171
 172                 /* We cannot invalidate something in use.. */
 173                 if (page_count(page) != 1)
 174                         continue;
 175
 176                 /* ..or dirty.. */
 177                 if (PageDirty(page))
 178                         continue;
 179
 180                 /* ..or locked */
 181                 if (TryLockPage(page))
 182                         continue;
 183
 184                 __lru_cache_del(page);
 185                 __remove_inode_page(page);
 186                 UnlockPage(page);
 187                 page_cache_release(page);
 188         }
 189
 190         spin_unlock(&pagemap_lru_lock);
 191         spin_unlock(&pagecache_lock);
 192 }
 193
 194 static inline void truncate_partial_page(struct page *page, unsigned partial)
 195 {
 196         memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
 197
 198         if (page->buffers)
 199                 block_flushpage(page, partial);
 200
 201 }
 202
 203 static inline void truncate_complete_page(struct page *page)
 204 {
 205         /* Leave it on the LRU if it gets converted into anonymous buffers */
 206         if (!page->buffers || block_flushpage(page, 0))
 207                 lru_cache_del(page);
 208
 209         /*
 210          * We remove the page from the page cache _after_ we have
 211          * destroyed all buffer-cache references to it. Otherwise some
 212          * other process might think this inode page is not in the
 213          * page cache and creates a buffer-cache alias to it causing
 214          * all sorts of fun problems ...
 215          */
 216         ClearPageDirty(page);
 217         ClearPageUptodate(page);
 218         remove_inode_page(page);
 219         page_cache_release(page);
 220 }
 221
 222 void truncate_list_pages(struct list_head *head, unsigned long start, unsigned partial)
 223 {
 224         struct list_head *curr;
 225         struct page * page;
 226
 227 repeat:
 228         spin_lock(&pagecache_lock);
 229         curr = head->next;
 230         while (curr != head) {
 231                 unsigned long offset;
 232
 233                 page = list_entry(curr, struct page, list);
 234                 curr = curr->next;
 235                 offset = page->index;
 236
 237                 /* Is one of the pages to truncate? */
 238                 if ((offset >= start) || (partial && (offset + 1) == start)) {
 239                         if (TryLockPage(page)) {
 240                                 page_cache_get(page);
 241                                 spin_unlock(&pagecache_lock);
 242                                 wait_on_page(page);
 243                                 page_cache_release(page);
 244                                 goto repeat;
 245                         }
 246                         page_cache_get(page);
 247                         spin_unlock(&pagecache_lock);
 248
 249                         if (partial && (offset + 1) == start) {
 250                                 truncate_partial_page(page, partial);
 251                                 partial = 0;
 252                         } else
 253                                 truncate_complete_page(page);
 254
 255                         UnlockPage(page);
 256                         page_cache_release(page);
 257
 258                         /*
 259                          * We have done things without the pagecache lock,
 260                          * so we'll have to repeat the scan.
 261                          * It's not possible to deadlock here because
 262                          * we are guaranteed to make progress. (ie. we have
 263                          * just removed a page)
 264                          */
 265                         goto repeat;
 266                 }
 267         }
 268         spin_unlock(&pagecache_lock);
 269 }
 270
 271
 272 /**
 273  * truncate_inode_pages - truncate *all* the pages from an offset
 274  * @mapping: mapping to truncate
 275  * @lstart: offset from with to truncate
 276  *
 277  * Truncate the page cache at a set offset, removing the pages
 278  * that are beyond that offset (and zeroing out partial pages).
 279  * If any page is locked we wait for it to become unlocked.
 280  */
 281 void truncate_inode_pages(struct address_space * mapping, loff_t lstart)
 282 {
 283         unsigned long start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 284         unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
 285
 286         truncate_list_pages(&mapping->clean_pages, start, partial);
 287         truncate_list_pages(&mapping->dirty_pages, start, partial);
 288         truncate_list_pages(&mapping->locked_pages, start, partial);
 289 }
 290
 291 static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
 292 {
 293         goto inside;
 294
 295         for (;;) {
 296                 page = page->next_hash;
 297 inside:
 298                 if (!page)
 299                         goto not_found;
 300                 if (page->mapping != mapping)
 301                         continue;
 302                 if (page->index == offset)
 303                         break;
 304         }
 305         /*
 306          * Touching the page may move it to the active list.
 307          * If we end up with too few inactive pages, we wake
 308          * up kswapd.
 309          */
 310         age_page_up(page);
 311         if (inactive_shortage() > inactive_target / 2 && free_shortage())
 312                         wakeup_kswapd(0);
 313 not_found:
 314         return page;
 315 }
 316
 317 /*
 318  * By the time this is called, the page is locked and
 319  * we don't have to worry about any races any more.
 320  *
 321  * Start the IO..
 322  */
 323 static int writeout_one_page(struct page *page)
 324 {
 325         struct buffer_head *bh, *head = page->buffers;
 326
 327         bh = head;
 328         do {
 329                 if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
 330                         continue;
 331
 332                 bh->b_flushtime = jiffies;
 333                 ll_rw_block(WRITE, 1, &bh);
 334         } while ((bh = bh->b_this_page) != head);
 335         return 0;
 336 }
 337
 338 static int waitfor_one_page(struct page *page)
 339 {
 340         int error = 0;
 341         struct buffer_head *bh, *head = page->buffers;
 342
 343         bh = head;
 344         do {
 345                 wait_on_buffer(bh);
 346                 if (buffer_req(bh) && !buffer_uptodate(bh))
 347                         error = -EIO;
 348         } while ((bh = bh->b_this_page) != head);
 349         return error;
 350 }
 351
 352 static int do_buffer_fdatasync(struct list_head *head, unsigned long start, unsigned long end, int (*fn)(struct page *))
 353 {
 354         struct list_head *curr;
 355         struct page *page;
 356         int retval = 0;
 357
 358         spin_lock(&pagecache_lock);
 359         curr = head->next;
 360         while (curr != head) {
 361                 page = list_entry(curr, struct page, list);
 362                 curr = curr->next;
 363                 if (!page->buffers)
 364                         continue;
 365                 if (page->index >= end)
 366                         continue;
 367                 if (page->index < start)
 368                         continue;
 369
 370                 page_cache_get(page);
 371                 spin_unlock(&pagecache_lock);
 372                 lock_page(page);
 373
 374                 /* The buffers could have been free'd while we waited for the page lock */
 375                 if (page->buffers)
 376                         retval |= fn(page);
 377
 378                 UnlockPage(page);
 379                 spin_lock(&pagecache_lock);
 380                 curr = page->list.next;
 381                 page_cache_release(page);
 382         }
 383         spin_unlock(&pagecache_lock);
 384
 385         return retval;
 386 }
 387
 388 /*
 389  * Two-stage data sync: first start the IO, then go back and
 390  * collect the information..
 391  */
 392 int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx)
 393 {
 394         int retval;
 395
 396         /* writeout dirty buffers on pages from both clean and dirty lists */
 397         retval = do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, writeout_one_page);
 398         retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, writeout_one_page);
 399         retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, writeout_one_page);
 400
 401         /* now wait for locked buffers on pages from both clean and dirty lists */
 402         retval |= do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, writeout_one_page);
 403         retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, waitfor_one_page);
 404         retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, waitfor_one_page);
 405
 406         return retval;
 407 }
 408
 409 /**
 410  *      filemap_fdatasync - walk the list of dirty pages of the given address space
 411  *      and writepage() all of them.
 412  *
 413  *      @mapping: address space structure to write
 414  *
 415  */
 416 void filemap_fdatasync(struct address_space * mapping)
 417 {
 418         int (*writepage)(struct page *) = mapping->a_ops->writepage;
 419
 420         spin_lock(&pagecache_lock);
 421
 422         while (!list_empty(&mapping->dirty_pages)) {
 423                 struct page *page = list_entry(mapping->dirty_pages.next, struct page, list);
 424
 425                 list_del(&page->list);
 426                 list_add(&page->list, &mapping->locked_pages);
 427
 428                 if (!PageDirty(page))
 429                         continue;
 430
 431                 page_cache_get(page);
 432                 spin_unlock(&pagecache_lock);
 433
 434                 lock_page(page);
 435
 436                 if (PageDirty(page)) {
 437                         ClearPageDirty(page);
 438                         writepage(page);
 439                 } else
 440                         UnlockPage(page);
 441
 442                 page_cache_release(page);
 443                 spin_lock(&pagecache_lock);
 444         }
 445         spin_unlock(&pagecache_lock);
 446 }
 447
 448 /**
 449  *      filemap_fdatawait - walk the list of locked pages of the given address space
 450  *      and wait for all of them.
 451  *
 452  *      @mapping: address space structure to wait for
 453  *
 454  */
 455 void filemap_fdatawait(struct address_space * mapping)
 456 {
 457         spin_lock(&pagecache_lock);
 458
 459         while (!list_empty(&mapping->locked_pages)) {
 460                 struct page *page = list_entry(mapping->locked_pages.next, struct page, list);
 461
 462                 list_del(&page->list);
 463                 list_add(&page->list, &mapping->clean_pages);
 464
 465                 if (!PageLocked(page))
 466                         continue;
 467
 468                 page_cache_get(page);
 469                 spin_unlock(&pagecache_lock);
 470
 471                 ___wait_on_page(page);
 472
 473                 page_cache_release(page);
 474                 spin_lock(&pagecache_lock);
 475         }
 476         spin_unlock(&pagecache_lock);
 477 }
 478
 479 /*
 480  * Add a page to the inode page cache.
 481  *
 482  * The caller must have locked the page and
 483  * set all the page flags correctly..
 484  */
 485 void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index)
 486 {
 487         if (!PageLocked(page))
 488                 BUG();
 489
 490         page_cache_get(page);
 491         spin_lock(&pagecache_lock);
 492         page->index = index;
 493         add_page_to_inode_queue(mapping, page);
 494         add_page_to_hash_queue(page, page_hash(mapping, index));
 495         lru_cache_add(page);
 496         spin_unlock(&pagecache_lock);
 497 }
 498
 499 /*
 500  * This adds a page to the page cache, starting out as locked,
 501  * owned by us, but unreferenced, not uptodate and with no errors.
 502  */
 503 static inline void __add_to_page_cache(struct page * page,
 504         struct address_space *mapping, unsigned long offset,
 505         struct page **hash)
 506 {
 507         unsigned long flags;
 508
 509         if (PageLocked(page))
 510                 BUG();
 511
 512         flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced) | (1 << PG_arch_1));
 513         page->flags = flags | (1 << PG_locked);
 514         page_cache_get(page);
 515         page->index = offset;
 516         add_page_to_inode_queue(mapping, page);
 517         add_page_to_hash_queue(page, hash);
 518         lru_cache_add(page);
 519 }
 520
 521 void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset)
 522 {
 523         spin_lock(&pagecache_lock);
 524         __add_to_page_cache(page, mapping, offset, page_hash(mapping, offset));
 525         spin_unlock(&pagecache_lock);
 526 }
 527
 528 static int add_to_page_cache_unique(struct page * page,
 529         struct address_space *mapping, unsigned long offset,
 530         struct page **hash)
 531 {
 532         int err;
 533         struct page *alias;
 534
 535         spin_lock(&pagecache_lock);
 536         alias = __find_page_nolock(mapping, offset, *hash);
 537
 538         err = 1;
 539         if (!alias) {
 540                 __add_to_page_cache(page,mapping,offset,hash);
 541                 err = 0;
 542         }
 543
 544         spin_unlock(&pagecache_lock);
 545         return err;
 546 }
 547
 548 /*
 549  * This adds the requested page to the page cache if it isn't already there,
 550  * and schedules an I/O to read in its contents from disk.
 551  */
 552 static inline int page_cache_read(struct file * file, unsigned long offset)
 553 {
 554         struct inode *inode = file->f_dentry->d_inode;
 555         struct address_space *mapping = inode->i_mapping;
 556         struct page **hash = page_hash(mapping, offset);
 557         struct page *page;
 558
 559         spin_lock(&pagecache_lock);
 560         page = __find_page_nolock(mapping, offset, *hash);
 561         spin_unlock(&pagecache_lock);
 562         if (page)
 563                 return 0;
 564
 565         page = page_cache_alloc();
 566         if (!page)
 567                 return -ENOMEM;
 568
 569         if (!add_to_page_cache_unique(page, mapping, offset, hash)) {
 570                 int error = mapping->a_ops->readpage(file, page);
 571                 page_cache_release(page);
 572                 return error;
 573         }
 574         /*
 575          * We arrive here in the unlikely event that someone
 576          * raced with us and added our page to the cache first.
 577          */
 578         page_cache_free(page);
 579         return 0;
 580 }
 581
 582 /*
 583  * Read in an entire cluster at once.  A cluster is usually a 64k-
 584  * aligned block that includes the page requested in "offset."
 585  */
 586 static int read_cluster_nonblocking(struct file * file, unsigned long offset,
 587         unsigned long filesize)
 588 {
 589         unsigned long pages = CLUSTER_PAGES;
 590
 591         offset = CLUSTER_OFFSET(offset);
 592         while ((pages-- > 0) && (offset < filesize)) {
 593                 int error = page_cache_read(file, offset);
 594                 if (error < 0)
 595                         return error;
 596                 offset ++;
 597         }
 598
 599         return 0;
 600 }
 601
 602 /*
 603  * Wait for a page to get unlocked.
 604  *
 605  * This must be called with the caller "holding" the page,
 606  * ie with increased "page->count" so that the page won't
 607  * go away during the wait..
 608  */
 609 void ___wait_on_page(struct page *page)
 610 {
 611         struct task_struct *tsk = current;
 612         DECLARE_WAITQUEUE(wait, tsk);
 613
 614         add_wait_queue(&page->wait, &wait);
 615         do {
 616                 sync_page(page);
 617                 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 618                 if (!PageLocked(page))
 619                         break;
 620                 run_task_queue(&tq_disk);
 621                 schedule();
 622         } while (PageLocked(page));
 623         tsk->state = TASK_RUNNING;
 624         remove_wait_queue(&page->wait, &wait);
 625 }
 626
 627 /*
 628  * Get a lock on the page, assuming we need to sleep
 629  * to get it..
 630  */
 631 static void __lock_page(struct page *page)
 632 {
 633         struct task_struct *tsk = current;
 634         DECLARE_WAITQUEUE(wait, tsk);
 635
 636         add_wait_queue_exclusive(&page->wait, &wait);
 637         for (;;) {
 638                 sync_page(page);
 639                 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 640                 if (PageLocked(page)) {
 641                         run_task_queue(&tq_disk);
 642                         schedule();
 643                         continue;
 644                 }
 645                 if (!TryLockPage(page))
 646                         break;
 647         }
 648         tsk->state = TASK_RUNNING;
 649         remove_wait_queue(&page->wait, &wait);
 650 }
 651
 652
 653 /*
 654  * Get an exclusive lock on the page, optimistically
 655  * assuming it's not locked..
 656  */
 657 void lock_page(struct page *page)
 658 {
 659         if (TryLockPage(page))
 660                 __lock_page(page);
 661 }
 662
 663 /*
 664  * a rather lightweight function, finding and getting a reference to a
 665  * hashed page atomically, waiting for it if it's locked.
 666  */
 667 struct page * __find_get_page(struct address_space *mapping,
 668                               unsigned long offset, struct page **hash)
 669 {
 670         struct page *page;
 671
 672         /*
 673          * We scan the hash list read-only. Addition to and removal from
 674          * the hash-list needs a held write-lock.
 675          */
 676         spin_lock(&pagecache_lock);
 677         page = __find_page_nolock(mapping, offset, *hash);
 678         if (page)
 679                 page_cache_get(page);
 680         spin_unlock(&pagecache_lock);
 681         return page;
 682 }
 683
 684 /*
 685  * Get the lock to a page atomically.
 686  */
 687 struct page * __find_lock_page (struct address_space *mapping,
 688                                 unsigned long offset, struct page **hash)
 689 {
 690         struct page *page;
 691
 692         /*
 693          * We scan the hash list read-only. Addition to and removal from
 694          * the hash-list needs a held write-lock.
 695          */
 696 repeat:
 697         spin_lock(&pagecache_lock);
 698         page = __find_page_nolock(mapping, offset, *hash);
 699         if (page) {
 700                 page_cache_get(page);
 701                 spin_unlock(&pagecache_lock);
 702
 703                 lock_page(page);
 704
 705                 /* Is the page still hashed? Ok, good.. */
 706                 if (page->mapping)
 707                         return page;
 708
 709                 /* Nope: we raced. Release and try again.. */
 710                 UnlockPage(page);
 711                 page_cache_release(page);
 712                 goto repeat;
 713         }
 714         spin_unlock(&pagecache_lock);
 715         return NULL;
 716 }
 717
 718 #if 0
 719 #define PROFILE_READAHEAD
 720 #define DEBUG_READAHEAD
 721 #endif
 722
 723 /*
 724  * We combine this with read-ahead to deactivate pages when we
 725  * think there's sequential IO going on. Note that this is
 726  * harmless since we don't actually evict the pages from memory
 727  * but just move them to the inactive list.
 728  *
 729  * TODO:
 730  * - make the readahead code smarter
 731  * - move readahead to the VMA level so we can do the same
 732  *   trick with mmap()
 733  *
 734  * Rik van Riel, 2000
 735  */
 736 static void drop_behind(struct file * file, unsigned long index)
 737 {
 738         struct inode *inode = file->f_dentry->d_inode;
 739         struct address_space *mapping = inode->i_mapping;
 740         struct page **hash;
 741         struct page *page;
 742         unsigned long start;
 743
 744         /* Nothing to drop-behind if we're on the first page. */
 745         if (!index)
 746                 return;
 747
 748         if (index > file->f_rawin)
 749                 start = index - file->f_rawin;
 750         else
 751                 start = 0;
 752
 753         /*
 754          * Go backwards from index-1 and drop all pages in the
 755          * readahead window. Since the readahead window may have
 756          * been increased since the last time we were called, we
 757          * stop when the page isn't there.
 758          */
 759         spin_lock(&pagecache_lock);
 760         while (--index >= start) {
 761                 hash = page_hash(mapping, index);
 762                 page = __find_page_nolock(mapping, index, *hash);
 763                 if (!page)
 764                         break;
 765                 deactivate_page(page);
 766         }
 767         spin_unlock(&pagecache_lock);
 768 }
 769
 770 /*
 771  * Read-ahead profiling information
 772  * --------------------------------
 773  * Every PROFILE_MAXREADCOUNT, the following information is written
 774  * to the syslog:
 775  *   Percentage of asynchronous read-ahead.
 776  *   Average of read-ahead fields context value.
 777  * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
 778  * to the syslog.
 779  */
 780
 781 #ifdef PROFILE_READAHEAD
 782
 783 #define PROFILE_MAXREADCOUNT 1000
 784
 785 static unsigned long total_reada;
 786 static unsigned long total_async;
 787 static unsigned long total_ramax;
 788 static unsigned long total_ralen;
 789 static unsigned long total_rawin;
 790
 791 static void profile_readahead(int async, struct file *filp)
 792 {
 793         unsigned long flags;
 794
 795         ++total_reada;
 796         if (async)
 797                 ++total_async;
 798
 799         total_ramax     += filp->f_ramax;
 800         total_ralen     += filp->f_ralen;
 801         total_rawin     += filp->f_rawin;
 802
 803         if (total_reada > PROFILE_MAXREADCOUNT) {
 804                 save_flags(flags);
 805                 cli();
 806                 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
 807                         restore_flags(flags);
 808                         return;
 809                 }
 810
 811                 printk("Readahead average:  max=%ld, len=%ld, win=%ld, async=%ld%%\n",
 812                         total_ramax/total_reada,
 813                         total_ralen/total_reada,
 814                         total_rawin/total_reada,
 815                         (total_async*100)/total_reada);
 816 #ifdef DEBUG_READAHEAD
 817                 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n",
 818                         filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
 819 #endif
 820
 821                 total_reada     = 0;
 822                 total_async     = 0;
 823                 total_ramax     = 0;
 824                 total_ralen     = 0;
 825                 total_rawin     = 0;
 826
 827                 restore_flags(flags);
 828         }
 829 }
 830 #endif  /* defined PROFILE_READAHEAD */
 831
 832 /*
 833  * Read-ahead context:
 834  * -------------------
 835  * The read ahead context fields of the "struct file" are the following:
 836  * - f_raend : position of the first byte after the last page we tried to
 837  *             read ahead.
 838  * - f_ramax : current read-ahead maximum size.
 839  * - f_ralen : length of the current IO read block we tried to read-ahead.
 840  * - f_rawin : length of the current read-ahead window.
 841  *              if last read-ahead was synchronous then
 842  *                      f_rawin = f_ralen
 843  *              otherwise (was asynchronous)
 844  *                      f_rawin = previous value of f_ralen + f_ralen
 845  *
 846  * Read-ahead limits:
 847  * ------------------
 848  * MIN_READAHEAD   : minimum read-ahead size when read-ahead.
 849  * MAX_READAHEAD   : maximum read-ahead size when read-ahead.
 850  *
 851  * Synchronous read-ahead benefits:
 852  * --------------------------------
 853  * Using reasonable IO xfer length from peripheral devices increase system
 854  * performances.
 855  * Reasonable means, in this context, not too large but not too small.
 856  * The actual maximum value is:
 857  *      MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
 858  *      and 32K if defined (4K page size assumed).
 859  *
 860  * Asynchronous read-ahead benefits:
 861  * ---------------------------------
 862  * Overlapping next read request and user process execution increase system
 863  * performance.
 864  *
 865  * Read-ahead risks:
 866  * -----------------
 867  * We have to guess which further data are needed by the user process.
 868  * If these data are often not really needed, it's bad for system
 869  * performances.
 870  * However, we know that files are often accessed sequentially by
 871  * application programs and it seems that it is possible to have some good
 872  * strategy in that guessing.
 873  * We only try to read-ahead files that seems to be read sequentially.
 874  *
 875  * Asynchronous read-ahead risks:
 876  * ------------------------------
 877  * In order to maximize overlapping, we must start some asynchronous read
 878  * request from the device, as soon as possible.
 879  * We must be very careful about:
 880  * - The number of effective pending IO read requests.
 881  *   ONE seems to be the only reasonable value.
 882  * - The total memory pool usage for the file access stream.
 883  *   This maximum memory usage is implicitly 2 IO read chunks:
 884  *   2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
 885  *   64k if defined (4K page size assumed).
 886  */
 887
 888 static inline int get_max_readahead(struct inode * inode)
 889 {
 890         if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
 891                 return MAX_READAHEAD;
 892         return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
 893 }
 894
 895 static void generic_file_readahead(int reada_ok,
 896         struct file * filp, struct inode * inode,
 897         struct page * page)
 898 {
 899         unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
 900         unsigned long index = page->index;
 901         unsigned long max_ahead, ahead;
 902         unsigned long raend;
 903         int max_readahead = get_max_readahead(inode);
 904
 905         raend = filp->f_raend;
 906         max_ahead = 0;
 907
 908 /*
 909  * The current page is locked.
 910  * If the current position is inside the previous read IO request, do not
 911  * try to reread previously read ahead pages.
 912  * Otherwise decide or not to read ahead some pages synchronously.
 913  * If we are not going to read ahead, set the read ahead context for this
 914  * page only.
 915  */
 916         if (PageLocked(page)) {
 917                 if (!filp->f_ralen || index >= raend || index + filp->f_rawin < raend) {
 918                         raend = index;
 919                         if (raend < end_index)
 920                                 max_ahead = filp->f_ramax;
 921                         filp->f_rawin = 0;
 922                         filp->f_ralen = 1;
 923                         if (!max_ahead) {
 924                                 filp->f_raend  = index + filp->f_ralen;
 925                                 filp->f_rawin += filp->f_ralen;
 926                         }
 927                 }
 928         }
 929 /*
 930  * The current page is not locked.
 931  * If we were reading ahead and,
 932  * if the current max read ahead size is not zero and,
 933  * if the current position is inside the last read-ahead IO request,
 934  *   it is the moment to try to read ahead asynchronously.
 935  * We will later force unplug device in order to force asynchronous read IO.
 936  */
 937         else if (reada_ok && filp->f_ramax && raend >= 1 &&
 938                  index <= raend && index + filp->f_ralen >= raend) {
 939 /*
 940  * Add ONE page to max_ahead in order to try to have about the same IO max size
 941  * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
 942  * Compute the position of the last page we have tried to read in order to
 943  * begin to read ahead just at the next page.
 944  */
 945                 raend -= 1;
 946                 if (raend < end_index)
 947                         max_ahead = filp->f_ramax + 1;
 948
 949                 if (max_ahead) {
 950                         filp->f_rawin = filp->f_ralen;
 951                         filp->f_ralen = 0;
 952                         reada_ok      = 2;
 953                 }
 954         }
 955 /*
 956  * Try to read ahead pages.
 957  * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
 958  * scheduler, will work enough for us to avoid too bad actuals IO requests.
 959  */
 960         ahead = 0;
 961         while (ahead < max_ahead) {
 962                 ahead ++;
 963                 if ((raend + ahead) >= end_index)
 964                         break;
 965                 if (page_cache_read(filp, raend + ahead) < 0)
 966                         break;
 967         }
 968 /*
 969  * If we tried to read ahead some pages,
 970  * If we tried to read ahead asynchronously,
 971  *   Try to force unplug of the device in order to start an asynchronous
 972  *   read IO request.
 973  * Update the read-ahead context.
 974  * Store the length of the current read-ahead window.
 975  * Double the current max read ahead size.
 976  *   That heuristic avoid to do some large IO for files that are not really
 977  *   accessed sequentially.
 978  */
 979         if (ahead) {
 980                 if (reada_ok == 2) {
 981                         run_task_queue(&tq_disk);
 982                 }
 983
 984                 filp->f_ralen += ahead;
 985                 filp->f_rawin += filp->f_ralen;
 986                 filp->f_raend = raend + ahead + 1;
 987
 988                 filp->f_ramax += filp->f_ramax;
 989
 990                 if (filp->f_ramax > max_readahead)
 991                         filp->f_ramax = max_readahead;
 992
 993                 /*
 994                  * Move the pages that have already been passed
 995                  * to the inactive list.
 996                  */
 997                 drop_behind(filp, index);
 998
 999 #ifdef PROFILE_READAHEAD
1000                 profile_readahead((reada_ok == 2), filp);
1001 #endif
1002         }
1003
1004         return;
1005 }
1006
1007
1008 /*
1009  * This is a generic file read routine, and uses the
1010  * inode->i_op->readpage() function for the actual low-level
1011  * stuff.
1012  *
1013  * This is really ugly. But the goto's actually try to clarify some
1014  * of the logic when it comes to error handling etc.
1015  */
1016 void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
1017 {
1018         struct inode *inode = filp->f_dentry->d_inode;
1019         struct address_space *mapping = inode->i_mapping;
1020         unsigned long index, offset;
1021         struct page *cached_page;
1022         int reada_ok;
1023         int error;
1024         int max_readahead = get_max_readahead(inode);
1025
1026         cached_page = NULL;
1027         index = *ppos >> PAGE_CACHE_SHIFT;
1028         offset = *ppos & ~PAGE_CACHE_MASK;
1029
1030 /*
1031  * If the current position is outside the previous read-ahead window,
1032  * we reset the current read-ahead context and set read ahead max to zero
1033  * (will be set to just needed value later),
1034  * otherwise, we assume that the file accesses are sequential enough to
1035  * continue read-ahead.
1036  */
1037         if (index > filp->f_raend || index + filp->f_rawin < filp->f_raend) {
1038                 reada_ok = 0;
1039                 filp->f_raend = 0;
1040                 filp->f_ralen = 0;
1041                 filp->f_ramax = 0;
1042                 filp->f_rawin = 0;
1043         } else {
1044                 reada_ok = 1;
1045         }
1046 /*
1047  * Adjust the current value of read-ahead max.
1048  * If the read operation stay in the first half page, force no readahead.
1049  * Otherwise try to increase read ahead max just enough to do the read request.
1050  * Then, at least MIN_READAHEAD if read ahead is ok,
1051  * and at most MAX_READAHEAD in all cases.
1052  */
1053         if (!index && offset + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
1054                 filp->f_ramax = 0;
1055         } else {
1056                 unsigned long needed;
1057
1058                 needed = ((offset + desc->count) >> PAGE_CACHE_SHIFT) + 1;
1059
1060                 if (filp->f_ramax < needed)
1061                         filp->f_ramax = needed;
1062
1063                 if (reada_ok && filp->f_ramax < MIN_READAHEAD)
1064                                 filp->f_ramax = MIN_READAHEAD;
1065                 if (filp->f_ramax > max_readahead)
1066                         filp->f_ramax = max_readahead;
1067         }
1068
1069         for (;;) {
1070                 struct page *page, **hash;
1071                 unsigned long end_index, nr;
1072
1073                 end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1074                 if (index > end_index)
1075                         break;
1076                 nr = PAGE_CACHE_SIZE;
1077                 if (index == end_index) {
1078                         nr = inode->i_size & ~PAGE_CACHE_MASK;
1079                         if (nr <= offset)
1080                                 break;
1081                 }
1082
1083                 nr = nr - offset;
1084
1085                 /*
1086                  * Try to find the data in the page cache..
1087                  */
1088                 hash = page_hash(mapping, index);
1089
1090                 spin_lock(&pagecache_lock);
1091                 page = __find_page_nolock(mapping, index, *hash);
1092                 if (!page)
1093                         goto no_cached_page;
1094 found_page:
1095                 page_cache_get(page);
1096                 spin_unlock(&pagecache_lock);
1097
1098                 if (!Page_Uptodate(page))
1099                         goto page_not_up_to_date;
1100                 generic_file_readahead(reada_ok, filp, inode, page);
1101 page_ok:
1102                 /* If users can be writing to this page using arbitrary
1103                  * virtual addresses, take care about potential aliasing
1104                  * before reading the page on the kernel side.
1105                  */
1106                 if (mapping->i_mmap_shared != NULL)
1107                         flush_dcache_page(page);
1108
1109                 /*
1110                  * Ok, we have the page, and it's up-to-date, so
1111                  * now we can copy it to user space...
1112                  *
1113                  * The actor routine returns how many bytes were actually used..
1114                  * NOTE! This may not be the same as how much of a user buffer
1115                  * we filled up (we may be padding etc), so we can only update
1116                  * "pos" here (the actor routine has to update the user buffer
1117                  * pointers and the remaining count).
1118                  */
1119                 nr = actor(desc, page, offset, nr);
1120                 offset += nr;
1121                 index += offset >> PAGE_CACHE_SHIFT;
1122                 offset &= ~PAGE_CACHE_MASK;
1123
1124                 page_cache_release(page);
1125                 if (nr && desc->count)
1126                         continue;
1127                 break;
1128
1129 /*
1130  * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
1131  */
1132 page_not_up_to_date:
1133                 generic_file_readahead(reada_ok, filp, inode, page);
1134
1135                 if (Page_Uptodate(page))
1136                         goto page_ok;
1137
1138                 /* Get exclusive access to the page ... */
1139                 lock_page(page);
1140
1141                 /* Did it get unhashed before we got the lock? */
1142                 if (!page->mapping) {
1143                         UnlockPage(page);
1144                         page_cache_release(page);
1145                         continue;
1146                 }
1147
1148                 /* Did somebody else fill it already? */
1149                 if (Page_Uptodate(page)) {
1150                         UnlockPage(page);
1151                         goto page_ok;
1152                 }
1153
1154 readpage:
1155                 /* ... and start the actual read. The read will unlock the page. */
1156                 error = mapping->a_ops->readpage(filp, page);
1157
1158                 if (!error) {
1159                         if (Page_Uptodate(page))
1160                                 goto page_ok;
1161
1162                         /* Again, try some read-ahead while waiting for the page to finish.. */
1163                         generic_file_readahead(reada_ok, filp, inode, page);
1164                         wait_on_page(page);
1165                         if (Page_Uptodate(page))
1166                                 goto page_ok;
1167                         error = -EIO;
1168                 }
1169
1170                 /* UHHUH! A synchronous read error occurred. Report it */
1171                 desc->error = error;
1172                 page_cache_release(page);
1173                 break;
1174
1175 no_cached_page:
1176                 /*
1177                  * Ok, it wasn't cached, so we need to create a new
1178                  * page..
1179                  *
1180                  * We get here with the page cache lock held.
1181                  */
1182                 if (!cached_page) {
1183                         spin_unlock(&pagecache_lock);
1184                         cached_page = page_cache_alloc();
1185                         if (!cached_page) {
1186                                 desc->error = -ENOMEM;
1187                                 break;
1188                         }
1189
1190                         /*
1191                          * Somebody may have added the page while we
1192                          * dropped the page cache lock. Check for that.
1193                          */
1194                         spin_lock(&pagecache_lock);
1195                         page = __find_page_nolock(mapping, index, *hash);
1196                         if (page)
1197                                 goto found_page;
1198                 }
1199
1200                 /*
1201                  * Ok, add the new page to the hash-queues...
1202                  */
1203                 page = cached_page;
1204                 __add_to_page_cache(page, mapping, index, hash);
1205                 spin_unlock(&pagecache_lock);
1206                 cached_page = NULL;
1207
1208                 goto readpage;
1209         }
1210
1211         *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1212         filp->f_reada = 1;
1213         if (cached_page)
1214                 page_cache_free(cached_page);
1215         UPDATE_ATIME(inode);
1216 }
1217
1218 static int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
1219 {
1220         char *kaddr;
1221         unsigned long left, count = desc->count;
1222
1223         if (size > count)
1224                 size = count;
1225
1226         kaddr = kmap(page);
1227         left = __copy_to_user(desc->buf, kaddr + offset, size);
1228         kunmap(page);
1229
1230         if (left) {
1231                 size -= left;
1232                 desc->error = -EFAULT;
1233         }
1234         desc->count = count - size;
1235         desc->written += size;
1236         desc->buf += size;
1237         return size;
1238 }
1239
1240 /*
1241  * This is the "read()" routine for all filesystems
1242  * that can use the page cache directly.
1243  */
1244 ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
1245 {
1246         ssize_t retval;
1247
1248         retval = -EFAULT;
1249         if (access_ok(VERIFY_WRITE, buf, count)) {
1250                 retval = 0;
1251
1252                 if (count) {
1253                         read_descriptor_t desc;
1254
1255                         desc.written = 0;
1256                         desc.count = count;
1257                         desc.buf = buf;
1258                         desc.error = 0;
1259                         do_generic_file_read(filp, ppos, &desc, file_read_actor);
1260
1261                         retval = desc.written;
1262                         if (!retval)
1263                                 retval = desc.error;
1264                 }
1265         }
1266         return retval;
1267 }
1268
1269 static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size)
1270 {
1271         char *kaddr;
1272         ssize_t written;
1273         unsigned long count = desc->count;
1274         struct file *file = (struct file *) desc->buf;
1275         mm_segment_t old_fs;
1276
1277         if (size > count)
1278                 size = count;
1279         old_fs = get_fs();
1280         set_fs(KERNEL_DS);
1281
1282         kaddr = kmap(page);
1283         written = file->f_op->write(file, kaddr + offset, size, &file->f_pos);
1284         kunmap(page);
1285         set_fs(old_fs);
1286         if (written < 0) {
1287                 desc->error = written;
1288                 written = 0;
1289         }
1290         desc->count = count - written;
1291         desc->written += written;
1292         return written;
1293 }
1294
1295 asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
1296 {
1297         ssize_t retval;
1298         struct file * in_file, * out_file;
1299         struct inode * in_inode, * out_inode;
1300
1301         /*
1302          * Get input file, and verify that it is ok..
1303          */
1304         retval = -EBADF;
1305         in_file = fget(in_fd);
1306         if (!in_file)
1307                 goto out;
1308         if (!(in_file->f_mode & FMODE_READ))
1309                 goto fput_in;
1310         retval = -EINVAL;
1311         in_inode = in_file->f_dentry->d_inode;
1312         if (!in_inode)
1313                 goto fput_in;
1314         if (!in_inode->i_mapping->a_ops->readpage)
1315                 goto fput_in;
1316         retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
1317         if (retval)
1318                 goto fput_in;
1319
1320         /*
1321          * Get output file, and verify that it is ok..
1322          */
1323         retval = -EBADF;
1324         out_file = fget(out_fd);
1325         if (!out_file)
1326                 goto fput_in;
1327         if (!(out_file->f_mode & FMODE_WRITE))
1328                 goto fput_out;
1329         retval = -EINVAL;
1330         if (!out_file->f_op || !out_file->f_op->write)
1331                 goto fput_out;
1332         out_inode = out_file->f_dentry->d_inode;
1333         retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
1334         if (retval)
1335                 goto fput_out;
1336
1337         retval = 0;
1338         if (count) {
1339                 read_descriptor_t desc;
1340                 loff_t pos = 0, *ppos;
1341
1342                 retval = -EFAULT;
1343                 ppos = &in_file->f_pos;
1344                 if (offset) {
1345                         if (get_user(pos, offset))
1346                                 goto fput_out;
1347                         ppos = &pos;
1348                 }
1349
1350                 desc.written = 0;
1351                 desc.count = count;
1352                 desc.buf = (char *) out_file;
1353                 desc.error = 0;
1354                 do_generic_file_read(in_file, ppos, &desc, file_send_actor);
1355
1356                 retval = desc.written;
1357                 if (!retval)
1358                         retval = desc.error;
1359                 if (offset)
1360                         put_user(pos, offset);
1361         }
1362
1363 fput_out:
1364         fput(out_file);
1365 fput_in:
1366         fput(in_file);
1367 out:
1368         return retval;
1369 }
1370
1371 /*
1372  * Read-ahead and flush behind for MADV_SEQUENTIAL areas.  Since we are
1373  * sure this is sequential access, we don't need a flexible read-ahead
1374  * window size -- we can always use a large fixed size window.
1375  */
1376 static void nopage_sequential_readahead(struct vm_area_struct * vma,
1377         unsigned long pgoff, unsigned long filesize)
1378 {
1379         unsigned long ra_window;
1380
1381         ra_window = get_max_readahead(vma->vm_file->f_dentry->d_inode);
1382         ra_window = CLUSTER_OFFSET(ra_window + CLUSTER_PAGES - 1);
1383
1384         /* vm_raend is zero if we haven't read ahead in this area yet.  */
1385         if (vma->vm_raend == 0)
1386                 vma->vm_raend = vma->vm_pgoff + ra_window;
1387
1388         /*
1389          * If we've just faulted the page half-way through our window,
1390          * then schedule reads for the next window, and release the
1391          * pages in the previous window.
1392          */
1393         if ((pgoff + (ra_window >> 1)) == vma->vm_raend) {
1394                 unsigned long start = vma->vm_pgoff + vma->vm_raend;
1395                 unsigned long end = start + ra_window;
1396
1397                 if (end > ((vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff))
1398                         end = (vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff;
1399                 if (start > end)
1400                         return;
1401
1402                 while ((start < end) && (start < filesize)) {
1403                         if (read_cluster_nonblocking(vma->vm_file,
1404                                                         start, filesize) < 0)
1405                                 break;
1406                         start += CLUSTER_PAGES;
1407                 }
1408                 run_task_queue(&tq_disk);
1409
1410                 /* if we're far enough past the beginning of this area,
1411                    recycle pages that are in the previous window. */
1412                 if (vma->vm_raend > (vma->vm_pgoff + ra_window + ra_window)) {
1413                         unsigned long window = ra_window << PAGE_SHIFT;
1414
1415                         end = vma->vm_start + (vma->vm_raend << PAGE_SHIFT);
1416                         end -= window + window;
1417                         filemap_sync(vma, end - window, window, MS_INVALIDATE);
1418                 }
1419
1420                 vma->vm_raend += ra_window;
1421         }
1422
1423         return;
1424 }
1425
1426 /*
1427  * filemap_nopage() is invoked via the vma operations vector for a
1428  * mapped memory region to read in file data during a page fault.
1429  *
1430  * The goto's are kind of ugly, but this streamlines the normal case of having
1431  * it in the page cache, and handles the special cases reasonably without
1432  * having a lot of duplicated code.
1433  */
1434 struct page * filemap_nopage(struct vm_area_struct * area,
1435         unsigned long address, int no_share)
1436 {
1437         int error;
1438         struct file *file = area->vm_file;
1439         struct inode *inode = file->f_dentry->d_inode;
1440         struct address_space *mapping = inode->i_mapping;
1441         struct page *page, **hash, *old_page;
1442         unsigned long size, pgoff;
1443
1444         pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
1445
1446 retry_all:
1447         /*
1448          * An external ptracer can access pages that normally aren't
1449          * accessible..
1450          */
1451         size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1452         if ((pgoff >= size) && (area->vm_mm == current->mm))
1453                 return NULL;
1454
1455         /*
1456          * Do we have something in the page cache already?
1457          */
1458         hash = page_hash(mapping, pgoff);
1459 retry_find:
1460         page = __find_get_page(mapping, pgoff, hash);
1461         if (!page)
1462                 goto no_cached_page;
1463
1464         /*
1465          * Ok, found a page in the page cache, now we need to check
1466          * that it's up-to-date.
1467          */
1468         if (!Page_Uptodate(page))
1469                 goto page_not_uptodate;
1470
1471 success:
1472         /*
1473          * Try read-ahead for sequential areas.
1474          */
1475         if (VM_SequentialReadHint(area))
1476                 nopage_sequential_readahead(area, pgoff, size);
1477
1478         /*
1479          * Found the page and have a reference on it, need to check sharing
1480          * and possibly copy it over to another page..
1481          */
1482         old_page = page;
1483         if (no_share) {
1484                 struct page *new_page = page_cache_alloc();
1485
1486                 if (new_page) {
1487                         copy_user_highpage(new_page, old_page, address);
1488                         flush_page_to_ram(new_page);
1489                 } else
1490                         new_page = NOPAGE_OOM;
1491                 page_cache_release(page);
1492                 return new_page;
1493         }
1494
1495         flush_page_to_ram(old_page);
1496         return old_page;
1497
1498 no_cached_page:
1499         /*
1500          * If the requested offset is within our file, try to read a whole
1501          * cluster of pages at once.
1502          *
1503          * Otherwise, we're off the end of a privately mapped file,
1504          * so we need to map a zero page.
1505          */
1506         if ((pgoff < size) && !VM_RandomReadHint(area))
1507                 error = read_cluster_nonblocking(file, pgoff, size);
1508         else
1509                 error = page_cache_read(file, pgoff);
1510
1511         /*
1512          * The page we want has now been added to the page cache.
1513          * In the unlikely event that someone removed it in the
1514          * meantime, we'll just come back here and read it again.
1515          */
1516         if (error >= 0)
1517                 goto retry_find;
1518
1519         /*
1520          * An error return from page_cache_read can result if the
1521          * system is low on memory, or a problem occurs while trying
1522          * to schedule I/O.
1523          */
1524         if (error == -ENOMEM)
1525                 return NOPAGE_OOM;
1526         return NULL;
1527
1528 page_not_uptodate:
1529         lock_page(page);
1530
1531         /* Did it get unhashed while we waited for it? */
1532         if (!page->mapping) {
1533                 UnlockPage(page);
1534                 page_cache_release(page);
1535                 goto retry_all;
1536         }
1537
1538         /* Did somebody else get it up-to-date? */
1539         if (Page_Uptodate(page)) {
1540                 UnlockPage(page);
1541                 goto success;
1542         }
1543
1544         if (!mapping->a_ops->readpage(file, page)) {
1545                 wait_on_page(page);
1546                 if (Page_Uptodate(page))
1547                         goto success;
1548         }
1549
1550         /*
1551          * Umm, take care of errors if the page isn't up-to-date.
1552          * Try to re-read it _once_. We do this synchronously,
1553          * because there really aren't any performance issues here
1554          * and we need to check for errors.
1555          */
1556         lock_page(page);
1557
1558         /* Somebody truncated the page on us? */
1559         if (!page->mapping) {
1560                 UnlockPage(page);
1561                 page_cache_release(page);
1562                 goto retry_all;
1563         }
1564
1565         /* Somebody else successfully read it in? */
1566         if (Page_Uptodate(page)) {
1567                 UnlockPage(page);
1568                 goto success;
1569         }
1570         ClearPageError(page);
1571         if (!mapping->a_ops->readpage(file, page)) {
1572                 wait_on_page(page);
1573                 if (Page_Uptodate(page))
1574                         goto success;
1575         }
1576
1577         /*
1578          * Things didn't work out. Return zero to tell the
1579          * mm layer so, possibly freeing the page cache page first.
1580          */
1581         page_cache_release(page);
1582         return NULL;
1583 }
1584
1585 /* Called with mm->page_table_lock held to protect against other
1586  * threads/the swapper from ripping pte's out from under us.
1587  */
1588 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
1589         unsigned long address, unsigned int flags)
1590 {
1591         pte_t pte = *ptep;
1592
1593         if (pte_present(pte) && ptep_test_and_clear_dirty(ptep)) {
1594                 struct page *page = pte_page(pte);
1595                 flush_tlb_page(vma, address);
1596                 set_page_dirty(page);
1597         }
1598         return 0;
1599 }
1600
1601 static inline int filemap_sync_pte_range(pmd_t * pmd,
1602         unsigned long address, unsigned long size,
1603         struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
1604 {
1605         pte_t * pte;
1606         unsigned long end;
1607         int error;
1608
1609         if (pmd_none(*pmd))
1610                 return 0;
1611         if (pmd_bad(*pmd)) {
1612                 pmd_ERROR(*pmd);
1613                 pmd_clear(pmd);
1614                 return 0;
1615         }
1616         pte = pte_offset(pmd, address);
1617         offset += address & PMD_MASK;
1618         address &= ~PMD_MASK;
1619         end = address + size;
1620         if (end > PMD_SIZE)
1621                 end = PMD_SIZE;
1622         error = 0;
1623         do {
1624                 error |= filemap_sync_pte(pte, vma, address + offset, flags);
1625                 address += PAGE_SIZE;
1626                 pte++;
1627         } while (address && (address < end));
1628         return error;
1629 }
1630
1631 static inline int filemap_sync_pmd_range(pgd_t * pgd,
1632         unsigned long address, unsigned long size,
1633         struct vm_area_struct *vma, unsigned int flags)
1634 {
1635         pmd_t * pmd;
1636         unsigned long offset, end;
1637         int error;
1638
1639         if (pgd_none(*pgd))
1640                 return 0;
1641         if (pgd_bad(*pgd)) {
1642                 pgd_ERROR(*pgd);
1643                 pgd_clear(pgd);
1644                 return 0;
1645         }
1646         pmd = pmd_offset(pgd, address);
1647         offset = address & PGDIR_MASK;
1648         address &= ~PGDIR_MASK;
1649         end = address + size;
1650         if (end > PGDIR_SIZE)
1651                 end = PGDIR_SIZE;
1652         error = 0;
1653         do {
1654                 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
1655                 address = (address + PMD_SIZE) & PMD_MASK;
1656                 pmd++;
1657         } while (address && (address < end));
1658         return error;
1659 }
1660
1661 int filemap_sync(struct vm_area_struct * vma, unsigned long address,
1662         size_t size, unsigned int flags)
1663 {
1664         pgd_t * dir;
1665         unsigned long end = address + size;
1666         int error = 0;
1667
1668         /* Aquire the lock early; it may be possible to avoid dropping
1669          * and reaquiring it repeatedly.
1670          */
1671         spin_lock(&vma->vm_mm->page_table_lock);
1672
1673         dir = pgd_offset(vma->vm_mm, address);
1674         flush_cache_range(vma->vm_mm, end - size, end);
1675         if (address >= end)
1676                 BUG();
1677         do {
1678                 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
1679                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
1680                 dir++;
1681         } while (address && (address < end));
1682         flush_tlb_range(vma->vm_mm, end - size, end);
1683
1684         spin_unlock(&vma->vm_mm->page_table_lock);
1685
1686         return error;
1687 }
1688
1689 /*
1690  * Shared mappings need to be able to do the right thing at
1691  * close/unmap/sync. They will also use the private file as
1692  * backing-store for swapping..
1693  */
1694 static struct vm_operations_struct file_shared_mmap = {
1695         nopage:         filemap_nopage,
1696 };
1697
1698 /*
1699  * Private mappings just need to be able to load in the map.
1700  *
1701  * (This is actually used for shared mappings as well, if we
1702  * know they can't ever get write permissions..)
1703  */
1704 static struct vm_operations_struct file_private_mmap = {
1705         nopage:         filemap_nopage,
1706 };
1707
1708 /* This is used for a general mmap of a disk file */
1709
1710 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1711 {
1712         struct vm_operations_struct * ops;
1713         struct inode *inode = file->f_dentry->d_inode;
1714
1715         ops = &file_private_mmap;
1716         if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1717                 if (!inode->i_mapping->a_ops->writepage)
1718                         return -EINVAL;
1719                 ops = &file_shared_mmap;
1720         }
1721         if (!inode->i_sb || !S_ISREG(inode->i_mode))
1722                 return -EACCES;
1723         if (!inode->i_mapping->a_ops->readpage)
1724                 return -ENOEXEC;
1725         UPDATE_ATIME(inode);
1726         vma->vm_ops = ops;
1727         return 0;
1728 }
1729
1730 /*
1731  * The msync() system call.
1732  */
1733
1734 static int msync_interval(struct vm_area_struct * vma,
1735         unsigned long start, unsigned long end, int flags)
1736 {
1737         struct file * file = vma->vm_file;
1738         if (file && (vma->vm_flags & VM_SHARED)) {
1739                 int error;
1740                 error = filemap_sync(vma, start, end-start, flags);
1741
1742                 if (!error && (flags & MS_SYNC)) {
1743                         struct inode * inode = file->f_dentry->d_inode;
1744                         down(&inode->i_sem);
1745                         filemap_fdatasync(inode->i_mapping);
1746                         if (file->f_op && file->f_op->fsync)
1747                                 error = file->f_op->fsync(file, file->f_dentry, 1);
1748                         filemap_fdatawait(inode->i_mapping);
1749                         up(&inode->i_sem);
1750                 }
1751                 return error;
1752         }
1753         return 0;
1754 }
1755
1756 asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
1757 {
1758         unsigned long end;
1759         struct vm_area_struct * vma;
1760         int unmapped_error, error = -EINVAL;
1761
1762         down(&current->mm->mmap_sem);
1763         if (start & ~PAGE_MASK)
1764                 goto out;
1765         len = (len + ~PAGE_MASK) & PAGE_MASK;
1766         end = start + len;
1767         if (end < start)
1768                 goto out;
1769         if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1770                 goto out;
1771         error = 0;
1772         if (end == start)
1773                 goto out;
1774         /*
1775          * If the interval [start,end) covers some unmapped address ranges,
1776          * just ignore them, but return -EFAULT at the end.
1777          */
1778         vma = find_vma(current->mm, start);
1779         unmapped_error = 0;
1780         for (;;) {
1781                 /* Still start < end. */
1782                 error = -EFAULT;
1783                 if (!vma)
1784                         goto out;
1785                 /* Here start < vma->vm_end. */
1786                 if (start < vma->vm_start) {
1787                         unmapped_error = -EFAULT;
1788                         start = vma->vm_start;
1789                 }
1790                 /* Here vma->vm_start <= start < vma->vm_end. */
1791                 if (end <= vma->vm_end) {
1792                         if (start < end) {
1793                                 error = msync_interval(vma, start, end, flags);
1794                                 if (error)
1795                                         goto out;
1796                         }
1797                         error = unmapped_error;
1798                         goto out;
1799                 }
1800                 /* Here vma->vm_start <= start < vma->vm_end < end. */
1801                 error = msync_interval(vma, start, vma->vm_end, flags);
1802                 if (error)
1803                         goto out;
1804                 start = vma->vm_end;
1805                 vma = vma->vm_next;
1806         }
1807 out:
1808         up(&current->mm->mmap_sem);
1809         return error;
1810 }
1811
1812 static inline void setup_read_behavior(struct vm_area_struct * vma,
1813         int behavior)
1814 {
1815         VM_ClearReadHint(vma);
1816         switch(behavior) {
1817                 case MADV_SEQUENTIAL:
1818                         vma->vm_flags |= VM_SEQ_READ;
1819                         break;
1820                 case MADV_RANDOM:
1821                         vma->vm_flags |= VM_RAND_READ;
1822                         break;
1823                 default:
1824                         break;
1825         }
1826         return;
1827 }
1828
1829 static long madvise_fixup_start(struct vm_area_struct * vma,
1830         unsigned long end, int behavior)
1831 {
1832         struct vm_area_struct * n;
1833
1834         n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1835         if (!n)
1836                 return -EAGAIN;
1837         *n = *vma;
1838         n->vm_end = end;
1839         setup_read_behavior(n, behavior);
1840         n->vm_raend = 0;
1841         get_file(n->vm_file);
1842         if (n->vm_ops && n->vm_ops->open)
1843                 n->vm_ops->open(n);
1844         lock_vma_mappings(vma);
1845         spin_lock(&vma->vm_mm->page_table_lock);
1846         vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
1847         vma->vm_start = end;
1848         __insert_vm_struct(current->mm, n);
1849         spin_unlock(&vma->vm_mm->page_table_lock);
1850         unlock_vma_mappings(vma);
1851         return 0;
1852 }
1853
1854 static long madvise_fixup_end(struct vm_area_struct * vma,
1855         unsigned long start, int behavior)
1856 {
1857         struct vm_area_struct * n;
1858
1859         n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1860         if (!n)
1861                 return -EAGAIN;
1862         *n = *vma;
1863         n->vm_start = start;
1864         n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT;
1865         setup_read_behavior(n, behavior);
1866         n->vm_raend = 0;
1867         get_file(n->vm_file);
1868         if (n->vm_ops && n->vm_ops->open)
1869                 n->vm_ops->open(n);
1870         lock_vma_mappings(vma);
1871         spin_lock(&vma->vm_mm->page_table_lock);
1872         vma->vm_end = start;
1873         __insert_vm_struct(current->mm, n);
1874         spin_unlock(&vma->vm_mm->page_table_lock);
1875         unlock_vma_mappings(vma);
1876         return 0;
1877 }
1878
1879 static long madvise_fixup_middle(struct vm_area_struct * vma,
1880         unsigned long start, unsigned long end, int behavior)
1881 {
1882         struct vm_area_struct * left, * right;
1883
1884         left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1885         if (!left)
1886                 return -EAGAIN;
1887         right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1888         if (!right) {
1889                 kmem_cache_free(vm_area_cachep, left);
1890                 return -EAGAIN;
1891         }
1892         *left = *vma;
1893         *right = *vma;
1894         left->vm_end = start;
1895         right->vm_start = end;
1896         right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT;
1897         left->vm_raend = 0;
1898         right->vm_raend = 0;
1899         atomic_add(2, &vma->vm_file->f_count);
1900
1901         if (vma->vm_ops && vma->vm_ops->open) {
1902                 vma->vm_ops->open(left);
1903                 vma->vm_ops->open(right);
1904         }
1905         lock_vma_mappings(vma);
1906         spin_lock(&vma->vm_mm->page_table_lock);
1907         vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
1908         vma->vm_start = start;
1909         vma->vm_end = end;
1910         setup_read_behavior(vma, behavior);
1911         vma->vm_raend = 0;
1912         __insert_vm_struct(current->mm, left);
1913         __insert_vm_struct(current->mm, right);
1914         spin_unlock(&vma->vm_mm->page_table_lock);
1915         unlock_vma_mappings(vma);
1916         return 0;
1917 }
1918
1919 /*
1920  * We can potentially split a vm area into separate
1921  * areas, each area with its own behavior.
1922  */
1923 static long madvise_behavior(struct vm_area_struct * vma,
1924         unsigned long start, unsigned long end, int behavior)
1925 {
1926         int error = 0;
1927
1928         /* This caps the number of vma's this process can own */
1929         if (vma->vm_mm->map_count > MAX_MAP_COUNT)
1930                 return -ENOMEM;
1931
1932         if (start == vma->vm_start) {
1933                 if (end == vma->vm_end) {
1934                         setup_read_behavior(vma, behavior);
1935                         vma->vm_raend = 0;
1936                 } else
1937                         error = madvise_fixup_start(vma, end, behavior);
1938         } else {
1939                 if (end == vma->vm_end)
1940                         error = madvise_fixup_end(vma, start, behavior);
1941                 else
1942                         error = madvise_fixup_middle(vma, start, end, behavior);
1943         }
1944
1945         return error;
1946 }
1947
1948 /*
1949  * Schedule all required I/O operations, then run the disk queue
1950  * to make sure they are started.  Do not wait for completion.
1951  */
1952 static long madvise_willneed(struct vm_area_struct * vma,
1953         unsigned long start, unsigned long end)
1954 {
1955         long error = -EBADF;
1956         struct file * file;
1957         unsigned long size, rlim_rss;
1958
1959         /* Doesn't work if there's no mapped file. */
1960         if (!vma->vm_file)
1961                 return error;
1962         file = vma->vm_file;
1963         size = (file->f_dentry->d_inode->i_size + PAGE_CACHE_SIZE - 1) >>
1964                                                         PAGE_CACHE_SHIFT;
1965
1966         start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1967         if (end > vma->vm_end)
1968                 end = vma->vm_end;
1969         end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1970
1971         /* Make sure this doesn't exceed the process's max rss. */
1972         error = -EIO;
1973         rlim_rss = current->rlim ?  current->rlim[RLIMIT_RSS].rlim_cur :
1974                                 LONG_MAX; /* default: see resource.h */
1975         if ((vma->vm_mm->rss + (end - start)) > rlim_rss)
1976                 return error;
1977
1978         /* round to cluster boundaries if this isn't a "random" area. */
1979         if (!VM_RandomReadHint(vma)) {
1980                 start = CLUSTER_OFFSET(start);
1981                 end = CLUSTER_OFFSET(end + CLUSTER_PAGES - 1);
1982
1983                 while ((start < end) && (start < size)) {
1984                         error = read_cluster_nonblocking(file, start, size);
1985                         start += CLUSTER_PAGES;
1986                         if (error < 0)
1987                                 break;
1988                 }
1989         } else {
1990                 while ((start < end) && (start < size)) {
1991                         error = page_cache_read(file, start);
1992                         start++;
1993                         if (error < 0)
1994                                 break;
1995                 }
1996         }
1997
1998         /* Don't wait for someone else to push these requests. */
1999         run_task_queue(&tq_disk);
2000
2001         return error;
2002 }
2003
2004 /*
2005  * Application no longer needs these pages.  If the pages are dirty,
2006  * it's OK to just throw them away.  The app will be more careful about
2007  * data it wants to keep.  Be sure to free swap resources too.  The
2008  * zap_page_range call sets things up for refill_inactive to actually free
2009  * these pages later if no one else has touched them in the meantime,
2010  * although we could add these pages to a global reuse list for
2011  * refill_inactive to pick up before reclaiming other pages.
2012  *
2013  * NB: This interface discards data rather than pushes it out to swap,
2014  * as some implementations do.  This has performance implications for
2015  * applications like large transactional databases which want to discard
2016  * pages in anonymous maps after committing to backing store the data
2017  * that was kept in them.  There is no reason to write this data out to
2018  * the swap area if the application is discarding it.
2019  *
2020  * An interface that causes the system to free clean pages and flush
2021  * dirty pages is already available as msync(MS_INVALIDATE).
2022  */
2023 static long madvise_dontneed(struct vm_area_struct * vma,
2024         unsigned long start, unsigned long end)
2025 {
2026         if (vma->vm_flags & VM_LOCKED)
2027                 return -EINVAL;
2028
2029         flush_cache_range(vma->vm_mm, start, end);
2030         zap_page_range(vma->vm_mm, start, end - start);
2031         flush_tlb_range(vma->vm_mm, start, end);
2032         return 0;
2033 }
2034
2035 static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
2036         unsigned long end, int behavior)
2037 {
2038         long error = -EBADF;
2039
2040         switch (behavior) {
2041         case MADV_NORMAL:
2042         case MADV_SEQUENTIAL:
2043         case MADV_RANDOM:
2044                 error = madvise_behavior(vma, start, end, behavior);
2045                 break;
2046
2047         case MADV_WILLNEED:
2048                 error = madvise_willneed(vma, start, end);
2049                 break;
2050
2051         case MADV_DONTNEED:
2052                 error = madvise_dontneed(vma, start, end);
2053                 break;
2054
2055         default:
2056                 error = -EINVAL;
2057                 break;
2058         }
2059
2060         return error;
2061 }
2062
2063 /*
2064  * The madvise(2) system call.
2065  *
2066  * Applications can use madvise() to advise the kernel how it should
2067  * handle paging I/O in this VM area.  The idea is to help the kernel
2068  * use appropriate read-ahead and caching techniques.  The information
2069  * provided is advisory only, and can be safely disregarded by the
2070  * kernel without affecting the correct operation of the application.
2071  *
2072  * behavior values:
2073  *  MADV_NORMAL - the default behavior is to read clusters.  This
2074  *              results in some read-ahead and read-behind.
2075  *  MADV_RANDOM - the system should read the minimum amount of data
2076  *              on any access, since it is unlikely that the appli-
2077  *              cation will need more than what it asks for.
2078  *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
2079  *              once, so they can be aggressively read ahead, and
2080  *              can be freed soon after they are accessed.
2081  *  MADV_WILLNEED - the application is notifying the system to read
2082  *              some pages ahead.
2083  *  MADV_DONTNEED - the application is finished with the given range,
2084  *              so the kernel can free resources associated with it.
2085  *
2086  * return values:
2087  *  zero    - success
2088  *  -EINVAL - start + len < 0, start is not page-aligned,
2089  *              "behavior" is not a valid value, or application
2090  *              is attempting to release locked or shared pages.
2091  *  -ENOMEM - addresses in the specified range are not currently
2092  *              mapped, or are outside the AS of the process.
2093  *  -EIO    - an I/O error occurred while paging in data.
2094  *  -EBADF  - map exists, but area maps something that isn't a file.
2095  *  -EAGAIN - a kernel resource was temporarily unavailable.
2096  */
2097 asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior)
2098 {
2099         unsigned long end;
2100         struct vm_area_struct * vma;
2101         int unmapped_error = 0;
2102         int error = -EINVAL;
2103
2104         down(&current->mm->mmap_sem);
2105
2106         if (start & ~PAGE_MASK)
2107                 goto out;
2108         len = (len + ~PAGE_MASK) & PAGE_MASK;
2109         end = start + len;
2110         if (end < start)
2111                 goto out;
2112
2113         error = 0;
2114         if (end == start)
2115                 goto out;
2116
2117         /*
2118          * If the interval [start,end) covers some unmapped address
2119          * ranges, just ignore them, but return -ENOMEM at the end.
2120          */
2121         vma = find_vma(current->mm, start);
2122         for (;;) {
2123                 /* Still start < end. */
2124                 error = -ENOMEM;
2125                 if (!vma)
2126                         goto out;
2127
2128                 /* Here start < vma->vm_end. */
2129                 if (start < vma->vm_start) {
2130                         unmapped_error = -ENOMEM;
2131                         start = vma->vm_start;
2132                 }
2133
2134                 /* Here vma->vm_start <= start < vma->vm_end. */
2135                 if (end <= vma->vm_end) {
2136                         if (start < end) {
2137                                 error = madvise_vma(vma, start, end,
2138                                                         behavior);
2139                                 if (error)
2140                                         goto out;
2141                         }
2142                         error = unmapped_error;
2143                         goto out;
2144                 }
2145
2146                 /* Here vma->vm_start <= start < vma->vm_end < end. */
2147                 error = madvise_vma(vma, start, vma->vm_end, behavior);
2148                 if (error)
2149                         goto out;
2150                 start = vma->vm_end;
2151                 vma = vma->vm_next;
2152         }
2153
2154 out:
2155         up(&current->mm->mmap_sem);
2156         return error;
2157 }
2158
2159 /*
2160  * Later we can get more picky about what "in core" means precisely.
2161  * For now, simply check to see if the page is in the page cache,
2162  * and is up to date; i.e. that no page-in operation would be required
2163  * at this time if an application were to map and access this page.
2164  */
2165 static unsigned char mincore_page(struct vm_area_struct * vma,
2166         unsigned long pgoff)
2167 {
2168         unsigned char present = 0;
2169         struct address_space * as = &vma->vm_file->f_dentry->d_inode->i_data;
2170         struct page * page, ** hash = page_hash(as, pgoff);
2171
2172         spin_lock(&pagecache_lock);
2173         page = __find_page_nolock(as, pgoff, *hash);
2174         if ((page) && (Page_Uptodate(page)))
2175                 present = 1;
2176         spin_unlock(&pagecache_lock);
2177
2178         return present;
2179 }
2180
2181 static long mincore_vma(struct vm_area_struct * vma,
2182         unsigned long start, unsigned long end, unsigned char * vec)
2183 {
2184         long error, i, remaining;
2185         unsigned char * tmp;
2186
2187         error = -ENOMEM;
2188         if (!vma->vm_file)
2189                 return error;
2190
2191         start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2192         if (end > vma->vm_end)
2193                 end = vma->vm_end;
2194         end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2195
2196         error = -EAGAIN;
2197         tmp = (unsigned char *) __get_free_page(GFP_KERNEL);
2198         if (!tmp)
2199                 return error;
2200
2201         /* (end - start) is # of pages, and also # of bytes in "vec */
2202         remaining = (end - start),
2203
2204         error = 0;
2205         for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) {
2206                 int j = 0;
2207                 long thispiece = (remaining < PAGE_SIZE) ?
2208                                                 remaining : PAGE_SIZE;
2209
2210                 while (j < thispiece)
2211                         tmp[j++] = mincore_page(vma, start++);
2212
2213                 if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) {
2214                         error = -EFAULT;
2215                         break;
2216                 }
2217         }
2218
2219         free_page((unsigned long) tmp);
2220         return error;
2221 }
2222
2223 /*
2224  * The mincore(2) system call.
2225  *
2226  * mincore() returns the memory residency status of the pages in the
2227  * current process's address space specified by [addr, addr + len).
2228  * The status is returned in a vector of bytes.  The least significant
2229  * bit of each byte is 1 if the referenced page is in memory, otherwise
2230  * it is zero.
2231  *
2232  * Because the status of a page can change after mincore() checks it
2233  * but before it returns to the application, the returned vector may
2234  * contain stale information.  Only locked pages are guaranteed to
2235  * remain in memory.
2236  *
2237  * return values:
2238  *  zero    - success
2239  *  -EFAULT - vec points to an illegal address
2240  *  -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE,
2241  *              or len has a nonpositive value
2242  *  -ENOMEM - Addresses in the range [addr, addr + len] are
2243  *              invalid for the address space of this process, or
2244  *              specify one or more pages which are not currently
2245  *              mapped
2246  *  -EAGAIN - A kernel resource was temporarily unavailable.
2247  */
2248 asmlinkage long sys_mincore(unsigned long start, size_t len,
2249         unsigned char * vec)
2250 {
2251         int index = 0;
2252         unsigned long end;
2253         struct vm_area_struct * vma;
2254         int unmapped_error = 0;
2255         long error = -EINVAL;
2256
2257         down(&current->mm->mmap_sem);
2258
2259         if (start & ~PAGE_CACHE_MASK)
2260                 goto out;
2261         len = (len + ~PAGE_CACHE_MASK) & PAGE_CACHE_MASK;
2262         end = start + len;
2263         if (end < start)
2264                 goto out;
2265
2266         error = 0;
2267         if (end == start)
2268                 goto out;
2269
2270         /*
2271          * If the interval [start,end) covers some unmapped address
2272          * ranges, just ignore them, but return -ENOMEM at the end.
2273          */
2274         vma = find_vma(current->mm, start);
2275         for (;;) {
2276                 /* Still start < end. */
2277                 error = -ENOMEM;
2278                 if (!vma)
2279                         goto out;
2280
2281                 /* Here start < vma->vm_end. */
2282                 if (start < vma->vm_start) {
2283                         unmapped_error = -ENOMEM;
2284                         start = vma->vm_start;
2285                 }
2286
2287                 /* Here vma->vm_start <= start < vma->vm_end. */
2288                 if (end <= vma->vm_end) {
2289                         if (start < end) {
2290                                 error = mincore_vma(vma, start, end,
2291                                                         &vec[index]);
2292                                 if (error)
2293                                         goto out;
2294                         }
2295                         error = unmapped_error;
2296                         goto out;
2297                 }
2298
2299                 /* Here vma->vm_start <= start < vma->vm_end < end. */
2300                 error = mincore_vma(vma, start, vma->vm_end, &vec[index]);
2301                 if (error)
2302                         goto out;
2303                 index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT;
2304                 start = vma->vm_end;
2305                 vma = vma->vm_next;
2306         }
2307
2308 out:
2309         up(&current->mm->mmap_sem);
2310         return error;
2311 }
2312
2313 static inline
2314 struct page *__read_cache_page(struct address_space *mapping,
2315                                 unsigned long index,
2316                                 int (*filler)(void *,struct page*),
2317                                 void *data)
2318 {
2319         struct page **hash = page_hash(mapping, index);
2320         struct page *page, *cached_page = NULL;
2321         int err;
2322 repeat:
2323         page = __find_get_page(mapping, index, hash);
2324         if (!page) {
2325                 if (!cached_page) {
2326                         cached_page = page_cache_alloc();
2327                         if (!cached_page)
2328                                 return ERR_PTR(-ENOMEM);
2329                 }
2330                 page = cached_page;
2331                 if (add_to_page_cache_unique(page, mapping, index, hash))
2332                         goto repeat;
2333                 cached_page = NULL;
2334                 err = filler(data, page);
2335                 if (err < 0) {
2336                         page_cache_release(page);
2337                         page = ERR_PTR(err);
2338                 }
2339         }
2340         if (cached_page)
2341                 page_cache_free(cached_page);
2342         return page;
2343 }
2344
2345 /*
2346  * Read into the page cache. If a page already exists,
2347  * and Page_Uptodate() is not set, try to fill the page.
2348  */
2349 struct page *read_cache_page(struct address_space *mapping,
2350                                 unsigned long index,
2351                                 int (*filler)(void *,struct page*),
2352                                 void *data)
2353 {
2354         struct page *page;
2355         int err;
2356
2357 retry:
2358         page = __read_cache_page(mapping, index, filler, data);
2359         if (IS_ERR(page) || Page_Uptodate(page))
2360                 goto out;
2361
2362         lock_page(page);
2363         if (!page->mapping) {
2364                 UnlockPage(page);
2365                 page_cache_release(page);
2366                 goto retry;
2367         }
2368         if (Page_Uptodate(page)) {
2369                 UnlockPage(page);
2370                 goto out;
2371         }
2372         err = filler(data, page);
2373         if (err < 0) {
2374                 page_cache_release(page);
2375                 page = ERR_PTR(err);
2376         }
2377  out:
2378         return page;
2379 }
2380
2381 static inline struct page * __grab_cache_page(struct address_space *mapping,
2382                                 unsigned long index, struct page **cached_page)
2383 {
2384         struct page *page, **hash = page_hash(mapping, index);
2385 repeat:
2386         page = __find_lock_page(mapping, index, hash);
2387         if (!page) {
2388                 if (!*cached_page) {
2389                         *cached_page = page_cache_alloc();
2390                         if (!*cached_page)
2391                                 return NULL;
2392                 }
2393                 page = *cached_page;
2394                 if (add_to_page_cache_unique(page, mapping, index, hash))
2395                         goto repeat;
2396                 *cached_page = NULL;
2397         }
2398         return page;
2399 }
2400
2401 /*
2402  * Returns locked page at given index in given cache, creating it if needed.
2403  */
2404
2405 struct page *grab_cache_page(struct address_space *mapping, unsigned long index)
2406 {
2407         struct page *cached_page = NULL;
2408         struct page *page = __grab_cache_page(mapping,index,&cached_page);
2409         if (cached_page)
2410                 page_cache_free(cached_page);
2411         return page;
2412 }
2413
2414 static inline void remove_suid(struct inode *inode)
2415 {
2416         unsigned int mode;
2417
2418         /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
2419         mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
2420
2421         /* was any of the uid bits set? */
2422         mode &= inode->i_mode;
2423         if (mode && !capable(CAP_FSETID)) {
2424                 inode->i_mode &= ~mode;
2425                 mark_inode_dirty(inode);
2426         }
2427 }
2428
2429 /*
2430  * Write to a file through the page cache.
2431  *
2432  * We currently put everything into the page cache prior to writing it.
2433  * This is not a problem when writing full pages. With partial pages,
2434  * however, we first have to read the data into the cache, then
2435  * dirty the page, and finally schedule it for writing. Alternatively, we
2436  * could write-through just the portion of data that would go into that
2437  * page, but that would kill performance for applications that write data
2438  * line by line, and it's prone to race conditions.
2439  *
2440  * Note that this routine doesn't try to keep track of dirty pages. Each
2441  * file system has to do this all by itself, unfortunately.
2442  *                                                      okir@monad.swb.de
2443  */
2444 ssize_t
2445 generic_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)
2446 {
2447         struct inode    *inode = file->f_dentry->d_inode;
2448         struct address_space *mapping = inode->i_mapping;
2449         unsigned long   limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
2450         loff_t          pos;
2451         struct page     *page, *cached_page;
2452         unsigned long   written;
2453         long            status;
2454         int             err;
2455
2456         cached_page = NULL;
2457
2458         down(&inode->i_sem);
2459
2460         pos = *ppos;
2461         err = -EINVAL;
2462         if (pos < 0)
2463                 goto out;
2464
2465         err = file->f_error;
2466         if (err) {
2467                 file->f_error = 0;
2468                 goto out;
2469         }
2470
2471         written = 0;
2472
2473         if (file->f_flags & O_APPEND)
2474                 pos = inode->i_size;
2475
2476         /*
2477          * Check whether we've reached the file size limit.
2478          */
2479         err = -EFBIG;
2480         if (limit != RLIM_INFINITY) {
2481                 if (pos >= limit) {
2482                         send_sig(SIGXFSZ, current, 0);
2483                         goto out;
2484                 }
2485                 if (count > limit - pos) {
2486                         send_sig(SIGXFSZ, current, 0);
2487                         count = limit - pos;
2488                 }
2489         }
2490
2491         status  = 0;
2492         if (count) {
2493                 remove_suid(inode);
2494                 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
2495                 mark_inode_dirty_sync(inode);
2496         }
2497
2498         while (count) {
2499                 unsigned long bytes, index, offset;
2500                 char *kaddr;
2501
2502                 /*
2503                  * Try to find the page in the cache. If it isn't there,
2504                  * allocate a free page.
2505                  */
2506                 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
2507                 index = pos >> PAGE_CACHE_SHIFT;
2508                 bytes = PAGE_CACHE_SIZE - offset;
2509                 if (bytes > count)
2510                         bytes = count;
2511
2512                 /*
2513                  * Bring in the user page that we will copy from _first_.
2514                  * Otherwise there's a nasty deadlock on copying from the
2515                  * same page as we're writing to, without it being marked
2516                  * up-to-date.
2517                  */
2518                 { volatile unsigned char dummy;
2519                         __get_user(dummy, buf);
2520                         __get_user(dummy, buf+bytes-1);
2521                 }
2522
2523                 status = -ENOMEM;       /* we'll assign it later anyway */
2524                 page = __grab_cache_page(mapping, index, &cached_page);
2525                 if (!page)
2526                         break;
2527
2528                 /* We have exclusive IO access to the page.. */
2529                 if (!PageLocked(page)) {
2530                         PAGE_BUG(page);
2531                 }
2532
2533                 status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes);
2534                 if (status)
2535                         goto unlock;
2536                 kaddr = page_address(page);
2537                 status = copy_from_user(kaddr+offset, buf, bytes);
2538                 flush_dcache_page(page);
2539                 if (status)
2540                         goto fail_write;
2541                 status = mapping->a_ops->commit_write(file, page, offset, offset+bytes);
2542                 if (!status)
2543                         status = bytes;
2544
2545                 if (status >= 0) {
2546                         written += status;
2547                         count -= status;
2548                         pos += status;
2549                         buf += status;
2550                 }
2551 unlock:
2552                 /* Mark it unlocked again and drop the page.. */
2553                 UnlockPage(page);
2554                 deactivate_page(page);
2555                 page_cache_release(page);
2556
2557                 if (status < 0)
2558                         break;
2559         }
2560         *ppos = pos;
2561
2562         if (cached_page)
2563                 page_cache_free(cached_page);
2564
2565         /* For now, when the user asks for O_SYNC, we'll actually
2566          * provide O_DSYNC. */
2567         if ((status >= 0) && (file->f_flags & O_SYNC))
2568                 status = generic_osync_inode(inode, 1); /* 1 means datasync */
2569
2570         err = written ? written : status;
2571 out:
2572
2573         up(&inode->i_sem);
2574         return err;
2575 fail_write:
2576         status = -EFAULT;
2577         ClearPageUptodate(page);
2578         kunmap(page);
2579         goto unlock;
2580 }
2581
2582 void __init page_cache_init(unsigned long mempages)
2583 {
2584         unsigned long htable_size, order;
2585
2586         htable_size = mempages;
2587         htable_size *= sizeof(struct page *);
2588         for(order = 0; (PAGE_SIZE << order) < htable_size; order++)
2589                 ;
2590
2591         do {
2592                 unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *);
2593
2594                 page_hash_bits = 0;
2595                 while((tmp >>= 1UL) != 0UL)
2596                         page_hash_bits++;
2597
2598                 page_hash_table = (struct page **)
2599                         __get_free_pages(GFP_ATOMIC, order);
2600         } while(page_hash_table == NULL && --order > 0);
2601
2602         printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
2603                (1 << page_hash_bits), order, (PAGE_SIZE << order));
2604         if (!page_hash_table)
2605                 panic("Failed to allocate page hash table\n");
2606         memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));
2607 }