mm/filemap.c

   1 /*
   2  *      linux/mm/filemap.c
   3  *
   4  * Copyright (C) 1994-1999  Linus Torvalds
   5  */
   6
   7 /*
   8  * This file handles the generic file mmap semantics used by
   9  * most "normal" filesystems (but you don't /have/ to use this:
  10  * the NFS filesystem used to do this differently, for example)
  11  */
  12 #include <linux/malloc.h>
  13 #include <linux/shm.h>
  14 #include <linux/mman.h>
  15 #include <linux/locks.h>
  16 #include <linux/pagemap.h>
  17 #include <linux/swap.h>
  18 #include <linux/smp_lock.h>
  19 #include <linux/blkdev.h>
  20 #include <linux/file.h>
  21 #include <linux/swapctl.h>
  22 #include <linux/slab.h>
  23 #include <linux/init.h>
  24 #include <linux/mm.h>
  25
  26 #include <asm/pgalloc.h>
  27 #include <asm/uaccess.h>
  28 #include <asm/mman.h>
  29
  30 #include <linux/highmem.h>
  31
  32 /*
  33  * Shared mappings implemented 30.11.1994. It's not fully working yet,
  34  * though.
  35  *
  36  * Shared mappings now work. 15.8.1995  Bruno.
  37  *
  38  * finished 'unifying' the page and buffer cache and SMP-threaded the
  39  * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
  40  *
  41  * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
  42  */
  43
  44 atomic_t page_cache_size = ATOMIC_INIT(0);
  45 unsigned int page_hash_bits;
  46 struct page **page_hash_table;
  47
  48 spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
  49 /*
  50  * NOTE: to avoid deadlocking you must never acquire the pagecache_lock with
  51  *       the pagemap_lru_lock held.
  52  */
  53 spinlock_t pagemap_lru_lock = SPIN_LOCK_UNLOCKED;
  54
  55 #define CLUSTER_PAGES           (1 << page_cluster)
  56 #define CLUSTER_OFFSET(x)       (((x) >> page_cluster) << page_cluster)
  57
  58 void __add_page_to_hash_queue(struct page * page, struct page **p)
  59 {
  60         atomic_inc(&page_cache_size);
  61         if((page->next_hash = *p) != NULL)
  62                 (*p)->pprev_hash = &page->next_hash;
  63         *p = page;
  64         page->pprev_hash = p;
  65         if (page->buffers)
  66                 PAGE_BUG(page);
  67 }
  68
  69 static inline void remove_page_from_hash_queue(struct page * page)
  70 {
  71         if(page->pprev_hash) {
  72                 if(page->next_hash)
  73                         page->next_hash->pprev_hash = page->pprev_hash;
  74                 *page->pprev_hash = page->next_hash;
  75                 page->pprev_hash = NULL;
  76         }
  77         atomic_dec(&page_cache_size);
  78 }
  79
  80 static inline int sync_page(struct page *page)
  81 {
  82         struct address_space *mapping = page->mapping;
  83
  84         if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
  85                 return mapping->a_ops->sync_page(page);
  86         return 0;
  87 }
  88
  89 /*
  90  * Remove a page from the page cache and free it. Caller has to make
  91  * sure the page is locked and that nobody else uses it - or that usage
  92  * is safe.
  93  */
  94 void __remove_inode_page(struct page *page)
  95 {
  96         remove_page_from_inode_queue(page);
  97         remove_page_from_hash_queue(page);
  98         page->mapping = NULL;
  99 }
 100
 101 void remove_inode_page(struct page *page)
 102 {
 103         if (!PageLocked(page))
 104                 PAGE_BUG(page);
 105
 106         spin_lock(&pagecache_lock);
 107         __remove_inode_page(page);
 108         spin_unlock(&pagecache_lock);
 109 }
 110
 111 /**
 112  * invalidate_inode_pages - Invalidate all the unlocked pages of one inode
 113  * @inode: the inode which pages we want to invalidate
 114  *
 115  * This function only removes the unlocked pages, if you want to
 116  * remove all the pages of one inode, you must call truncate_inode_pages.
 117  */
 118
 119 void invalidate_inode_pages(struct inode * inode)
 120 {
 121         struct list_head *head, *curr;
 122         struct page * page;
 123
 124         head = &inode->i_mapping->pages;
 125
 126         spin_lock(&pagecache_lock);
 127         spin_lock(&pagemap_lru_lock);
 128         curr = head->next;
 129
 130         while (curr != head) {
 131                 page = list_entry(curr, struct page, list);
 132                 curr = curr->next;
 133
 134                 /* We cannot invalidate a locked page */
 135                 if (TryLockPage(page))
 136                         continue;
 137
 138                 /* Neither can we invalidate something in use.. */
 139                 if (page_count(page) != 1) {
 140                         UnlockPage(page);
 141                         continue;
 142                 }
 143
 144                 __lru_cache_del(page);
 145                 __remove_inode_page(page);
 146                 UnlockPage(page);
 147                 page_cache_release(page);
 148         }
 149
 150         spin_unlock(&pagemap_lru_lock);
 151         spin_unlock(&pagecache_lock);
 152 }
 153
 154 static inline void truncate_partial_page(struct page *page, unsigned partial)
 155 {
 156         memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
 157
 158         if (page->buffers)
 159                 block_flushpage(page, partial);
 160
 161 }
 162
 163 static inline void truncate_complete_page(struct page *page)
 164 {
 165         /* Leave it on the LRU if it gets converted into anonymous buffers */
 166         if (!page->buffers || block_flushpage(page, 0))
 167                 lru_cache_del(page);
 168
 169         /*
 170          * We remove the page from the page cache _after_ we have
 171          * destroyed all buffer-cache references to it. Otherwise some
 172          * other process might think this inode page is not in the
 173          * page cache and creates a buffer-cache alias to it causing
 174          * all sorts of fun problems ...
 175          */
 176         ClearPageDirty(page);
 177         ClearPageUptodate(page);
 178         remove_inode_page(page);
 179         page_cache_release(page);
 180 }
 181
 182 /**
 183  * truncate_inode_pages - truncate *all* the pages from an offset
 184  * @mapping: mapping to truncate
 185  * @lstart: offset from with to truncate
 186  *
 187  * Truncate the page cache at a set offset, removing the pages
 188  * that are beyond that offset (and zeroing out partial pages).
 189  * If any page is locked we wait for it to become unlocked.
 190  */
 191 void truncate_inode_pages(struct address_space * mapping, loff_t lstart)
 192 {
 193         struct list_head *head, *curr;
 194         struct page * page;
 195         unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
 196         unsigned long start;
 197
 198         start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 199
 200 repeat:
 201         head = &mapping->pages;
 202         spin_lock(&pagecache_lock);
 203         curr = head->next;
 204         while (curr != head) {
 205                 unsigned long offset;
 206
 207                 page = list_entry(curr, struct page, list);
 208                 curr = curr->next;
 209                 offset = page->index;
 210
 211                 /* Is one of the pages to truncate? */
 212                 if ((offset >= start) || (partial && (offset + 1) == start)) {
 213                         if (TryLockPage(page)) {
 214                                 page_cache_get(page);
 215                                 spin_unlock(&pagecache_lock);
 216                                 wait_on_page(page);
 217                                 page_cache_release(page);
 218                                 goto repeat;
 219                         }
 220                         page_cache_get(page);
 221                         spin_unlock(&pagecache_lock);
 222
 223                         if (partial && (offset + 1) == start) {
 224                                 truncate_partial_page(page, partial);
 225                                 partial = 0;
 226                         } else
 227                                 truncate_complete_page(page);
 228
 229                         UnlockPage(page);
 230                         page_cache_release(page);
 231
 232                         /*
 233                          * We have done things without the pagecache lock,
 234                          * so we'll have to repeat the scan.
 235                          * It's not possible to deadlock here because
 236                          * we are guaranteed to make progress. (ie. we have
 237                          * just removed a page)
 238                          */
 239                         goto repeat;
 240                 }
 241         }
 242         spin_unlock(&pagecache_lock);
 243 }
 244
 245 static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
 246 {
 247         goto inside;
 248
 249         for (;;) {
 250                 page = page->next_hash;
 251 inside:
 252                 if (!page)
 253                         goto not_found;
 254                 if (page->mapping != mapping)
 255                         continue;
 256                 if (page->index == offset)
 257                         break;
 258         }
 259         /*
 260          * Touching the page may move it to the active list.
 261          * If we end up with too few inactive pages, we wake
 262          * up kswapd.
 263          */
 264         age_page_up(page);
 265         if (inactive_shortage() > inactive_target / 2 && free_shortage())
 266                         wakeup_kswapd(0);
 267 not_found:
 268         return page;
 269 }
 270
 271 /*
 272  * By the time this is called, the page is locked and
 273  * we don't have to worry about any races any more.
 274  *
 275  * Start the IO..
 276  */
 277 static int writeout_one_page(struct page *page)
 278 {
 279         struct buffer_head *bh, *head = page->buffers;
 280
 281         bh = head;
 282         do {
 283                 if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
 284                         continue;
 285
 286                 bh->b_flushtime = jiffies;
 287                 ll_rw_block(WRITE, 1, &bh);
 288         } while ((bh = bh->b_this_page) != head);
 289         return 0;
 290 }
 291
 292 static int waitfor_one_page(struct page *page)
 293 {
 294         int error = 0;
 295         struct buffer_head *bh, *head = page->buffers;
 296
 297         bh = head;
 298         do {
 299                 wait_on_buffer(bh);
 300                 if (buffer_req(bh) && !buffer_uptodate(bh))
 301                         error = -EIO;
 302         } while ((bh = bh->b_this_page) != head);
 303         return error;
 304 }
 305
 306 static int do_buffer_fdatasync(struct inode *inode, unsigned long start, unsigned long end, int (*fn)(struct page *))
 307 {
 308         struct list_head *head, *curr;
 309         struct page *page;
 310         int retval = 0;
 311
 312         head = &inode->i_mapping->pages;
 313
 314         spin_lock(&pagecache_lock);
 315         curr = head->next;
 316         while (curr != head) {
 317                 page = list_entry(curr, struct page, list);
 318                 curr = curr->next;
 319                 if (!page->buffers)
 320                         continue;
 321                 if (page->index >= end)
 322                         continue;
 323                 if (page->index < start)
 324                         continue;
 325
 326                 page_cache_get(page);
 327                 spin_unlock(&pagecache_lock);
 328                 lock_page(page);
 329
 330                 /* The buffers could have been free'd while we waited for the page lock */
 331                 if (page->buffers)
 332                         retval |= fn(page);
 333
 334                 UnlockPage(page);
 335                 spin_lock(&pagecache_lock);
 336                 curr = page->list.next;
 337                 page_cache_release(page);
 338         }
 339         spin_unlock(&pagecache_lock);
 340
 341         return retval;
 342 }
 343
 344 /*
 345  * Two-stage data sync: first start the IO, then go back and
 346  * collect the information..
 347  */
 348 int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx)
 349 {
 350         int retval;
 351
 352         retval = do_buffer_fdatasync(inode, start_idx, end_idx, writeout_one_page);
 353         retval |= do_buffer_fdatasync(inode, start_idx, end_idx, waitfor_one_page);
 354         return retval;
 355 }
 356
 357 /*
 358  * Add a page to the inode page cache.
 359  *
 360  * The caller must have locked the page and
 361  * set all the page flags correctly..
 362  */
 363 void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index)
 364 {
 365         if (!PageLocked(page))
 366                 BUG();
 367
 368         page_cache_get(page);
 369         spin_lock(&pagecache_lock);
 370         page->index = index;
 371         add_page_to_inode_queue(mapping, page);
 372         __add_page_to_hash_queue(page, page_hash(mapping, index));
 373         lru_cache_add(page);
 374         spin_unlock(&pagecache_lock);
 375 }
 376
 377 /*
 378  * This adds a page to the page cache, starting out as locked,
 379  * owned by us, but unreferenced, not uptodate and with no errors.
 380  */
 381 static inline void __add_to_page_cache(struct page * page,
 382         struct address_space *mapping, unsigned long offset,
 383         struct page **hash)
 384 {
 385         unsigned long flags;
 386
 387         if (PageLocked(page))
 388                 BUG();
 389
 390         flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced) | (1 << PG_arch_1));
 391         page->flags = flags | (1 << PG_locked);
 392         page_cache_get(page);
 393         page->index = offset;
 394         add_page_to_inode_queue(mapping, page);
 395         __add_page_to_hash_queue(page, hash);
 396         lru_cache_add(page);
 397 }
 398
 399 void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset)
 400 {
 401         spin_lock(&pagecache_lock);
 402         __add_to_page_cache(page, mapping, offset, page_hash(mapping, offset));
 403         spin_unlock(&pagecache_lock);
 404 }
 405
 406 static int add_to_page_cache_unique(struct page * page,
 407         struct address_space *mapping, unsigned long offset,
 408         struct page **hash)
 409 {
 410         int err;
 411         struct page *alias;
 412
 413         spin_lock(&pagecache_lock);
 414         alias = __find_page_nolock(mapping, offset, *hash);
 415
 416         err = 1;
 417         if (!alias) {
 418                 __add_to_page_cache(page,mapping,offset,hash);
 419                 err = 0;
 420         }
 421
 422         spin_unlock(&pagecache_lock);
 423         return err;
 424 }
 425
 426 /*
 427  * This adds the requested page to the page cache if it isn't already there,
 428  * and schedules an I/O to read in its contents from disk.
 429  */
 430 static inline int page_cache_read(struct file * file, unsigned long offset)
 431 {
 432         struct inode *inode = file->f_dentry->d_inode;
 433         struct address_space *mapping = inode->i_mapping;
 434         struct page **hash = page_hash(mapping, offset);
 435         struct page *page;
 436
 437         spin_lock(&pagecache_lock);
 438         page = __find_page_nolock(mapping, offset, *hash);
 439         spin_unlock(&pagecache_lock);
 440         if (page)
 441                 return 0;
 442
 443         page = page_cache_alloc();
 444         if (!page)
 445                 return -ENOMEM;
 446
 447         if (!add_to_page_cache_unique(page, mapping, offset, hash)) {
 448                 int error = mapping->a_ops->readpage(file, page);
 449                 page_cache_release(page);
 450                 return error;
 451         }
 452         /*
 453          * We arrive here in the unlikely event that someone
 454          * raced with us and added our page to the cache first.
 455          */
 456         page_cache_free(page);
 457         return 0;
 458 }
 459
 460 /*
 461  * Read in an entire cluster at once.  A cluster is usually a 64k-
 462  * aligned block that includes the page requested in "offset."
 463  */
 464 static int read_cluster_nonblocking(struct file * file, unsigned long offset,
 465         unsigned long filesize)
 466 {
 467         unsigned long pages = CLUSTER_PAGES;
 468
 469         offset = CLUSTER_OFFSET(offset);
 470         while ((pages-- > 0) && (offset < filesize)) {
 471                 int error = page_cache_read(file, offset);
 472                 if (error < 0)
 473                         return error;
 474                 offset ++;
 475         }
 476
 477         return 0;
 478 }
 479
 480 /*
 481  * Wait for a page to get unlocked.
 482  *
 483  * This must be called with the caller "holding" the page,
 484  * ie with increased "page->count" so that the page won't
 485  * go away during the wait..
 486  */
 487 void ___wait_on_page(struct page *page)
 488 {
 489         struct task_struct *tsk = current;
 490         DECLARE_WAITQUEUE(wait, tsk);
 491
 492         add_wait_queue(&page->wait, &wait);
 493         do {
 494                 sync_page(page);
 495                 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 496                 if (!PageLocked(page))
 497                         break;
 498                 run_task_queue(&tq_disk);
 499                 schedule();
 500         } while (PageLocked(page));
 501         tsk->state = TASK_RUNNING;
 502         remove_wait_queue(&page->wait, &wait);
 503 }
 504
 505 /*
 506  * Get a lock on the page, assuming we need to sleep
 507  * to get it..
 508  */
 509 static void __lock_page(struct page *page)
 510 {
 511         struct task_struct *tsk = current;
 512         DECLARE_WAITQUEUE(wait, tsk);
 513
 514         add_wait_queue_exclusive(&page->wait, &wait);
 515         for (;;) {
 516                 sync_page(page);
 517                 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 518                 if (PageLocked(page)) {
 519                         run_task_queue(&tq_disk);
 520                         schedule();
 521                         continue;
 522                 }
 523                 if (!TryLockPage(page))
 524                         break;
 525         }
 526         tsk->state = TASK_RUNNING;
 527         remove_wait_queue(&page->wait, &wait);
 528 }
 529
 530
 531 /*
 532  * Get an exclusive lock on the page, optimistically
 533  * assuming it's not locked..
 534  */
 535 void lock_page(struct page *page)
 536 {
 537         if (TryLockPage(page))
 538                 __lock_page(page);
 539 }
 540
 541 /*
 542  * a rather lightweight function, finding and getting a reference to a
 543  * hashed page atomically, waiting for it if it's locked.
 544  */
 545 static struct page * __find_get_page(struct address_space *mapping,
 546                                 unsigned long offset, struct page **hash)
 547 {
 548         struct page *page;
 549
 550         /*
 551          * We scan the hash list read-only. Addition to and removal from
 552          * the hash-list needs a held write-lock.
 553          */
 554         spin_lock(&pagecache_lock);
 555         page = __find_page_nolock(mapping, offset, *hash);
 556         if (page)
 557                 page_cache_get(page);
 558         spin_unlock(&pagecache_lock);
 559         return page;
 560 }
 561
 562 /*
 563  * Get the lock to a page atomically.
 564  */
 565 struct page * __find_lock_page (struct address_space *mapping,
 566                                 unsigned long offset, struct page **hash)
 567 {
 568         struct page *page;
 569
 570         /*
 571          * We scan the hash list read-only. Addition to and removal from
 572          * the hash-list needs a held write-lock.
 573          */
 574 repeat:
 575         spin_lock(&pagecache_lock);
 576         page = __find_page_nolock(mapping, offset, *hash);
 577         if (page) {
 578                 page_cache_get(page);
 579                 spin_unlock(&pagecache_lock);
 580
 581                 lock_page(page);
 582
 583                 /* Is the page still hashed? Ok, good.. */
 584                 if (page->mapping)
 585                         return page;
 586
 587                 /* Nope: we raced. Release and try again.. */
 588                 UnlockPage(page);
 589                 page_cache_release(page);
 590                 goto repeat;
 591         }
 592         spin_unlock(&pagecache_lock);
 593         return NULL;
 594 }
 595
 596 #if 0
 597 #define PROFILE_READAHEAD
 598 #define DEBUG_READAHEAD
 599 #endif
 600
 601 /*
 602  * We combine this with read-ahead to deactivate pages when we
 603  * think there's sequential IO going on. Note that this is
 604  * harmless since we don't actually evict the pages from memory
 605  * but just move them to the inactive list.
 606  *
 607  * TODO:
 608  * - make the readahead code smarter
 609  * - move readahead to the VMA level so we can do the same
 610  *   trick with mmap()
 611  *
 612  * Rik van Riel, 2000
 613  */
 614 static void drop_behind(struct file * file, unsigned long index)
 615 {
 616         struct inode *inode = file->f_dentry->d_inode;
 617         struct address_space *mapping = inode->i_mapping;
 618         struct page **hash;
 619         struct page *page;
 620         unsigned long start;
 621
 622         /* Nothing to drop-behind if we're on the first page. */
 623         if (!index)
 624                 return;
 625
 626         if (index > file->f_rawin)
 627                 start = index - file->f_rawin;
 628         else
 629                 start = 0;
 630
 631         /*
 632          * Go backwards from index-1 and drop all pages in the
 633          * readahead window. Since the readahead window may have
 634          * been increased since the last time we were called, we
 635          * stop when the page isn't there.
 636          */
 637         spin_lock(&pagecache_lock);
 638         while (--index >= start) {
 639                 hash = page_hash(mapping, index);
 640                 page = __find_page_nolock(mapping, index, *hash);
 641                 if (!page)
 642                         break;
 643                 deactivate_page(page);
 644         }
 645         spin_unlock(&pagecache_lock);
 646 }
 647
 648 /*
 649  * Read-ahead profiling information
 650  * --------------------------------
 651  * Every PROFILE_MAXREADCOUNT, the following information is written
 652  * to the syslog:
 653  *   Percentage of asynchronous read-ahead.
 654  *   Average of read-ahead fields context value.
 655  * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
 656  * to the syslog.
 657  */
 658
 659 #ifdef PROFILE_READAHEAD
 660
 661 #define PROFILE_MAXREADCOUNT 1000
 662
 663 static unsigned long total_reada;
 664 static unsigned long total_async;
 665 static unsigned long total_ramax;
 666 static unsigned long total_ralen;
 667 static unsigned long total_rawin;
 668
 669 static void profile_readahead(int async, struct file *filp)
 670 {
 671         unsigned long flags;
 672
 673         ++total_reada;
 674         if (async)
 675                 ++total_async;
 676
 677         total_ramax     += filp->f_ramax;
 678         total_ralen     += filp->f_ralen;
 679         total_rawin     += filp->f_rawin;
 680
 681         if (total_reada > PROFILE_MAXREADCOUNT) {
 682                 save_flags(flags);
 683                 cli();
 684                 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
 685                         restore_flags(flags);
 686                         return;
 687                 }
 688
 689                 printk("Readahead average:  max=%ld, len=%ld, win=%ld, async=%ld%%\n",
 690                         total_ramax/total_reada,
 691                         total_ralen/total_reada,
 692                         total_rawin/total_reada,
 693                         (total_async*100)/total_reada);
 694 #ifdef DEBUG_READAHEAD
 695                 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n",
 696                         filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
 697 #endif
 698
 699                 total_reada     = 0;
 700                 total_async     = 0;
 701                 total_ramax     = 0;
 702                 total_ralen     = 0;
 703                 total_rawin     = 0;
 704
 705                 restore_flags(flags);
 706         }
 707 }
 708 #endif  /* defined PROFILE_READAHEAD */
 709
 710 /*
 711  * Read-ahead context:
 712  * -------------------
 713  * The read ahead context fields of the "struct file" are the following:
 714  * - f_raend : position of the first byte after the last page we tried to
 715  *             read ahead.
 716  * - f_ramax : current read-ahead maximum size.
 717  * - f_ralen : length of the current IO read block we tried to read-ahead.
 718  * - f_rawin : length of the current read-ahead window.
 719  *              if last read-ahead was synchronous then
 720  *                      f_rawin = f_ralen
 721  *              otherwise (was asynchronous)
 722  *                      f_rawin = previous value of f_ralen + f_ralen
 723  *
 724  * Read-ahead limits:
 725  * ------------------
 726  * MIN_READAHEAD   : minimum read-ahead size when read-ahead.
 727  * MAX_READAHEAD   : maximum read-ahead size when read-ahead.
 728  *
 729  * Synchronous read-ahead benefits:
 730  * --------------------------------
 731  * Using reasonable IO xfer length from peripheral devices increase system
 732  * performances.
 733  * Reasonable means, in this context, not too large but not too small.
 734  * The actual maximum value is:
 735  *      MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
 736  *      and 32K if defined (4K page size assumed).
 737  *
 738  * Asynchronous read-ahead benefits:
 739  * ---------------------------------
 740  * Overlapping next read request and user process execution increase system
 741  * performance.
 742  *
 743  * Read-ahead risks:
 744  * -----------------
 745  * We have to guess which further data are needed by the user process.
 746  * If these data are often not really needed, it's bad for system
 747  * performances.
 748  * However, we know that files are often accessed sequentially by
 749  * application programs and it seems that it is possible to have some good
 750  * strategy in that guessing.
 751  * We only try to read-ahead files that seems to be read sequentially.
 752  *
 753  * Asynchronous read-ahead risks:
 754  * ------------------------------
 755  * In order to maximize overlapping, we must start some asynchronous read
 756  * request from the device, as soon as possible.
 757  * We must be very careful about:
 758  * - The number of effective pending IO read requests.
 759  *   ONE seems to be the only reasonable value.
 760  * - The total memory pool usage for the file access stream.
 761  *   This maximum memory usage is implicitly 2 IO read chunks:
 762  *   2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
 763  *   64k if defined (4K page size assumed).
 764  */
 765
 766 static inline int get_max_readahead(struct inode * inode)
 767 {
 768         if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
 769                 return MAX_READAHEAD;
 770         return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
 771 }
 772
 773 static void generic_file_readahead(int reada_ok,
 774         struct file * filp, struct inode * inode,
 775         struct page * page)
 776 {
 777         unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
 778         unsigned long index = page->index;
 779         unsigned long max_ahead, ahead;
 780         unsigned long raend;
 781         int max_readahead = get_max_readahead(inode);
 782
 783         raend = filp->f_raend;
 784         max_ahead = 0;
 785
 786 /*
 787  * The current page is locked.
 788  * If the current position is inside the previous read IO request, do not
 789  * try to reread previously read ahead pages.
 790  * Otherwise decide or not to read ahead some pages synchronously.
 791  * If we are not going to read ahead, set the read ahead context for this
 792  * page only.
 793  */
 794         if (PageLocked(page)) {
 795                 if (!filp->f_ralen || index >= raend || index + filp->f_rawin < raend) {
 796                         raend = index;
 797                         if (raend < end_index)
 798                                 max_ahead = filp->f_ramax;
 799                         filp->f_rawin = 0;
 800                         filp->f_ralen = 1;
 801                         if (!max_ahead) {
 802                                 filp->f_raend  = index + filp->f_ralen;
 803                                 filp->f_rawin += filp->f_ralen;
 804                         }
 805                 }
 806         }
 807 /*
 808  * The current page is not locked.
 809  * If we were reading ahead and,
 810  * if the current max read ahead size is not zero and,
 811  * if the current position is inside the last read-ahead IO request,
 812  *   it is the moment to try to read ahead asynchronously.
 813  * We will later force unplug device in order to force asynchronous read IO.
 814  */
 815         else if (reada_ok && filp->f_ramax && raend >= 1 &&
 816                  index <= raend && index + filp->f_ralen >= raend) {
 817 /*
 818  * Add ONE page to max_ahead in order to try to have about the same IO max size
 819  * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
 820  * Compute the position of the last page we have tried to read in order to
 821  * begin to read ahead just at the next page.
 822  */
 823                 raend -= 1;
 824                 if (raend < end_index)
 825                         max_ahead = filp->f_ramax + 1;
 826
 827                 if (max_ahead) {
 828                         filp->f_rawin = filp->f_ralen;
 829                         filp->f_ralen = 0;
 830                         reada_ok      = 2;
 831                 }
 832         }
 833 /*
 834  * Try to read ahead pages.
 835  * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
 836  * scheduler, will work enough for us to avoid too bad actuals IO requests.
 837  */
 838         ahead = 0;
 839         while (ahead < max_ahead) {
 840                 ahead ++;
 841                 if ((raend + ahead) >= end_index)
 842                         break;
 843                 if (page_cache_read(filp, raend + ahead) < 0)
 844                         break;
 845         }
 846 /*
 847  * If we tried to read ahead some pages,
 848  * If we tried to read ahead asynchronously,
 849  *   Try to force unplug of the device in order to start an asynchronous
 850  *   read IO request.
 851  * Update the read-ahead context.
 852  * Store the length of the current read-ahead window.
 853  * Double the current max read ahead size.
 854  *   That heuristic avoid to do some large IO for files that are not really
 855  *   accessed sequentially.
 856  */
 857         if (ahead) {
 858                 if (reada_ok == 2) {
 859                         run_task_queue(&tq_disk);
 860                 }
 861
 862                 filp->f_ralen += ahead;
 863                 filp->f_rawin += filp->f_ralen;
 864                 filp->f_raend = raend + ahead + 1;
 865
 866                 filp->f_ramax += filp->f_ramax;
 867
 868                 if (filp->f_ramax > max_readahead)
 869                         filp->f_ramax = max_readahead;
 870
 871                 /*
 872                  * Move the pages that have already been passed
 873                  * to the inactive list.
 874                  */
 875                 drop_behind(filp, index);
 876
 877 #ifdef PROFILE_READAHEAD
 878                 profile_readahead((reada_ok == 2), filp);
 879 #endif
 880         }
 881
 882         return;
 883 }
 884
 885
 886 /*
 887  * This is a generic file read routine, and uses the
 888  * inode->i_op->readpage() function for the actual low-level
 889  * stuff.
 890  *
 891  * This is really ugly. But the goto's actually try to clarify some
 892  * of the logic when it comes to error handling etc.
 893  */
 894 void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
 895 {
 896         struct inode *inode = filp->f_dentry->d_inode;
 897         struct address_space *mapping = inode->i_mapping;
 898         unsigned long index, offset;
 899         struct page *cached_page;
 900         int reada_ok;
 901         int error;
 902         int max_readahead = get_max_readahead(inode);
 903
 904         cached_page = NULL;
 905         index = *ppos >> PAGE_CACHE_SHIFT;
 906         offset = *ppos & ~PAGE_CACHE_MASK;
 907
 908 /*
 909  * If the current position is outside the previous read-ahead window,
 910  * we reset the current read-ahead context and set read ahead max to zero
 911  * (will be set to just needed value later),
 912  * otherwise, we assume that the file accesses are sequential enough to
 913  * continue read-ahead.
 914  */
 915         if (index > filp->f_raend || index + filp->f_rawin < filp->f_raend) {
 916                 reada_ok = 0;
 917                 filp->f_raend = 0;
 918                 filp->f_ralen = 0;
 919                 filp->f_ramax = 0;
 920                 filp->f_rawin = 0;
 921         } else {
 922                 reada_ok = 1;
 923         }
 924 /*
 925  * Adjust the current value of read-ahead max.
 926  * If the read operation stay in the first half page, force no readahead.
 927  * Otherwise try to increase read ahead max just enough to do the read request.
 928  * Then, at least MIN_READAHEAD if read ahead is ok,
 929  * and at most MAX_READAHEAD in all cases.
 930  */
 931         if (!index && offset + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
 932                 filp->f_ramax = 0;
 933         } else {
 934                 unsigned long needed;
 935
 936                 needed = ((offset + desc->count) >> PAGE_CACHE_SHIFT) + 1;
 937
 938                 if (filp->f_ramax < needed)
 939                         filp->f_ramax = needed;
 940
 941                 if (reada_ok && filp->f_ramax < MIN_READAHEAD)
 942                                 filp->f_ramax = MIN_READAHEAD;
 943                 if (filp->f_ramax > max_readahead)
 944                         filp->f_ramax = max_readahead;
 945         }
 946
 947         for (;;) {
 948                 struct page *page, **hash;
 949                 unsigned long end_index, nr;
 950
 951                 end_index = inode->i_size >> PAGE_CACHE_SHIFT;
 952                 if (index > end_index)
 953                         break;
 954                 nr = PAGE_CACHE_SIZE;
 955                 if (index == end_index) {
 956                         nr = inode->i_size & ~PAGE_CACHE_MASK;
 957                         if (nr <= offset)
 958                                 break;
 959                 }
 960
 961                 nr = nr - offset;
 962
 963                 /*
 964                  * Try to find the data in the page cache..
 965                  */
 966                 hash = page_hash(mapping, index);
 967
 968                 spin_lock(&pagecache_lock);
 969                 page = __find_page_nolock(mapping, index, *hash);
 970                 if (!page)
 971                         goto no_cached_page;
 972 found_page:
 973                 page_cache_get(page);
 974                 spin_unlock(&pagecache_lock);
 975
 976                 if (!Page_Uptodate(page))
 977                         goto page_not_up_to_date;
 978                 generic_file_readahead(reada_ok, filp, inode, page);
 979 page_ok:
 980                 /* If users can be writing to this page using arbitrary
 981                  * virtual addresses, take care about potential aliasing
 982                  * before reading the page on the kernel side.
 983                  */
 984                 if (mapping->i_mmap_shared != NULL)
 985                         flush_dcache_page(page);
 986
 987                 /*
 988                  * Ok, we have the page, and it's up-to-date, so
 989                  * now we can copy it to user space...
 990                  *
 991                  * The actor routine returns how many bytes were actually used..
 992                  * NOTE! This may not be the same as how much of a user buffer
 993                  * we filled up (we may be padding etc), so we can only update
 994                  * "pos" here (the actor routine has to update the user buffer
 995                  * pointers and the remaining count).
 996                  */
 997                 nr = actor(desc, page, offset, nr);
 998                 offset += nr;
 999                 index += offset >> PAGE_CACHE_SHIFT;
1000                 offset &= ~PAGE_CACHE_MASK;
1001
1002                 page_cache_release(page);
1003                 if (nr && desc->count)
1004                         continue;
1005                 break;
1006
1007 /*
1008  * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
1009  */
1010 page_not_up_to_date:
1011                 generic_file_readahead(reada_ok, filp, inode, page);
1012
1013                 if (Page_Uptodate(page))
1014                         goto page_ok;
1015
1016                 /* Get exclusive access to the page ... */
1017                 lock_page(page);
1018
1019                 /* Did it get unhashed before we got the lock? */
1020                 if (!page->mapping) {
1021                         UnlockPage(page);
1022                         page_cache_release(page);
1023                         continue;
1024                 }
1025
1026                 /* Did somebody else fill it already? */
1027                 if (Page_Uptodate(page)) {
1028                         UnlockPage(page);
1029                         goto page_ok;
1030                 }
1031
1032 readpage:
1033                 /* ... and start the actual read. The read will unlock the page. */
1034                 error = mapping->a_ops->readpage(filp, page);
1035
1036                 if (!error) {
1037                         if (Page_Uptodate(page))
1038                                 goto page_ok;
1039
1040                         /* Again, try some read-ahead while waiting for the page to finish.. */
1041                         generic_file_readahead(reada_ok, filp, inode, page);
1042                         wait_on_page(page);
1043                         if (Page_Uptodate(page))
1044                                 goto page_ok;
1045                         error = -EIO;
1046                 }
1047
1048                 /* UHHUH! A synchronous read error occurred. Report it */
1049                 desc->error = error;
1050                 page_cache_release(page);
1051                 break;
1052
1053 no_cached_page:
1054                 /*
1055                  * Ok, it wasn't cached, so we need to create a new
1056                  * page..
1057                  *
1058                  * We get here with the page cache lock held.
1059                  */
1060                 if (!cached_page) {
1061                         spin_unlock(&pagecache_lock);
1062                         cached_page = page_cache_alloc();
1063                         if (!cached_page) {
1064                                 desc->error = -ENOMEM;
1065                                 break;
1066                         }
1067
1068                         /*
1069                          * Somebody may have added the page while we
1070                          * dropped the page cache lock. Check for that.
1071                          */
1072                         spin_lock(&pagecache_lock);
1073                         page = __find_page_nolock(mapping, index, *hash);
1074                         if (page)
1075                                 goto found_page;
1076                 }
1077
1078                 /*
1079                  * Ok, add the new page to the hash-queues...
1080                  */
1081                 page = cached_page;
1082                 __add_to_page_cache(page, mapping, index, hash);
1083                 spin_unlock(&pagecache_lock);
1084                 cached_page = NULL;
1085
1086                 goto readpage;
1087         }
1088
1089         *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1090         filp->f_reada = 1;
1091         if (cached_page)
1092                 page_cache_free(cached_page);
1093         UPDATE_ATIME(inode);
1094 }
1095
1096 static int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
1097 {
1098         char *kaddr;
1099         unsigned long left, count = desc->count;
1100
1101         if (size > count)
1102                 size = count;
1103
1104         kaddr = kmap(page);
1105         left = __copy_to_user(desc->buf, kaddr + offset, size);
1106         kunmap(page);
1107
1108         if (left) {
1109                 size -= left;
1110                 desc->error = -EFAULT;
1111         }
1112         desc->count = count - size;
1113         desc->written += size;
1114         desc->buf += size;
1115         return size;
1116 }
1117
1118 /*
1119  * This is the "read()" routine for all filesystems
1120  * that can use the page cache directly.
1121  */
1122 ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
1123 {
1124         ssize_t retval;
1125
1126         retval = -EFAULT;
1127         if (access_ok(VERIFY_WRITE, buf, count)) {
1128                 retval = 0;
1129
1130                 if (count) {
1131                         read_descriptor_t desc;
1132
1133                         desc.written = 0;
1134                         desc.count = count;
1135                         desc.buf = buf;
1136                         desc.error = 0;
1137                         do_generic_file_read(filp, ppos, &desc, file_read_actor);
1138
1139                         retval = desc.written;
1140                         if (!retval)
1141                                 retval = desc.error;
1142                 }
1143         }
1144         return retval;
1145 }
1146
1147 static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size)
1148 {
1149         char *kaddr;
1150         ssize_t written;
1151         unsigned long count = desc->count;
1152         struct file *file = (struct file *) desc->buf;
1153         mm_segment_t old_fs;
1154
1155         if (size > count)
1156                 size = count;
1157         old_fs = get_fs();
1158         set_fs(KERNEL_DS);
1159
1160         kaddr = kmap(page);
1161         written = file->f_op->write(file, kaddr + offset, size, &file->f_pos);
1162         kunmap(page);
1163         set_fs(old_fs);
1164         if (written < 0) {
1165                 desc->error = written;
1166                 written = 0;
1167         }
1168         desc->count = count - written;
1169         desc->written += written;
1170         return written;
1171 }
1172
1173 asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
1174 {
1175         ssize_t retval;
1176         struct file * in_file, * out_file;
1177         struct inode * in_inode, * out_inode;
1178
1179         /*
1180          * Get input file, and verify that it is ok..
1181          */
1182         retval = -EBADF;
1183         in_file = fget(in_fd);
1184         if (!in_file)
1185                 goto out;
1186         if (!(in_file->f_mode & FMODE_READ))
1187                 goto fput_in;
1188         retval = -EINVAL;
1189         in_inode = in_file->f_dentry->d_inode;
1190         if (!in_inode)
1191                 goto fput_in;
1192         if (!in_inode->i_mapping->a_ops->readpage)
1193                 goto fput_in;
1194         retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
1195         if (retval)
1196                 goto fput_in;
1197
1198         /*
1199          * Get output file, and verify that it is ok..
1200          */
1201         retval = -EBADF;
1202         out_file = fget(out_fd);
1203         if (!out_file)
1204                 goto fput_in;
1205         if (!(out_file->f_mode & FMODE_WRITE))
1206                 goto fput_out;
1207         retval = -EINVAL;
1208         if (!out_file->f_op || !out_file->f_op->write)
1209                 goto fput_out;
1210         out_inode = out_file->f_dentry->d_inode;
1211         retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
1212         if (retval)
1213                 goto fput_out;
1214
1215         retval = 0;
1216         if (count) {
1217                 read_descriptor_t desc;
1218                 loff_t pos = 0, *ppos;
1219
1220                 retval = -EFAULT;
1221                 ppos = &in_file->f_pos;
1222                 if (offset) {
1223                         if (get_user(pos, offset))
1224                                 goto fput_out;
1225                         ppos = &pos;
1226                 }
1227
1228                 desc.written = 0;
1229                 desc.count = count;
1230                 desc.buf = (char *) out_file;
1231                 desc.error = 0;
1232                 do_generic_file_read(in_file, ppos, &desc, file_send_actor);
1233
1234                 retval = desc.written;
1235                 if (!retval)
1236                         retval = desc.error;
1237                 if (offset)
1238                         put_user(pos, offset);
1239         }
1240
1241 fput_out:
1242         fput(out_file);
1243 fput_in:
1244         fput(in_file);
1245 out:
1246         return retval;
1247 }
1248
1249 /*
1250  * Read-ahead and flush behind for MADV_SEQUENTIAL areas.  Since we are
1251  * sure this is sequential access, we don't need a flexible read-ahead
1252  * window size -- we can always use a large fixed size window.
1253  */
1254 static void nopage_sequential_readahead(struct vm_area_struct * vma,
1255         unsigned long pgoff, unsigned long filesize)
1256 {
1257         unsigned long ra_window;
1258
1259         ra_window = get_max_readahead(vma->vm_file->f_dentry->d_inode);
1260         ra_window = CLUSTER_OFFSET(ra_window + CLUSTER_PAGES - 1);
1261
1262         /* vm_raend is zero if we haven't read ahead in this area yet.  */
1263         if (vma->vm_raend == 0)
1264                 vma->vm_raend = vma->vm_pgoff + ra_window;
1265
1266         /*
1267          * If we've just faulted the page half-way through our window,
1268          * then schedule reads for the next window, and release the
1269          * pages in the previous window.
1270          */
1271         if ((pgoff + (ra_window >> 1)) == vma->vm_raend) {
1272                 unsigned long start = vma->vm_pgoff + vma->vm_raend;
1273                 unsigned long end = start + ra_window;
1274
1275                 if (end > ((vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff))
1276                         end = (vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff;
1277                 if (start > end)
1278                         return;
1279
1280                 while ((start < end) && (start < filesize)) {
1281                         if (read_cluster_nonblocking(vma->vm_file,
1282                                                         start, filesize) < 0)
1283                                 break;
1284                         start += CLUSTER_PAGES;
1285                 }
1286                 run_task_queue(&tq_disk);
1287
1288                 /* if we're far enough past the beginning of this area,
1289                    recycle pages that are in the previous window. */
1290                 if (vma->vm_raend > (vma->vm_pgoff + ra_window + ra_window)) {
1291                         unsigned long window = ra_window << PAGE_SHIFT;
1292
1293                         end = vma->vm_start + (vma->vm_raend << PAGE_SHIFT);
1294                         end -= window + window;
1295                         filemap_sync(vma, end - window, window, MS_INVALIDATE);
1296                 }
1297
1298                 vma->vm_raend += ra_window;
1299         }
1300
1301         return;
1302 }
1303
1304 /*
1305  * filemap_nopage() is invoked via the vma operations vector for a
1306  * mapped memory region to read in file data during a page fault.
1307  *
1308  * The goto's are kind of ugly, but this streamlines the normal case of having
1309  * it in the page cache, and handles the special cases reasonably without
1310  * having a lot of duplicated code.
1311  */
1312 struct page * filemap_nopage(struct vm_area_struct * area,
1313         unsigned long address, int no_share)
1314 {
1315         int error;
1316         struct file *file = area->vm_file;
1317         struct inode *inode = file->f_dentry->d_inode;
1318         struct address_space *mapping = inode->i_mapping;
1319         struct page *page, **hash, *old_page;
1320         unsigned long size, pgoff;
1321
1322         pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
1323
1324 retry_all:
1325         /*
1326          * An external ptracer can access pages that normally aren't
1327          * accessible..
1328          */
1329         size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1330         if ((pgoff >= size) && (area->vm_mm == current->mm))
1331                 return NULL;
1332
1333         /*
1334          * Do we have something in the page cache already?
1335          */
1336         hash = page_hash(mapping, pgoff);
1337 retry_find:
1338         page = __find_get_page(mapping, pgoff, hash);
1339         if (!page)
1340                 goto no_cached_page;
1341
1342         /*
1343          * Ok, found a page in the page cache, now we need to check
1344          * that it's up-to-date.
1345          */
1346         if (!Page_Uptodate(page))
1347                 goto page_not_uptodate;
1348
1349 success:
1350         /*
1351          * Try read-ahead for sequential areas.
1352          */
1353         if (VM_SequentialReadHint(area))
1354                 nopage_sequential_readahead(area, pgoff, size);
1355
1356         /*
1357          * Found the page and have a reference on it, need to check sharing
1358          * and possibly copy it over to another page..
1359          */
1360         old_page = page;
1361         if (no_share) {
1362                 struct page *new_page = page_cache_alloc();
1363
1364                 if (new_page) {
1365                         copy_user_highpage(new_page, old_page, address);
1366                         flush_page_to_ram(new_page);
1367                 } else
1368                         new_page = NOPAGE_OOM;
1369                 page_cache_release(page);
1370                 return new_page;
1371         }
1372
1373         flush_page_to_ram(old_page);
1374         return old_page;
1375
1376 no_cached_page:
1377         /*
1378          * If the requested offset is within our file, try to read a whole
1379          * cluster of pages at once.
1380          *
1381          * Otherwise, we're off the end of a privately mapped file,
1382          * so we need to map a zero page.
1383          */
1384         if ((pgoff < size) && !VM_RandomReadHint(area))
1385                 error = read_cluster_nonblocking(file, pgoff, size);
1386         else
1387                 error = page_cache_read(file, pgoff);
1388
1389         /*
1390          * The page we want has now been added to the page cache.
1391          * In the unlikely event that someone removed it in the
1392          * meantime, we'll just come back here and read it again.
1393          */
1394         if (error >= 0)
1395                 goto retry_find;
1396
1397         /*
1398          * An error return from page_cache_read can result if the
1399          * system is low on memory, or a problem occurs while trying
1400          * to schedule I/O.
1401          */
1402         if (error == -ENOMEM)
1403                 return NOPAGE_OOM;
1404         return NULL;
1405
1406 page_not_uptodate:
1407         lock_page(page);
1408
1409         /* Did it get unhashed while we waited for it? */
1410         if (!page->mapping) {
1411                 UnlockPage(page);
1412                 page_cache_release(page);
1413                 goto retry_all;
1414         }
1415
1416         /* Did somebody else get it up-to-date? */
1417         if (Page_Uptodate(page)) {
1418                 UnlockPage(page);
1419                 goto success;
1420         }
1421
1422         if (!mapping->a_ops->readpage(file, page)) {
1423                 wait_on_page(page);
1424                 if (Page_Uptodate(page))
1425                         goto success;
1426         }
1427
1428         /*
1429          * Umm, take care of errors if the page isn't up-to-date.
1430          * Try to re-read it _once_. We do this synchronously,
1431          * because there really aren't any performance issues here
1432          * and we need to check for errors.
1433          */
1434         lock_page(page);
1435
1436         /* Somebody truncated the page on us? */
1437         if (!page->mapping) {
1438                 UnlockPage(page);
1439                 page_cache_release(page);
1440                 goto retry_all;
1441         }
1442
1443         /* Somebody else successfully read it in? */
1444         if (Page_Uptodate(page)) {
1445                 UnlockPage(page);
1446                 goto success;
1447         }
1448         ClearPageError(page);
1449         if (!mapping->a_ops->readpage(file, page)) {
1450                 wait_on_page(page);
1451                 if (Page_Uptodate(page))
1452                         goto success;
1453         }
1454
1455         /*
1456          * Things didn't work out. Return zero to tell the
1457          * mm layer so, possibly freeing the page cache page first.
1458          */
1459         page_cache_release(page);
1460         return NULL;
1461 }
1462
1463 /*
1464  * If a task terminates while we're swapping the page, the vma and
1465  * and file could be released: try_to_swap_out has done a get_file.
1466  * vma/file is guaranteed to exist in the unmap/sync cases because
1467  * mmap_sem is held.
1468  *
1469  * The "mapping" test takes care of somebody having truncated the
1470  * page and thus made this write-page a no-op..
1471  */
1472 static int filemap_write_page(struct page * page, int wait)
1473 {
1474         struct address_space * mapping = page->mapping;
1475         int error = 0;
1476
1477         if (mapping && mapping->a_ops->writepage) {
1478                 ClearPageDirty(page);
1479                 error = mapping->a_ops->writepage(page);
1480         }
1481         return error;
1482 }
1483
1484
1485 /*
1486  * The page cache takes care of races between somebody
1487  * trying to swap something out and swap something in
1488  * at the same time..
1489  */
1490 extern void wakeup_bdflush(int);
1491 int filemap_swapout(struct page * page, struct file *file)
1492 {
1493         SetPageDirty(page);
1494         return 0;
1495 }
1496
1497 /* Called with mm->page_table_lock held to protect against other
1498  * threads/the swapper from ripping pte's out from under us.
1499  */
1500 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
1501         unsigned long address, unsigned int flags)
1502 {
1503         pte_t pte;
1504         struct page *page;
1505         int error;
1506
1507         pte = *ptep;
1508
1509         if (!pte_present(pte))
1510                 goto out;
1511         if (!ptep_test_and_clear_dirty(ptep))
1512                 goto out;
1513
1514         flush_page_to_ram(pte_page(pte));
1515         flush_cache_page(vma, address);
1516         flush_tlb_page(vma, address);
1517         page = pte_page(pte);
1518         page_cache_get(page);
1519         spin_unlock(&vma->vm_mm->page_table_lock);
1520
1521         lock_page(page);
1522         error = filemap_write_page(page, 1);
1523         page_cache_free(page);
1524
1525         spin_lock(&vma->vm_mm->page_table_lock);
1526         return error;
1527
1528 out:
1529         return 0;
1530 }
1531
1532 static inline int filemap_sync_pte_range(pmd_t * pmd,
1533         unsigned long address, unsigned long size,
1534         struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
1535 {
1536         pte_t * pte;
1537         unsigned long end;
1538         int error;
1539
1540         if (pmd_none(*pmd))
1541                 return 0;
1542         if (pmd_bad(*pmd)) {
1543                 pmd_ERROR(*pmd);
1544                 pmd_clear(pmd);
1545                 return 0;
1546         }
1547         pte = pte_offset(pmd, address);
1548         offset += address & PMD_MASK;
1549         address &= ~PMD_MASK;
1550         end = address + size;
1551         if (end > PMD_SIZE)
1552                 end = PMD_SIZE;
1553         error = 0;
1554         do {
1555                 error |= filemap_sync_pte(pte, vma, address + offset, flags);
1556                 address += PAGE_SIZE;
1557                 pte++;
1558         } while (address && (address < end));
1559         return error;
1560 }
1561
1562 static inline int filemap_sync_pmd_range(pgd_t * pgd,
1563         unsigned long address, unsigned long size,
1564         struct vm_area_struct *vma, unsigned int flags)
1565 {
1566         pmd_t * pmd;
1567         unsigned long offset, end;
1568         int error;
1569
1570         if (pgd_none(*pgd))
1571                 return 0;
1572         if (pgd_bad(*pgd)) {
1573                 pgd_ERROR(*pgd);
1574                 pgd_clear(pgd);
1575                 return 0;
1576         }
1577         pmd = pmd_offset(pgd, address);
1578         offset = address & PGDIR_MASK;
1579         address &= ~PGDIR_MASK;
1580         end = address + size;
1581         if (end > PGDIR_SIZE)
1582                 end = PGDIR_SIZE;
1583         error = 0;
1584         do {
1585                 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
1586                 address = (address + PMD_SIZE) & PMD_MASK;
1587                 pmd++;
1588         } while (address && (address < end));
1589         return error;
1590 }
1591
1592 int filemap_sync(struct vm_area_struct * vma, unsigned long address,
1593         size_t size, unsigned int flags)
1594 {
1595         pgd_t * dir;
1596         unsigned long end = address + size;
1597         int error = 0;
1598
1599         /* Aquire the lock early; it may be possible to avoid dropping
1600          * and reaquiring it repeatedly.
1601          */
1602         spin_lock(&vma->vm_mm->page_table_lock);
1603
1604         dir = pgd_offset(vma->vm_mm, address);
1605         flush_cache_range(vma->vm_mm, end - size, end);
1606         if (address >= end)
1607                 BUG();
1608         do {
1609                 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
1610                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
1611                 dir++;
1612         } while (address && (address < end));
1613         flush_tlb_range(vma->vm_mm, end - size, end);
1614
1615         spin_unlock(&vma->vm_mm->page_table_lock);
1616
1617         return error;
1618 }
1619
1620 /*
1621  * Shared mappings need to be able to do the right thing at
1622  * close/unmap/sync. They will also use the private file as
1623  * backing-store for swapping..
1624  */
1625 static struct vm_operations_struct file_shared_mmap = {
1626         sync:           filemap_sync,
1627         nopage:         filemap_nopage,
1628         swapout:        filemap_swapout,
1629 };
1630
1631 /*
1632  * Private mappings just need to be able to load in the map.
1633  *
1634  * (This is actually used for shared mappings as well, if we
1635  * know they can't ever get write permissions..)
1636  */
1637 static struct vm_operations_struct file_private_mmap = {
1638         nopage:         filemap_nopage,
1639 };
1640
1641 /* This is used for a general mmap of a disk file */
1642
1643 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1644 {
1645         struct vm_operations_struct * ops;
1646         struct inode *inode = file->f_dentry->d_inode;
1647
1648         ops = &file_private_mmap;
1649         if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1650                 if (!inode->i_mapping->a_ops->writepage)
1651                         return -EINVAL;
1652                 ops = &file_shared_mmap;
1653         }
1654         if (!inode->i_sb || !S_ISREG(inode->i_mode))
1655                 return -EACCES;
1656         if (!inode->i_mapping->a_ops->readpage)
1657                 return -ENOEXEC;
1658         UPDATE_ATIME(inode);
1659         vma->vm_ops = ops;
1660         return 0;
1661 }
1662
1663 /*
1664  * The msync() system call.
1665  */
1666
1667 static int msync_interval(struct vm_area_struct * vma,
1668         unsigned long start, unsigned long end, int flags)
1669 {
1670         if (vma->vm_file && vma->vm_ops && vma->vm_ops->sync) {
1671                 int error;
1672                 error = vma->vm_ops->sync(vma, start, end-start, flags);
1673                 if (!error && (flags & MS_SYNC)) {
1674                         struct file * file = vma->vm_file;
1675                         if (file && file->f_op && file->f_op->fsync) {
1676                                 down(&file->f_dentry->d_inode->i_sem);
1677                                 error = file->f_op->fsync(file, file->f_dentry, 1);
1678                                 up(&file->f_dentry->d_inode->i_sem);
1679                         }
1680                 }
1681                 return error;
1682         }
1683         return 0;
1684 }
1685
1686 asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
1687 {
1688         unsigned long end;
1689         struct vm_area_struct * vma;
1690         int unmapped_error, error = -EINVAL;
1691
1692         down(&current->mm->mmap_sem);
1693         if (start & ~PAGE_MASK)
1694                 goto out;
1695         len = (len + ~PAGE_MASK) & PAGE_MASK;
1696         end = start + len;
1697         if (end < start)
1698                 goto out;
1699         if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1700                 goto out;
1701         error = 0;
1702         if (end == start)
1703                 goto out;
1704         /*
1705          * If the interval [start,end) covers some unmapped address ranges,
1706          * just ignore them, but return -EFAULT at the end.
1707          */
1708         vma = find_vma(current->mm, start);
1709         unmapped_error = 0;
1710         for (;;) {
1711                 /* Still start < end. */
1712                 error = -EFAULT;
1713                 if (!vma)
1714                         goto out;
1715                 /* Here start < vma->vm_end. */
1716                 if (start < vma->vm_start) {
1717                         unmapped_error = -EFAULT;
1718                         start = vma->vm_start;
1719                 }
1720                 /* Here vma->vm_start <= start < vma->vm_end. */
1721                 if (end <= vma->vm_end) {
1722                         if (start < end) {
1723                                 error = msync_interval(vma, start, end, flags);
1724                                 if (error)
1725                                         goto out;
1726                         }
1727                         error = unmapped_error;
1728                         goto out;
1729                 }
1730                 /* Here vma->vm_start <= start < vma->vm_end < end. */
1731                 error = msync_interval(vma, start, vma->vm_end, flags);
1732                 if (error)
1733                         goto out;
1734                 start = vma->vm_end;
1735                 vma = vma->vm_next;
1736         }
1737 out:
1738         up(&current->mm->mmap_sem);
1739         return error;
1740 }
1741
1742 static inline void setup_read_behavior(struct vm_area_struct * vma,
1743         int behavior)
1744 {
1745         VM_ClearReadHint(vma);
1746         switch(behavior) {
1747                 case MADV_SEQUENTIAL:
1748                         vma->vm_flags |= VM_SEQ_READ;
1749                         break;
1750                 case MADV_RANDOM:
1751                         vma->vm_flags |= VM_RAND_READ;
1752                         break;
1753                 default:
1754                         break;
1755         }
1756         return;
1757 }
1758
1759 static long madvise_fixup_start(struct vm_area_struct * vma,
1760         unsigned long end, int behavior)
1761 {
1762         struct vm_area_struct * n;
1763
1764         n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1765         if (!n)
1766                 return -EAGAIN;
1767         *n = *vma;
1768         n->vm_end = end;
1769         setup_read_behavior(n, behavior);
1770         n->vm_raend = 0;
1771         get_file(n->vm_file);
1772         if (n->vm_ops && n->vm_ops->open)
1773                 n->vm_ops->open(n);
1774         lock_vma_mappings(vma);
1775         spin_lock(&vma->vm_mm->page_table_lock);
1776         vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
1777         vma->vm_start = end;
1778         __insert_vm_struct(current->mm, n);
1779         spin_unlock(&vma->vm_mm->page_table_lock);
1780         unlock_vma_mappings(vma);
1781         return 0;
1782 }
1783
1784 static long madvise_fixup_end(struct vm_area_struct * vma,
1785         unsigned long start, int behavior)
1786 {
1787         struct vm_area_struct * n;
1788
1789         n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1790         if (!n)
1791                 return -EAGAIN;
1792         *n = *vma;
1793         n->vm_start = start;
1794         n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT;
1795         setup_read_behavior(n, behavior);
1796         n->vm_raend = 0;
1797         get_file(n->vm_file);
1798         if (n->vm_ops && n->vm_ops->open)
1799                 n->vm_ops->open(n);
1800         lock_vma_mappings(vma);
1801         spin_lock(&vma->vm_mm->page_table_lock);
1802         vma->vm_end = start;
1803         __insert_vm_struct(current->mm, n);
1804         spin_unlock(&vma->vm_mm->page_table_lock);
1805         unlock_vma_mappings(vma);
1806         return 0;
1807 }
1808
1809 static long madvise_fixup_middle(struct vm_area_struct * vma,
1810         unsigned long start, unsigned long end, int behavior)
1811 {
1812         struct vm_area_struct * left, * right;
1813
1814         left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1815         if (!left)
1816                 return -EAGAIN;
1817         right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1818         if (!right) {
1819                 kmem_cache_free(vm_area_cachep, left);
1820                 return -EAGAIN;
1821         }
1822         *left = *vma;
1823         *right = *vma;
1824         left->vm_end = start;
1825         right->vm_start = end;
1826         right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT;
1827         left->vm_raend = 0;
1828         right->vm_raend = 0;
1829         atomic_add(2, &vma->vm_file->f_count);
1830
1831         if (vma->vm_ops && vma->vm_ops->open) {
1832                 vma->vm_ops->open(left);
1833                 vma->vm_ops->open(right);
1834         }
1835         lock_vma_mappings(vma);
1836         spin_lock(&vma->vm_mm->page_table_lock);
1837         vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
1838         vma->vm_start = start;
1839         vma->vm_end = end;
1840         setup_read_behavior(vma, behavior);
1841         vma->vm_raend = 0;
1842         __insert_vm_struct(current->mm, left);
1843         __insert_vm_struct(current->mm, right);
1844         spin_unlock(&vma->vm_mm->page_table_lock);
1845         unlock_vma_mappings(vma);
1846         return 0;
1847 }
1848
1849 /*
1850  * We can potentially split a vm area into separate
1851  * areas, each area with its own behavior.
1852  */
1853 static long madvise_behavior(struct vm_area_struct * vma,
1854         unsigned long start, unsigned long end, int behavior)
1855 {
1856         int error = 0;
1857
1858         /* This caps the number of vma's this process can own */
1859         if (vma->vm_mm->map_count > MAX_MAP_COUNT)
1860                 return -ENOMEM;
1861
1862         if (start == vma->vm_start) {
1863                 if (end == vma->vm_end) {
1864                         setup_read_behavior(vma, behavior);
1865                         vma->vm_raend = 0;
1866                 } else
1867                         error = madvise_fixup_start(vma, end, behavior);
1868         } else {
1869                 if (end == vma->vm_end)
1870                         error = madvise_fixup_end(vma, start, behavior);
1871                 else
1872                         error = madvise_fixup_middle(vma, start, end, behavior);
1873         }
1874
1875         return error;
1876 }
1877
1878 /*
1879  * Schedule all required I/O operations, then run the disk queue
1880  * to make sure they are started.  Do not wait for completion.
1881  */
1882 static long madvise_willneed(struct vm_area_struct * vma,
1883         unsigned long start, unsigned long end)
1884 {
1885         long error = -EBADF;
1886         struct file * file;
1887         unsigned long size, rlim_rss;
1888
1889         /* Doesn't work if there's no mapped file. */
1890         if (!vma->vm_file)
1891                 return error;
1892         file = vma->vm_file;
1893         size = (file->f_dentry->d_inode->i_size + PAGE_CACHE_SIZE - 1) >>
1894                                                         PAGE_CACHE_SHIFT;
1895
1896         start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1897         if (end > vma->vm_end)
1898                 end = vma->vm_end;
1899         end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1900
1901         /* Make sure this doesn't exceed the process's max rss. */
1902         error = -EIO;
1903         rlim_rss = current->rlim ?  current->rlim[RLIMIT_RSS].rlim_cur :
1904                                 LONG_MAX; /* default: see resource.h */
1905         if ((vma->vm_mm->rss + (end - start)) > rlim_rss)
1906                 return error;
1907
1908         /* round to cluster boundaries if this isn't a "random" area. */
1909         if (!VM_RandomReadHint(vma)) {
1910                 start = CLUSTER_OFFSET(start);
1911                 end = CLUSTER_OFFSET(end + CLUSTER_PAGES - 1);
1912
1913                 while ((start < end) && (start < size)) {
1914                         error = read_cluster_nonblocking(file, start, size);
1915                         start += CLUSTER_PAGES;
1916                         if (error < 0)
1917                                 break;
1918                 }
1919         } else {
1920                 while ((start < end) && (start < size)) {
1921                         error = page_cache_read(file, start);
1922                         start++;
1923                         if (error < 0)
1924                                 break;
1925                 }
1926         }
1927
1928         /* Don't wait for someone else to push these requests. */
1929         run_task_queue(&tq_disk);
1930
1931         return error;
1932 }
1933
1934 /*
1935  * Application no longer needs these pages.  If the pages are dirty,
1936  * it's OK to just throw them away.  The app will be more careful about
1937  * data it wants to keep.  Be sure to free swap resources too.  The
1938  * zap_page_range call sets things up for refill_inactive to actually free
1939  * these pages later if no one else has touched them in the meantime,
1940  * although we could add these pages to a global reuse list for
1941  * refill_inactive to pick up before reclaiming other pages.
1942  *
1943  * NB: This interface discards data rather than pushes it out to swap,
1944  * as some implementations do.  This has performance implications for
1945  * applications like large transactional databases which want to discard
1946  * pages in anonymous maps after committing to backing store the data
1947  * that was kept in them.  There is no reason to write this data out to
1948  * the swap area if the application is discarding it.
1949  *
1950  * An interface that causes the system to free clean pages and flush
1951  * dirty pages is already available as msync(MS_INVALIDATE).
1952  */
1953 static long madvise_dontneed(struct vm_area_struct * vma,
1954         unsigned long start, unsigned long end)
1955 {
1956         if (vma->vm_flags & VM_LOCKED)
1957                 return -EINVAL;
1958
1959         flush_cache_range(vma->vm_mm, start, end);
1960         zap_page_range(vma->vm_mm, start, end - start);
1961         flush_tlb_range(vma->vm_mm, start, end);
1962         return 0;
1963 }
1964
1965 static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
1966         unsigned long end, int behavior)
1967 {
1968         long error = -EBADF;
1969
1970         switch (behavior) {
1971         case MADV_NORMAL:
1972         case MADV_SEQUENTIAL:
1973         case MADV_RANDOM:
1974                 error = madvise_behavior(vma, start, end, behavior);
1975                 break;
1976
1977         case MADV_WILLNEED:
1978                 error = madvise_willneed(vma, start, end);
1979                 break;
1980
1981         case MADV_DONTNEED:
1982                 error = madvise_dontneed(vma, start, end);
1983                 break;
1984
1985         default:
1986                 error = -EINVAL;
1987                 break;
1988         }
1989
1990         return error;
1991 }
1992
1993 /*
1994  * The madvise(2) system call.
1995  *
1996  * Applications can use madvise() to advise the kernel how it should
1997  * handle paging I/O in this VM area.  The idea is to help the kernel
1998  * use appropriate read-ahead and caching techniques.  The information
1999  * provided is advisory only, and can be safely disregarded by the
2000  * kernel without affecting the correct operation of the application.
2001  *
2002  * behavior values:
2003  *  MADV_NORMAL - the default behavior is to read clusters.  This
2004  *              results in some read-ahead and read-behind.
2005  *  MADV_RANDOM - the system should read the minimum amount of data
2006  *              on any access, since it is unlikely that the appli-
2007  *              cation will need more than what it asks for.
2008  *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
2009  *              once, so they can be aggressively read ahead, and
2010  *              can be freed soon after they are accessed.
2011  *  MADV_WILLNEED - the application is notifying the system to read
2012  *              some pages ahead.
2013  *  MADV_DONTNEED - the application is finished with the given range,
2014  *              so the kernel can free resources associated with it.
2015  *
2016  * return values:
2017  *  zero    - success
2018  *  -EINVAL - start + len < 0, start is not page-aligned,
2019  *              "behavior" is not a valid value, or application
2020  *              is attempting to release locked or shared pages.
2021  *  -ENOMEM - addresses in the specified range are not currently
2022  *              mapped, or are outside the AS of the process.
2023  *  -EIO    - an I/O error occurred while paging in data.
2024  *  -EBADF  - map exists, but area maps something that isn't a file.
2025  *  -EAGAIN - a kernel resource was temporarily unavailable.
2026  */
2027 asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior)
2028 {
2029         unsigned long end;
2030         struct vm_area_struct * vma;
2031         int unmapped_error = 0;
2032         int error = -EINVAL;
2033
2034         down(&current->mm->mmap_sem);
2035
2036         if (start & ~PAGE_MASK)
2037                 goto out;
2038         len = (len + ~PAGE_MASK) & PAGE_MASK;
2039         end = start + len;
2040         if (end < start)
2041                 goto out;
2042
2043         error = 0;
2044         if (end == start)
2045                 goto out;
2046
2047         /*
2048          * If the interval [start,end) covers some unmapped address
2049          * ranges, just ignore them, but return -ENOMEM at the end.
2050          */
2051         vma = find_vma(current->mm, start);
2052         for (;;) {
2053                 /* Still start < end. */
2054                 error = -ENOMEM;
2055                 if (!vma)
2056                         goto out;
2057
2058                 /* Here start < vma->vm_end. */
2059                 if (start < vma->vm_start) {
2060                         unmapped_error = -ENOMEM;
2061                         start = vma->vm_start;
2062                 }
2063
2064                 /* Here vma->vm_start <= start < vma->vm_end. */
2065                 if (end <= vma->vm_end) {
2066                         if (start < end) {
2067                                 error = madvise_vma(vma, start, end,
2068                                                         behavior);
2069                                 if (error)
2070                                         goto out;
2071                         }
2072                         error = unmapped_error;
2073                         goto out;
2074                 }
2075
2076                 /* Here vma->vm_start <= start < vma->vm_end < end. */
2077                 error = madvise_vma(vma, start, vma->vm_end, behavior);
2078                 if (error)
2079                         goto out;
2080                 start = vma->vm_end;
2081                 vma = vma->vm_next;
2082         }
2083
2084 out:
2085         up(&current->mm->mmap_sem);
2086         return error;
2087 }
2088
2089 /*
2090  * Later we can get more picky about what "in core" means precisely.
2091  * For now, simply check to see if the page is in the page cache,
2092  * and is up to date; i.e. that no page-in operation would be required
2093  * at this time if an application were to map and access this page.
2094  */
2095 static unsigned char mincore_page(struct vm_area_struct * vma,
2096         unsigned long pgoff)
2097 {
2098         unsigned char present = 0;
2099         struct address_space * as = &vma->vm_file->f_dentry->d_inode->i_data;
2100         struct page * page, ** hash = page_hash(as, pgoff);
2101
2102         spin_lock(&pagecache_lock);
2103         page = __find_page_nolock(as, pgoff, *hash);
2104         if ((page) && (Page_Uptodate(page)))
2105                 present = 1;
2106         spin_unlock(&pagecache_lock);
2107
2108         return present;
2109 }
2110
2111 static long mincore_vma(struct vm_area_struct * vma,
2112         unsigned long start, unsigned long end, unsigned char * vec)
2113 {
2114         long error, i, remaining;
2115         unsigned char * tmp;
2116
2117         error = -ENOMEM;
2118         if (!vma->vm_file)
2119                 return error;
2120
2121         start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2122         if (end > vma->vm_end)
2123                 end = vma->vm_end;
2124         end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2125
2126         error = -EAGAIN;
2127         tmp = (unsigned char *) __get_free_page(GFP_KERNEL);
2128         if (!tmp)
2129                 return error;
2130
2131         /* (end - start) is # of pages, and also # of bytes in "vec */
2132         remaining = (end - start),
2133
2134         error = 0;
2135         for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) {
2136                 int j = 0;
2137                 long thispiece = (remaining < PAGE_SIZE) ?
2138                                                 remaining : PAGE_SIZE;
2139
2140                 while (j < thispiece)
2141                         tmp[j++] = mincore_page(vma, start++);
2142
2143                 if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) {
2144                         error = -EFAULT;
2145                         break;
2146                 }
2147         }
2148
2149         free_page((unsigned long) tmp);
2150         return error;
2151 }
2152
2153 /*
2154  * The mincore(2) system call.
2155  *
2156  * mincore() returns the memory residency status of the pages in the
2157  * current process's address space specified by [addr, addr + len).
2158  * The status is returned in a vector of bytes.  The least significant
2159  * bit of each byte is 1 if the referenced page is in memory, otherwise
2160  * it is zero.
2161  *
2162  * Because the status of a page can change after mincore() checks it
2163  * but before it returns to the application, the returned vector may
2164  * contain stale information.  Only locked pages are guaranteed to
2165  * remain in memory.
2166  *
2167  * return values:
2168  *  zero    - success
2169  *  -EFAULT - vec points to an illegal address
2170  *  -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE,
2171  *              or len has a nonpositive value
2172  *  -ENOMEM - Addresses in the range [addr, addr + len] are
2173  *              invalid for the address space of this process, or
2174  *              specify one or more pages which are not currently
2175  *              mapped
2176  *  -EAGAIN - A kernel resource was temporarily unavailable.
2177  */
2178 asmlinkage long sys_mincore(unsigned long start, size_t len,
2179         unsigned char * vec)
2180 {
2181         int index = 0;
2182         unsigned long end;
2183         struct vm_area_struct * vma;
2184         int unmapped_error = 0;
2185         long error = -EINVAL;
2186
2187         down(&current->mm->mmap_sem);
2188
2189         if (start & ~PAGE_CACHE_MASK)
2190                 goto out;
2191         len = (len + ~PAGE_CACHE_MASK) & PAGE_CACHE_MASK;
2192         end = start + len;
2193         if (end < start)
2194                 goto out;
2195
2196         error = 0;
2197         if (end == start)
2198                 goto out;
2199
2200         /*
2201          * If the interval [start,end) covers some unmapped address
2202          * ranges, just ignore them, but return -ENOMEM at the end.
2203          */
2204         vma = find_vma(current->mm, start);
2205         for (;;) {
2206                 /* Still start < end. */
2207                 error = -ENOMEM;
2208                 if (!vma)
2209                         goto out;
2210
2211                 /* Here start < vma->vm_end. */
2212                 if (start < vma->vm_start) {
2213                         unmapped_error = -ENOMEM;
2214                         start = vma->vm_start;
2215                 }
2216
2217                 /* Here vma->vm_start <= start < vma->vm_end. */
2218                 if (end <= vma->vm_end) {
2219                         if (start < end) {
2220                                 error = mincore_vma(vma, start, end,
2221                                                         &vec[index]);
2222                                 if (error)
2223                                         goto out;
2224                         }
2225                         error = unmapped_error;
2226                         goto out;
2227                 }
2228
2229                 /* Here vma->vm_start <= start < vma->vm_end < end. */
2230                 error = mincore_vma(vma, start, vma->vm_end, &vec[index]);
2231                 if (error)
2232                         goto out;
2233                 index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT;
2234                 start = vma->vm_end;
2235                 vma = vma->vm_next;
2236         }
2237
2238 out:
2239         up(&current->mm->mmap_sem);
2240         return error;
2241 }
2242
2243 static inline
2244 struct page *__read_cache_page(struct address_space *mapping,
2245                                 unsigned long index,
2246                                 int (*filler)(void *,struct page*),
2247                                 void *data)
2248 {
2249         struct page **hash = page_hash(mapping, index);
2250         struct page *page, *cached_page = NULL;
2251         int err;
2252 repeat:
2253         page = __find_get_page(mapping, index, hash);
2254         if (!page) {
2255                 if (!cached_page) {
2256                         cached_page = page_cache_alloc();
2257                         if (!cached_page)
2258                                 return ERR_PTR(-ENOMEM);
2259                 }
2260                 page = cached_page;
2261                 if (add_to_page_cache_unique(page, mapping, index, hash))
2262                         goto repeat;
2263                 cached_page = NULL;
2264                 err = filler(data, page);
2265                 if (err < 0) {
2266                         page_cache_release(page);
2267                         page = ERR_PTR(err);
2268                 }
2269         }
2270         if (cached_page)
2271                 page_cache_free(cached_page);
2272         return page;
2273 }
2274
2275 /*
2276  * Read into the page cache. If a page already exists,
2277  * and Page_Uptodate() is not set, try to fill the page.
2278  */
2279 struct page *read_cache_page(struct address_space *mapping,
2280                                 unsigned long index,
2281                                 int (*filler)(void *,struct page*),
2282                                 void *data)
2283 {
2284         struct page *page;
2285         int err;
2286
2287 retry:
2288         page = __read_cache_page(mapping, index, filler, data);
2289         if (IS_ERR(page) || Page_Uptodate(page))
2290                 goto out;
2291
2292         lock_page(page);
2293         if (!page->mapping) {
2294                 UnlockPage(page);
2295                 page_cache_release(page);
2296                 goto retry;
2297         }
2298         if (Page_Uptodate(page)) {
2299                 UnlockPage(page);
2300                 goto out;
2301         }
2302         err = filler(data, page);
2303         if (err < 0) {
2304                 page_cache_release(page);
2305                 page = ERR_PTR(err);
2306         }
2307  out:
2308         return page;
2309 }
2310
2311 static inline struct page * __grab_cache_page(struct address_space *mapping,
2312                                 unsigned long index, struct page **cached_page)
2313 {
2314         struct page *page, **hash = page_hash(mapping, index);
2315 repeat:
2316         page = __find_lock_page(mapping, index, hash);
2317         if (!page) {
2318                 if (!*cached_page) {
2319                         *cached_page = page_cache_alloc();
2320                         if (!*cached_page)
2321                                 return NULL;
2322                 }
2323                 page = *cached_page;
2324                 if (add_to_page_cache_unique(page, mapping, index, hash))
2325                         goto repeat;
2326                 *cached_page = NULL;
2327         }
2328         return page;
2329 }
2330
2331 /*
2332  * Returns locked page at given index in given cache, creating it if needed.
2333  */
2334
2335 struct page *grab_cache_page(struct address_space *mapping, unsigned long index)
2336 {
2337         struct page *cached_page = NULL;
2338         struct page *page = __grab_cache_page(mapping,index,&cached_page);
2339         if (cached_page)
2340                 page_cache_free(cached_page);
2341         return page;
2342 }
2343
2344 static inline void remove_suid(struct inode *inode)
2345 {
2346         unsigned int mode;
2347
2348         /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
2349         mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
2350
2351         /* was any of the uid bits set? */
2352         mode &= inode->i_mode;
2353         if (mode && !capable(CAP_FSETID)) {
2354                 inode->i_mode &= ~mode;
2355                 mark_inode_dirty(inode);
2356         }
2357 }
2358
2359 /*
2360  * Write to a file through the page cache.
2361  *
2362  * We currently put everything into the page cache prior to writing it.
2363  * This is not a problem when writing full pages. With partial pages,
2364  * however, we first have to read the data into the cache, then
2365  * dirty the page, and finally schedule it for writing. Alternatively, we
2366  * could write-through just the portion of data that would go into that
2367  * page, but that would kill performance for applications that write data
2368  * line by line, and it's prone to race conditions.
2369  *
2370  * Note that this routine doesn't try to keep track of dirty pages. Each
2371  * file system has to do this all by itself, unfortunately.
2372  *                                                      okir@monad.swb.de
2373  */
2374 ssize_t
2375 generic_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)
2376 {
2377         struct inode    *inode = file->f_dentry->d_inode;
2378         struct address_space *mapping = inode->i_mapping;
2379         unsigned long   limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
2380         loff_t          pos;
2381         struct page     *page, *cached_page;
2382         unsigned long   written;
2383         long            status;
2384         int             err;
2385
2386         cached_page = NULL;
2387
2388         down(&inode->i_sem);
2389
2390         pos = *ppos;
2391         err = -EINVAL;
2392         if (pos < 0)
2393                 goto out;
2394
2395         err = file->f_error;
2396         if (err) {
2397                 file->f_error = 0;
2398                 goto out;
2399         }
2400
2401         written = 0;
2402
2403         if (file->f_flags & O_APPEND)
2404                 pos = inode->i_size;
2405
2406         /*
2407          * Check whether we've reached the file size limit.
2408          */
2409         err = -EFBIG;
2410         if (limit != RLIM_INFINITY) {
2411                 if (pos >= limit) {
2412                         send_sig(SIGXFSZ, current, 0);
2413                         goto out;
2414                 }
2415                 if (count > limit - pos) {
2416                         send_sig(SIGXFSZ, current, 0);
2417                         count = limit - pos;
2418                 }
2419         }
2420
2421         status  = 0;
2422         if (count) {
2423                 remove_suid(inode);
2424                 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
2425                 mark_inode_dirty_sync(inode);
2426         }
2427
2428         while (count) {
2429                 unsigned long bytes, index, offset;
2430                 char *kaddr;
2431
2432                 /*
2433                  * Try to find the page in the cache. If it isn't there,
2434                  * allocate a free page.
2435                  */
2436                 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
2437                 index = pos >> PAGE_CACHE_SHIFT;
2438                 bytes = PAGE_CACHE_SIZE - offset;
2439                 if (bytes > count)
2440                         bytes = count;
2441
2442                 status = -ENOMEM;       /* we'll assign it later anyway */
2443                 page = __grab_cache_page(mapping, index, &cached_page);
2444                 if (!page)
2445                         break;
2446
2447                 /* We have exclusive IO access to the page.. */
2448                 if (!PageLocked(page)) {
2449                         PAGE_BUG(page);
2450                 }
2451
2452                 status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes);
2453                 if (status)
2454                         goto unlock;
2455                 kaddr = page_address(page);
2456                 status = copy_from_user(kaddr+offset, buf, bytes);
2457                 flush_dcache_page(page);
2458                 if (status)
2459                         goto fail_write;
2460                 status = mapping->a_ops->commit_write(file, page, offset, offset+bytes);
2461                 if (!status)
2462                         status = bytes;
2463
2464                 if (status >= 0) {
2465                         written += status;
2466                         count -= status;
2467                         pos += status;
2468                         buf += status;
2469                 }
2470 unlock:
2471                 /* Mark it unlocked again and drop the page.. */
2472                 UnlockPage(page);
2473                 deactivate_page(page);
2474                 page_cache_release(page);
2475
2476                 if (status < 0)
2477                         break;
2478         }
2479         *ppos = pos;
2480
2481         if (cached_page)
2482                 page_cache_free(cached_page);
2483
2484         /* For now, when the user asks for O_SYNC, we'll actually
2485          * provide O_DSYNC. */
2486         if ((status >= 0) && (file->f_flags & O_SYNC))
2487                 status = generic_osync_inode(inode, 1); /* 1 means datasync */
2488
2489         err = written ? written : status;
2490 out:
2491
2492         up(&inode->i_sem);
2493         return err;
2494 fail_write:
2495         status = -EFAULT;
2496         ClearPageUptodate(page);
2497         kunmap(page);
2498         goto unlock;
2499 }
2500
2501 void __init page_cache_init(unsigned long mempages)
2502 {
2503         unsigned long htable_size, order;
2504
2505         htable_size = mempages;
2506         htable_size *= sizeof(struct page *);
2507         for(order = 0; (PAGE_SIZE << order) < htable_size; order++)
2508                 ;
2509
2510         do {
2511                 unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *);
2512
2513                 page_hash_bits = 0;
2514                 while((tmp >>= 1UL) != 0UL)
2515                         page_hash_bits++;
2516
2517                 page_hash_table = (struct page **)
2518                         __get_free_pages(GFP_ATOMIC, order);
2519         } while(page_hash_table == NULL && --order > 0);
2520
2521         printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
2522                (1 << page_hash_bits), order, (PAGE_SIZE << order));
2523         if (!page_hash_table)
2524                 panic("Failed to allocate page hash table\n");
2525         memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));
2526 }