mm/filemap.c

   1 /*
   2  *      linux/mm/filemap.c
   3  *
   4  * Copyright (C) 1994-1999  Linus Torvalds
   5  */
   6
   7 /*
   8  * This file handles the generic file mmap semantics used by
   9  * most "normal" filesystems (but you don't /have/ to use this:
  10  * the NFS filesystem used to do this differently, for example)
  11  */
  12 #include <linux/malloc.h>
  13 #include <linux/shm.h>
  14 #include <linux/mman.h>
  15 #include <linux/locks.h>
  16 #include <linux/pagemap.h>
  17 #include <linux/swap.h>
  18 #include <linux/smp_lock.h>
  19 #include <linux/blkdev.h>
  20 #include <linux/file.h>
  21 #include <linux/swapctl.h>
  22 #include <linux/slab.h>
  23 #include <linux/init.h>
  24
  25 #include <asm/pgtable.h>
  26 #include <asm/uaccess.h>
  27
  28 /*
  29  * Shared mappings implemented 30.11.1994. It's not fully working yet,
  30  * though.
  31  *
  32  * Shared mappings now work. 15.8.1995  Bruno.
  33  *
  34  * finished 'unifying' the page and buffer cache and SMP-threaded the
  35  * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
  36  *
  37  * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
  38  */
  39
  40 atomic_t page_cache_size = ATOMIC_INIT(0);
  41 unsigned int page_hash_bits;
  42 struct page **page_hash_table;
  43
  44 spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
  45 /*
  46  * NOTE: to avoid deadlocking you must never acquire the pagecache_lock with
  47  *       the pagemap_lru_lock held.
  48  */
  49 spinlock_t pagemap_lru_lock = SPIN_LOCK_UNLOCKED;
  50
  51 #define CLUSTER_PAGES           (1 << page_cluster)
  52 #define CLUSTER_SHIFT           (PAGE_CACHE_SHIFT + page_cluster)
  53 #define CLUSTER_BYTES           (1 << CLUSTER_SHIFT)
  54 #define CLUSTER_OFFSET(x)       (((x) >> CLUSTER_SHIFT) << CLUSTER_SHIFT)
  55
  56 void __add_page_to_hash_queue(struct page * page, struct page **p)
  57 {
  58         atomic_inc(&page_cache_size);
  59         if((page->next_hash = *p) != NULL)
  60                 (*p)->pprev_hash = &page->next_hash;
  61         *p = page;
  62         page->pprev_hash = p;
  63         if (page->buffers)
  64                 PAGE_BUG(page);
  65 }
  66
  67 static void remove_page_from_hash_queue(struct page * page)
  68 {
  69         if(page->pprev_hash) {
  70                 if(page->next_hash)
  71                         page->next_hash->pprev_hash = page->pprev_hash;
  72                 *page->pprev_hash = page->next_hash;
  73                 page->pprev_hash = NULL;
  74         }
  75         atomic_dec(&page_cache_size);
  76 }
  77
  78 static void remove_page_from_inode_queue(struct page * page)
  79 {
  80         struct inode * inode = page->inode;
  81         struct page *prev, *next;
  82
  83         inode->i_nrpages--;
  84         next = page->next;
  85         prev = page->prev;
  86         if (inode->i_pages == page)
  87                 inode->i_pages = next;
  88         if (next)
  89                 next->prev = prev;
  90         if (prev)
  91                 prev->next = next;
  92         page->next = NULL;
  93         page->prev = NULL;
  94 }
  95
  96 /*
  97  * Remove a page from the page cache and free it. Caller has to make
  98  * sure the page is locked and that nobody else uses it - or that usage
  99  * is safe.
 100  */
 101 void remove_inode_page(struct page *page)
 102 {
 103         if (!PageLocked(page))
 104                 PAGE_BUG(page);
 105
 106         spin_lock(&pagecache_lock);
 107         remove_page_from_inode_queue(page);
 108         remove_page_from_hash_queue(page);
 109         page->inode = NULL;
 110         spin_unlock(&pagecache_lock);
 111 }
 112
 113 void invalidate_inode_pages(struct inode * inode)
 114 {
 115         struct page ** p;
 116         struct page * page;
 117
 118 repeat:
 119         spin_lock(&pagecache_lock);
 120         p = &inode->i_pages;
 121         while ((page = *p) != NULL) {
 122                 get_page(page);
 123                 if (TryLockPage(page)) {
 124                         spin_unlock(&pagecache_lock);
 125                         wait_on_page(page);
 126                         page_cache_release(page);
 127                         goto repeat;
 128                 }
 129                 if (page_count(page) != 2)
 130                         printk("hm, busy page invalidated? (not necesserily a bug)\n");
 131                 lru_cache_del(page);
 132
 133                 remove_page_from_inode_queue(page);
 134                 remove_page_from_hash_queue(page);
 135                 page->inode = NULL;
 136                 UnlockPage(page);
 137                 page_cache_release(page);
 138                 page_cache_release(page);
 139
 140         }
 141         spin_unlock(&pagecache_lock);
 142 }
 143 /*
 144  * Truncate the page cache at a set offset, removing the pages
 145  * that are beyond that offset (and zeroing out partial pages).
 146  */
 147 void truncate_inode_pages(struct inode * inode, unsigned long start)
 148 {
 149         struct page ** p;
 150         struct page * page;
 151         int partial = 0;
 152
 153 repeat:
 154         spin_lock(&pagecache_lock);
 155         p = &inode->i_pages;
 156         while ((page = *p) != NULL) {
 157                 unsigned long offset = page->offset;
 158
 159                 /* page wholly truncated - free it */
 160                 if (offset >= start) {
 161                         get_page(page);
 162                         spin_unlock(&pagecache_lock);
 163
 164                         lock_page(page);
 165
 166                         if (!inode->i_op->flushpage ||
 167                             inode->i_op->flushpage(inode, page, 0))
 168                                 lru_cache_del(page);
 169
 170                         /*
 171                          * We remove the page from the page cache
 172                          * _after_ we have destroyed all buffer-cache
 173                          * references to it. Otherwise some other process
 174                          * might think this inode page is not in the
 175                          * page cache and creates a buffer-cache alias
 176                          * to it causing all sorts of fun problems ...
 177                          */
 178                         remove_inode_page(page);
 179
 180                         UnlockPage(page);
 181                         page_cache_release(page);
 182                         page_cache_release(page);
 183
 184                         /*
 185                          * We have done things without the pagecache lock,
 186                          * so we'll have to repeat the scan.
 187                          * It's not possible to deadlock here because
 188                          * we are guaranteed to make progress. (ie. we have
 189                          * just removed a page)
 190                          */
 191                         goto repeat;
 192                 }
 193                 p = &page->next;
 194                 /*
 195                  * there is only one partial page possible.
 196                  */
 197                 if (partial)
 198                         continue;
 199
 200                 offset = start - offset;
 201                 /* partial truncate, clear end of page */
 202                 if (offset < PAGE_CACHE_SIZE) {
 203                         unsigned long address;
 204                         get_page(page);
 205                         spin_unlock(&pagecache_lock);
 206
 207                         lock_page(page);
 208                         partial = 1;
 209
 210                         address = page_address(page);
 211                         memset((void *) (offset + address), 0, PAGE_CACHE_SIZE - offset);
 212                         flush_page_to_ram(address);
 213
 214                         if (inode->i_op->flushpage)
 215                                 inode->i_op->flushpage(inode, page, offset);
 216                         /*
 217                          * we have dropped the spinlock so we have to
 218                          * restart.
 219                          */
 220                         UnlockPage(page);
 221                         page_cache_release(page);
 222                         goto repeat;
 223                 }
 224         }
 225         spin_unlock(&pagecache_lock);
 226 }
 227
 228 int shrink_mmap(int priority, int gfp_mask)
 229 {
 230         int ret = 0, count;
 231         LIST_HEAD(young);
 232         LIST_HEAD(old);
 233         LIST_HEAD(forget);
 234         struct list_head * page_lru, * dispose;
 235         struct page * page;
 236
 237         count = nr_lru_pages / (priority+1);
 238
 239         spin_lock(&pagemap_lru_lock);
 240
 241         while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache) {
 242                 page = list_entry(page_lru, struct page, lru);
 243                 list_del(page_lru);
 244
 245                 dispose = &lru_cache;
 246                 if (test_and_clear_bit(PG_referenced, &page->flags))
 247                         /* Roll the page at the top of the lru list,
 248                          * we could also be more aggressive putting
 249                          * the page in the young-dispose-list, so
 250                          * avoiding to free young pages in each pass.
 251                          */
 252                         goto dispose_continue;
 253
 254                 dispose = &old;
 255                 /* don't account passes over not DMA pages */
 256                 if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
 257                         goto dispose_continue;
 258                 if (!(gfp_mask & __GFP_BIGMEM) && PageBIGMEM(page))
 259                         goto dispose_continue;
 260
 261                 count--;
 262
 263                 dispose = &young;
 264                 if (TryLockPage(page))
 265                         goto dispose_continue;
 266
 267                 /* Release the pagemap_lru lock even if the page is not yet
 268                    queued in any lru queue since we have just locked down
 269                    the page so nobody else may SMP race with us running
 270                    a lru_cache_del() (lru_cache_del() always run with the
 271                    page locked down ;). */
 272                 spin_unlock(&pagemap_lru_lock);
 273
 274                 /* avoid unscalable SMP locking */
 275                 if (!page->buffers && page_count(page) > 1)
 276                         goto unlock_noput_continue;
 277
 278                 /* Take the pagecache_lock spinlock held to avoid
 279                    other tasks to notice the page while we are looking at its
 280                    page count. If it's a pagecache-page we'll free it
 281                    in one atomic transaction after checking its page count. */
 282                 spin_lock(&pagecache_lock);
 283
 284                 /* avoid freeing the page while it's locked */
 285                 get_page(page);
 286
 287                 /* Is it a buffer page? */
 288                 if (page->buffers) {
 289                         spin_unlock(&pagecache_lock);
 290                         if (!try_to_free_buffers(page))
 291                                 goto unlock_continue;
 292                         /* page was locked, inode can't go away under us */
 293                         if (!page->inode) {
 294                                 atomic_sub(PAGE_CACHE_SIZE, &buffermem);
 295                                 goto made_buffer_progress;
 296                         }
 297                         spin_lock(&pagecache_lock);
 298                 }
 299
 300                 /*
 301                  * We can't free pages unless there's just one user
 302                  * (count == 2 because we added one ourselves above).
 303                  */
 304                 if (page_count(page) != 2)
 305                         goto cache_unlock_continue;
 306
 307                 /*
 308                  * Is it a page swap page? If so, we want to
 309                  * drop it if it is no longer used, even if it
 310                  * were to be marked referenced..
 311                  */
 312                 if (PageSwapCache(page)) {
 313                         spin_unlock(&pagecache_lock);
 314                         __delete_from_swap_cache(page);
 315                         goto made_inode_progress;
 316                 }
 317
 318                 /* is it a page-cache page? */
 319                 if (page->inode)
 320                 {
 321                         dispose = &old;
 322                         if (!pgcache_under_min())
 323                         {
 324                                 remove_page_from_inode_queue(page);
 325                                 remove_page_from_hash_queue(page);
 326                                 page->inode = NULL;
 327                                 spin_unlock(&pagecache_lock);
 328                                 goto made_inode_progress;
 329                         }
 330                         goto cache_unlock_continue;
 331                 }
 332
 333                 dispose = &forget;
 334                 printk(KERN_ERR "shrink_mmap: unknown LRU page!\n");
 335
 336 cache_unlock_continue:
 337                 spin_unlock(&pagecache_lock);
 338 unlock_continue:
 339                 UnlockPage(page);
 340                 put_page(page);
 341 dispose_relock_continue:
 342                 /* even if the dispose list is local, a truncate_inode_page()
 343                    may remove a page from its queue so always
 344                    synchronize with the lru lock while accesing the
 345                    page->lru field */
 346                 spin_lock(&pagemap_lru_lock);
 347                 list_add(page_lru, dispose);
 348                 continue;
 349
 350 unlock_noput_continue:
 351                 UnlockPage(page);
 352                 goto dispose_relock_continue;
 353
 354 dispose_continue:
 355                 list_add(page_lru, dispose);
 356         }
 357         goto out;
 358
 359 made_inode_progress:
 360         page_cache_release(page);
 361 made_buffer_progress:
 362         UnlockPage(page);
 363         put_page(page);
 364         ret = 1;
 365         spin_lock(&pagemap_lru_lock);
 366         /* nr_lru_pages needs the spinlock */
 367         nr_lru_pages--;
 368
 369 out:
 370         list_splice(&young, &lru_cache);
 371         list_splice(&old, lru_cache.prev);
 372
 373         spin_unlock(&pagemap_lru_lock);
 374
 375         return ret;
 376 }
 377
 378 static inline struct page * __find_page_nolock(struct inode * inode, unsigned long offset, struct page *page)
 379 {
 380         goto inside;
 381
 382         for (;;) {
 383                 page = page->next_hash;
 384 inside:
 385                 if (!page)
 386                         goto not_found;
 387                 if (page->inode != inode)
 388                         continue;
 389                 if (page->offset == offset)
 390                         break;
 391         }
 392         set_bit(PG_referenced, &page->flags);
 393 not_found:
 394         return page;
 395 }
 396
 397 /*
 398  * By the time this is called, the page is locked and
 399  * we don't have to worry about any races any more.
 400  *
 401  * Start the IO..
 402  */
 403 static int writeout_one_page(struct page *page)
 404 {
 405         struct buffer_head *bh, *head = page->buffers;
 406
 407         bh = head;
 408         do {
 409                 if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
 410                         continue;
 411
 412                 bh->b_flushtime = 0;
 413                 ll_rw_block(WRITE, 1, &bh);
 414         } while ((bh = bh->b_this_page) != head);
 415         return 0;
 416 }
 417
 418 static int waitfor_one_page(struct page *page)
 419 {
 420         int error = 0;
 421         struct buffer_head *bh, *head = page->buffers;
 422
 423         bh = head;
 424         do {
 425                 wait_on_buffer(bh);
 426                 if (buffer_req(bh) && !buffer_uptodate(bh))
 427                         error = -EIO;
 428         } while ((bh = bh->b_this_page) != head);
 429         return error;
 430 }
 431
 432 static int do_buffer_fdatasync(struct inode *inode, unsigned long start, unsigned long end, int (*fn)(struct page *))
 433 {
 434         struct page *next;
 435         int retval = 0;
 436
 437         start &= PAGE_MASK;
 438
 439         spin_lock(&pagecache_lock);
 440         next = inode->i_pages;
 441         while (next) {
 442                 struct page *page = next;
 443                 next = page->next;
 444                 if (!page->buffers)
 445                         continue;
 446                 if (page->offset >= end)
 447                         continue;
 448                 if (page->offset < start)
 449                         continue;
 450
 451                 get_page(page);
 452                 spin_unlock(&pagecache_lock);
 453                 lock_page(page);
 454
 455                 /* The buffers could have been free'd while we waited for the page lock */
 456                 if (page->buffers)
 457                         retval |= fn(page);
 458
 459                 UnlockPage(page);
 460                 spin_lock(&pagecache_lock);
 461                 next = page->next;
 462                 page_cache_release(page);
 463         }
 464         spin_unlock(&pagecache_lock);
 465
 466         return retval;
 467 }
 468
 469 /*
 470  * Two-stage data sync: first start the IO, then go back and
 471  * collect the information..
 472  */
 473 int generic_buffer_fdatasync(struct inode *inode, unsigned long start, unsigned long end)
 474 {
 475         int retval;
 476
 477         retval = do_buffer_fdatasync(inode, start, end, writeout_one_page);
 478         retval |= do_buffer_fdatasync(inode, start, end, waitfor_one_page);
 479         return retval;
 480 }
 481
 482 /*
 483  * This adds a page to the page cache, starting out as locked,
 484  * owned by us, referenced, but not uptodate and with no errors.
 485  */
 486 static inline void __add_to_page_cache(struct page * page,
 487         struct inode * inode, unsigned long offset,
 488         struct page **hash)
 489 {
 490         unsigned long flags;
 491
 492         flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced));
 493         page->flags = flags | (1 << PG_locked);
 494         page->owner = current;  /* REMOVEME */
 495         get_page(page);
 496         page->offset = offset;
 497         add_page_to_inode_queue(inode, page);
 498         __add_page_to_hash_queue(page, hash);
 499         lru_cache_add(page);
 500 }
 501
 502 void add_to_page_cache(struct page * page, struct inode * inode, unsigned long offset)
 503 {
 504         spin_lock(&pagecache_lock);
 505         __add_to_page_cache(page, inode, offset, page_hash(inode, offset));
 506         spin_unlock(&pagecache_lock);
 507 }
 508
 509 int add_to_page_cache_unique(struct page * page,
 510         struct inode * inode, unsigned long offset,
 511         struct page **hash)
 512 {
 513         int err;
 514         struct page *alias;
 515
 516         spin_lock(&pagecache_lock);
 517         alias = __find_page_nolock(inode, offset, *hash);
 518
 519         err = 1;
 520         if (!alias) {
 521                 __add_to_page_cache(page,inode,offset,hash);
 522                 err = 0;
 523         }
 524
 525         spin_unlock(&pagecache_lock);
 526         return err;
 527 }
 528
 529 /*
 530  * This adds the requested page to the page cache if it isn't already there,
 531  * and schedules an I/O to read in its contents from disk.
 532  */
 533 static inline void page_cache_read(struct file * file, unsigned long offset)
 534 {
 535         unsigned long new_page;
 536         struct inode *inode = file->f_dentry->d_inode;
 537         struct page ** hash = page_hash(inode, offset);
 538         struct page * page;
 539
 540         spin_lock(&pagecache_lock);
 541         page = __find_page_nolock(inode, offset, *hash);
 542         spin_unlock(&pagecache_lock);
 543         if (page)
 544                 return;
 545
 546         new_page = page_cache_alloc();
 547         if (!new_page)
 548                 return;
 549         page = page_cache_entry(new_page);
 550
 551         if (!add_to_page_cache_unique(page, inode, offset, hash)) {
 552                 inode->i_op->readpage(file, page);
 553                 page_cache_release(page);
 554                 return;
 555         }
 556
 557         /*
 558          * We arrive here in the unlikely event that someone
 559          * raced with us and added our page to the cache first.
 560          */
 561         page_cache_free(new_page);
 562         return;
 563 }
 564
 565 /*
 566  * Read in an entire cluster at once.  A cluster is usually a 64k-
 567  * aligned block that includes the address requested in "offset."
 568  */
 569 static void read_cluster_nonblocking(struct file * file,
 570         unsigned long offset)
 571 {
 572         off_t filesize = file->f_dentry->d_inode->i_size;
 573         unsigned long pages = CLUSTER_PAGES;
 574
 575         offset = CLUSTER_OFFSET(offset);
 576         while ((pages-- > 0) && (offset < filesize)) {
 577                 page_cache_read(file, offset);
 578                 offset += PAGE_CACHE_SIZE;
 579         }
 580
 581         return;
 582 }
 583
 584 /*
 585  * Wait for a page to get unlocked.
 586  *
 587  * This must be called with the caller "holding" the page,
 588  * ie with increased "page->count" so that the page won't
 589  * go away during the wait..
 590  */
 591 void ___wait_on_page(struct page *page)
 592 {
 593         struct task_struct *tsk = current;
 594         DECLARE_WAITQUEUE(wait, tsk);
 595
 596         add_wait_queue(&page->wait, &wait);
 597         do {
 598                 run_task_queue(&tq_disk);
 599                 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 600                 if (!PageLocked(page))
 601                         break;
 602                 schedule();
 603         } while (PageLocked(page));
 604         tsk->state = TASK_RUNNING;
 605         remove_wait_queue(&page->wait, &wait);
 606 }
 607
 608 /*
 609  * Get an exclusive lock on the page..
 610  */
 611 void lock_page(struct page *page)
 612 {
 613         while (TryLockPage(page))
 614                 ___wait_on_page(page);
 615 }
 616
 617
 618 /*
 619  * a rather lightweight function, finding and getting a reference to a
 620  * hashed page atomically, waiting for it if it's locked.
 621  */
 622 struct page * __find_get_page (struct inode * inode,
 623                                 unsigned long offset, struct page **hash)
 624 {
 625         struct page *page;
 626
 627         /*
 628          * We scan the hash list read-only. Addition to and removal from
 629          * the hash-list needs a held write-lock.
 630          */
 631 repeat:
 632         spin_lock(&pagecache_lock);
 633         page = __find_page_nolock(inode, offset, *hash);
 634         if (page)
 635                 get_page(page);
 636         spin_unlock(&pagecache_lock);
 637
 638         /* Found the page, sleep if locked. */
 639         if (page && PageLocked(page)) {
 640                 struct task_struct *tsk = current;
 641                 DECLARE_WAITQUEUE(wait, tsk);
 642
 643                 run_task_queue(&tq_disk);
 644
 645                 __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 646                 add_wait_queue(&page->wait, &wait);
 647
 648                 if (PageLocked(page))
 649                         schedule();
 650                 __set_task_state(tsk, TASK_RUNNING);
 651                 remove_wait_queue(&page->wait, &wait);
 652
 653                 /*
 654                  * The page might have been unhashed meanwhile. It's
 655                  * not freed though because we hold a reference to it.
 656                  * If this is the case then it will be freed _here_,
 657                  * and we recheck the hash anyway.
 658                  */
 659                 page_cache_release(page);
 660                 goto repeat;
 661         }
 662         /*
 663          * It's not locked so we can return the page and we hold
 664          * a reference to it.
 665          */
 666         return page;
 667 }
 668
 669 /*
 670  * Get the lock to a page atomically.
 671  */
 672 struct page * __find_lock_page (struct inode * inode,
 673                                 unsigned long offset, struct page **hash)
 674 {
 675         struct page *page;
 676
 677         /*
 678          * We scan the hash list read-only. Addition to and removal from
 679          * the hash-list needs a held write-lock.
 680          */
 681 repeat:
 682         spin_lock(&pagecache_lock);
 683         page = __find_page_nolock(inode, offset, *hash);
 684         if (page)
 685                 get_page(page);
 686         spin_unlock(&pagecache_lock);
 687
 688         /* Found the page, sleep if locked. */
 689         if (page && TryLockPage(page)) {
 690                 struct task_struct *tsk = current;
 691                 DECLARE_WAITQUEUE(wait, tsk);
 692
 693                 run_task_queue(&tq_disk);
 694
 695                 __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 696                 add_wait_queue(&page->wait, &wait);
 697
 698                 if (PageLocked(page))
 699                         schedule();
 700                 __set_task_state(tsk, TASK_RUNNING);
 701                 remove_wait_queue(&page->wait, &wait);
 702
 703                 /*
 704                  * The page might have been unhashed meanwhile. It's
 705                  * not freed though because we hold a reference to it.
 706                  * If this is the case then it will be freed _here_,
 707                  * and we recheck the hash anyway.
 708                  */
 709                 page_cache_release(page);
 710                 goto repeat;
 711         }
 712         /*
 713          * It's not locked so we can return the page and we hold
 714          * a reference to it.
 715          */
 716         return page;
 717 }
 718
 719 #if 0
 720 #define PROFILE_READAHEAD
 721 #define DEBUG_READAHEAD
 722 #endif
 723
 724 /*
 725  * Read-ahead profiling information
 726  * --------------------------------
 727  * Every PROFILE_MAXREADCOUNT, the following information is written
 728  * to the syslog:
 729  *   Percentage of asynchronous read-ahead.
 730  *   Average of read-ahead fields context value.
 731  * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
 732  * to the syslog.
 733  */
 734
 735 #ifdef PROFILE_READAHEAD
 736
 737 #define PROFILE_MAXREADCOUNT 1000
 738
 739 static unsigned long total_reada;
 740 static unsigned long total_async;
 741 static unsigned long total_ramax;
 742 static unsigned long total_ralen;
 743 static unsigned long total_rawin;
 744
 745 static void profile_readahead(int async, struct file *filp)
 746 {
 747         unsigned long flags;
 748
 749         ++total_reada;
 750         if (async)
 751                 ++total_async;
 752
 753         total_ramax     += filp->f_ramax;
 754         total_ralen     += filp->f_ralen;
 755         total_rawin     += filp->f_rawin;
 756
 757         if (total_reada > PROFILE_MAXREADCOUNT) {
 758                 save_flags(flags);
 759                 cli();
 760                 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
 761                         restore_flags(flags);
 762                         return;
 763                 }
 764
 765                 printk("Readahead average:  max=%ld, len=%ld, win=%ld, async=%ld%%\n",
 766                         total_ramax/total_reada,
 767                         total_ralen/total_reada,
 768                         total_rawin/total_reada,
 769                         (total_async*100)/total_reada);
 770 #ifdef DEBUG_READAHEAD
 771                 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%ld\n",
 772                         filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
 773 #endif
 774
 775                 total_reada     = 0;
 776                 total_async     = 0;
 777                 total_ramax     = 0;
 778                 total_ralen     = 0;
 779                 total_rawin     = 0;
 780
 781                 restore_flags(flags);
 782         }
 783 }
 784 #endif  /* defined PROFILE_READAHEAD */
 785
 786 /*
 787  * Read-ahead context:
 788  * -------------------
 789  * The read ahead context fields of the "struct file" are the following:
 790  * - f_raend : position of the first byte after the last page we tried to
 791  *             read ahead.
 792  * - f_ramax : current read-ahead maximum size.
 793  * - f_ralen : length of the current IO read block we tried to read-ahead.
 794  * - f_rawin : length of the current read-ahead window.
 795  *              if last read-ahead was synchronous then
 796  *                      f_rawin = f_ralen
 797  *              otherwise (was asynchronous)
 798  *                      f_rawin = previous value of f_ralen + f_ralen
 799  *
 800  * Read-ahead limits:
 801  * ------------------
 802  * MIN_READAHEAD   : minimum read-ahead size when read-ahead.
 803  * MAX_READAHEAD   : maximum read-ahead size when read-ahead.
 804  *
 805  * Synchronous read-ahead benefits:
 806  * --------------------------------
 807  * Using reasonable IO xfer length from peripheral devices increase system
 808  * performances.
 809  * Reasonable means, in this context, not too large but not too small.
 810  * The actual maximum value is:
 811  *      MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
 812  *      and 32K if defined (4K page size assumed).
 813  *
 814  * Asynchronous read-ahead benefits:
 815  * ---------------------------------
 816  * Overlapping next read request and user process execution increase system
 817  * performance.
 818  *
 819  * Read-ahead risks:
 820  * -----------------
 821  * We have to guess which further data are needed by the user process.
 822  * If these data are often not really needed, it's bad for system
 823  * performances.
 824  * However, we know that files are often accessed sequentially by
 825  * application programs and it seems that it is possible to have some good
 826  * strategy in that guessing.
 827  * We only try to read-ahead files that seems to be read sequentially.
 828  *
 829  * Asynchronous read-ahead risks:
 830  * ------------------------------
 831  * In order to maximize overlapping, we must start some asynchronous read
 832  * request from the device, as soon as possible.
 833  * We must be very careful about:
 834  * - The number of effective pending IO read requests.
 835  *   ONE seems to be the only reasonable value.
 836  * - The total memory pool usage for the file access stream.
 837  *   This maximum memory usage is implicitly 2 IO read chunks:
 838  *   2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
 839  *   64k if defined (4K page size assumed).
 840  */
 841
 842 static inline int get_max_readahead(struct inode * inode)
 843 {
 844         if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
 845                 return MAX_READAHEAD;
 846         return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
 847 }
 848
 849 static void generic_file_readahead(int reada_ok,
 850         struct file * filp, struct inode * inode,
 851         unsigned long ppos, struct page * page)
 852 {
 853         unsigned long max_ahead, ahead;
 854         unsigned long raend;
 855         int max_readahead = get_max_readahead(inode);
 856
 857         raend = filp->f_raend & PAGE_CACHE_MASK;
 858         max_ahead = 0;
 859
 860 /*
 861  * The current page is locked.
 862  * If the current position is inside the previous read IO request, do not
 863  * try to reread previously read ahead pages.
 864  * Otherwise decide or not to read ahead some pages synchronously.
 865  * If we are not going to read ahead, set the read ahead context for this
 866  * page only.
 867  */
 868         if (PageLocked(page)) {
 869                 if (!filp->f_ralen || ppos >= raend || ppos + filp->f_ralen < raend) {
 870                         raend = ppos;
 871                         if (raend < inode->i_size)
 872                                 max_ahead = filp->f_ramax;
 873                         filp->f_rawin = 0;
 874                         filp->f_ralen = PAGE_CACHE_SIZE;
 875                         if (!max_ahead) {
 876                                 filp->f_raend  = ppos + filp->f_ralen;
 877                                 filp->f_rawin += filp->f_ralen;
 878                         }
 879                 }
 880         }
 881 /*
 882  * The current page is not locked.
 883  * If we were reading ahead and,
 884  * if the current max read ahead size is not zero and,
 885  * if the current position is inside the last read-ahead IO request,
 886  *   it is the moment to try to read ahead asynchronously.
 887  * We will later force unplug device in order to force asynchronous read IO.
 888  */
 889         else if (reada_ok && filp->f_ramax && raend >= PAGE_CACHE_SIZE &&
 890                  ppos <= raend && ppos + filp->f_ralen >= raend) {
 891 /*
 892  * Add ONE page to max_ahead in order to try to have about the same IO max size
 893  * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
 894  * Compute the position of the last page we have tried to read in order to
 895  * begin to read ahead just at the next page.
 896  */
 897                 raend -= PAGE_CACHE_SIZE;
 898                 if (raend < inode->i_size)
 899                         max_ahead = filp->f_ramax + PAGE_CACHE_SIZE;
 900
 901                 if (max_ahead) {
 902                         filp->f_rawin = filp->f_ralen;
 903                         filp->f_ralen = 0;
 904                         reada_ok      = 2;
 905                 }
 906         }
 907 /*
 908  * Try to read ahead pages.
 909  * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
 910  * scheduler, will work enough for us to avoid too bad actuals IO requests.
 911  */
 912         ahead = 0;
 913         while (ahead < max_ahead) {
 914                 ahead += PAGE_CACHE_SIZE;
 915                 page_cache_read(filp, raend + ahead);
 916         }
 917 /*
 918  * If we tried to read ahead some pages,
 919  * If we tried to read ahead asynchronously,
 920  *   Try to force unplug of the device in order to start an asynchronous
 921  *   read IO request.
 922  * Update the read-ahead context.
 923  * Store the length of the current read-ahead window.
 924  * Double the current max read ahead size.
 925  *   That heuristic avoid to do some large IO for files that are not really
 926  *   accessed sequentially.
 927  */
 928         if (ahead) {
 929                 if (reada_ok == 2) {
 930                         run_task_queue(&tq_disk);
 931                 }
 932
 933                 filp->f_ralen += ahead;
 934                 filp->f_rawin += filp->f_ralen;
 935                 filp->f_raend = raend + ahead + PAGE_CACHE_SIZE;
 936
 937                 filp->f_ramax += filp->f_ramax;
 938
 939                 if (filp->f_ramax > max_readahead)
 940                         filp->f_ramax = max_readahead;
 941
 942 #ifdef PROFILE_READAHEAD
 943                 profile_readahead((reada_ok == 2), filp);
 944 #endif
 945         }
 946
 947         return;
 948 }
 949
 950
 951 /*
 952  * This is a generic file read routine, and uses the
 953  * inode->i_op->readpage() function for the actual low-level
 954  * stuff.
 955  *
 956  * This is really ugly. But the goto's actually try to clarify some
 957  * of the logic when it comes to error handling etc.
 958  */
 959 void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
 960 {
 961         struct dentry *dentry = filp->f_dentry;
 962         struct inode *inode = dentry->d_inode;
 963         size_t pos, pgpos, page_cache;
 964         int reada_ok;
 965         int error;
 966         int max_readahead = get_max_readahead(inode);
 967
 968         page_cache = 0;
 969
 970         pos = *ppos;
 971         pgpos = pos & PAGE_CACHE_MASK;
 972 /*
 973  * If the current position is outside the previous read-ahead window,
 974  * we reset the current read-ahead context and set read ahead max to zero
 975  * (will be set to just needed value later),
 976  * otherwise, we assume that the file accesses are sequential enough to
 977  * continue read-ahead.
 978  */
 979         if (pgpos > filp->f_raend || pgpos + filp->f_rawin < filp->f_raend) {
 980                 reada_ok = 0;
 981                 filp->f_raend = 0;
 982                 filp->f_ralen = 0;
 983                 filp->f_ramax = 0;
 984                 filp->f_rawin = 0;
 985         } else {
 986                 reada_ok = 1;
 987         }
 988 /*
 989  * Adjust the current value of read-ahead max.
 990  * If the read operation stay in the first half page, force no readahead.
 991  * Otherwise try to increase read ahead max just enough to do the read request.
 992  * Then, at least MIN_READAHEAD if read ahead is ok,
 993  * and at most MAX_READAHEAD in all cases.
 994  */
 995         if (pos + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
 996                 filp->f_ramax = 0;
 997         } else {
 998                 unsigned long needed;
 999
1000                 needed = ((pos + desc->count) & PAGE_CACHE_MASK) - pgpos;
1001
1002                 if (filp->f_ramax < needed)
1003                         filp->f_ramax = needed;
1004
1005                 if (reada_ok && filp->f_ramax < MIN_READAHEAD)
1006                                 filp->f_ramax = MIN_READAHEAD;
1007                 if (filp->f_ramax > max_readahead)
1008                         filp->f_ramax = max_readahead;
1009         }
1010
1011         for (;;) {
1012                 struct page *page, **hash;
1013
1014                 if (pos >= inode->i_size)
1015                         break;
1016
1017                 /*
1018                  * Try to find the data in the page cache..
1019                  */
1020                 hash = page_hash(inode, pos & PAGE_CACHE_MASK);
1021
1022                 spin_lock(&pagecache_lock);
1023                 page = __find_page_nolock(inode, pos & PAGE_CACHE_MASK, *hash);
1024                 if (!page)
1025                         goto no_cached_page;
1026 found_page:
1027                 get_page(page);
1028                 spin_unlock(&pagecache_lock);
1029
1030                 if (!Page_Uptodate(page))
1031                         goto page_not_up_to_date;
1032 page_ok:
1033         /*
1034          * Ok, we have the page, and it's up-to-date, so
1035          * now we can copy it to user space...
1036          */
1037         {
1038                 unsigned long offset, nr;
1039
1040                 offset = pos & ~PAGE_CACHE_MASK;
1041                 nr = PAGE_CACHE_SIZE - offset;
1042                 if (nr > inode->i_size - pos)
1043                         nr = inode->i_size - pos;
1044
1045                 /*
1046                  * The actor routine returns how many bytes were actually used..
1047                  * NOTE! This may not be the same as how much of a user buffer
1048                  * we filled up (we may be padding etc), so we can only update
1049                  * "pos" here (the actor routine has to update the user buffer
1050                  * pointers and the remaining count).
1051                  */
1052                 nr = actor(desc, (const char *) (page_address(page) + offset), nr);
1053                 pos += nr;
1054                 page_cache_release(page);
1055                 if (nr && desc->count)
1056                         continue;
1057                 break;
1058         }
1059
1060 /*
1061  * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
1062  */
1063 page_not_up_to_date:
1064                 generic_file_readahead(reada_ok, filp, inode,
1065                                                 pos & PAGE_CACHE_MASK, page);
1066
1067                 if (Page_Uptodate(page))
1068                         goto page_ok;
1069
1070                 /* Get exclusive access to the page ... */
1071                 lock_page(page);
1072                 if (Page_Uptodate(page)) {
1073                         UnlockPage(page);
1074                         goto page_ok;
1075                 }
1076
1077 readpage:
1078                 /* ... and start the actual read. The read will unlock the page. */
1079                 error = inode->i_op->readpage(filp, page);
1080
1081                 if (!error) {
1082                         if (Page_Uptodate(page))
1083                                 goto page_ok;
1084
1085                         /* Again, try some read-ahead while waiting for the page to finish.. */
1086                         generic_file_readahead(reada_ok, filp, inode,
1087                                                 pos & PAGE_CACHE_MASK, page);
1088                         wait_on_page(page);
1089                         if (Page_Uptodate(page))
1090                                 goto page_ok;
1091                         error = -EIO;
1092                 }
1093
1094                 /* UHHUH! A synchronous read error occurred. Report it */
1095                 desc->error = error;
1096                 page_cache_release(page);
1097                 break;
1098
1099 no_cached_page:
1100                 /*
1101                  * Ok, it wasn't cached, so we need to create a new
1102                  * page..
1103                  *
1104                  * We get here with the page cache lock held.
1105                  */
1106                 if (!page_cache) {
1107                         spin_unlock(&pagecache_lock);
1108                         page_cache = page_cache_alloc();
1109                         if (!page_cache) {
1110                                 desc->error = -ENOMEM;
1111                                 break;
1112                         }
1113
1114                         /*
1115                          * Somebody may have added the page while we
1116                          * dropped the page cache lock. Check for that.
1117                          */
1118                         spin_lock(&pagecache_lock);
1119                         page = __find_page_nolock(inode, pos & PAGE_CACHE_MASK, *hash);
1120                         if (page)
1121                                 goto found_page;
1122                 }
1123
1124                 /*
1125                  * Ok, add the new page to the hash-queues...
1126                  */
1127                 page = page_cache_entry(page_cache);
1128                 __add_to_page_cache(page, inode, pos & PAGE_CACHE_MASK, hash);
1129                 spin_unlock(&pagecache_lock);
1130
1131                 page_cache = 0;
1132                 goto readpage;
1133         }
1134
1135         *ppos = pos;
1136         filp->f_reada = 1;
1137         if (page_cache)
1138                 page_cache_free(page_cache);
1139         UPDATE_ATIME(inode);
1140 }
1141
1142 static int file_read_actor(read_descriptor_t * desc, const char *area, unsigned long size)
1143 {
1144         unsigned long left;
1145         unsigned long count = desc->count;
1146
1147         if (size > count)
1148                 size = count;
1149         left = __copy_to_user(desc->buf, area, size);
1150         if (left) {
1151                 size -= left;
1152                 desc->error = -EFAULT;
1153         }
1154         desc->count = count - size;
1155         desc->written += size;
1156         desc->buf += size;
1157         return size;
1158 }
1159
1160 /*
1161  * This is the "read()" routine for all filesystems
1162  * that can use the page cache directly.
1163  */
1164 ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
1165 {
1166         ssize_t retval;
1167
1168         retval = -EFAULT;
1169         if (access_ok(VERIFY_WRITE, buf, count)) {
1170                 retval = 0;
1171                 if (count) {
1172                         read_descriptor_t desc;
1173
1174                         desc.written = 0;
1175                         desc.count = count;
1176                         desc.buf = buf;
1177                         desc.error = 0;
1178                         do_generic_file_read(filp, ppos, &desc, file_read_actor);
1179
1180                         retval = desc.written;
1181                         if (!retval)
1182                                 retval = desc.error;
1183                 }
1184         }
1185         return retval;
1186 }
1187
1188 static int file_send_actor(read_descriptor_t * desc, const char *area, unsigned long size)
1189 {
1190         ssize_t written;
1191         unsigned long count = desc->count;
1192         struct file *file = (struct file *) desc->buf;
1193         mm_segment_t old_fs;
1194
1195         if (size > count)
1196                 size = count;
1197         old_fs = get_fs();
1198         set_fs(KERNEL_DS);
1199         written = file->f_op->write(file, area, size, &file->f_pos);
1200         set_fs(old_fs);
1201         if (written < 0) {
1202                 desc->error = written;
1203                 written = 0;
1204         }
1205         desc->count = count - written;
1206         desc->written += written;
1207         return written;
1208 }
1209
1210 asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
1211 {
1212         ssize_t retval;
1213         struct file * in_file, * out_file;
1214         struct inode * in_inode, * out_inode;
1215
1216         /*
1217          * Get input file, and verify that it is ok..
1218          */
1219         retval = -EBADF;
1220         in_file = fget(in_fd);
1221         if (!in_file)
1222                 goto out;
1223         if (!(in_file->f_mode & FMODE_READ))
1224                 goto fput_in;
1225         retval = -EINVAL;
1226         in_inode = in_file->f_dentry->d_inode;
1227         if (!in_inode)
1228                 goto fput_in;
1229         if (!in_inode->i_op || !in_inode->i_op->readpage)
1230                 goto fput_in;
1231         retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
1232         if (retval)
1233                 goto fput_in;
1234
1235         /*
1236          * Get output file, and verify that it is ok..
1237          */
1238         retval = -EBADF;
1239         out_file = fget(out_fd);
1240         if (!out_file)
1241                 goto fput_in;
1242         if (!(out_file->f_mode & FMODE_WRITE))
1243                 goto fput_out;
1244         retval = -EINVAL;
1245         if (!out_file->f_op || !out_file->f_op->write)
1246                 goto fput_out;
1247         out_inode = out_file->f_dentry->d_inode;
1248         if (!out_inode)
1249                 goto fput_out;
1250         retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
1251         if (retval)
1252                 goto fput_out;
1253
1254         retval = 0;
1255         if (count) {
1256                 read_descriptor_t desc;
1257                 loff_t pos = 0, *ppos;
1258
1259                 retval = -EFAULT;
1260                 ppos = &in_file->f_pos;
1261                 if (offset) {
1262                         if (get_user(pos, offset))
1263                                 goto fput_out;
1264                         ppos = &pos;
1265                 }
1266
1267                 desc.written = 0;
1268                 desc.count = count;
1269                 desc.buf = (char *) out_file;
1270                 desc.error = 0;
1271                 do_generic_file_read(in_file, ppos, &desc, file_send_actor);
1272
1273                 retval = desc.written;
1274                 if (!retval)
1275                         retval = desc.error;
1276                 if (offset)
1277                         put_user(pos, offset);
1278         }
1279
1280 fput_out:
1281         fput(out_file);
1282 fput_in:
1283         fput(in_file);
1284 out:
1285         return retval;
1286 }
1287
1288 /*
1289  * filemap_nopage() is invoked via the vma operations vector for a
1290  * mapped memory region to read in file data during a page fault.
1291  *
1292  * The goto's are kind of ugly, but this streamlines the normal case of having
1293  * it in the page cache, and handles the special cases reasonably without
1294  * having a lot of duplicated code.
1295  *
1296  * XXX - at some point, this should return unique values to indicate to
1297  *       the caller whether this is EIO, OOM, or SIGBUS.
1298  */
1299 static unsigned long filemap_nopage(struct vm_area_struct * area,
1300         unsigned long address, int no_share)
1301 {
1302         struct file * file = area->vm_file;
1303         struct dentry * dentry = file->f_dentry;
1304         struct inode * inode = dentry->d_inode;
1305         struct page * page, **hash;
1306         unsigned long old_page, new_page = 0;
1307
1308         unsigned long offset = address - area->vm_start + area->vm_offset;
1309
1310         /*
1311          * Semantics for shared and private memory areas are different
1312          * past the end of the file. A shared mapping past the last page
1313          * of the file is an error and results in a SIGBUS, while a
1314          * private mapping just maps in a zero page.
1315          */
1316         if ((offset >= inode->i_size) &&
1317                 (area->vm_flags & VM_SHARED) && (area->vm_mm == current->mm))
1318                 return 0;
1319
1320         /*
1321          * Do we have something in the page cache already?
1322          */
1323         hash = page_hash(inode, offset);
1324 retry_find:
1325         page = __find_get_page(inode, offset, hash);
1326         if (!page)
1327                 goto no_cached_page;
1328
1329         /*
1330          * Ok, found a page in the page cache, now we need to check
1331          * that it's up-to-date.
1332          */
1333         if (!Page_Uptodate(page))
1334                 goto page_not_uptodate;
1335
1336 success:
1337         /*
1338          * Found the page and have a reference on it, need to check sharing
1339          * and possibly copy it over to another page..
1340          */
1341         old_page = page_address(page);
1342         if (!no_share) {
1343                 flush_page_to_ram(old_page);
1344                 return old_page;
1345         }
1346
1347         new_page = page_cache_alloc();
1348         if (new_page) {
1349                 copy_page(new_page, old_page);
1350                 flush_page_to_ram(new_page);
1351         }
1352         page_cache_release(page);
1353         return new_page;
1354
1355 no_cached_page:
1356         /*
1357          * If the requested offset is within our file, try to read a whole
1358          * cluster of pages at once.
1359          *
1360          * Otherwise, we're off the end of a privately mapped file,
1361          * so we need to map a zero page.
1362          */
1363         if (offset < inode->i_size)
1364                 read_cluster_nonblocking(file, offset);
1365         else
1366                 page_cache_read(file, offset);
1367
1368         /*
1369          * The page we want has now been added to the page cache.
1370          * In the unlikely event that someone removed it in the
1371          * meantime, we'll just come back here and read it again.
1372          */
1373         goto retry_find;
1374
1375 page_not_uptodate:
1376         lock_page(page);
1377         if (Page_Uptodate(page)) {
1378                 UnlockPage(page);
1379                 goto success;
1380         }
1381
1382         if (!inode->i_op->readpage(file, page)) {
1383                 wait_on_page(page);
1384                 if (Page_Uptodate(page))
1385                         goto success;
1386         }
1387
1388         /*
1389          * Umm, take care of errors if the page isn't up-to-date.
1390          * Try to re-read it _once_. We do this synchronously,
1391          * because there really aren't any performance issues here
1392          * and we need to check for errors.
1393          */
1394         lock_page(page);
1395         if (Page_Uptodate(page)) {
1396                 UnlockPage(page);
1397                 goto success;
1398         }
1399         ClearPageError(page);
1400         if (!inode->i_op->readpage(file, page)) {
1401                 wait_on_page(page);
1402                 if (Page_Uptodate(page))
1403                         goto success;
1404         }
1405
1406         /*
1407          * Things didn't work out. Return zero to tell the
1408          * mm layer so, possibly freeing the page cache page first.
1409          */
1410         page_cache_release(page);
1411         if (new_page)
1412                 page_cache_free(new_page);
1413         return 0;
1414 }
1415
1416 /*
1417  * Tries to write a shared mapped page to its backing store. May return -EIO
1418  * if the disk is full.
1419  */
1420 static inline int do_write_page(struct inode * inode, struct file * file,
1421         const char * page_addr, unsigned long offset)
1422 {
1423         int retval;
1424         unsigned long size;
1425         int (*writepage) (struct file *, struct page *);
1426         struct page * page;
1427
1428         size = offset + PAGE_SIZE;
1429         /* refuse to extend file size.. */
1430         if (S_ISREG(inode->i_mode)) {
1431                 if (size > inode->i_size)
1432                         size = inode->i_size;
1433                 /* Ho humm.. We should have tested for this earlier */
1434                 if (size < offset)
1435                         return -EIO;
1436         }
1437         size -= offset;
1438         retval = -EIO;
1439         writepage = inode->i_op->writepage;
1440         page = mem_map + MAP_NR(page_addr);
1441         lock_page(page);
1442
1443         retval = writepage(file, page);
1444
1445         UnlockPage(page);
1446         return retval;
1447 }
1448
1449 static int filemap_write_page(struct vm_area_struct * vma,
1450                               unsigned long offset,
1451                               unsigned long page,
1452                               int wait)
1453 {
1454         int result;
1455         struct file * file;
1456         struct dentry * dentry;
1457         struct inode * inode;
1458
1459         file = vma->vm_file;
1460         dentry = file->f_dentry;
1461         inode = dentry->d_inode;
1462
1463         /*
1464          * If a task terminates while we're swapping the page, the vma and
1465          * and file could be released ... increment the count to be safe.
1466          */
1467         get_file(file);
1468         result = do_write_page(inode, file, (const char *) page, offset);
1469         fput(file);
1470         return result;
1471 }
1472
1473
1474 /*
1475  * The page cache takes care of races between somebody
1476  * trying to swap something out and swap something in
1477  * at the same time..
1478  */
1479 extern void wakeup_bdflush(int);
1480 int filemap_swapout(struct vm_area_struct * vma, struct page * page)
1481 {
1482         int retval = filemap_write_page(vma, page->offset, page_address(page), 0);
1483         wakeup_bdflush(0);
1484         return retval;
1485 }
1486
1487 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
1488         unsigned long address, unsigned int flags)
1489 {
1490         pte_t pte = *ptep;
1491         unsigned long pageaddr;
1492         struct page *page;
1493         int error;
1494
1495         if (!(flags & MS_INVALIDATE)) {
1496                 if (!pte_present(pte))
1497                         return 0;
1498                 if (!pte_dirty(pte))
1499                         return 0;
1500                 flush_page_to_ram(pte_page(pte));
1501                 flush_cache_page(vma, address);
1502                 set_pte(ptep, pte_mkclean(pte));
1503                 flush_tlb_page(vma, address);
1504                 pageaddr = pte_page(pte);
1505                 page = page_cache_entry(pageaddr);
1506                 get_page(page);
1507         } else {
1508                 if (pte_none(pte))
1509                         return 0;
1510                 flush_cache_page(vma, address);
1511                 pte_clear(ptep);
1512                 flush_tlb_page(vma, address);
1513                 if (!pte_present(pte)) {
1514                         swap_free(pte_val(pte));
1515                         return 0;
1516                 }
1517                 pageaddr = pte_page(pte);
1518                 if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
1519                         page_cache_free(pageaddr);
1520                         return 0;
1521                 }
1522         }
1523         error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, pageaddr, 1);
1524         page_cache_free(pageaddr);
1525         return error;
1526 }
1527
1528 static inline int filemap_sync_pte_range(pmd_t * pmd,
1529         unsigned long address, unsigned long size,
1530         struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
1531 {
1532         pte_t * pte;
1533         unsigned long end;
1534         int error;
1535
1536         if (pmd_none(*pmd))
1537                 return 0;
1538         if (pmd_bad(*pmd)) {
1539                 printk("filemap_sync_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
1540                 pmd_clear(pmd);
1541                 return 0;
1542         }
1543         pte = pte_offset(pmd, address);
1544         offset += address & PMD_MASK;
1545         address &= ~PMD_MASK;
1546         end = address + size;
1547         if (end > PMD_SIZE)
1548                 end = PMD_SIZE;
1549         error = 0;
1550         do {
1551                 error |= filemap_sync_pte(pte, vma, address + offset, flags);
1552                 address += PAGE_SIZE;
1553                 pte++;
1554         } while (address < end);
1555         return error;
1556 }
1557
1558 static inline int filemap_sync_pmd_range(pgd_t * pgd,
1559         unsigned long address, unsigned long size,
1560         struct vm_area_struct *vma, unsigned int flags)
1561 {
1562         pmd_t * pmd;
1563         unsigned long offset, end;
1564         int error;
1565
1566         if (pgd_none(*pgd))
1567                 return 0;
1568         if (pgd_bad(*pgd)) {
1569                 printk("filemap_sync_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd));
1570                 pgd_clear(pgd);
1571                 return 0;
1572         }
1573         pmd = pmd_offset(pgd, address);
1574         offset = address & PGDIR_MASK;
1575         address &= ~PGDIR_MASK;
1576         end = address + size;
1577         if (end > PGDIR_SIZE)
1578                 end = PGDIR_SIZE;
1579         error = 0;
1580         do {
1581                 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
1582                 address = (address + PMD_SIZE) & PMD_MASK;
1583                 pmd++;
1584         } while (address < end);
1585         return error;
1586 }
1587
1588 static int filemap_sync(struct vm_area_struct * vma, unsigned long address,
1589         size_t size, unsigned int flags)
1590 {
1591         pgd_t * dir;
1592         unsigned long end = address + size;
1593         int error = 0;
1594
1595         dir = pgd_offset(vma->vm_mm, address);
1596         flush_cache_range(vma->vm_mm, end - size, end);
1597         while (address < end) {
1598                 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
1599                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
1600                 dir++;
1601         }
1602         flush_tlb_range(vma->vm_mm, end - size, end);
1603         return error;
1604 }
1605
1606 /*
1607  * This handles (potentially partial) area unmaps..
1608  */
1609 static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len)
1610 {
1611         filemap_sync(vma, start, len, MS_ASYNC);
1612 }
1613
1614 /*
1615  * Shared mappings need to be able to do the right thing at
1616  * close/unmap/sync. They will also use the private file as
1617  * backing-store for swapping..
1618  */
1619 static struct vm_operations_struct file_shared_mmap = {
1620         NULL,                   /* no special open */
1621         NULL,                   /* no special close */
1622         filemap_unmap,          /* unmap - we need to sync the pages */
1623         NULL,                   /* no special protect */
1624         filemap_sync,           /* sync */
1625         NULL,                   /* advise */
1626         filemap_nopage,         /* nopage */
1627         NULL,                   /* wppage */
1628         filemap_swapout         /* swapout */
1629 };
1630
1631 /*
1632  * Private mappings just need to be able to load in the map.
1633  *
1634  * (This is actually used for shared mappings as well, if we
1635  * know they can't ever get write permissions..)
1636  */
1637 static struct vm_operations_struct file_private_mmap = {
1638         NULL,                   /* open */
1639         NULL,                   /* close */
1640         NULL,                   /* unmap */
1641         NULL,                   /* protect */
1642         NULL,                   /* sync */
1643         NULL,                   /* advise */
1644         filemap_nopage,         /* nopage */
1645         NULL,                   /* wppage */
1646         NULL                    /* swapout */
1647 };
1648
1649 /* This is used for a general mmap of a disk file */
1650
1651 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1652 {
1653         struct vm_operations_struct * ops;
1654         struct inode *inode = file->f_dentry->d_inode;
1655
1656         ops = &file_private_mmap;
1657         if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1658                 if (!inode->i_op || !inode->i_op->writepage)
1659                         return -EINVAL;
1660                 ops = &file_shared_mmap;
1661         }
1662         if (!inode->i_sb || !S_ISREG(inode->i_mode))
1663                 return -EACCES;
1664         if (!inode->i_op || !inode->i_op->readpage)
1665                 return -ENOEXEC;
1666         UPDATE_ATIME(inode);
1667         vma->vm_ops = ops;
1668         return 0;
1669 }
1670
1671
1672 /*
1673  * The msync() system call.
1674  */
1675
1676 static int msync_interval(struct vm_area_struct * vma,
1677         unsigned long start, unsigned long end, int flags)
1678 {
1679         if (vma->vm_file && vma->vm_ops && vma->vm_ops->sync) {
1680                 int error;
1681                 error = vma->vm_ops->sync(vma, start, end-start, flags);
1682                 if (!error && (flags & MS_SYNC)) {
1683                         struct file * file = vma->vm_file;
1684                         if (file) {
1685                                 struct dentry * dentry = file->f_dentry;
1686                                 error = file_fsync(file, dentry);
1687                         }
1688                 }
1689                 return error;
1690         }
1691         return 0;
1692 }
1693
1694 asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
1695 {
1696         unsigned long end;
1697         struct vm_area_struct * vma;
1698         int unmapped_error, error = -EINVAL;
1699
1700         down(&current->mm->mmap_sem);
1701         lock_kernel();
1702         if (start & ~PAGE_MASK)
1703                 goto out;
1704         len = (len + ~PAGE_MASK) & PAGE_MASK;
1705         end = start + len;
1706         if (end < start)
1707                 goto out;
1708         if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1709                 goto out;
1710         error = 0;
1711         if (end == start)
1712                 goto out;
1713         /*
1714          * If the interval [start,end) covers some unmapped address ranges,
1715          * just ignore them, but return -EFAULT at the end.
1716          */
1717         vma = find_vma(current->mm, start);
1718         unmapped_error = 0;
1719         for (;;) {
1720                 /* Still start < end. */
1721                 error = -EFAULT;
1722                 if (!vma)
1723                         goto out;
1724                 /* Here start < vma->vm_end. */
1725                 if (start < vma->vm_start) {
1726                         unmapped_error = -EFAULT;
1727                         start = vma->vm_start;
1728                 }
1729                 /* Here vma->vm_start <= start < vma->vm_end. */
1730                 if (end <= vma->vm_end) {
1731                         if (start < end) {
1732                                 error = msync_interval(vma, start, end, flags);
1733                                 if (error)
1734                                         goto out;
1735                         }
1736                         error = unmapped_error;
1737                         goto out;
1738                 }
1739                 /* Here vma->vm_start <= start < vma->vm_end < end. */
1740                 error = msync_interval(vma, start, vma->vm_end, flags);
1741                 if (error)
1742                         goto out;
1743                 start = vma->vm_end;
1744                 vma = vma->vm_next;
1745         }
1746 out:
1747         unlock_kernel();
1748         up(&current->mm->mmap_sem);
1749         return error;
1750 }
1751
1752 /*
1753  * Write to a file through the page cache. This is mainly for the
1754  * benefit of NFS and possibly other network-based file systems.
1755  *
1756  * We currently put everything into the page cache prior to writing it.
1757  * This is not a problem when writing full pages. With partial pages,
1758  * however, we first have to read the data into the cache, then
1759  * dirty the page, and finally schedule it for writing. Alternatively, we
1760  * could write-through just the portion of data that would go into that
1761  * page, but that would kill performance for applications that write data
1762  * line by line, and it's prone to race conditions.
1763  *
1764  * Note that this routine doesn't try to keep track of dirty pages. Each
1765  * file system has to do this all by itself, unfortunately.
1766  *                                                      okir@monad.swb.de
1767  */
1768 ssize_t
1769 generic_file_write(struct file *file, const char *buf,
1770                    size_t count, loff_t *ppos,
1771                    writepage_t write_one_page)
1772 {
1773         struct dentry   *dentry = file->f_dentry;
1774         struct inode    *inode = dentry->d_inode;
1775         unsigned long   pos = *ppos;
1776         unsigned long   limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
1777         struct page     *page, **hash;
1778         unsigned long   page_cache = 0;
1779         unsigned long   written;
1780         long            status;
1781         int             err;
1782
1783         err = file->f_error;
1784         if (err) {
1785                 file->f_error = 0;
1786                 goto out;
1787         }
1788
1789         written = 0;
1790
1791         if (file->f_flags & O_APPEND)
1792                 pos = inode->i_size;
1793
1794         /*
1795          * Check whether we've reached the file size limit.
1796          */
1797         err = -EFBIG;
1798         if (pos >= limit) {
1799                 send_sig(SIGXFSZ, current, 0);
1800                 goto out;
1801         }
1802
1803         status  = 0;
1804         /*
1805          * Check whether to truncate the write,
1806          * and send the signal if we do.
1807          */
1808         if (count > limit - pos) {
1809                 send_sig(SIGXFSZ, current, 0);
1810                 count = limit - pos;
1811         }
1812
1813         while (count) {
1814                 unsigned long bytes, pgpos, offset;
1815                 /*
1816                  * Try to find the page in the cache. If it isn't there,
1817                  * allocate a free page.
1818                  */
1819                 offset = (pos & ~PAGE_CACHE_MASK);
1820                 pgpos = pos & PAGE_CACHE_MASK;
1821                 bytes = PAGE_CACHE_SIZE - offset;
1822                 if (bytes > count)
1823                         bytes = count;
1824
1825                 hash = page_hash(inode, pgpos);
1826 repeat_find:
1827                 page = __find_lock_page(inode, pgpos, hash);
1828                 if (!page) {
1829                         if (!page_cache) {
1830                                 page_cache = page_cache_alloc();
1831                                 if (page_cache)
1832                                         goto repeat_find;
1833                                 status = -ENOMEM;
1834                                 break;
1835                         }
1836                         page = page_cache_entry(page_cache);
1837                         if (add_to_page_cache_unique(page,inode,pgpos,hash))
1838                                 goto repeat_find;
1839
1840                         page_cache = 0;
1841                 }
1842
1843                 /* We have exclusive IO access to the page.. */
1844                 if (!PageLocked(page)) {
1845                         PAGE_BUG(page);
1846                 } else {
1847                         if (page->owner != current) {
1848                                 PAGE_BUG(page);
1849                         }
1850                 }
1851
1852                 status = write_one_page(file, page, offset, bytes, buf);
1853
1854                 if (status >= 0) {
1855                         written += status;
1856                         count -= status;
1857                         pos += status;
1858                         buf += status;
1859                         if (pos > inode->i_size)
1860                                 inode->i_size = pos;
1861                 }
1862                 /* Mark it unlocked again and drop the page.. */
1863                 UnlockPage(page);
1864                 page_cache_release(page);
1865
1866                 if (status < 0)
1867                         break;
1868         }
1869         *ppos = pos;
1870
1871         if (page_cache)
1872                 page_cache_free(page_cache);
1873
1874         err = written ? written : status;
1875 out:
1876         return err;
1877 }
1878
1879 /*
1880  * Support routines for directory caching using the page cache.
1881  */
1882
1883 /*
1884  * Unlock and free a page.
1885  */
1886 void put_cached_page(unsigned long addr)
1887 {
1888         struct page * page = page_cache_entry(addr);
1889
1890         UnlockPage(page);
1891         if (page_count(page) != 2)
1892                 panic("put_cached_page: page count=%d\n",
1893                         page_count(page));
1894         page_cache_release(page);
1895 }
1896
1897 void __init page_cache_init(unsigned long memory_size)
1898 {
1899         unsigned long htable_size, order;
1900
1901         htable_size = memory_size >> PAGE_SHIFT;
1902         htable_size *= sizeof(struct page *);
1903         for(order = 0; (PAGE_SIZE << order) < htable_size; order++)
1904                 ;
1905
1906         do {
1907                 unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *);
1908
1909                 page_hash_bits = 0;
1910                 while((tmp >>= 1UL) != 0UL)
1911                         page_hash_bits++;
1912
1913                 page_hash_table = (struct page **)
1914                         __get_free_pages(GFP_ATOMIC, order);
1915         } while(page_hash_table == NULL && --order > 0);
1916
1917         printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
1918                (1 << page_hash_bits), order, (PAGE_SIZE << order));
1919         if (!page_hash_table)
1920                 panic("Failed to allocate page hash table\n");
1921         memset(page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));
1922 }