mm/filemap.c

   1 /*
   2  *      linux/mm/filemap.c
   3  *
   4  * Copyright (C) 1994-1999  Linus Torvalds
   5  */
   6
   7 /*
   8  * This file handles the generic file mmap semantics used by
   9  * most "normal" filesystems (but you don't /have/ to use this:
  10  * the NFS filesystem used to do this differently, for example)
  11  */
  12 #include <linux/malloc.h>
  13 #include <linux/shm.h>
  14 #include <linux/mman.h>
  15 #include <linux/locks.h>
  16 #include <linux/pagemap.h>
  17 #include <linux/swap.h>
  18 #include <linux/smp_lock.h>
  19 #include <linux/blkdev.h>
  20 #include <linux/file.h>
  21 #include <linux/swapctl.h>
  22 #include <linux/slab.h>
  23
  24 #include <asm/pgtable.h>
  25 #include <asm/uaccess.h>
  26
  27 /*
  28  * Shared mappings implemented 30.11.1994. It's not fully working yet,
  29  * though.
  30  *
  31  * Shared mappings now work. 15.8.1995  Bruno.
  32  *
  33  * finished 'unifying' the page and buffer cache and SMP-threaded the
  34  * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
  35  */
  36
  37 atomic_t page_cache_size = ATOMIC_INIT(0);
  38 struct page * page_hash_table[PAGE_HASH_SIZE];
  39
  40 /*
  41  * Define a request structure for outstanding page write requests
  42  * to the background page io daemon
  43  */
  44
  45 struct pio_request
  46 {
  47         struct pio_request *    next;
  48         struct file *           file;
  49         unsigned long           offset;
  50         unsigned long           page;
  51 };
  52 static struct pio_request *pio_first = NULL, **pio_last = &pio_first;
  53 static kmem_cache_t *pio_request_cache;
  54 static DECLARE_WAIT_QUEUE_HEAD(pio_wait);
  55
  56 spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
  57
  58
  59 static inline void
  60 make_pio_request(struct file *, unsigned long, unsigned long);
  61
  62 void __add_page_to_hash_queue(struct page * page, struct page **p){
  63         atomic_inc(&page_cache_size);
  64         if((page->next_hash = *p) != NULL)
  65                 (*p)->pprev_hash = &page->next_hash;
  66         *p = page;
  67         page->pprev_hash = p;
  68         if (page->buffers)
  69                 PAGE_BUG(page);
  70 }
  71
  72 static void remove_page_from_hash_queue(struct page * page)
  73 {
  74         if(page->pprev_hash) {
  75                 if(page->next_hash)
  76                         page->next_hash->pprev_hash = page->pprev_hash;
  77                 *page->pprev_hash = page->next_hash;
  78                 page->pprev_hash = NULL;
  79         }
  80         atomic_dec(&page_cache_size);
  81 }
  82
  83 void invalidate_inode_pages(struct inode * inode)
  84 {
  85         struct page ** p;
  86         struct page * page;
  87
  88 repeat:
  89         spin_lock(&pagecache_lock);
  90         p = &inode->i_pages;
  91         while ((page = *p) != NULL) {
  92                 get_page(page);
  93                 if (TryLockPage(page)) {
  94                         spin_unlock(&pagecache_lock);
  95                         wait_on_page(page);
  96                         page_cache_release(page);
  97                         goto repeat;
  98                 }
  99                 if (page_count(page) != 2)
 100                         printk("hm, busy page invalidated? (not necesserily a bug)\n");
 101                 inode->i_nrpages--;
 102                 if ((*p = page->next) != NULL)
 103                         (*p)->prev = page->prev;
 104                 page->next = NULL;
 105                 page->prev = NULL;
 106                 remove_page_from_hash_queue(page);
 107                 page->inode = NULL;
 108                 UnlockPage(page);
 109                 page_cache_release(page);
 110                 page_cache_release(page);
 111
 112         }
 113         spin_unlock(&pagecache_lock);
 114 }
 115 /*
 116  * Truncate the page cache at a set offset, removing the pages
 117  * that are beyond that offset (and zeroing out partial pages).
 118  */
 119 void truncate_inode_pages(struct inode * inode, unsigned long start)
 120 {
 121         struct page ** p;
 122         struct page * page;
 123         int partial = 0;
 124
 125 repeat:
 126         spin_lock(&pagecache_lock);
 127         p = &inode->i_pages;
 128         while ((page = *p) != NULL) {
 129                 unsigned long offset = page->offset;
 130
 131                 /* page wholly truncated - free it */
 132                 if (offset >= start) {
 133                         get_page(page);
 134                         if (TryLockPage(page)) {
 135                                 spin_unlock(&pagecache_lock);
 136                                 wait_on_page(page);
 137                                 page_cache_release(page);
 138                                 goto repeat;
 139                         }
 140                         spin_unlock(&pagecache_lock);
 141
 142                         if (inode->i_op->flushpage)
 143                                 inode->i_op->flushpage(inode, page, 0);
 144
 145                         /*
 146                          * We remove the page from the page cache
 147                          * _after_ we have destroyed all buffer-cache
 148                          * references to it. Otherwise some other process
 149                          * might think this inode page is not in the
 150                          * page cache and creates a buffer-cache alias
 151                          * to it causing all sorts of fun problems ...
 152                          */
 153                         spin_lock(&pagecache_lock);
 154                         inode->i_nrpages--;
 155                         if ((*p = page->next) != NULL)
 156                                 (*p)->prev = page->prev;
 157                         page->next = NULL;
 158                         page->prev = NULL;
 159                         remove_page_from_hash_queue(page);
 160                         page->inode = NULL;
 161                         spin_unlock(&pagecache_lock);
 162
 163                         UnlockPage(page);
 164                         page_cache_release(page);
 165                         page_cache_release(page);
 166
 167                         /*
 168                          * We have done things without the pagecache lock,
 169                          * so we'll have to repeat the scan.
 170                          * It's not possible to deadlock here because
 171                          * we are guaranteed to make progress. (ie. we have
 172                          * just removed a page)
 173                          */
 174                         goto repeat;
 175                 }
 176                 p = &page->next;
 177                 /*
 178                  * there is only one partial page possible.
 179                  */
 180                 if (partial)
 181                         continue;
 182
 183                 offset = start - offset;
 184                 /* partial truncate, clear end of page */
 185                 if (offset < PAGE_CACHE_SIZE) {
 186                         unsigned long address;
 187                         get_page(page);
 188                         if (TryLockPage(page)) {
 189                                 spin_unlock(&pagecache_lock);
 190                                 wait_on_page(page);
 191                                 page_cache_release(page);
 192                                 goto repeat;
 193                         }
 194                         /*
 195                          * It's worth dropping the write lock only at
 196                          * this point. We are holding the page lock
 197                          * so nobody can do anything bad to us.
 198                          */
 199                         spin_unlock(&pagecache_lock);
 200                         partial = 1;
 201
 202                         address = page_address(page);
 203                         memset((void *) (offset + address), 0, PAGE_CACHE_SIZE - offset);
 204                         flush_page_to_ram(address);
 205
 206                         if (inode->i_op->flushpage)
 207                                 inode->i_op->flushpage(inode, page, offset);
 208                         /*
 209                          * we have dropped the spinlock so we have to
 210                          * restart.
 211                          */
 212                         UnlockPage(page);
 213                         page_cache_release(page);
 214                         goto repeat;
 215                 }
 216         }
 217         spin_unlock(&pagecache_lock);
 218 }
 219
 220 /*
 221  * Remove a page from the page cache and free it. Caller has to make
 222  * sure the page is locked and that nobody else uses it - or that usage
 223  * is safe.
 224  */
 225 void remove_inode_page(struct page *page)
 226 {
 227         if (!PageLocked(page))
 228                 PAGE_BUG(page);
 229
 230         spin_lock(&pagecache_lock);
 231         remove_page_from_inode_queue(page);
 232         remove_page_from_hash_queue(page);
 233         page->inode = NULL;
 234         spin_unlock(&pagecache_lock);
 235 }
 236
 237 int shrink_mmap(int priority, int gfp_mask)
 238 {
 239         static unsigned long clock = 0;
 240         unsigned long limit = num_physpages;
 241         struct page * page;
 242         int count, err;
 243
 244         count = limit >> priority;
 245
 246         page = mem_map + clock;
 247         do {
 248                 int referenced;
 249
 250                 /* This works even in the presence of PageSkip because
 251                  * the first two entries at the beginning of a hole will
 252                  * be marked, not just the first.
 253                  */
 254                 page++;
 255                 clock++;
 256                 if (clock >= max_mapnr) {
 257                         clock = 0;
 258                         page = mem_map;
 259                 }
 260                 if (PageSkip(page)) {
 261                         /* next_hash is overloaded for PageSkip */
 262                         page = page->next_hash;
 263                         clock = page - mem_map;
 264                 }
 265
 266                 referenced = test_and_clear_bit(PG_referenced, &page->flags);
 267
 268                 if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
 269                         continue;
 270
 271                 if (PageLocked(page))
 272                         continue;
 273
 274                 /* Is it a buffer page? */
 275                 if (page->buffers) {
 276                         if (buffer_under_min())
 277                                 continue;
 278
 279                         if (TryLockPage(page))
 280                                 continue;
 281                         err = try_to_free_buffers(page);
 282                         UnlockPage(page);
 283
 284                         if (!err)
 285                                 continue;
 286                         goto out;
 287                 }
 288
 289                 /* We can't free pages unless there's just one user */
 290                 if (page_count(page) != 1)
 291                         continue;
 292
 293                 count--;
 294
 295                 /*
 296                  * Is it a page swap page? If so, we want to
 297                  * drop it if it is no longer used, even if it
 298                  * were to be marked referenced..
 299                  */
 300                 if (PageSwapCache(page)) {
 301                         if (referenced && swap_count(page->offset) != 1)
 302                                 continue;
 303                         delete_from_swap_cache(page);
 304                         err = 1;
 305                         goto out;
 306                 }
 307
 308                 if (referenced)
 309                         continue;
 310
 311                 /* is it a page-cache page? */
 312                 spin_lock(&pagecache_lock);
 313                 if (page->inode) {
 314                         if (pgcache_under_min())
 315                                 goto unlock_continue;
 316                         if (TryLockPage(page))
 317                                 goto unlock_continue;
 318
 319                         if (page_count(page) == 1) {
 320                                 remove_page_from_inode_queue(page);
 321                                 remove_page_from_hash_queue(page);
 322                                 page->inode = NULL;
 323                         }
 324                         spin_unlock(&pagecache_lock);
 325
 326                         UnlockPage(page);
 327                         page_cache_release(page);
 328                         err = 1;
 329                         goto out;
 330 unlock_continue:
 331                         spin_unlock(&pagecache_lock);
 332                         continue;
 333                 }
 334                 spin_unlock(&pagecache_lock);
 335         } while (count > 0);
 336         err = 0;
 337 out:
 338         return err;
 339 }
 340
 341 static inline struct page * __find_page_nolock(struct inode * inode, unsigned long offset, struct page *page)
 342 {
 343         goto inside;
 344
 345         for (;;) {
 346                 page = page->next_hash;
 347 inside:
 348                 if (!page)
 349                         goto not_found;
 350                 if (page->inode != inode)
 351                         continue;
 352                 if (page->offset == offset)
 353                         break;
 354         }
 355 not_found:
 356         return page;
 357 }
 358
 359 /*
 360  * This adds a page to the page cache, starting out as locked,
 361  * owned by us, referenced, but not uptodate and with no errors.
 362  */
 363 static inline void __add_to_page_cache(struct page * page,
 364         struct inode * inode, unsigned long offset,
 365         struct page **hash)
 366 {
 367         unsigned long flags;
 368
 369         flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error));
 370         page->flags = flags |  ((1 << PG_locked) | (1 << PG_referenced));
 371         page->owner = (int)current;     /* REMOVEME */
 372         get_page(page);
 373         page->offset = offset;
 374         add_page_to_inode_queue(inode, page);
 375         __add_page_to_hash_queue(page, hash);
 376 }
 377
 378 int add_to_page_cache_unique(struct page * page,
 379         struct inode * inode, unsigned long offset,
 380         struct page **hash)
 381 {
 382         int err;
 383         struct page *alias;
 384
 385         spin_lock(&pagecache_lock);
 386         alias = __find_page_nolock(inode, offset, *hash);
 387
 388         err = 1;
 389         if (!alias) {
 390                 __add_to_page_cache(page,inode,offset,hash);
 391                 err = 0;
 392         }
 393
 394         spin_unlock(&pagecache_lock);
 395         return err;
 396 }
 397
 398 /*
 399  * Try to read ahead in the file. "page_cache" is a potentially free page
 400  * that we could use for the cache (if it is 0 we can try to create one,
 401  * this is all overlapped with the IO on the previous page finishing anyway)
 402  */
 403 static unsigned long try_to_read_ahead(struct file * file,
 404                                 unsigned long offset, unsigned long page_cache)
 405 {
 406         struct inode *inode = file->f_dentry->d_inode;
 407         struct page * page;
 408         struct page ** hash;
 409
 410         offset &= PAGE_CACHE_MASK;
 411         switch (page_cache) {
 412         case 0:
 413                 page_cache = page_cache_alloc();
 414                 if (!page_cache)
 415                         break;
 416         default:
 417                 if (offset >= inode->i_size)
 418                         break;
 419                 hash = page_hash(inode, offset);
 420                 page = page_cache_entry(page_cache);
 421                 if (!add_to_page_cache_unique(page, inode, offset, hash)) {
 422                         /*
 423                          * We do not have to check the return value here
 424                          * because it's a readahead.
 425                          */
 426                         lock_kernel();
 427                         inode->i_op->readpage(file, page);
 428                         unlock_kernel();
 429                         page_cache = 0;
 430                         page_cache_release(page);
 431                 }
 432         }
 433         return page_cache;
 434 }
 435
 436 /*
 437  * Wait for a page to get unlocked.
 438  *
 439  * This must be called with the caller "holding" the page,
 440  * ie with increased "page->count" so that the page won't
 441  * go away during the wait..
 442  */
 443 void ___wait_on_page(struct page *page)
 444 {
 445         struct task_struct *tsk = current;
 446         DECLARE_WAITQUEUE(wait, tsk);
 447
 448         add_wait_queue(&page->wait, &wait);
 449 repeat:
 450         tsk->state = TASK_UNINTERRUPTIBLE;
 451         run_task_queue(&tq_disk);
 452         if (PageLocked(page)) {
 453                 int left;
 454                 left = schedule_timeout(HZ*20);
 455                 if (!left)
 456                         PAGE_BUG(page);
 457                 goto repeat;
 458         }
 459         tsk->state = TASK_RUNNING;
 460         remove_wait_queue(&page->wait, &wait);
 461 }
 462
 463 /*
 464  * Get an exclusive lock on the page..
 465  */
 466 void lock_page(struct page *page)
 467 {
 468         if (TryLockPage(page)) {
 469                 struct task_struct *tsk = current;
 470                 DECLARE_WAITQUEUE(wait, current);
 471
 472                 run_task_queue(&tq_disk);
 473                 add_wait_queue(&page->wait, &wait);
 474                 tsk->state = TASK_UNINTERRUPTIBLE;
 475
 476                 while (TryLockPage(page)) {
 477                         run_task_queue(&tq_disk);
 478                         schedule();
 479                         tsk->state = TASK_UNINTERRUPTIBLE;
 480                 }
 481
 482                 remove_wait_queue(&page->wait, &wait);
 483                 tsk->state = TASK_RUNNING;
 484         }
 485 }
 486
 487
 488 /*
 489  * a rather lightweight function, finding and getting a reference to a
 490  * hashed page atomically, waiting for it if it's locked.
 491  */
 492 struct page * __find_get_page (struct inode * inode,
 493                                 unsigned long offset, struct page *page)
 494 {
 495
 496         /*
 497          * We scan the hash list read-only. Addition to and removal from
 498          * the hash-list needs a held write-lock.
 499          */
 500 repeat:
 501         spin_lock(&pagecache_lock);
 502         page = __find_page_nolock(inode, offset, page);
 503         if (page)
 504                 get_page(page);
 505         spin_unlock(&pagecache_lock);
 506
 507         /* Found the page, sleep if locked. */
 508         if (page && PageLocked(page)) {
 509                 struct task_struct *tsk = current;
 510                 DECLARE_WAITQUEUE(wait, tsk);
 511
 512                 add_wait_queue(&page->wait, &wait);
 513                 tsk->state = TASK_UNINTERRUPTIBLE;
 514
 515                 run_task_queue(&tq_disk);
 516                 if (PageLocked(page))
 517                         schedule();
 518                 tsk->state = TASK_RUNNING;
 519                 remove_wait_queue(&page->wait, &wait);
 520
 521                 /*
 522                  * The page might have been unhashed meanwhile. It's
 523                  * not freed though because we hold a reference to it.
 524                  * If this is the case then it will be freed _here_,
 525                  * and we recheck the hash anyway.
 526                  */
 527                 page_cache_release(page);
 528                 goto repeat;
 529         }
 530         /*
 531          * It's not locked so we can return the page and we hold
 532          * a reference to it.
 533          */
 534         return page;
 535 }
 536
 537 /*
 538  * Get the lock to a page atomically.
 539  */
 540 struct page * __find_lock_page (struct inode * inode,
 541                                 unsigned long offset, struct page *page)
 542 {
 543         int locked;
 544
 545
 546         /*
 547          * We scan the hash list read-only. Addition to and removal from
 548          * the hash-list needs a held write-lock.
 549          */
 550 repeat:
 551         spin_lock(&pagecache_lock);
 552         page = __find_page_nolock(inode, offset, page);
 553         locked = 0;
 554         if (page) {
 555                 get_page(page);
 556                 if (TryLockPage(page))
 557                         locked = 1;
 558         }
 559         spin_unlock(&pagecache_lock);
 560
 561         /* Found the page, sleep if locked. */
 562         if (page && locked) {
 563                 struct task_struct *tsk = current;
 564                 DECLARE_WAITQUEUE(wait, tsk);
 565
 566                 add_wait_queue(&page->wait, &wait);
 567                 tsk->state = TASK_UNINTERRUPTIBLE;
 568
 569                 run_task_queue(&tq_disk);
 570                 if (PageLocked(page))
 571                         schedule();
 572                 tsk->state = TASK_RUNNING;
 573                 remove_wait_queue(&page->wait, &wait);
 574
 575                 /*
 576                  * The page might have been unhashed meanwhile. It's
 577                  * not freed though because we hold a reference to it.
 578                  * If this is the case then it will be freed _here_,
 579                  * and we recheck the hash anyway.
 580                  */
 581                 page_cache_release(page);
 582                 goto repeat;
 583         }
 584         /*
 585          * It's not locked so we can return the page and we hold
 586          * a reference to it.
 587          */
 588         return page;
 589 }
 590
 591 #if 0
 592 #define PROFILE_READAHEAD
 593 #define DEBUG_READAHEAD
 594 #endif
 595
 596 /*
 597  * Read-ahead profiling information
 598  * --------------------------------
 599  * Every PROFILE_MAXREADCOUNT, the following information is written
 600  * to the syslog:
 601  *   Percentage of asynchronous read-ahead.
 602  *   Average of read-ahead fields context value.
 603  * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
 604  * to the syslog.
 605  */
 606
 607 #ifdef PROFILE_READAHEAD
 608
 609 #define PROFILE_MAXREADCOUNT 1000
 610
 611 static unsigned long total_reada;
 612 static unsigned long total_async;
 613 static unsigned long total_ramax;
 614 static unsigned long total_ralen;
 615 static unsigned long total_rawin;
 616
 617 static void profile_readahead(int async, struct file *filp)
 618 {
 619         unsigned long flags;
 620
 621         ++total_reada;
 622         if (async)
 623                 ++total_async;
 624
 625         total_ramax     += filp->f_ramax;
 626         total_ralen     += filp->f_ralen;
 627         total_rawin     += filp->f_rawin;
 628
 629         if (total_reada > PROFILE_MAXREADCOUNT) {
 630                 save_flags(flags);
 631                 cli();
 632                 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
 633                         restore_flags(flags);
 634                         return;
 635                 }
 636
 637                 printk("Readahead average:  max=%ld, len=%ld, win=%ld, async=%ld%%\n",
 638                         total_ramax/total_reada,
 639                         total_ralen/total_reada,
 640                         total_rawin/total_reada,
 641                         (total_async*100)/total_reada);
 642 #ifdef DEBUG_READAHEAD
 643                 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%ld\n",
 644                         filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
 645 #endif
 646
 647                 total_reada     = 0;
 648                 total_async     = 0;
 649                 total_ramax     = 0;
 650                 total_ralen     = 0;
 651                 total_rawin     = 0;
 652
 653                 restore_flags(flags);
 654         }
 655 }
 656 #endif  /* defined PROFILE_READAHEAD */
 657
 658 /*
 659  * Read-ahead context:
 660  * -------------------
 661  * The read ahead context fields of the "struct file" are the following:
 662  * - f_raend : position of the first byte after the last page we tried to
 663  *             read ahead.
 664  * - f_ramax : current read-ahead maximum size.
 665  * - f_ralen : length of the current IO read block we tried to read-ahead.
 666  * - f_rawin : length of the current read-ahead window.
 667  *              if last read-ahead was synchronous then
 668  *                      f_rawin = f_ralen
 669  *              otherwise (was asynchronous)
 670  *                      f_rawin = previous value of f_ralen + f_ralen
 671  *
 672  * Read-ahead limits:
 673  * ------------------
 674  * MIN_READAHEAD   : minimum read-ahead size when read-ahead.
 675  * MAX_READAHEAD   : maximum read-ahead size when read-ahead.
 676  *
 677  * Synchronous read-ahead benefits:
 678  * --------------------------------
 679  * Using reasonable IO xfer length from peripheral devices increase system
 680  * performances.
 681  * Reasonable means, in this context, not too large but not too small.
 682  * The actual maximum value is:
 683  *      MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
 684  *      and 32K if defined (4K page size assumed).
 685  *
 686  * Asynchronous read-ahead benefits:
 687  * ---------------------------------
 688  * Overlapping next read request and user process execution increase system
 689  * performance.
 690  *
 691  * Read-ahead risks:
 692  * -----------------
 693  * We have to guess which further data are needed by the user process.
 694  * If these data are often not really needed, it's bad for system
 695  * performances.
 696  * However, we know that files are often accessed sequentially by
 697  * application programs and it seems that it is possible to have some good
 698  * strategy in that guessing.
 699  * We only try to read-ahead files that seems to be read sequentially.
 700  *
 701  * Asynchronous read-ahead risks:
 702  * ------------------------------
 703  * In order to maximize overlapping, we must start some asynchronous read
 704  * request from the device, as soon as possible.
 705  * We must be very careful about:
 706  * - The number of effective pending IO read requests.
 707  *   ONE seems to be the only reasonable value.
 708  * - The total memory pool usage for the file access stream.
 709  *   This maximum memory usage is implicitly 2 IO read chunks:
 710  *   2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
 711  *   64k if defined (4K page size assumed).
 712  */
 713
 714 static inline int get_max_readahead(struct inode * inode)
 715 {
 716         if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
 717                 return MAX_READAHEAD;
 718         return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
 719 }
 720
 721 static inline unsigned long generic_file_readahead(int reada_ok,
 722         struct file * filp, struct inode * inode,
 723         unsigned long ppos, struct page * page, unsigned long page_cache)
 724 {
 725         unsigned long max_ahead, ahead;
 726         unsigned long raend;
 727         int max_readahead = get_max_readahead(inode);
 728
 729         raend = filp->f_raend & PAGE_CACHE_MASK;
 730         max_ahead = 0;
 731
 732 /*
 733  * The current page is locked.
 734  * If the current position is inside the previous read IO request, do not
 735  * try to reread previously read ahead pages.
 736  * Otherwise decide or not to read ahead some pages synchronously.
 737  * If we are not going to read ahead, set the read ahead context for this
 738  * page only.
 739  */
 740         if (PageLocked(page)) {
 741                 if (!filp->f_ralen || ppos >= raend || ppos + filp->f_ralen < raend) {
 742                         raend = ppos;
 743                         if (raend < inode->i_size)
 744                                 max_ahead = filp->f_ramax;
 745                         filp->f_rawin = 0;
 746                         filp->f_ralen = PAGE_CACHE_SIZE;
 747                         if (!max_ahead) {
 748                                 filp->f_raend  = ppos + filp->f_ralen;
 749                                 filp->f_rawin += filp->f_ralen;
 750                         }
 751                 }
 752         }
 753 /*
 754  * The current page is not locked.
 755  * If we were reading ahead and,
 756  * if the current max read ahead size is not zero and,
 757  * if the current position is inside the last read-ahead IO request,
 758  *   it is the moment to try to read ahead asynchronously.
 759  * We will later force unplug device in order to force asynchronous read IO.
 760  */
 761         else if (reada_ok && filp->f_ramax && raend >= PAGE_CACHE_SIZE &&
 762                  ppos <= raend && ppos + filp->f_ralen >= raend) {
 763 /*
 764  * Add ONE page to max_ahead in order to try to have about the same IO max size
 765  * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
 766  * Compute the position of the last page we have tried to read in order to
 767  * begin to read ahead just at the next page.
 768  */
 769                 raend -= PAGE_CACHE_SIZE;
 770                 if (raend < inode->i_size)
 771                         max_ahead = filp->f_ramax + PAGE_CACHE_SIZE;
 772
 773                 if (max_ahead) {
 774                         filp->f_rawin = filp->f_ralen;
 775                         filp->f_ralen = 0;
 776                         reada_ok      = 2;
 777                 }
 778         }
 779 /*
 780  * Try to read ahead pages.
 781  * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
 782  * scheduler, will work enough for us to avoid too bad actuals IO requests.
 783  */
 784         ahead = 0;
 785         while (ahead < max_ahead) {
 786                 ahead += PAGE_CACHE_SIZE;
 787                 page_cache = try_to_read_ahead(filp, raend + ahead,
 788                                                 page_cache);
 789         }
 790 /*
 791  * If we tried to read ahead some pages,
 792  * If we tried to read ahead asynchronously,
 793  *   Try to force unplug of the device in order to start an asynchronous
 794  *   read IO request.
 795  * Update the read-ahead context.
 796  * Store the length of the current read-ahead window.
 797  * Double the current max read ahead size.
 798  *   That heuristic avoid to do some large IO for files that are not really
 799  *   accessed sequentially.
 800  */
 801         if (ahead) {
 802                 if (reada_ok == 2) {
 803                         run_task_queue(&tq_disk);
 804                 }
 805
 806                 filp->f_ralen += ahead;
 807                 filp->f_rawin += filp->f_ralen;
 808                 filp->f_raend = raend + ahead + PAGE_CACHE_SIZE;
 809
 810                 filp->f_ramax += filp->f_ramax;
 811
 812                 if (filp->f_ramax > max_readahead)
 813                         filp->f_ramax = max_readahead;
 814
 815 #ifdef PROFILE_READAHEAD
 816                 profile_readahead((reada_ok == 2), filp);
 817 #endif
 818         }
 819
 820         return page_cache;
 821 }
 822
 823 /*
 824  * "descriptor" for what we're up to with a read.
 825  * This allows us to use the same read code yet
 826  * have multiple different users of the data that
 827  * we read from a file.
 828  *
 829  * The simplest case just copies the data to user
 830  * mode.
 831  */
 832 typedef struct {
 833         size_t written;
 834         size_t count;
 835         char * buf;
 836         int error;
 837 } read_descriptor_t;
 838
 839 typedef int (*read_actor_t)(read_descriptor_t *, const char *, unsigned long);
 840
 841 /*
 842  * This is a generic file read routine, and uses the
 843  * inode->i_op->readpage() function for the actual low-level
 844  * stuff.
 845  *
 846  * This is really ugly. But the goto's actually try to clarify some
 847  * of the logic when it comes to error handling etc.
 848  */
 849 static void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
 850 {
 851         struct dentry *dentry = filp->f_dentry;
 852         struct inode *inode = dentry->d_inode;
 853         size_t pos, pgpos, page_cache;
 854         int reada_ok;
 855         int error;
 856         int max_readahead = get_max_readahead(inode);
 857
 858         page_cache = 0;
 859
 860         pos = *ppos;
 861         pgpos = pos & PAGE_CACHE_MASK;
 862 /*
 863  * If the current position is outside the previous read-ahead window,
 864  * we reset the current read-ahead context and set read ahead max to zero
 865  * (will be set to just needed value later),
 866  * otherwise, we assume that the file accesses are sequential enough to
 867  * continue read-ahead.
 868  */
 869         if (pgpos > filp->f_raend || pgpos + filp->f_rawin < filp->f_raend) {
 870                 reada_ok = 0;
 871                 filp->f_raend = 0;
 872                 filp->f_ralen = 0;
 873                 filp->f_ramax = 0;
 874                 filp->f_rawin = 0;
 875         } else {
 876                 reada_ok = 1;
 877         }
 878 /*
 879  * Adjust the current value of read-ahead max.
 880  * If the read operation stay in the first half page, force no readahead.
 881  * Otherwise try to increase read ahead max just enough to do the read request.
 882  * Then, at least MIN_READAHEAD if read ahead is ok,
 883  * and at most MAX_READAHEAD in all cases.
 884  */
 885         if (pos + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
 886                 filp->f_ramax = 0;
 887         } else {
 888                 unsigned long needed;
 889
 890                 needed = ((pos + desc->count) & PAGE_CACHE_MASK) - pgpos;
 891
 892                 if (filp->f_ramax < needed)
 893                         filp->f_ramax = needed;
 894
 895                 if (reada_ok && filp->f_ramax < MIN_READAHEAD)
 896                                 filp->f_ramax = MIN_READAHEAD;
 897                 if (filp->f_ramax > max_readahead)
 898                         filp->f_ramax = max_readahead;
 899         }
 900
 901         for (;;) {
 902                 struct page *page, **hash;
 903
 904                 if (pos >= inode->i_size)
 905                         break;
 906
 907                 /*
 908                  * Try to find the data in the page cache..
 909                  */
 910                 hash = page_hash(inode, pos & PAGE_CACHE_MASK);
 911
 912                 spin_lock(&pagecache_lock);
 913                 page = __find_page_nolock(inode, pos & PAGE_CACHE_MASK, *hash);
 914                 if (!page)
 915                         goto no_cached_page;
 916 found_page:
 917                 get_page(page);
 918                 spin_unlock(&pagecache_lock);
 919
 920                 if (!Page_Uptodate(page))
 921                         goto page_not_up_to_date;
 922 page_ok:
 923         /*
 924          * Ok, we have the page, and it's up-to-date, so
 925          * now we can copy it to user space...
 926          */
 927         {
 928                 unsigned long offset, nr;
 929
 930                 offset = pos & ~PAGE_CACHE_MASK;
 931                 nr = PAGE_CACHE_SIZE - offset;
 932                 if (nr > inode->i_size - pos)
 933                         nr = inode->i_size - pos;
 934
 935                 /*
 936                  * The actor routine returns how many bytes were actually used..
 937                  * NOTE! This may not be the same as how much of a user buffer
 938                  * we filled up (we may be padding etc), so we can only update
 939                  * "pos" here (the actor routine has to update the user buffer
 940                  * pointers and the remaining count).
 941                  */
 942                 nr = actor(desc, (const char *) (page_address(page) + offset), nr);
 943                 pos += nr;
 944                 page_cache_release(page);
 945                 if (nr && desc->count)
 946                         continue;
 947                 break;
 948         }
 949
 950 /*
 951  * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
 952  */
 953 page_not_up_to_date:
 954                 page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_CACHE_MASK, page, page_cache);
 955
 956                 if (Page_Uptodate(page))
 957                         goto page_ok;
 958
 959                 /* Get exclusive access to the page ... */
 960                 lock_page(page);
 961                 if (Page_Uptodate(page)) {
 962                         UnlockPage(page);
 963                         goto page_ok;
 964                 }
 965
 966 read_page:
 967                 /* ... and start the actual read. The read will unlock the page. */
 968                 lock_kernel();
 969                 error = inode->i_op->readpage(filp, page);
 970                 unlock_kernel();
 971
 972                 if (!error) {
 973                         if (Page_Uptodate(page))
 974                                 goto page_ok;
 975
 976                         /* Again, try some read-ahead while waiting for the page to finish.. */
 977                         page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_CACHE_MASK, page, page_cache);
 978                         wait_on_page(page);
 979                         if (Page_Uptodate(page))
 980                                 goto page_ok;
 981                         error = -EIO;
 982                 }
 983
 984                 /* UHHUH! A synchronous read error occurred. Report it */
 985                 desc->error = error;
 986                 page_cache_release(page);
 987                 break;
 988
 989 no_cached_page:
 990                 /*
 991                  * Ok, it wasn't cached, so we need to create a new
 992                  * page..
 993                  *
 994                  * We get here with the page cache lock held.
 995                  */
 996                 if (!page_cache) {
 997                         spin_unlock(&pagecache_lock);
 998                         page_cache = page_cache_alloc();
 999                         if (!page_cache) {
1000                                 desc->error = -ENOMEM;
1001                                 break;
1002                         }
1003
1004                         /*
1005                          * Somebody may have added the page while we
1006                          * dropped the page cache lock. Check for that.
1007                          */
1008                         spin_lock(&pagecache_lock);
1009                         page = __find_page_nolock(inode, pos & PAGE_CACHE_MASK, *hash);
1010                         if (page)
1011                                 goto found_page;
1012                 }
1013
1014                 /*
1015                  * Ok, add the new page to the hash-queues...
1016                  */
1017                 page = page_cache_entry(page_cache);
1018                 __add_to_page_cache(page, inode, pos & PAGE_CACHE_MASK, hash);
1019                 spin_unlock(&pagecache_lock);
1020
1021                 page_cache = 0;
1022                 goto read_page;
1023         }
1024
1025         *ppos = pos;
1026         filp->f_reada = 1;
1027         if (page_cache)
1028                 page_cache_free(page_cache);
1029         UPDATE_ATIME(inode);
1030 }
1031
1032 static int file_read_actor(read_descriptor_t * desc, const char *area, unsigned long size)
1033 {
1034         unsigned long left;
1035         unsigned long count = desc->count;
1036
1037         if (size > count)
1038                 size = count;
1039         left = __copy_to_user(desc->buf, area, size);
1040         if (left) {
1041                 size -= left;
1042                 desc->error = -EFAULT;
1043         }
1044         desc->count = count - size;
1045         desc->written += size;
1046         desc->buf += size;
1047         return size;
1048 }
1049
1050 /*
1051  * This is the "read()" routine for all filesystems
1052  * that can use the page cache directly.
1053  */
1054 ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
1055 {
1056         ssize_t retval;
1057
1058         unlock_kernel();
1059         retval = -EFAULT;
1060         if (access_ok(VERIFY_WRITE, buf, count)) {
1061                 retval = 0;
1062                 if (count) {
1063                         read_descriptor_t desc;
1064
1065                         desc.written = 0;
1066                         desc.count = count;
1067                         desc.buf = buf;
1068                         desc.error = 0;
1069                         do_generic_file_read(filp, ppos, &desc, file_read_actor);
1070
1071                         retval = desc.written;
1072                         if (!retval)
1073                                 retval = desc.error;
1074                 }
1075         }
1076         lock_kernel();
1077         return retval;
1078 }
1079
1080 static int file_send_actor(read_descriptor_t * desc, const char *area, unsigned long size)
1081 {
1082         ssize_t written;
1083         unsigned long count = desc->count;
1084         struct file *file = (struct file *) desc->buf;
1085         mm_segment_t old_fs;
1086
1087         if (size > count)
1088                 size = count;
1089         old_fs = get_fs();
1090         set_fs(KERNEL_DS);
1091         written = file->f_op->write(file, area, size, &file->f_pos);
1092         set_fs(old_fs);
1093         if (written < 0) {
1094                 desc->error = written;
1095                 written = 0;
1096         }
1097         desc->count = count - written;
1098         desc->written += written;
1099         return written;
1100 }
1101
1102 asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
1103 {
1104         ssize_t retval;
1105         struct file * in_file, * out_file;
1106         struct inode * in_inode, * out_inode;
1107
1108         lock_kernel();
1109
1110         /*
1111          * Get input file, and verify that it is ok..
1112          */
1113         retval = -EBADF;
1114         in_file = fget(in_fd);
1115         if (!in_file)
1116                 goto out;
1117         if (!(in_file->f_mode & FMODE_READ))
1118                 goto fput_in;
1119         retval = -EINVAL;
1120         in_inode = in_file->f_dentry->d_inode;
1121         if (!in_inode)
1122                 goto fput_in;
1123         if (!in_inode->i_op || !in_inode->i_op->readpage)
1124                 goto fput_in;
1125         retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
1126         if (retval)
1127                 goto fput_in;
1128
1129         /*
1130          * Get output file, and verify that it is ok..
1131          */
1132         retval = -EBADF;
1133         out_file = fget(out_fd);
1134         if (!out_file)
1135                 goto fput_in;
1136         if (!(out_file->f_mode & FMODE_WRITE))
1137                 goto fput_out;
1138         retval = -EINVAL;
1139         if (!out_file->f_op || !out_file->f_op->write)
1140                 goto fput_out;
1141         out_inode = out_file->f_dentry->d_inode;
1142         if (!out_inode)
1143                 goto fput_out;
1144         retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
1145         if (retval)
1146                 goto fput_out;
1147
1148         retval = 0;
1149         if (count) {
1150                 read_descriptor_t desc;
1151                 loff_t pos = 0, *ppos;
1152
1153                 retval = -EFAULT;
1154                 ppos = &in_file->f_pos;
1155                 if (offset) {
1156                         if (get_user(pos, offset))
1157                                 goto fput_out;
1158                         ppos = &pos;
1159                 }
1160
1161                 desc.written = 0;
1162                 desc.count = count;
1163                 desc.buf = (char *) out_file;
1164                 desc.error = 0;
1165                 do_generic_file_read(in_file, ppos, &desc, file_send_actor);
1166
1167                 retval = desc.written;
1168                 if (!retval)
1169                         retval = desc.error;
1170                 if (offset)
1171                         put_user(pos, offset);
1172         }
1173
1174
1175 fput_out:
1176         fput(out_file);
1177 fput_in:
1178         fput(in_file);
1179 out:
1180         unlock_kernel();
1181         return retval;
1182 }
1183
1184 /*
1185  * Semantics for shared and private memory areas are different past the end
1186  * of the file. A shared mapping past the last page of the file is an error
1187  * and results in a SIGBUS, while a private mapping just maps in a zero page.
1188  *
1189  * The goto's are kind of ugly, but this streamlines the normal case of having
1190  * it in the page cache, and handles the special cases reasonably without
1191  * having a lot of duplicated code.
1192  *
1193  * WSH 06/04/97: fixed a memory leak and moved the allocation of new_page
1194  * ahead of the wait if we're sure to need it.
1195  */
1196 static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long address, int no_share)
1197 {
1198         struct file * file = area->vm_file;
1199         struct dentry * dentry = file->f_dentry;
1200         struct inode * inode = dentry->d_inode;
1201         unsigned long offset, reada, i;
1202         struct page * page, **hash;
1203         unsigned long old_page, new_page;
1204         int error;
1205
1206         new_page = 0;
1207         offset = (address & PAGE_MASK) - area->vm_start + area->vm_offset;
1208         if (offset >= inode->i_size && (area->vm_flags & VM_SHARED) && area->vm_mm == current->mm)
1209                 goto no_page_nolock;
1210         unlock_kernel();
1211
1212         /*
1213          * Do we have something in the page cache already?
1214          */
1215         hash = page_hash(inode, offset);
1216 retry_find:
1217         page = __find_get_page(inode, offset, *hash);
1218         if (!page)
1219                 goto no_cached_page;
1220
1221 found_page:
1222         /*
1223          * Ok, found a page in the page cache, now we need to check
1224          * that it's up-to-date.  First check whether we'll need an
1225          * extra page -- better to overlap the allocation with the I/O.
1226          */
1227         if (no_share && !new_page) {
1228                 new_page = page_cache_alloc();
1229                 if (!new_page)
1230                         goto failure;
1231         }
1232
1233         wait_on_page(page);
1234
1235         if (!Page_Uptodate(page))
1236                 PAGE_BUG(page);
1237
1238 success:
1239         /*
1240          * Found the page and have a reference on it, need to check sharing
1241          * and possibly copy it over to another page..
1242          */
1243         old_page = page_address(page);
1244         if (!no_share) {
1245                 /*
1246                  * Ok, we can share the cached page directly.. Get rid
1247                  * of any potential extra pages.
1248                  */
1249                 if (new_page)
1250                         page_cache_free(new_page);
1251
1252                 flush_page_to_ram(old_page);
1253                 lock_kernel();
1254                 return old_page;
1255         }
1256
1257         /*
1258          * No sharing ... copy to the new page.
1259          */
1260         copy_page(new_page, old_page);
1261         flush_page_to_ram(new_page);
1262         page_cache_release(page);
1263         lock_kernel();
1264         return new_page;
1265
1266 no_cached_page:
1267         /*
1268          * Try to read in an entire cluster at once.
1269          */
1270         reada   = offset;
1271         reada >>= PAGE_CACHE_SHIFT + page_cluster;
1272         reada <<= PAGE_CACHE_SHIFT + page_cluster;
1273
1274         for (i = 1 << page_cluster; i > 0; --i, reada += PAGE_CACHE_SIZE)
1275                 new_page = try_to_read_ahead(file, reada, new_page);
1276
1277         if (!new_page)
1278                 new_page = page_cache_alloc();
1279         if (!new_page)
1280                 goto no_page;
1281
1282         /*
1283          * During getting the above page we might have slept,
1284          * so we need to re-check the situation with the page
1285          * cache.. The page we just got may be useful if we
1286          * can't share, so don't get rid of it here.
1287          */
1288         page = __find_get_page(inode, offset, *hash);
1289         if (page)
1290                 goto found_page;
1291
1292         /*
1293          * Now, create a new page-cache page from the page we got
1294          */
1295         page = page_cache_entry(new_page);
1296         if (add_to_page_cache_unique(page, inode, offset, hash))
1297                 goto retry_find;
1298
1299         /*
1300          * Now it's ours and locked, we can do initial IO to it:
1301          */
1302         new_page = 0;
1303
1304         lock_kernel();
1305         error = inode->i_op->readpage(file, page);
1306         unlock_kernel();
1307
1308         if (!error) {
1309                 wait_on_page(page);
1310                 if (PageError(page))
1311                         goto page_read_error;
1312                 goto success;
1313         }
1314
1315 page_read_error:
1316         /*
1317          * Umm, take care of errors if the page isn't up-to-date.
1318          * Try to re-read it _once_. We do this synchronously,
1319          * because there really aren't any performance issues here
1320          * and we need to check for errors.
1321          */
1322         if (!PageLocked(page))
1323                 PAGE_BUG(page);
1324         ClearPageError(page);
1325         lock_kernel();
1326         error = inode->i_op->readpage(file, page);
1327         unlock_kernel();
1328         if (error)
1329                 goto failure;
1330         wait_on_page(page);
1331         if (Page_Uptodate(page))
1332                 goto success;
1333
1334         /*
1335          * Things didn't work out. Return zero to tell the
1336          * mm layer so, possibly freeing the page cache page first.
1337          */
1338 failure:
1339         page_cache_release(page);
1340         if (new_page)
1341                 page_cache_free(new_page);
1342 no_page:
1343         lock_kernel();
1344 no_page_nolock:
1345         return 0;
1346 }
1347
1348 /*
1349  * Tries to write a shared mapped page to its backing store. May return -EIO
1350  * if the disk is full.
1351  */
1352 static inline int do_write_page(struct inode * inode, struct file * file,
1353         const char * page_addr, unsigned long offset)
1354 {
1355         int retval;
1356         unsigned long size;
1357         loff_t loff = offset;
1358         int (*writepage) (struct file *, struct page *);
1359         struct page * page;
1360
1361         size = offset + PAGE_SIZE;
1362         /* refuse to extend file size.. */
1363         if (S_ISREG(inode->i_mode)) {
1364                 if (size > inode->i_size)
1365                         size = inode->i_size;
1366                 /* Ho humm.. We should have tested for this earlier */
1367                 if (size < offset)
1368                         return -EIO;
1369         }
1370         size -= offset;
1371         retval = -EIO;
1372         writepage = inode->i_op->writepage;
1373         page = mem_map + MAP_NR(page_addr);
1374         lock_page(page);
1375
1376         if (writepage) {
1377                 retval = writepage(file, page);
1378         } else {
1379                 mm_segment_t old_fs = get_fs();
1380                 set_fs(KERNEL_DS);
1381                 if (size == file->f_op->write(file, page_addr, size, &loff))
1382                         retval = 0;
1383                 set_fs(old_fs);
1384         }
1385         UnlockPage(page);
1386         return retval;
1387 }
1388
1389 static int filemap_write_page(struct vm_area_struct * vma,
1390                               unsigned long offset,
1391                               unsigned long page,
1392                               int wait)
1393 {
1394         int result;
1395         struct file * file;
1396         struct dentry * dentry;
1397         struct inode * inode;
1398
1399         file = vma->vm_file;
1400         dentry = file->f_dentry;
1401         inode = dentry->d_inode;
1402         if (!file->f_op->write)
1403                 return -EIO;
1404
1405         /*
1406          * If a task terminates while we're swapping the page, the vma and
1407          * and file could be released ... increment the count to be safe.
1408          */
1409         file->f_count++;
1410
1411         /*
1412          * If this is a swapping operation rather than msync(), then
1413          * leave the actual IO, and the restoration of the file count,
1414          * to the kpiod thread.  Just queue the request for now.
1415          */
1416         if (!wait) {
1417                 make_pio_request(file, offset, page);
1418                 return 0;
1419         }
1420
1421         result = do_write_page(inode, file, (const char *) page, offset);
1422         fput(file);
1423         return result;
1424 }
1425
1426
1427 /*
1428  * The page cache takes care of races between somebody
1429  * trying to swap something out and swap something in
1430  * at the same time..
1431  */
1432 int filemap_swapout(struct vm_area_struct * vma, struct page * page)
1433 {
1434         return filemap_write_page(vma, page->offset, page_address(page), 0);
1435 }
1436
1437 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
1438         unsigned long address, unsigned int flags)
1439 {
1440         pte_t pte = *ptep;
1441         unsigned long pageaddr;
1442         struct page *page;
1443         int error;
1444
1445         if (!(flags & MS_INVALIDATE)) {
1446                 if (!pte_present(pte))
1447                         return 0;
1448                 if (!pte_dirty(pte))
1449                         return 0;
1450                 flush_page_to_ram(pte_page(pte));
1451                 flush_cache_page(vma, address);
1452                 set_pte(ptep, pte_mkclean(pte));
1453                 flush_tlb_page(vma, address);
1454                 pageaddr = pte_page(pte);
1455                 page = page_cache_entry(pageaddr);
1456                 get_page(page);
1457         } else {
1458                 if (pte_none(pte))
1459                         return 0;
1460                 flush_cache_page(vma, address);
1461                 pte_clear(ptep);
1462                 flush_tlb_page(vma, address);
1463                 if (!pte_present(pte)) {
1464                         swap_free(pte_val(pte));
1465                         return 0;
1466                 }
1467                 pageaddr = pte_page(pte);
1468                 if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
1469                         page_cache_free(pageaddr);
1470                         return 0;
1471                 }
1472         }
1473         error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, pageaddr, 1);
1474         page_cache_free(pageaddr);
1475         return error;
1476 }
1477
1478 static inline int filemap_sync_pte_range(pmd_t * pmd,
1479         unsigned long address, unsigned long size,
1480         struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
1481 {
1482         pte_t * pte;
1483         unsigned long end;
1484         int error;
1485
1486         if (pmd_none(*pmd))
1487                 return 0;
1488         if (pmd_bad(*pmd)) {
1489                 printk("filemap_sync_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
1490                 pmd_clear(pmd);
1491                 return 0;
1492         }
1493         pte = pte_offset(pmd, address);
1494         offset += address & PMD_MASK;
1495         address &= ~PMD_MASK;
1496         end = address + size;
1497         if (end > PMD_SIZE)
1498                 end = PMD_SIZE;
1499         error = 0;
1500         do {
1501                 error |= filemap_sync_pte(pte, vma, address + offset, flags);
1502                 address += PAGE_SIZE;
1503                 pte++;
1504         } while (address < end);
1505         return error;
1506 }
1507
1508 static inline int filemap_sync_pmd_range(pgd_t * pgd,
1509         unsigned long address, unsigned long size,
1510         struct vm_area_struct *vma, unsigned int flags)
1511 {
1512         pmd_t * pmd;
1513         unsigned long offset, end;
1514         int error;
1515
1516         if (pgd_none(*pgd))
1517                 return 0;
1518         if (pgd_bad(*pgd)) {
1519                 printk("filemap_sync_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd));
1520                 pgd_clear(pgd);
1521                 return 0;
1522         }
1523         pmd = pmd_offset(pgd, address);
1524         offset = address & PGDIR_MASK;
1525         address &= ~PGDIR_MASK;
1526         end = address + size;
1527         if (end > PGDIR_SIZE)
1528                 end = PGDIR_SIZE;
1529         error = 0;
1530         do {
1531                 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
1532                 address = (address + PMD_SIZE) & PMD_MASK;
1533                 pmd++;
1534         } while (address < end);
1535         return error;
1536 }
1537
1538 static int filemap_sync(struct vm_area_struct * vma, unsigned long address,
1539         size_t size, unsigned int flags)
1540 {
1541         pgd_t * dir;
1542         unsigned long end = address + size;
1543         int error = 0;
1544
1545         dir = pgd_offset(vma->vm_mm, address);
1546         flush_cache_range(vma->vm_mm, end - size, end);
1547         while (address < end) {
1548                 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
1549                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
1550                 dir++;
1551         }
1552         flush_tlb_range(vma->vm_mm, end - size, end);
1553         return error;
1554 }
1555
1556 /*
1557  * This handles (potentially partial) area unmaps..
1558  */
1559 static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len)
1560 {
1561         filemap_sync(vma, start, len, MS_ASYNC);
1562 }
1563
1564 /*
1565  * Shared mappings need to be able to do the right thing at
1566  * close/unmap/sync. They will also use the private file as
1567  * backing-store for swapping..
1568  */
1569 static struct vm_operations_struct file_shared_mmap = {
1570         NULL,                   /* no special open */
1571         NULL,                   /* no special close */
1572         filemap_unmap,          /* unmap - we need to sync the pages */
1573         NULL,                   /* no special protect */
1574         filemap_sync,           /* sync */
1575         NULL,                   /* advise */
1576         filemap_nopage,         /* nopage */
1577         NULL,                   /* wppage */
1578         filemap_swapout,        /* swapout */
1579         NULL,                   /* swapin */
1580 };
1581
1582 /*
1583  * Private mappings just need to be able to load in the map.
1584  *
1585  * (This is actually used for shared mappings as well, if we
1586  * know they can't ever get write permissions..)
1587  */
1588 static struct vm_operations_struct file_private_mmap = {
1589         NULL,                   /* open */
1590         NULL,                   /* close */
1591         NULL,                   /* unmap */
1592         NULL,                   /* protect */
1593         NULL,                   /* sync */
1594         NULL,                   /* advise */
1595         filemap_nopage,         /* nopage */
1596         NULL,                   /* wppage */
1597         NULL,                   /* swapout */
1598         NULL,                   /* swapin */
1599 };
1600
1601 /* This is used for a general mmap of a disk file */
1602
1603 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1604 {
1605         struct vm_operations_struct * ops;
1606         struct inode *inode = file->f_dentry->d_inode;
1607
1608         ops = &file_private_mmap;
1609         if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
1610                 ops = &file_shared_mmap;
1611         if (!inode->i_sb || !S_ISREG(inode->i_mode))
1612                 return -EACCES;
1613         if (!inode->i_op || !inode->i_op->readpage)
1614                 return -ENOEXEC;
1615         UPDATE_ATIME(inode);
1616         vma->vm_ops = ops;
1617         return 0;
1618 }
1619
1620
1621 /*
1622  * The msync() system call.
1623  */
1624
1625 static int msync_interval(struct vm_area_struct * vma,
1626         unsigned long start, unsigned long end, int flags)
1627 {
1628         if (vma->vm_file && vma->vm_ops && vma->vm_ops->sync) {
1629                 int error;
1630                 error = vma->vm_ops->sync(vma, start, end-start, flags);
1631                 if (!error && (flags & MS_SYNC)) {
1632                         struct file * file = vma->vm_file;
1633                         if (file) {
1634                                 struct dentry * dentry = file->f_dentry;
1635                                 error = file_fsync(file, dentry);
1636                         }
1637                 }
1638                 return error;
1639         }
1640         return 0;
1641 }
1642
1643 asmlinkage int sys_msync(unsigned long start, size_t len, int flags)
1644 {
1645         unsigned long end;
1646         struct vm_area_struct * vma;
1647         int unmapped_error, error = -EINVAL;
1648
1649         down(&current->mm->mmap_sem);
1650         lock_kernel();
1651         if (start & ~PAGE_MASK)
1652                 goto out;
1653         len = (len + ~PAGE_MASK) & PAGE_MASK;
1654         end = start + len;
1655         if (end < start)
1656                 goto out;
1657         if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1658                 goto out;
1659         error = 0;
1660         if (end == start)
1661                 goto out;
1662         /*
1663          * If the interval [start,end) covers some unmapped address ranges,
1664          * just ignore them, but return -EFAULT at the end.
1665          */
1666         vma = find_vma(current->mm, start);
1667         unmapped_error = 0;
1668         for (;;) {
1669                 /* Still start < end. */
1670                 error = -EFAULT;
1671                 if (!vma)
1672                         goto out;
1673                 /* Here start < vma->vm_end. */
1674                 if (start < vma->vm_start) {
1675                         unmapped_error = -EFAULT;
1676                         start = vma->vm_start;
1677                 }
1678                 /* Here vma->vm_start <= start < vma->vm_end. */
1679                 if (end <= vma->vm_end) {
1680                         if (start < end) {
1681                                 error = msync_interval(vma, start, end, flags);
1682                                 if (error)
1683                                         goto out;
1684                         }
1685                         error = unmapped_error;
1686                         goto out;
1687                 }
1688                 /* Here vma->vm_start <= start < vma->vm_end < end. */
1689                 error = msync_interval(vma, start, vma->vm_end, flags);
1690                 if (error)
1691                         goto out;
1692                 start = vma->vm_end;
1693                 vma = vma->vm_next;
1694         }
1695 out:
1696         unlock_kernel();
1697         up(&current->mm->mmap_sem);
1698         return error;
1699 }
1700
1701 /*
1702  * Write to a file through the page cache. This is mainly for the
1703  * benefit of NFS and possibly other network-based file systems.
1704  *
1705  * We currently put everything into the page cache prior to writing it.
1706  * This is not a problem when writing full pages. With partial pages,
1707  * however, we first have to read the data into the cache, then
1708  * dirty the page, and finally schedule it for writing. Alternatively, we
1709  * could write-through just the portion of data that would go into that
1710  * page, but that would kill performance for applications that write data
1711  * line by line, and it's prone to race conditions.
1712  *
1713  * Note that this routine doesn't try to keep track of dirty pages. Each
1714  * file system has to do this all by itself, unfortunately.
1715  *                                                      okir@monad.swb.de
1716  */
1717 ssize_t
1718 generic_file_write(struct file *file, const char *buf,
1719                    size_t count, loff_t *ppos,
1720                    writepage_t write_one_page)
1721 {
1722         struct dentry   *dentry = file->f_dentry;
1723         struct inode    *inode = dentry->d_inode;
1724         unsigned long   pos = *ppos;
1725         unsigned long   limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
1726         struct page     *page, **hash;
1727         unsigned long   page_cache = 0;
1728         unsigned long   written;
1729         long            status;
1730         int             err;
1731
1732         err = file->f_error;
1733         if (err) {
1734                 file->f_error = 0;
1735                 goto out;
1736         }
1737
1738         written = 0;
1739
1740         if (file->f_flags & O_APPEND)
1741                 pos = inode->i_size;
1742
1743         /*
1744          * Check whether we've reached the file size limit.
1745          */
1746         err = -EFBIG;
1747         if (pos >= limit) {
1748                 send_sig(SIGXFSZ, current, 0);
1749                 goto out;
1750         }
1751
1752         status  = 0;
1753         /*
1754          * Check whether to truncate the write,
1755          * and send the signal if we do.
1756          */
1757         if (count > limit - pos) {
1758                 send_sig(SIGXFSZ, current, 0);
1759                 count = limit - pos;
1760         }
1761
1762         unlock_kernel();
1763
1764         while (count) {
1765                 unsigned long bytes, pgpos, offset;
1766                 /*
1767                  * Try to find the page in the cache. If it isn't there,
1768                  * allocate a free page.
1769                  */
1770                 offset = (pos & ~PAGE_CACHE_MASK);
1771                 pgpos = pos & PAGE_CACHE_MASK;
1772                 bytes = PAGE_CACHE_SIZE - offset;
1773                 if (bytes > count)
1774                         bytes = count;
1775
1776                 hash = page_hash(inode, pgpos);
1777 repeat_find:
1778                 page = __find_lock_page(inode, pgpos, *hash);
1779                 if (!page) {
1780                         if (!page_cache) {
1781                                 page_cache = page_cache_alloc();
1782                                 if (page_cache)
1783                                         goto repeat_find;
1784                                 status = -ENOMEM;
1785                                 break;
1786                         }
1787                         page = page_cache_entry(page_cache);
1788                         if (add_to_page_cache_unique(page,inode,pgpos,hash))
1789                                 goto repeat_find;
1790
1791                         page_cache = 0;
1792                 }
1793
1794                 /* We have exclusive IO access to the page.. */
1795                 if (!PageLocked(page)) {
1796                         PAGE_BUG(page);
1797                 } else {
1798                         if (page->owner != (int)current) {
1799                                 PAGE_BUG(page);
1800                         }
1801                 }
1802
1803                 status = write_one_page(file, page, offset, bytes, buf);
1804
1805                 /* Mark it unlocked again and drop the page.. */
1806                 UnlockPage(page);
1807                 page_cache_release(page);
1808
1809                 if (status < 0)
1810                         break;
1811
1812                 written += status;
1813                 count -= status;
1814                 pos += status;
1815                 buf += status;
1816         }
1817         *ppos = pos;
1818         if (pos > inode->i_size)
1819                 inode->i_size = pos;
1820
1821         if (page_cache)
1822                 page_cache_free(page_cache);
1823
1824         err = written ? written : status;
1825         lock_kernel();
1826 out:
1827         return err;
1828 }
1829
1830 /*
1831  * Support routines for directory caching using the page cache.
1832  */
1833
1834 /*
1835  * Unlock and free a page.
1836  */
1837 void put_cached_page(unsigned long addr)
1838 {
1839         struct page * page = page_cache_entry(addr);
1840
1841         UnlockPage(page);
1842         if (page_count(page) != 2)
1843                 panic("put_cached_page: page count=%d\n",
1844                         page_count(page));
1845         page_cache_release(page);
1846 }
1847
1848
1849 /* Add request for page IO to the queue */
1850
1851 static inline void put_pio_request(struct pio_request *p)
1852 {
1853         *pio_last = p;
1854         p->next = NULL;
1855         pio_last = &p->next;
1856 }
1857
1858 /* Take the first page IO request off the queue */
1859
1860 static inline struct pio_request * get_pio_request(void)
1861 {
1862         struct pio_request * p = pio_first;
1863         pio_first = p->next;
1864         if (!pio_first)
1865                 pio_last = &pio_first;
1866         return p;
1867 }
1868
1869 /* Make a new page IO request and queue it to the kpiod thread */
1870
1871 static inline void make_pio_request(struct file *file,
1872                                     unsigned long offset,
1873                                     unsigned long pageaddr)
1874 {
1875         struct pio_request *p;
1876         struct page *page;
1877
1878         page = page_cache_entry(pageaddr);
1879         get_page(page);
1880
1881         /*
1882          * We need to allocate without causing any recursive IO in the
1883          * current thread's context.  We might currently be swapping out
1884          * as a result of an allocation made while holding a critical
1885          * filesystem lock.  To avoid deadlock, we *MUST* not reenter
1886          * the filesystem in this thread.
1887          *
1888          * We can wait for kswapd to free memory, or we can try to free
1889          * pages without actually performing further IO, without fear of
1890          * deadlock.  --sct
1891          */
1892
1893         while ((p = kmem_cache_alloc(pio_request_cache, GFP_BUFFER)) == NULL) {
1894                 if (try_to_free_pages(__GFP_WAIT))
1895                         continue;
1896                 current->state = TASK_INTERRUPTIBLE;
1897                 schedule_timeout(HZ/10);
1898         }
1899
1900         p->file   = file;
1901         p->offset = offset;
1902         p->page   = pageaddr;
1903
1904         put_pio_request(p);
1905         wake_up(&pio_wait);
1906 }
1907
1908
1909 /*
1910  * This is the only thread which is allowed to write out filemap pages
1911  * while swapping.
1912  *
1913  * To avoid deadlock, it is important that we never reenter this thread.
1914  * Although recursive memory allocations within this thread may result
1915  * in more page swapping, that swapping will always be done by queuing
1916  * another IO request to the same thread: we will never actually start
1917  * that IO request until we have finished with the current one, and so
1918  * we will not deadlock.
1919  */
1920
1921 int kpiod(void * unused)
1922 {
1923         struct task_struct *tsk = current;
1924         DECLARE_WAITQUEUE(wait, tsk);
1925         struct inode * inode;
1926         struct dentry * dentry;
1927         struct pio_request * p;
1928
1929         tsk->session = 1;
1930         tsk->pgrp = 1;
1931         strcpy(tsk->comm, "kpiod");
1932         sigfillset(&tsk->blocked);
1933         /*
1934          * Mark this task as a memory allocator - we don't want to get caught
1935          * up in the regular mm freeing frenzy if we have to allocate memory
1936          * in order to write stuff out.
1937          */
1938         tsk->flags |= PF_MEMALLOC;
1939
1940         lock_kernel();
1941
1942         pio_request_cache = kmem_cache_create("pio_request",
1943                                               sizeof(struct pio_request),
1944                                               0, SLAB_HWCACHE_ALIGN,
1945                                               NULL, NULL);
1946         if (!pio_request_cache)
1947                 panic ("Could not create pio_request slab cache");
1948
1949         while (1) {
1950                 tsk->state = TASK_INTERRUPTIBLE;
1951                 add_wait_queue(&pio_wait, &wait);
1952                 if (!pio_first)
1953                         schedule();
1954                 remove_wait_queue(&pio_wait, &wait);
1955                 tsk->state = TASK_RUNNING;
1956
1957                 while (pio_first) {
1958                         p = get_pio_request();
1959                         dentry = p->file->f_dentry;
1960                         inode = dentry->d_inode;
1961
1962                         do_write_page(inode, p->file,
1963                                       (const char *) p->page, p->offset);
1964                         fput(p->file);
1965                         page_cache_free(p->page);
1966                         kmem_cache_free(pio_request_cache, p);
1967                 }
1968         }
1969 }