mm/filemap.c

   1 /*
   2  *      linux/mm/filemap.c
   3  *
   4  * Copyright (C) 1994-1999  Linus Torvalds
   5  */
   6
   7 /*
   8  * This file handles the generic file mmap semantics used by
   9  * most "normal" filesystems (but you don't /have/ to use this:
  10  * the NFS filesystem used to do this differently, for example)
  11  */
  12 #include <linux/malloc.h>
  13 #include <linux/shm.h>
  14 #include <linux/mman.h>
  15 #include <linux/locks.h>
  16 #include <linux/pagemap.h>
  17 #include <linux/swap.h>
  18 #include <linux/smp_lock.h>
  19 #include <linux/blkdev.h>
  20 #include <linux/file.h>
  21 #include <linux/swapctl.h>
  22 #include <linux/slab.h>
  23 #include <linux/init.h>
  24
  25 #include <asm/pgtable.h>
  26 #include <asm/uaccess.h>
  27
  28 /*
  29  * Shared mappings implemented 30.11.1994. It's not fully working yet,
  30  * though.
  31  *
  32  * Shared mappings now work. 15.8.1995  Bruno.
  33  *
  34  * finished 'unifying' the page and buffer cache and SMP-threaded the
  35  * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
  36  */
  37
  38 atomic_t page_cache_size = ATOMIC_INIT(0);
  39 unsigned int page_hash_bits;
  40 struct page **page_hash_table;
  41
  42 spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
  43
  44
  45 void __add_page_to_hash_queue(struct page * page, struct page **p)
  46 {
  47         atomic_inc(&page_cache_size);
  48         if((page->next_hash = *p) != NULL)
  49                 (*p)->pprev_hash = &page->next_hash;
  50         *p = page;
  51         page->pprev_hash = p;
  52         if (page->buffers)
  53                 PAGE_BUG(page);
  54 }
  55
  56 static void remove_page_from_hash_queue(struct page * page)
  57 {
  58         if(page->pprev_hash) {
  59                 if(page->next_hash)
  60                         page->next_hash->pprev_hash = page->pprev_hash;
  61                 *page->pprev_hash = page->next_hash;
  62                 page->pprev_hash = NULL;
  63         }
  64         atomic_dec(&page_cache_size);
  65 }
  66
  67 static void remove_page_from_inode_queue(struct page * page)
  68 {
  69         struct inode * inode = page->inode;
  70         struct page *prev, *next;
  71
  72         inode->i_nrpages--;
  73         next = page->next;
  74         prev = page->prev;
  75         if (inode->i_pages == page)
  76                 inode->i_pages = next;
  77         if (next)
  78                 next->prev = prev;
  79         if (prev)
  80                 prev->next = next;
  81         page->next = NULL;
  82         page->prev = NULL;
  83 }
  84
  85 /*
  86  * Remove a page from the page cache and free it. Caller has to make
  87  * sure the page is locked and that nobody else uses it - or that usage
  88  * is safe.
  89  */
  90 void remove_inode_page(struct page *page)
  91 {
  92         if (!PageLocked(page))
  93                 PAGE_BUG(page);
  94
  95         spin_lock(&pagecache_lock);
  96         remove_page_from_inode_queue(page);
  97         remove_page_from_hash_queue(page);
  98         page->inode = NULL;
  99         spin_unlock(&pagecache_lock);
 100 }
 101
 102 void invalidate_inode_pages(struct inode * inode)
 103 {
 104         struct page ** p;
 105         struct page * page;
 106
 107 repeat:
 108         spin_lock(&pagecache_lock);
 109         p = &inode->i_pages;
 110         while ((page = *p) != NULL) {
 111                 get_page(page);
 112                 if (TryLockPage(page)) {
 113                         spin_unlock(&pagecache_lock);
 114                         wait_on_page(page);
 115                         page_cache_release(page);
 116                         goto repeat;
 117                 }
 118                 if (page_count(page) != 2)
 119                         printk("hm, busy page invalidated? (not necesserily a bug)\n");
 120
 121                 remove_page_from_inode_queue(page);
 122                 remove_page_from_hash_queue(page);
 123                 page->inode = NULL;
 124                 UnlockPage(page);
 125                 page_cache_release(page);
 126                 page_cache_release(page);
 127
 128         }
 129         spin_unlock(&pagecache_lock);
 130 }
 131 /*
 132  * Truncate the page cache at a set offset, removing the pages
 133  * that are beyond that offset (and zeroing out partial pages).
 134  */
 135 void truncate_inode_pages(struct inode * inode, unsigned long start)
 136 {
 137         struct page ** p;
 138         struct page * page;
 139         int partial = 0;
 140
 141 repeat:
 142         spin_lock(&pagecache_lock);
 143         p = &inode->i_pages;
 144         while ((page = *p) != NULL) {
 145                 unsigned long offset = page->offset;
 146
 147                 /* page wholly truncated - free it */
 148                 if (offset >= start) {
 149                         get_page(page);
 150                         spin_unlock(&pagecache_lock);
 151
 152                         lock_page(page);
 153
 154                         if (inode->i_op->flushpage)
 155                                 inode->i_op->flushpage(inode, page, 0);
 156
 157                         /*
 158                          * We remove the page from the page cache
 159                          * _after_ we have destroyed all buffer-cache
 160                          * references to it. Otherwise some other process
 161                          * might think this inode page is not in the
 162                          * page cache and creates a buffer-cache alias
 163                          * to it causing all sorts of fun problems ...
 164                          */
 165                         remove_inode_page(page);
 166
 167                         UnlockPage(page);
 168                         page_cache_release(page);
 169                         page_cache_release(page);
 170
 171                         /*
 172                          * We have done things without the pagecache lock,
 173                          * so we'll have to repeat the scan.
 174                          * It's not possible to deadlock here because
 175                          * we are guaranteed to make progress. (ie. we have
 176                          * just removed a page)
 177                          */
 178                         goto repeat;
 179                 }
 180                 p = &page->next;
 181                 /*
 182                  * there is only one partial page possible.
 183                  */
 184                 if (partial)
 185                         continue;
 186
 187                 offset = start - offset;
 188                 /* partial truncate, clear end of page */
 189                 if (offset < PAGE_CACHE_SIZE) {
 190                         unsigned long address;
 191                         get_page(page);
 192                         spin_unlock(&pagecache_lock);
 193
 194                         lock_page(page);
 195                         partial = 1;
 196
 197                         address = page_address(page);
 198                         memset((void *) (offset + address), 0, PAGE_CACHE_SIZE - offset);
 199                         flush_page_to_ram(address);
 200
 201                         if (inode->i_op->flushpage)
 202                                 inode->i_op->flushpage(inode, page, offset);
 203                         /*
 204                          * we have dropped the spinlock so we have to
 205                          * restart.
 206                          */
 207                         UnlockPage(page);
 208                         page_cache_release(page);
 209                         goto repeat;
 210                 }
 211         }
 212         spin_unlock(&pagecache_lock);
 213 }
 214
 215 extern atomic_t too_many_dirty_buffers;
 216
 217 int shrink_mmap(int priority, int gfp_mask)
 218 {
 219         static unsigned long clock = 0;
 220         unsigned long limit = num_physpages << 1;
 221         struct page * page;
 222         int count, users;
 223
 224         count = limit >> priority;
 225
 226         page = mem_map + clock;
 227         do {
 228                 int referenced;
 229
 230                 /* This works even in the presence of PageSkip because
 231                  * the first two entries at the beginning of a hole will
 232                  * be marked, not just the first.
 233                  */
 234                 page++;
 235                 clock++;
 236                 if (clock >= max_mapnr) {
 237                         clock = 0;
 238                         page = mem_map;
 239                 }
 240                 if (PageSkip(page)) {
 241                         /* next_hash is overloaded for PageSkip */
 242                         page = page->next_hash;
 243                         clock = page - mem_map;
 244                 }
 245
 246                 referenced = test_and_clear_bit(PG_referenced, &page->flags);
 247
 248                 if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
 249                         continue;
 250
 251                 count--;
 252
 253                 /*
 254                  * Some common cases that we just short-circuit without
 255                  * getting the locks - we need to re-check this once we
 256                  * have the lock, but that's fine.
 257                  */
 258                 users = page_count(page);
 259                 if (!users)
 260                         continue;
 261                 if (!page->buffers) {
 262                         if (!page->inode)
 263                                 continue;
 264                         if (users > 1)
 265                                 continue;
 266                 }
 267
 268                 /*
 269                  * ok, now the page looks interesting. Re-check things
 270                  * and keep the lock.
 271                  */
 272                 spin_lock(&pagecache_lock);
 273                 if (!page->inode && !page->buffers) {
 274                         spin_unlock(&pagecache_lock);
 275                         continue;
 276                 }
 277                 if (!page_count(page)) {
 278                         spin_unlock(&pagecache_lock);
 279                         BUG();
 280                         continue;
 281                 }
 282                 get_page(page);
 283                 if (TryLockPage(page)) {
 284                         spin_unlock(&pagecache_lock);
 285                         goto put_continue;
 286                 }
 287
 288                 /*
 289                  * we keep pagecache_lock locked and unlock it in
 290                  * each branch, so that the page->inode case doesnt
 291                  * have to re-grab it. Here comes the 'real' logic
 292                  * to free memory:
 293                  */
 294
 295                 /* Is it a buffer page? */
 296                 if (page->buffers) {
 297                         spin_unlock(&pagecache_lock);
 298                         if (!try_to_free_buffers(page))
 299                                 goto unlock_continue;
 300                         /* page was locked, inode can't go away under us */
 301                         if (!page->inode)
 302                         {
 303                                 atomic_sub(PAGE_CACHE_SIZE, &buffermem);
 304                                 goto made_progress;
 305                         }
 306                         spin_lock(&pagecache_lock);
 307                 }
 308
 309                 /*
 310                  * We can't free pages unless there's just one user
 311                  * (count == 2 because we added one ourselves above).
 312                  */
 313                 if (page_count(page) != 2)
 314                         goto spin_unlock_continue;
 315
 316                 /*
 317                  * Is it a page swap page? If so, we want to
 318                  * drop it if it is no longer used, even if it
 319                  * were to be marked referenced..
 320                  */
 321                 if (PageSwapCache(page)) {
 322                         spin_unlock(&pagecache_lock);
 323                         if (referenced && swap_count(page->offset) != 2)
 324                                 goto unlock_continue;
 325                         __delete_from_swap_cache(page);
 326                         page_cache_release(page);
 327                         goto made_progress;
 328                 }
 329
 330                 /* is it a page-cache page? */
 331                 if (!referenced && page->inode && !pgcache_under_min()) {
 332                         remove_page_from_inode_queue(page);
 333                         remove_page_from_hash_queue(page);
 334                         page->inode = NULL;
 335                         spin_unlock(&pagecache_lock);
 336
 337                         page_cache_release(page);
 338                         goto made_progress;
 339                 }
 340 spin_unlock_continue:
 341                 spin_unlock(&pagecache_lock);
 342 unlock_continue:
 343                 UnlockPage(page);
 344 put_continue:
 345                 put_page(page);
 346         } while (count > 0);
 347         return 0;
 348 made_progress:
 349         UnlockPage(page);
 350         put_page(page);
 351         return 1;
 352 }
 353
 354 static inline struct page * __find_page_nolock(struct inode * inode, unsigned long offset, struct page *page)
 355 {
 356         goto inside;
 357
 358         for (;;) {
 359                 page = page->next_hash;
 360 inside:
 361                 if (!page)
 362                         goto not_found;
 363                 if (page->inode != inode)
 364                         continue;
 365                 if (page->offset == offset)
 366                         break;
 367         }
 368         set_bit(PG_referenced, &page->flags);
 369 not_found:
 370         return page;
 371 }
 372
 373 /*
 374  * By the time this is called, the page is locked and
 375  * we don't have to worry about any races any more.
 376  *
 377  * Start the IO..
 378  */
 379 static int writeout_one_page(struct page *page)
 380 {
 381         struct buffer_head *bh, *head = page->buffers;
 382
 383         bh = head;
 384         do {
 385                 if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
 386                         continue;
 387
 388                 bh->b_flushtime = 0;
 389                 ll_rw_block(WRITE, 1, &bh);
 390         } while ((bh = bh->b_this_page) != head);
 391         return 0;
 392 }
 393
 394 static int waitfor_one_page(struct page *page)
 395 {
 396         int error = 0;
 397         struct buffer_head *bh, *head = page->buffers;
 398
 399         bh = head;
 400         do {
 401                 wait_on_buffer(bh);
 402                 if (buffer_req(bh) && !buffer_uptodate(bh))
 403                         error = -EIO;
 404         } while ((bh = bh->b_this_page) != head);
 405         return error;
 406 }
 407
 408 static int do_buffer_fdatasync(struct inode *inode, unsigned long start, unsigned long end, int (*fn)(struct page *))
 409 {
 410         struct page *next;
 411         int retval = 0;
 412
 413         start &= PAGE_MASK;
 414
 415         spin_lock(&pagecache_lock);
 416         next = inode->i_pages;
 417         while (next) {
 418                 struct page *page = next;
 419                 next = page->next;
 420                 if (!page->buffers)
 421                         continue;
 422                 if (page->offset >= end)
 423                         continue;
 424                 if (page->offset < start)
 425                         continue;
 426
 427                 get_page(page);
 428                 spin_unlock(&pagecache_lock);
 429                 lock_page(page);
 430
 431                 /* The buffers could have been free'd while we waited for the page lock */
 432                 if (page->buffers)
 433                         retval |= fn(page);
 434
 435                 UnlockPage(page);
 436                 spin_lock(&pagecache_lock);
 437                 next = page->next;
 438                 page_cache_release(page);
 439         }
 440         spin_unlock(&pagecache_lock);
 441
 442         return retval;
 443 }
 444
 445 /*
 446  * Two-stage data sync: first start the IO, then go back and
 447  * collect the information..
 448  */
 449 int generic_buffer_fdatasync(struct inode *inode, unsigned long start, unsigned long end)
 450 {
 451         int retval;
 452
 453         retval = do_buffer_fdatasync(inode, start, end, writeout_one_page);
 454         retval |= do_buffer_fdatasync(inode, start, end, waitfor_one_page);
 455         return retval;
 456 }
 457
 458 /*
 459  * This adds a page to the page cache, starting out as locked,
 460  * owned by us, referenced, but not uptodate and with no errors.
 461  */
 462 static inline void __add_to_page_cache(struct page * page,
 463         struct inode * inode, unsigned long offset,
 464         struct page **hash)
 465 {
 466         unsigned long flags;
 467
 468         flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error));
 469         page->flags = flags |  ((1 << PG_locked) | (1 << PG_referenced));
 470         page->owner = current;  /* REMOVEME */
 471         get_page(page);
 472         page->offset = offset;
 473         add_page_to_inode_queue(inode, page);
 474         __add_page_to_hash_queue(page, hash);
 475 }
 476
 477 void add_to_page_cache(struct page * page, struct inode * inode, unsigned long offset)
 478 {
 479         spin_lock(&pagecache_lock);
 480         __add_to_page_cache(page, inode, offset, page_hash(inode, offset));
 481         spin_unlock(&pagecache_lock);
 482 }
 483
 484 int add_to_page_cache_unique(struct page * page,
 485         struct inode * inode, unsigned long offset,
 486         struct page **hash)
 487 {
 488         int err;
 489         struct page *alias;
 490
 491         spin_lock(&pagecache_lock);
 492         alias = __find_page_nolock(inode, offset, *hash);
 493
 494         err = 1;
 495         if (!alias) {
 496                 __add_to_page_cache(page,inode,offset,hash);
 497                 err = 0;
 498         }
 499
 500         spin_unlock(&pagecache_lock);
 501         return err;
 502 }
 503
 504 /*
 505  * Try to read ahead in the file. "page_cache" is a potentially free page
 506  * that we could use for the cache (if it is 0 we can try to create one,
 507  * this is all overlapped with the IO on the previous page finishing anyway)
 508  */
 509 static unsigned long try_to_read_ahead(struct file * file,
 510                                 unsigned long offset, unsigned long page_cache)
 511 {
 512         struct inode *inode = file->f_dentry->d_inode;
 513         struct page * page;
 514         struct page ** hash;
 515
 516         offset &= PAGE_CACHE_MASK;
 517         switch (page_cache) {
 518         case 0:
 519                 page_cache = page_cache_alloc();
 520                 if (!page_cache)
 521                         break;
 522         default:
 523                 if (offset >= inode->i_size)
 524                         break;
 525                 hash = page_hash(inode, offset);
 526                 page = page_cache_entry(page_cache);
 527                 if (!add_to_page_cache_unique(page, inode, offset, hash)) {
 528                         /*
 529                          * We do not have to check the return value here
 530                          * because it's a readahead.
 531                          */
 532                         inode->i_op->readpage(file, page);
 533                         page_cache = 0;
 534                         page_cache_release(page);
 535                 }
 536         }
 537         return page_cache;
 538 }
 539
 540 /*
 541  * Wait for a page to get unlocked.
 542  *
 543  * This must be called with the caller "holding" the page,
 544  * ie with increased "page->count" so that the page won't
 545  * go away during the wait..
 546  */
 547 void ___wait_on_page(struct page *page)
 548 {
 549         struct task_struct *tsk = current;
 550         DECLARE_WAITQUEUE(wait, tsk);
 551
 552         add_wait_queue(&page->wait, &wait);
 553         do {
 554                 tsk->state = TASK_UNINTERRUPTIBLE;
 555                 run_task_queue(&tq_disk);
 556                 if (!PageLocked(page))
 557                         break;
 558                 schedule();
 559         } while (PageLocked(page));
 560         tsk->state = TASK_RUNNING;
 561         remove_wait_queue(&page->wait, &wait);
 562 }
 563
 564 /*
 565  * Get an exclusive lock on the page..
 566  */
 567 void lock_page(struct page *page)
 568 {
 569         if (TryLockPage(page)) {
 570                 struct task_struct *tsk = current;
 571                 DECLARE_WAITQUEUE(wait, current);
 572
 573                 run_task_queue(&tq_disk);
 574                 add_wait_queue(&page->wait, &wait);
 575                 tsk->state = TASK_UNINTERRUPTIBLE;
 576
 577                 while (TryLockPage(page)) {
 578                         run_task_queue(&tq_disk);
 579                         schedule();
 580                         tsk->state = TASK_UNINTERRUPTIBLE;
 581                 }
 582
 583                 remove_wait_queue(&page->wait, &wait);
 584                 tsk->state = TASK_RUNNING;
 585         }
 586 }
 587
 588
 589 /*
 590  * a rather lightweight function, finding and getting a reference to a
 591  * hashed page atomically, waiting for it if it's locked.
 592  */
 593 struct page * __find_get_page (struct inode * inode,
 594                                 unsigned long offset, struct page **hash)
 595 {
 596         struct page *page;
 597
 598         /*
 599          * We scan the hash list read-only. Addition to and removal from
 600          * the hash-list needs a held write-lock.
 601          */
 602 repeat:
 603         spin_lock(&pagecache_lock);
 604         page = __find_page_nolock(inode, offset, *hash);
 605         if (page)
 606                 get_page(page);
 607         spin_unlock(&pagecache_lock);
 608
 609         /* Found the page, sleep if locked. */
 610         if (page && PageLocked(page)) {
 611                 struct task_struct *tsk = current;
 612                 DECLARE_WAITQUEUE(wait, tsk);
 613
 614                 add_wait_queue(&page->wait, &wait);
 615                 tsk->state = TASK_UNINTERRUPTIBLE;
 616
 617                 run_task_queue(&tq_disk);
 618                 if (PageLocked(page))
 619                         schedule();
 620                 tsk->state = TASK_RUNNING;
 621                 remove_wait_queue(&page->wait, &wait);
 622
 623                 /*
 624                  * The page might have been unhashed meanwhile. It's
 625                  * not freed though because we hold a reference to it.
 626                  * If this is the case then it will be freed _here_,
 627                  * and we recheck the hash anyway.
 628                  */
 629                 page_cache_release(page);
 630                 goto repeat;
 631         }
 632         /*
 633          * It's not locked so we can return the page and we hold
 634          * a reference to it.
 635          */
 636         return page;
 637 }
 638
 639 /*
 640  * Get the lock to a page atomically.
 641  */
 642 struct page * __find_lock_page (struct inode * inode,
 643                                 unsigned long offset, struct page **hash)
 644 {
 645         struct page *page;
 646
 647         /*
 648          * We scan the hash list read-only. Addition to and removal from
 649          * the hash-list needs a held write-lock.
 650          */
 651 repeat:
 652         spin_lock(&pagecache_lock);
 653         page = __find_page_nolock(inode, offset, *hash);
 654         if (page)
 655                 get_page(page);
 656         spin_unlock(&pagecache_lock);
 657
 658         /* Found the page, sleep if locked. */
 659         if (page && TryLockPage(page)) {
 660                 struct task_struct *tsk = current;
 661                 DECLARE_WAITQUEUE(wait, tsk);
 662
 663                 add_wait_queue(&page->wait, &wait);
 664                 tsk->state = TASK_UNINTERRUPTIBLE;
 665
 666                 run_task_queue(&tq_disk);
 667                 if (PageLocked(page))
 668                         schedule();
 669                 tsk->state = TASK_RUNNING;
 670                 remove_wait_queue(&page->wait, &wait);
 671
 672                 /*
 673                  * The page might have been unhashed meanwhile. It's
 674                  * not freed though because we hold a reference to it.
 675                  * If this is the case then it will be freed _here_,
 676                  * and we recheck the hash anyway.
 677                  */
 678                 page_cache_release(page);
 679                 goto repeat;
 680         }
 681         /*
 682          * It's not locked so we can return the page and we hold
 683          * a reference to it.
 684          */
 685         return page;
 686 }
 687
 688 #if 0
 689 #define PROFILE_READAHEAD
 690 #define DEBUG_READAHEAD
 691 #endif
 692
 693 /*
 694  * Read-ahead profiling information
 695  * --------------------------------
 696  * Every PROFILE_MAXREADCOUNT, the following information is written
 697  * to the syslog:
 698  *   Percentage of asynchronous read-ahead.
 699  *   Average of read-ahead fields context value.
 700  * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
 701  * to the syslog.
 702  */
 703
 704 #ifdef PROFILE_READAHEAD
 705
 706 #define PROFILE_MAXREADCOUNT 1000
 707
 708 static unsigned long total_reada;
 709 static unsigned long total_async;
 710 static unsigned long total_ramax;
 711 static unsigned long total_ralen;
 712 static unsigned long total_rawin;
 713
 714 static void profile_readahead(int async, struct file *filp)
 715 {
 716         unsigned long flags;
 717
 718         ++total_reada;
 719         if (async)
 720                 ++total_async;
 721
 722         total_ramax     += filp->f_ramax;
 723         total_ralen     += filp->f_ralen;
 724         total_rawin     += filp->f_rawin;
 725
 726         if (total_reada > PROFILE_MAXREADCOUNT) {
 727                 save_flags(flags);
 728                 cli();
 729                 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
 730                         restore_flags(flags);
 731                         return;
 732                 }
 733
 734                 printk("Readahead average:  max=%ld, len=%ld, win=%ld, async=%ld%%\n",
 735                         total_ramax/total_reada,
 736                         total_ralen/total_reada,
 737                         total_rawin/total_reada,
 738                         (total_async*100)/total_reada);
 739 #ifdef DEBUG_READAHEAD
 740                 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%ld\n",
 741                         filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
 742 #endif
 743
 744                 total_reada     = 0;
 745                 total_async     = 0;
 746                 total_ramax     = 0;
 747                 total_ralen     = 0;
 748                 total_rawin     = 0;
 749
 750                 restore_flags(flags);
 751         }
 752 }
 753 #endif  /* defined PROFILE_READAHEAD */
 754
 755 /*
 756  * Read-ahead context:
 757  * -------------------
 758  * The read ahead context fields of the "struct file" are the following:
 759  * - f_raend : position of the first byte after the last page we tried to
 760  *             read ahead.
 761  * - f_ramax : current read-ahead maximum size.
 762  * - f_ralen : length of the current IO read block we tried to read-ahead.
 763  * - f_rawin : length of the current read-ahead window.
 764  *              if last read-ahead was synchronous then
 765  *                      f_rawin = f_ralen
 766  *              otherwise (was asynchronous)
 767  *                      f_rawin = previous value of f_ralen + f_ralen
 768  *
 769  * Read-ahead limits:
 770  * ------------------
 771  * MIN_READAHEAD   : minimum read-ahead size when read-ahead.
 772  * MAX_READAHEAD   : maximum read-ahead size when read-ahead.
 773  *
 774  * Synchronous read-ahead benefits:
 775  * --------------------------------
 776  * Using reasonable IO xfer length from peripheral devices increase system
 777  * performances.
 778  * Reasonable means, in this context, not too large but not too small.
 779  * The actual maximum value is:
 780  *      MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
 781  *      and 32K if defined (4K page size assumed).
 782  *
 783  * Asynchronous read-ahead benefits:
 784  * ---------------------------------
 785  * Overlapping next read request and user process execution increase system
 786  * performance.
 787  *
 788  * Read-ahead risks:
 789  * -----------------
 790  * We have to guess which further data are needed by the user process.
 791  * If these data are often not really needed, it's bad for system
 792  * performances.
 793  * However, we know that files are often accessed sequentially by
 794  * application programs and it seems that it is possible to have some good
 795  * strategy in that guessing.
 796  * We only try to read-ahead files that seems to be read sequentially.
 797  *
 798  * Asynchronous read-ahead risks:
 799  * ------------------------------
 800  * In order to maximize overlapping, we must start some asynchronous read
 801  * request from the device, as soon as possible.
 802  * We must be very careful about:
 803  * - The number of effective pending IO read requests.
 804  *   ONE seems to be the only reasonable value.
 805  * - The total memory pool usage for the file access stream.
 806  *   This maximum memory usage is implicitly 2 IO read chunks:
 807  *   2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
 808  *   64k if defined (4K page size assumed).
 809  */
 810
 811 static inline int get_max_readahead(struct inode * inode)
 812 {
 813         if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
 814                 return MAX_READAHEAD;
 815         return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
 816 }
 817
 818 static inline unsigned long generic_file_readahead(int reada_ok,
 819         struct file * filp, struct inode * inode,
 820         unsigned long ppos, struct page * page, unsigned long page_cache)
 821 {
 822         unsigned long max_ahead, ahead;
 823         unsigned long raend;
 824         int max_readahead = get_max_readahead(inode);
 825
 826         raend = filp->f_raend & PAGE_CACHE_MASK;
 827         max_ahead = 0;
 828
 829 /*
 830  * The current page is locked.
 831  * If the current position is inside the previous read IO request, do not
 832  * try to reread previously read ahead pages.
 833  * Otherwise decide or not to read ahead some pages synchronously.
 834  * If we are not going to read ahead, set the read ahead context for this
 835  * page only.
 836  */
 837         if (PageLocked(page)) {
 838                 if (!filp->f_ralen || ppos >= raend || ppos + filp->f_ralen < raend) {
 839                         raend = ppos;
 840                         if (raend < inode->i_size)
 841                                 max_ahead = filp->f_ramax;
 842                         filp->f_rawin = 0;
 843                         filp->f_ralen = PAGE_CACHE_SIZE;
 844                         if (!max_ahead) {
 845                                 filp->f_raend  = ppos + filp->f_ralen;
 846                                 filp->f_rawin += filp->f_ralen;
 847                         }
 848                 }
 849         }
 850 /*
 851  * The current page is not locked.
 852  * If we were reading ahead and,
 853  * if the current max read ahead size is not zero and,
 854  * if the current position is inside the last read-ahead IO request,
 855  *   it is the moment to try to read ahead asynchronously.
 856  * We will later force unplug device in order to force asynchronous read IO.
 857  */
 858         else if (reada_ok && filp->f_ramax && raend >= PAGE_CACHE_SIZE &&
 859                  ppos <= raend && ppos + filp->f_ralen >= raend) {
 860 /*
 861  * Add ONE page to max_ahead in order to try to have about the same IO max size
 862  * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
 863  * Compute the position of the last page we have tried to read in order to
 864  * begin to read ahead just at the next page.
 865  */
 866                 raend -= PAGE_CACHE_SIZE;
 867                 if (raend < inode->i_size)
 868                         max_ahead = filp->f_ramax + PAGE_CACHE_SIZE;
 869
 870                 if (max_ahead) {
 871                         filp->f_rawin = filp->f_ralen;
 872                         filp->f_ralen = 0;
 873                         reada_ok      = 2;
 874                 }
 875         }
 876 /*
 877  * Try to read ahead pages.
 878  * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
 879  * scheduler, will work enough for us to avoid too bad actuals IO requests.
 880  */
 881         ahead = 0;
 882         while (ahead < max_ahead) {
 883                 ahead += PAGE_CACHE_SIZE;
 884                 page_cache = try_to_read_ahead(filp, raend + ahead,
 885                                                 page_cache);
 886         }
 887 /*
 888  * If we tried to read ahead some pages,
 889  * If we tried to read ahead asynchronously,
 890  *   Try to force unplug of the device in order to start an asynchronous
 891  *   read IO request.
 892  * Update the read-ahead context.
 893  * Store the length of the current read-ahead window.
 894  * Double the current max read ahead size.
 895  *   That heuristic avoid to do some large IO for files that are not really
 896  *   accessed sequentially.
 897  */
 898         if (ahead) {
 899                 if (reada_ok == 2) {
 900                         run_task_queue(&tq_disk);
 901                 }
 902
 903                 filp->f_ralen += ahead;
 904                 filp->f_rawin += filp->f_ralen;
 905                 filp->f_raend = raend + ahead + PAGE_CACHE_SIZE;
 906
 907                 filp->f_ramax += filp->f_ramax;
 908
 909                 if (filp->f_ramax > max_readahead)
 910                         filp->f_ramax = max_readahead;
 911
 912 #ifdef PROFILE_READAHEAD
 913                 profile_readahead((reada_ok == 2), filp);
 914 #endif
 915         }
 916
 917         return page_cache;
 918 }
 919
 920 /*
 921  * "descriptor" for what we're up to with a read.
 922  * This allows us to use the same read code yet
 923  * have multiple different users of the data that
 924  * we read from a file.
 925  *
 926  * The simplest case just copies the data to user
 927  * mode.
 928  */
 929 typedef struct {
 930         size_t written;
 931         size_t count;
 932         char * buf;
 933         int error;
 934 } read_descriptor_t;
 935
 936 typedef int (*read_actor_t)(read_descriptor_t *, const char *, unsigned long);
 937
 938 /*
 939  * This is a generic file read routine, and uses the
 940  * inode->i_op->readpage() function for the actual low-level
 941  * stuff.
 942  *
 943  * This is really ugly. But the goto's actually try to clarify some
 944  * of the logic when it comes to error handling etc.
 945  */
 946 static void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
 947 {
 948         struct dentry *dentry = filp->f_dentry;
 949         struct inode *inode = dentry->d_inode;
 950         size_t pos, pgpos, page_cache;
 951         int reada_ok;
 952         int error;
 953         int max_readahead = get_max_readahead(inode);
 954
 955         page_cache = 0;
 956
 957         pos = *ppos;
 958         pgpos = pos & PAGE_CACHE_MASK;
 959 /*
 960  * If the current position is outside the previous read-ahead window,
 961  * we reset the current read-ahead context and set read ahead max to zero
 962  * (will be set to just needed value later),
 963  * otherwise, we assume that the file accesses are sequential enough to
 964  * continue read-ahead.
 965  */
 966         if (pgpos > filp->f_raend || pgpos + filp->f_rawin < filp->f_raend) {
 967                 reada_ok = 0;
 968                 filp->f_raend = 0;
 969                 filp->f_ralen = 0;
 970                 filp->f_ramax = 0;
 971                 filp->f_rawin = 0;
 972         } else {
 973                 reada_ok = 1;
 974         }
 975 /*
 976  * Adjust the current value of read-ahead max.
 977  * If the read operation stay in the first half page, force no readahead.
 978  * Otherwise try to increase read ahead max just enough to do the read request.
 979  * Then, at least MIN_READAHEAD if read ahead is ok,
 980  * and at most MAX_READAHEAD in all cases.
 981  */
 982         if (pos + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
 983                 filp->f_ramax = 0;
 984         } else {
 985                 unsigned long needed;
 986
 987                 needed = ((pos + desc->count) & PAGE_CACHE_MASK) - pgpos;
 988
 989                 if (filp->f_ramax < needed)
 990                         filp->f_ramax = needed;
 991
 992                 if (reada_ok && filp->f_ramax < MIN_READAHEAD)
 993                                 filp->f_ramax = MIN_READAHEAD;
 994                 if (filp->f_ramax > max_readahead)
 995                         filp->f_ramax = max_readahead;
 996         }
 997
 998         for (;;) {
 999                 struct page *page, **hash;
1000
1001                 if (pos >= inode->i_size)
1002                         break;
1003
1004                 /*
1005                  * Try to find the data in the page cache..
1006                  */
1007                 hash = page_hash(inode, pos & PAGE_CACHE_MASK);
1008
1009                 spin_lock(&pagecache_lock);
1010                 page = __find_page_nolock(inode, pos & PAGE_CACHE_MASK, *hash);
1011                 if (!page)
1012                         goto no_cached_page;
1013 found_page:
1014                 get_page(page);
1015                 spin_unlock(&pagecache_lock);
1016
1017                 if (!Page_Uptodate(page))
1018                         goto page_not_up_to_date;
1019 page_ok:
1020         /*
1021          * Ok, we have the page, and it's up-to-date, so
1022          * now we can copy it to user space...
1023          */
1024         {
1025                 unsigned long offset, nr;
1026
1027                 offset = pos & ~PAGE_CACHE_MASK;
1028                 nr = PAGE_CACHE_SIZE - offset;
1029                 if (nr > inode->i_size - pos)
1030                         nr = inode->i_size - pos;
1031
1032                 /*
1033                  * The actor routine returns how many bytes were actually used..
1034                  * NOTE! This may not be the same as how much of a user buffer
1035                  * we filled up (we may be padding etc), so we can only update
1036                  * "pos" here (the actor routine has to update the user buffer
1037                  * pointers and the remaining count).
1038                  */
1039                 nr = actor(desc, (const char *) (page_address(page) + offset), nr);
1040                 pos += nr;
1041                 page_cache_release(page);
1042                 if (nr && desc->count)
1043                         continue;
1044                 break;
1045         }
1046
1047 /*
1048  * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
1049  */
1050 page_not_up_to_date:
1051                 page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_CACHE_MASK, page, page_cache);
1052
1053                 if (Page_Uptodate(page))
1054                         goto page_ok;
1055
1056                 /* Get exclusive access to the page ... */
1057                 lock_page(page);
1058                 if (Page_Uptodate(page)) {
1059                         UnlockPage(page);
1060                         goto page_ok;
1061                 }
1062
1063 readpage:
1064                 /* ... and start the actual read. The read will unlock the page. */
1065                 error = inode->i_op->readpage(filp, page);
1066
1067                 if (!error) {
1068                         if (Page_Uptodate(page))
1069                                 goto page_ok;
1070
1071                         /* Again, try some read-ahead while waiting for the page to finish.. */
1072                         page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_CACHE_MASK, page, page_cache);
1073                         wait_on_page(page);
1074                         if (Page_Uptodate(page))
1075                                 goto page_ok;
1076                         error = -EIO;
1077                 }
1078
1079                 /* UHHUH! A synchronous read error occurred. Report it */
1080                 desc->error = error;
1081                 page_cache_release(page);
1082                 break;
1083
1084 no_cached_page:
1085                 /*
1086                  * Ok, it wasn't cached, so we need to create a new
1087                  * page..
1088                  *
1089                  * We get here with the page cache lock held.
1090                  */
1091                 if (!page_cache) {
1092                         spin_unlock(&pagecache_lock);
1093                         page_cache = page_cache_alloc();
1094                         if (!page_cache) {
1095                                 desc->error = -ENOMEM;
1096                                 break;
1097                         }
1098
1099                         /*
1100                          * Somebody may have added the page while we
1101                          * dropped the page cache lock. Check for that.
1102                          */
1103                         spin_lock(&pagecache_lock);
1104                         page = __find_page_nolock(inode, pos & PAGE_CACHE_MASK, *hash);
1105                         if (page)
1106                                 goto found_page;
1107                 }
1108
1109                 /*
1110                  * Ok, add the new page to the hash-queues...
1111                  */
1112                 page = page_cache_entry(page_cache);
1113                 __add_to_page_cache(page, inode, pos & PAGE_CACHE_MASK, hash);
1114                 spin_unlock(&pagecache_lock);
1115
1116                 page_cache = 0;
1117                 goto readpage;
1118         }
1119
1120         *ppos = pos;
1121         filp->f_reada = 1;
1122         if (page_cache)
1123                 page_cache_free(page_cache);
1124         UPDATE_ATIME(inode);
1125 }
1126
1127 static int file_read_actor(read_descriptor_t * desc, const char *area, unsigned long size)
1128 {
1129         unsigned long left;
1130         unsigned long count = desc->count;
1131
1132         if (size > count)
1133                 size = count;
1134         left = __copy_to_user(desc->buf, area, size);
1135         if (left) {
1136                 size -= left;
1137                 desc->error = -EFAULT;
1138         }
1139         desc->count = count - size;
1140         desc->written += size;
1141         desc->buf += size;
1142         return size;
1143 }
1144
1145 /*
1146  * This is the "read()" routine for all filesystems
1147  * that can use the page cache directly.
1148  */
1149 ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
1150 {
1151         ssize_t retval;
1152
1153         retval = -EFAULT;
1154         if (access_ok(VERIFY_WRITE, buf, count)) {
1155                 retval = 0;
1156                 if (count) {
1157                         read_descriptor_t desc;
1158
1159                         desc.written = 0;
1160                         desc.count = count;
1161                         desc.buf = buf;
1162                         desc.error = 0;
1163                         do_generic_file_read(filp, ppos, &desc, file_read_actor);
1164
1165                         retval = desc.written;
1166                         if (!retval)
1167                                 retval = desc.error;
1168                 }
1169         }
1170         return retval;
1171 }
1172
1173 static int file_send_actor(read_descriptor_t * desc, const char *area, unsigned long size)
1174 {
1175         ssize_t written;
1176         unsigned long count = desc->count;
1177         struct file *file = (struct file *) desc->buf;
1178         mm_segment_t old_fs;
1179
1180         if (size > count)
1181                 size = count;
1182         old_fs = get_fs();
1183         set_fs(KERNEL_DS);
1184         written = file->f_op->write(file, area, size, &file->f_pos);
1185         set_fs(old_fs);
1186         if (written < 0) {
1187                 desc->error = written;
1188                 written = 0;
1189         }
1190         desc->count = count - written;
1191         desc->written += written;
1192         return written;
1193 }
1194
1195 asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
1196 {
1197         ssize_t retval;
1198         struct file * in_file, * out_file;
1199         struct inode * in_inode, * out_inode;
1200
1201         /*
1202          * Get input file, and verify that it is ok..
1203          */
1204         retval = -EBADF;
1205         in_file = fget(in_fd);
1206         if (!in_file)
1207                 goto out;
1208         if (!(in_file->f_mode & FMODE_READ))
1209                 goto fput_in;
1210         retval = -EINVAL;
1211         in_inode = in_file->f_dentry->d_inode;
1212         if (!in_inode)
1213                 goto fput_in;
1214         if (!in_inode->i_op || !in_inode->i_op->readpage)
1215                 goto fput_in;
1216         retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
1217         if (retval)
1218                 goto fput_in;
1219
1220         /*
1221          * Get output file, and verify that it is ok..
1222          */
1223         retval = -EBADF;
1224         out_file = fget(out_fd);
1225         if (!out_file)
1226                 goto fput_in;
1227         if (!(out_file->f_mode & FMODE_WRITE))
1228                 goto fput_out;
1229         retval = -EINVAL;
1230         if (!out_file->f_op || !out_file->f_op->write)
1231                 goto fput_out;
1232         out_inode = out_file->f_dentry->d_inode;
1233         if (!out_inode)
1234                 goto fput_out;
1235         retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
1236         if (retval)
1237                 goto fput_out;
1238
1239         retval = 0;
1240         if (count) {
1241                 read_descriptor_t desc;
1242                 loff_t pos = 0, *ppos;
1243
1244                 retval = -EFAULT;
1245                 ppos = &in_file->f_pos;
1246                 if (offset) {
1247                         if (get_user(pos, offset))
1248                                 goto fput_out;
1249                         ppos = &pos;
1250                 }
1251
1252                 desc.written = 0;
1253                 desc.count = count;
1254                 desc.buf = (char *) out_file;
1255                 desc.error = 0;
1256                 do_generic_file_read(in_file, ppos, &desc, file_send_actor);
1257
1258                 retval = desc.written;
1259                 if (!retval)
1260                         retval = desc.error;
1261                 if (offset)
1262                         put_user(pos, offset);
1263         }
1264
1265 fput_out:
1266         fput(out_file);
1267 fput_in:
1268         fput(in_file);
1269 out:
1270         return retval;
1271 }
1272
1273 /*
1274  * Semantics for shared and private memory areas are different past the end
1275  * of the file. A shared mapping past the last page of the file is an error
1276  * and results in a SIGBUS, while a private mapping just maps in a zero page.
1277  *
1278  * The goto's are kind of ugly, but this streamlines the normal case of having
1279  * it in the page cache, and handles the special cases reasonably without
1280  * having a lot of duplicated code.
1281  *
1282  * WSH 06/04/97: fixed a memory leak and moved the allocation of new_page
1283  * ahead of the wait if we're sure to need it.
1284  */
1285 static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long address, int no_share)
1286 {
1287         struct file * file = area->vm_file;
1288         struct dentry * dentry = file->f_dentry;
1289         struct inode * inode = dentry->d_inode;
1290         unsigned long offset, reada, i;
1291         struct page * page, **hash;
1292         unsigned long old_page, new_page;
1293         int error;
1294
1295         new_page = 0;
1296         offset = (address & PAGE_MASK) - area->vm_start + area->vm_offset;
1297         if (offset >= inode->i_size && (area->vm_flags & VM_SHARED) && area->vm_mm == current->mm)
1298                 goto no_page;
1299
1300         /*
1301          * Do we have something in the page cache already?
1302          */
1303         hash = page_hash(inode, offset);
1304 retry_find:
1305         page = __find_get_page(inode, offset, hash);
1306         if (!page)
1307                 goto no_cached_page;
1308
1309 found_page:
1310         /*
1311          * Ok, found a page in the page cache, now we need to check
1312          * that it's up-to-date.  First check whether we'll need an
1313          * extra page -- better to overlap the allocation with the I/O.
1314          */
1315         if (no_share && !new_page) {
1316                 new_page = page_cache_alloc();
1317                 if (!new_page)
1318                         goto failure;
1319         }
1320
1321         if (!Page_Uptodate(page)) {
1322                 lock_page(page);
1323                 if (!Page_Uptodate(page))
1324                         goto page_not_uptodate;
1325                 UnlockPage(page);
1326         }
1327
1328 success:
1329         /*
1330          * Found the page and have a reference on it, need to check sharing
1331          * and possibly copy it over to another page..
1332          */
1333         old_page = page_address(page);
1334         if (!no_share) {
1335                 /*
1336                  * Ok, we can share the cached page directly.. Get rid
1337                  * of any potential extra pages.
1338                  */
1339                 if (new_page)
1340                         page_cache_free(new_page);
1341
1342                 flush_page_to_ram(old_page);
1343                 return old_page;
1344         }
1345
1346         /*
1347          * No sharing ... copy to the new page.
1348          */
1349         copy_page(new_page, old_page);
1350         flush_page_to_ram(new_page);
1351         page_cache_release(page);
1352         return new_page;
1353
1354 no_cached_page:
1355         /*
1356          * Try to read in an entire cluster at once.
1357          */
1358         reada   = offset;
1359         reada >>= PAGE_CACHE_SHIFT + page_cluster;
1360         reada <<= PAGE_CACHE_SHIFT + page_cluster;
1361
1362         for (i = 1 << page_cluster; i > 0; --i, reada += PAGE_CACHE_SIZE)
1363                 new_page = try_to_read_ahead(file, reada, new_page);
1364
1365         if (!new_page)
1366                 new_page = page_cache_alloc();
1367         if (!new_page)
1368                 goto no_page;
1369
1370         /*
1371          * During getting the above page we might have slept,
1372          * so we need to re-check the situation with the page
1373          * cache.. The page we just got may be useful if we
1374          * can't share, so don't get rid of it here.
1375          */
1376         page = __find_get_page(inode, offset, hash);
1377         if (page)
1378                 goto found_page;
1379
1380         /*
1381          * Now, create a new page-cache page from the page we got
1382          */
1383         page = page_cache_entry(new_page);
1384         if (add_to_page_cache_unique(page, inode, offset, hash))
1385                 goto retry_find;
1386
1387         /*
1388          * Now it's ours and locked, we can do initial IO to it:
1389          */
1390         new_page = 0;
1391
1392 page_not_uptodate:
1393         error = inode->i_op->readpage(file, page);
1394
1395         if (!error) {
1396                 wait_on_page(page);
1397                 if (PageError(page))
1398                         goto page_read_error;
1399                 goto success;
1400         }
1401
1402 page_read_error:
1403         /*
1404          * Umm, take care of errors if the page isn't up-to-date.
1405          * Try to re-read it _once_. We do this synchronously,
1406          * because there really aren't any performance issues here
1407          * and we need to check for errors.
1408          */
1409         if (!PageLocked(page))
1410                 PAGE_BUG(page);
1411         ClearPageError(page);
1412         error = inode->i_op->readpage(file, page);
1413         if (error)
1414                 goto failure;
1415         wait_on_page(page);
1416         if (Page_Uptodate(page))
1417                 goto success;
1418
1419         /*
1420          * Things didn't work out. Return zero to tell the
1421          * mm layer so, possibly freeing the page cache page first.
1422          */
1423 failure:
1424         page_cache_release(page);
1425         if (new_page)
1426                 page_cache_free(new_page);
1427 no_page:
1428         return 0;
1429 }
1430
1431 /*
1432  * Tries to write a shared mapped page to its backing store. May return -EIO
1433  * if the disk is full.
1434  */
1435 static inline int do_write_page(struct inode * inode, struct file * file,
1436         const char * page_addr, unsigned long offset)
1437 {
1438         int retval;
1439         unsigned long size;
1440         int (*writepage) (struct file *, struct page *);
1441         struct page * page;
1442
1443         size = offset + PAGE_SIZE;
1444         /* refuse to extend file size.. */
1445         if (S_ISREG(inode->i_mode)) {
1446                 if (size > inode->i_size)
1447                         size = inode->i_size;
1448                 /* Ho humm.. We should have tested for this earlier */
1449                 if (size < offset)
1450                         return -EIO;
1451         }
1452         size -= offset;
1453         retval = -EIO;
1454         writepage = inode->i_op->writepage;
1455         page = mem_map + MAP_NR(page_addr);
1456         lock_page(page);
1457
1458         retval = writepage(file, page);
1459
1460         UnlockPage(page);
1461         return retval;
1462 }
1463
1464 static int filemap_write_page(struct vm_area_struct * vma,
1465                               unsigned long offset,
1466                               unsigned long page,
1467                               int wait)
1468 {
1469         int result;
1470         struct file * file;
1471         struct dentry * dentry;
1472         struct inode * inode;
1473
1474         file = vma->vm_file;
1475         dentry = file->f_dentry;
1476         inode = dentry->d_inode;
1477
1478         /*
1479          * If a task terminates while we're swapping the page, the vma and
1480          * and file could be released ... increment the count to be safe.
1481          */
1482         get_file(file);
1483         result = do_write_page(inode, file, (const char *) page, offset);
1484         fput(file);
1485         return result;
1486 }
1487
1488
1489 /*
1490  * The page cache takes care of races between somebody
1491  * trying to swap something out and swap something in
1492  * at the same time..
1493  */
1494 extern void wakeup_bdflush(int);
1495 int filemap_swapout(struct vm_area_struct * vma, struct page * page)
1496 {
1497         int retval = filemap_write_page(vma, page->offset, page_address(page), 0);
1498         wakeup_bdflush(0);
1499         return retval;
1500 }
1501
1502 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
1503         unsigned long address, unsigned int flags)
1504 {
1505         pte_t pte = *ptep;
1506         unsigned long pageaddr;
1507         struct page *page;
1508         int error;
1509
1510         if (!(flags & MS_INVALIDATE)) {
1511                 if (!pte_present(pte))
1512                         return 0;
1513                 if (!pte_dirty(pte))
1514                         return 0;
1515                 flush_page_to_ram(pte_page(pte));
1516                 flush_cache_page(vma, address);
1517                 set_pte(ptep, pte_mkclean(pte));
1518                 flush_tlb_page(vma, address);
1519                 pageaddr = pte_page(pte);
1520                 page = page_cache_entry(pageaddr);
1521                 get_page(page);
1522         } else {
1523                 if (pte_none(pte))
1524                         return 0;
1525                 flush_cache_page(vma, address);
1526                 pte_clear(ptep);
1527                 flush_tlb_page(vma, address);
1528                 if (!pte_present(pte)) {
1529                         swap_free(pte_val(pte));
1530                         return 0;
1531                 }
1532                 pageaddr = pte_page(pte);
1533                 if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
1534                         page_cache_free(pageaddr);
1535                         return 0;
1536                 }
1537         }
1538         error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, pageaddr, 1);
1539         page_cache_free(pageaddr);
1540         return error;
1541 }
1542
1543 static inline int filemap_sync_pte_range(pmd_t * pmd,
1544         unsigned long address, unsigned long size,
1545         struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
1546 {
1547         pte_t * pte;
1548         unsigned long end;
1549         int error;
1550
1551         if (pmd_none(*pmd))
1552                 return 0;
1553         if (pmd_bad(*pmd)) {
1554                 printk("filemap_sync_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
1555                 pmd_clear(pmd);
1556                 return 0;
1557         }
1558         pte = pte_offset(pmd, address);
1559         offset += address & PMD_MASK;
1560         address &= ~PMD_MASK;
1561         end = address + size;
1562         if (end > PMD_SIZE)
1563                 end = PMD_SIZE;
1564         error = 0;
1565         do {
1566                 error |= filemap_sync_pte(pte, vma, address + offset, flags);
1567                 address += PAGE_SIZE;
1568                 pte++;
1569         } while (address < end);
1570         return error;
1571 }
1572
1573 static inline int filemap_sync_pmd_range(pgd_t * pgd,
1574         unsigned long address, unsigned long size,
1575         struct vm_area_struct *vma, unsigned int flags)
1576 {
1577         pmd_t * pmd;
1578         unsigned long offset, end;
1579         int error;
1580
1581         if (pgd_none(*pgd))
1582                 return 0;
1583         if (pgd_bad(*pgd)) {
1584                 printk("filemap_sync_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd));
1585                 pgd_clear(pgd);
1586                 return 0;
1587         }
1588         pmd = pmd_offset(pgd, address);
1589         offset = address & PGDIR_MASK;
1590         address &= ~PGDIR_MASK;
1591         end = address + size;
1592         if (end > PGDIR_SIZE)
1593                 end = PGDIR_SIZE;
1594         error = 0;
1595         do {
1596                 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
1597                 address = (address + PMD_SIZE) & PMD_MASK;
1598                 pmd++;
1599         } while (address < end);
1600         return error;
1601 }
1602
1603 static int filemap_sync(struct vm_area_struct * vma, unsigned long address,
1604         size_t size, unsigned int flags)
1605 {
1606         pgd_t * dir;
1607         unsigned long end = address + size;
1608         int error = 0;
1609
1610         dir = pgd_offset(vma->vm_mm, address);
1611         flush_cache_range(vma->vm_mm, end - size, end);
1612         while (address < end) {
1613                 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
1614                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
1615                 dir++;
1616         }
1617         flush_tlb_range(vma->vm_mm, end - size, end);
1618         return error;
1619 }
1620
1621 /*
1622  * This handles (potentially partial) area unmaps..
1623  */
1624 static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len)
1625 {
1626         filemap_sync(vma, start, len, MS_ASYNC);
1627 }
1628
1629 /*
1630  * Shared mappings need to be able to do the right thing at
1631  * close/unmap/sync. They will also use the private file as
1632  * backing-store for swapping..
1633  */
1634 static struct vm_operations_struct file_shared_mmap = {
1635         NULL,                   /* no special open */
1636         NULL,                   /* no special close */
1637         filemap_unmap,          /* unmap - we need to sync the pages */
1638         NULL,                   /* no special protect */
1639         filemap_sync,           /* sync */
1640         NULL,                   /* advise */
1641         filemap_nopage,         /* nopage */
1642         NULL,                   /* wppage */
1643         filemap_swapout         /* swapout */
1644 };
1645
1646 /*
1647  * Private mappings just need to be able to load in the map.
1648  *
1649  * (This is actually used for shared mappings as well, if we
1650  * know they can't ever get write permissions..)
1651  */
1652 static struct vm_operations_struct file_private_mmap = {
1653         NULL,                   /* open */
1654         NULL,                   /* close */
1655         NULL,                   /* unmap */
1656         NULL,                   /* protect */
1657         NULL,                   /* sync */
1658         NULL,                   /* advise */
1659         filemap_nopage,         /* nopage */
1660         NULL,                   /* wppage */
1661         NULL                    /* swapout */
1662 };
1663
1664 /* This is used for a general mmap of a disk file */
1665
1666 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1667 {
1668         struct vm_operations_struct * ops;
1669         struct inode *inode = file->f_dentry->d_inode;
1670
1671         ops = &file_private_mmap;
1672         if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1673                 if (!inode->i_op || !inode->i_op->writepage)
1674                         return -EINVAL;
1675                 ops = &file_shared_mmap;
1676         }
1677         if (!inode->i_sb || !S_ISREG(inode->i_mode))
1678                 return -EACCES;
1679         if (!inode->i_op || !inode->i_op->readpage)
1680                 return -ENOEXEC;
1681         UPDATE_ATIME(inode);
1682         vma->vm_ops = ops;
1683         return 0;
1684 }
1685
1686
1687 /*
1688  * The msync() system call.
1689  */
1690
1691 static int msync_interval(struct vm_area_struct * vma,
1692         unsigned long start, unsigned long end, int flags)
1693 {
1694         if (vma->vm_file && vma->vm_ops && vma->vm_ops->sync) {
1695                 int error;
1696                 error = vma->vm_ops->sync(vma, start, end-start, flags);
1697                 if (!error && (flags & MS_SYNC)) {
1698                         struct file * file = vma->vm_file;
1699                         if (file) {
1700                                 struct dentry * dentry = file->f_dentry;
1701                                 error = file_fsync(file, dentry);
1702                         }
1703                 }
1704                 return error;
1705         }
1706         return 0;
1707 }
1708
1709 asmlinkage int sys_msync(unsigned long start, size_t len, int flags)
1710 {
1711         unsigned long end;
1712         struct vm_area_struct * vma;
1713         int unmapped_error, error = -EINVAL;
1714
1715         down(&current->mm->mmap_sem);
1716         lock_kernel();
1717         if (start & ~PAGE_MASK)
1718                 goto out;
1719         len = (len + ~PAGE_MASK) & PAGE_MASK;
1720         end = start + len;
1721         if (end < start)
1722                 goto out;
1723         if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1724                 goto out;
1725         error = 0;
1726         if (end == start)
1727                 goto out;
1728         /*
1729          * If the interval [start,end) covers some unmapped address ranges,
1730          * just ignore them, but return -EFAULT at the end.
1731          */
1732         vma = find_vma(current->mm, start);
1733         unmapped_error = 0;
1734         for (;;) {
1735                 /* Still start < end. */
1736                 error = -EFAULT;
1737                 if (!vma)
1738                         goto out;
1739                 /* Here start < vma->vm_end. */
1740                 if (start < vma->vm_start) {
1741                         unmapped_error = -EFAULT;
1742                         start = vma->vm_start;
1743                 }
1744                 /* Here vma->vm_start <= start < vma->vm_end. */
1745                 if (end <= vma->vm_end) {
1746                         if (start < end) {
1747                                 error = msync_interval(vma, start, end, flags);
1748                                 if (error)
1749                                         goto out;
1750                         }
1751                         error = unmapped_error;
1752                         goto out;
1753                 }
1754                 /* Here vma->vm_start <= start < vma->vm_end < end. */
1755                 error = msync_interval(vma, start, vma->vm_end, flags);
1756                 if (error)
1757                         goto out;
1758                 start = vma->vm_end;
1759                 vma = vma->vm_next;
1760         }
1761 out:
1762         unlock_kernel();
1763         up(&current->mm->mmap_sem);
1764         return error;
1765 }
1766
1767 /*
1768  * Write to a file through the page cache. This is mainly for the
1769  * benefit of NFS and possibly other network-based file systems.
1770  *
1771  * We currently put everything into the page cache prior to writing it.
1772  * This is not a problem when writing full pages. With partial pages,
1773  * however, we first have to read the data into the cache, then
1774  * dirty the page, and finally schedule it for writing. Alternatively, we
1775  * could write-through just the portion of data that would go into that
1776  * page, but that would kill performance for applications that write data
1777  * line by line, and it's prone to race conditions.
1778  *
1779  * Note that this routine doesn't try to keep track of dirty pages. Each
1780  * file system has to do this all by itself, unfortunately.
1781  *                                                      okir@monad.swb.de
1782  */
1783 ssize_t
1784 generic_file_write(struct file *file, const char *buf,
1785                    size_t count, loff_t *ppos,
1786                    writepage_t write_one_page)
1787 {
1788         struct dentry   *dentry = file->f_dentry;
1789         struct inode    *inode = dentry->d_inode;
1790         unsigned long   pos = *ppos;
1791         unsigned long   limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
1792         struct page     *page, **hash;
1793         unsigned long   page_cache = 0;
1794         unsigned long   written;
1795         long            status;
1796         int             err;
1797
1798         err = file->f_error;
1799         if (err) {
1800                 file->f_error = 0;
1801                 goto out;
1802         }
1803
1804         written = 0;
1805
1806         if (file->f_flags & O_APPEND)
1807                 pos = inode->i_size;
1808
1809         /*
1810          * Check whether we've reached the file size limit.
1811          */
1812         err = -EFBIG;
1813         if (pos >= limit) {
1814                 send_sig(SIGXFSZ, current, 0);
1815                 goto out;
1816         }
1817
1818         status  = 0;
1819         /*
1820          * Check whether to truncate the write,
1821          * and send the signal if we do.
1822          */
1823         if (count > limit - pos) {
1824                 send_sig(SIGXFSZ, current, 0);
1825                 count = limit - pos;
1826         }
1827
1828         while (count) {
1829                 unsigned long bytes, pgpos, offset;
1830                 /*
1831                  * Try to find the page in the cache. If it isn't there,
1832                  * allocate a free page.
1833                  */
1834                 offset = (pos & ~PAGE_CACHE_MASK);
1835                 pgpos = pos & PAGE_CACHE_MASK;
1836                 bytes = PAGE_CACHE_SIZE - offset;
1837                 if (bytes > count)
1838                         bytes = count;
1839
1840                 hash = page_hash(inode, pgpos);
1841 repeat_find:
1842                 page = __find_lock_page(inode, pgpos, hash);
1843                 if (!page) {
1844                         if (!page_cache) {
1845                                 page_cache = page_cache_alloc();
1846                                 if (page_cache)
1847                                         goto repeat_find;
1848                                 status = -ENOMEM;
1849                                 break;
1850                         }
1851                         page = page_cache_entry(page_cache);
1852                         if (add_to_page_cache_unique(page,inode,pgpos,hash))
1853                                 goto repeat_find;
1854
1855                         page_cache = 0;
1856                 }
1857
1858                 /* We have exclusive IO access to the page.. */
1859                 if (!PageLocked(page)) {
1860                         PAGE_BUG(page);
1861                 } else {
1862                         if (page->owner != current) {
1863                                 PAGE_BUG(page);
1864                         }
1865                 }
1866
1867                 status = write_one_page(file, page, offset, bytes, buf);
1868
1869                 /* Mark it unlocked again and drop the page.. */
1870                 UnlockPage(page);
1871                 page_cache_release(page);
1872
1873                 if (status < 0)
1874                         break;
1875
1876                 written += status;
1877                 count -= status;
1878                 pos += status;
1879                 buf += status;
1880         }
1881         *ppos = pos;
1882         if (pos > inode->i_size)
1883                 inode->i_size = pos;
1884
1885         if (page_cache)
1886                 page_cache_free(page_cache);
1887
1888         err = written ? written : status;
1889 out:
1890         return err;
1891 }
1892
1893 /*
1894  * Support routines for directory caching using the page cache.
1895  */
1896
1897 /*
1898  * Unlock and free a page.
1899  */
1900 void put_cached_page(unsigned long addr)
1901 {
1902         struct page * page = page_cache_entry(addr);
1903
1904         UnlockPage(page);
1905         if (page_count(page) != 2)
1906                 panic("put_cached_page: page count=%d\n",
1907                         page_count(page));
1908         page_cache_release(page);
1909 }
1910
1911 void __init page_cache_init(unsigned long memory_size)
1912 {
1913         unsigned long htable_size, order;
1914
1915         htable_size = memory_size >> PAGE_SHIFT;
1916         htable_size *= sizeof(struct page *);
1917         for(order = 0; (PAGE_SIZE << order) < htable_size; order++)
1918                 ;
1919
1920         do {
1921                 unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *);
1922
1923                 page_hash_bits = 0;
1924                 while((tmp >>= 1UL) != 0UL)
1925                         page_hash_bits++;
1926
1927                 page_hash_table = (struct page **)
1928                         __get_free_pages(GFP_ATOMIC, order);
1929         } while(page_hash_table == NULL && --order > 0);
1930
1931         printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
1932                (1 << page_hash_bits), order, (PAGE_SIZE << order));
1933         if (!page_hash_table)
1934                 panic("Failed to allocate page hash table\n");
1935         memset(page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));
1936 }