mm/filemap.c

   1 /*
   2  *      linux/mm/filemap.c
   3  *
   4  * Copyright (C) 1994-1999  Linus Torvalds
   5  */
   6
   7 /*
   8  * This file handles the generic file mmap semantics used by
   9  * most "normal" filesystems (but you don't /have/ to use this:
  10  * the NFS filesystem used to do this differently, for example)
  11  */
  12 #include <linux/malloc.h>
  13 #include <linux/shm.h>
  14 #include <linux/mman.h>
  15 #include <linux/locks.h>
  16 #include <linux/pagemap.h>
  17 #include <linux/swap.h>
  18 #include <linux/smp_lock.h>
  19 #include <linux/blkdev.h>
  20 #include <linux/file.h>
  21 #include <linux/swapctl.h>
  22 #include <linux/slab.h>
  23 #include <linux/init.h>
  24
  25 #include <asm/pgtable.h>
  26 #include <asm/uaccess.h>
  27
  28 /*
  29  * Shared mappings implemented 30.11.1994. It's not fully working yet,
  30  * though.
  31  *
  32  * Shared mappings now work. 15.8.1995  Bruno.
  33  *
  34  * finished 'unifying' the page and buffer cache and SMP-threaded the
  35  * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
  36  */
  37
  38 atomic_t page_cache_size = ATOMIC_INIT(0);
  39 unsigned int page_hash_bits;
  40 struct page **page_hash_table;
  41
  42 spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
  43
  44
  45 void __add_page_to_hash_queue(struct page * page, struct page **p)
  46 {
  47         atomic_inc(&page_cache_size);
  48         if((page->next_hash = *p) != NULL)
  49                 (*p)->pprev_hash = &page->next_hash;
  50         *p = page;
  51         page->pprev_hash = p;
  52         if (page->buffers)
  53                 PAGE_BUG(page);
  54 }
  55
  56 static void remove_page_from_hash_queue(struct page * page)
  57 {
  58         if(page->pprev_hash) {
  59                 if(page->next_hash)
  60                         page->next_hash->pprev_hash = page->pprev_hash;
  61                 *page->pprev_hash = page->next_hash;
  62                 page->pprev_hash = NULL;
  63         }
  64         atomic_dec(&page_cache_size);
  65 }
  66
  67 static void remove_page_from_inode_queue(struct page * page)
  68 {
  69         struct inode * inode = page->inode;
  70         struct page *prev, *next;
  71
  72         inode->i_nrpages--;
  73         next = page->next;
  74         prev = page->prev;
  75         if (inode->i_pages == page)
  76                 inode->i_pages = next;
  77         if (next)
  78                 next->prev = prev;
  79         if (prev)
  80                 prev->next = next;
  81         page->next = NULL;
  82         page->prev = NULL;
  83 }
  84
  85 /*
  86  * Remove a page from the page cache and free it. Caller has to make
  87  * sure the page is locked and that nobody else uses it - or that usage
  88  * is safe.
  89  */
  90 void remove_inode_page(struct page *page)
  91 {
  92         if (!PageLocked(page))
  93                 PAGE_BUG(page);
  94
  95         spin_lock(&pagecache_lock);
  96         remove_page_from_inode_queue(page);
  97         remove_page_from_hash_queue(page);
  98         page->inode = NULL;
  99         spin_unlock(&pagecache_lock);
 100 }
 101
 102 void invalidate_inode_pages(struct inode * inode)
 103 {
 104         struct page ** p;
 105         struct page * page;
 106
 107 repeat:
 108         spin_lock(&pagecache_lock);
 109         p = &inode->i_pages;
 110         while ((page = *p) != NULL) {
 111                 get_page(page);
 112                 if (TryLockPage(page)) {
 113                         spin_unlock(&pagecache_lock);
 114                         wait_on_page(page);
 115                         page_cache_release(page);
 116                         goto repeat;
 117                 }
 118                 if (page_count(page) != 2)
 119                         printk("hm, busy page invalidated? (not necesserily a bug)\n");
 120
 121                 remove_page_from_inode_queue(page);
 122                 remove_page_from_hash_queue(page);
 123                 page->inode = NULL;
 124                 UnlockPage(page);
 125                 page_cache_release(page);
 126                 page_cache_release(page);
 127
 128         }
 129         spin_unlock(&pagecache_lock);
 130 }
 131 /*
 132  * Truncate the page cache at a set offset, removing the pages
 133  * that are beyond that offset (and zeroing out partial pages).
 134  */
 135 void truncate_inode_pages(struct inode * inode, unsigned long start)
 136 {
 137         struct page ** p;
 138         struct page * page;
 139         int partial = 0;
 140
 141 repeat:
 142         spin_lock(&pagecache_lock);
 143         p = &inode->i_pages;
 144         while ((page = *p) != NULL) {
 145                 unsigned long offset = page->offset;
 146
 147                 /* page wholly truncated - free it */
 148                 if (offset >= start) {
 149                         get_page(page);
 150                         spin_unlock(&pagecache_lock);
 151
 152                         lock_page(page);
 153
 154                         if (inode->i_op->flushpage)
 155                                 inode->i_op->flushpage(inode, page, 0);
 156
 157                         /*
 158                          * We remove the page from the page cache
 159                          * _after_ we have destroyed all buffer-cache
 160                          * references to it. Otherwise some other process
 161                          * might think this inode page is not in the
 162                          * page cache and creates a buffer-cache alias
 163                          * to it causing all sorts of fun problems ...
 164                          */
 165                         remove_inode_page(page);
 166
 167                         UnlockPage(page);
 168                         page_cache_release(page);
 169                         page_cache_release(page);
 170
 171                         /*
 172                          * We have done things without the pagecache lock,
 173                          * so we'll have to repeat the scan.
 174                          * It's not possible to deadlock here because
 175                          * we are guaranteed to make progress. (ie. we have
 176                          * just removed a page)
 177                          */
 178                         goto repeat;
 179                 }
 180                 p = &page->next;
 181                 /*
 182                  * there is only one partial page possible.
 183                  */
 184                 if (partial)
 185                         continue;
 186
 187                 offset = start - offset;
 188                 /* partial truncate, clear end of page */
 189                 if (offset < PAGE_CACHE_SIZE) {
 190                         unsigned long address;
 191                         get_page(page);
 192                         spin_unlock(&pagecache_lock);
 193
 194                         lock_page(page);
 195                         partial = 1;
 196
 197                         address = page_address(page);
 198                         memset((void *) (offset + address), 0, PAGE_CACHE_SIZE - offset);
 199                         flush_page_to_ram(address);
 200
 201                         if (inode->i_op->flushpage)
 202                                 inode->i_op->flushpage(inode, page, offset);
 203                         /*
 204                          * we have dropped the spinlock so we have to
 205                          * restart.
 206                          */
 207                         UnlockPage(page);
 208                         page_cache_release(page);
 209                         goto repeat;
 210                 }
 211         }
 212         spin_unlock(&pagecache_lock);
 213 }
 214
 215 extern atomic_t too_many_dirty_buffers;
 216
 217 int shrink_mmap(int priority, int gfp_mask)
 218 {
 219         static unsigned long clock = 0;
 220         unsigned long limit = num_physpages << 1;
 221         struct page * page;
 222         int count, users;
 223
 224         count = limit >> priority;
 225
 226         page = mem_map + clock;
 227         do {
 228                 int referenced;
 229
 230                 /* This works even in the presence of PageSkip because
 231                  * the first two entries at the beginning of a hole will
 232                  * be marked, not just the first.
 233                  */
 234                 page++;
 235                 clock++;
 236                 if (clock >= max_mapnr) {
 237                         clock = 0;
 238                         page = mem_map;
 239                 }
 240                 if (PageSkip(page)) {
 241                         /* next_hash is overloaded for PageSkip */
 242                         page = page->next_hash;
 243                         clock = page - mem_map;
 244                 }
 245
 246                 referenced = test_and_clear_bit(PG_referenced, &page->flags);
 247
 248                 if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
 249                         continue;
 250
 251                 count--;
 252
 253                 /*
 254                  * Some common cases that we just short-circuit without
 255                  * getting the locks - we need to re-check this once we
 256                  * have the lock, but that's fine.
 257                  */
 258                 users = page_count(page);
 259                 if (!users)
 260                         continue;
 261                 if (!page->buffers) {
 262                         if (!page->inode)
 263                                 continue;
 264                         if (users > 1)
 265                                 continue;
 266                 }
 267
 268                 /*
 269                  * ok, now the page looks interesting. Re-check things
 270                  * and keep the lock.
 271                  */
 272                 spin_lock(&pagecache_lock);
 273                 if (!page->inode && !page->buffers) {
 274                         spin_unlock(&pagecache_lock);
 275                         continue;
 276                 }
 277                 if (!page_count(page)) {
 278                         spin_unlock(&pagecache_lock);
 279                         BUG();
 280                         continue;
 281                 }
 282                 get_page(page);
 283                 if (TryLockPage(page)) {
 284                         spin_unlock(&pagecache_lock);
 285                         goto put_continue;
 286                 }
 287
 288                 /*
 289                  * we keep pagecache_lock locked and unlock it in
 290                  * each branch, so that the page->inode case doesnt
 291                  * have to re-grab it. Here comes the 'real' logic
 292                  * to free memory:
 293                  */
 294
 295                 /* Is it a buffer page? */
 296                 if (page->buffers) {
 297                         int mem = page->inode ? 0 : PAGE_CACHE_SIZE;
 298                         spin_unlock(&pagecache_lock);
 299                         if (!try_to_free_buffers(page))
 300                                 goto unlock_continue;
 301                         atomic_sub(mem, &buffermem);
 302                         spin_lock(&pagecache_lock);
 303                 }
 304
 305                 /*
 306                  * We can't free pages unless there's just one user
 307                  * (count == 2 because we added one ourselves above).
 308                  */
 309                 if (page_count(page) != 2)
 310                         goto spin_unlock_continue;
 311
 312                 /*
 313                  * Is it a page swap page? If so, we want to
 314                  * drop it if it is no longer used, even if it
 315                  * were to be marked referenced..
 316                  */
 317                 if (PageSwapCache(page)) {
 318                         spin_unlock(&pagecache_lock);
 319                         if (referenced && swap_count(page->offset) != 2)
 320                                 goto unlock_continue;
 321                         __delete_from_swap_cache(page);
 322                         page_cache_release(page);
 323                         goto made_progress;
 324                 }
 325
 326                 /* is it a page-cache page? */
 327                 if (!referenced && page->inode && !pgcache_under_min()) {
 328                         remove_page_from_inode_queue(page);
 329                         remove_page_from_hash_queue(page);
 330                         page->inode = NULL;
 331                         spin_unlock(&pagecache_lock);
 332
 333                         page_cache_release(page);
 334                         goto made_progress;
 335                 }
 336 spin_unlock_continue:
 337                 spin_unlock(&pagecache_lock);
 338 unlock_continue:
 339                 UnlockPage(page);
 340 put_continue:
 341                 put_page(page);
 342         } while (count > 0);
 343         return 0;
 344 made_progress:
 345         UnlockPage(page);
 346         put_page(page);
 347         return 1;
 348 }
 349
 350 static inline struct page * __find_page_nolock(struct inode * inode, unsigned long offset, struct page *page)
 351 {
 352         goto inside;
 353
 354         for (;;) {
 355                 page = page->next_hash;
 356 inside:
 357                 if (!page)
 358                         goto not_found;
 359                 if (page->inode != inode)
 360                         continue;
 361                 if (page->offset == offset)
 362                         break;
 363         }
 364         set_bit(PG_referenced, &page->flags);
 365 not_found:
 366         return page;
 367 }
 368
 369 /*
 370  * By the time this is called, the page is locked and
 371  * we don't have to worry about any races any more.
 372  *
 373  * Start the IO..
 374  */
 375 static int writeout_one_page(struct page *page)
 376 {
 377         struct buffer_head *bh, *head = page->buffers;
 378
 379         bh = head;
 380         do {
 381                 if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
 382                         continue;
 383
 384                 bh->b_flushtime = 0;
 385                 ll_rw_block(WRITE, 1, &bh);
 386         } while ((bh = bh->b_this_page) != head);
 387         return 0;
 388 }
 389
 390 static int waitfor_one_page(struct page *page)
 391 {
 392         int error = 0;
 393         struct buffer_head *bh, *head = page->buffers;
 394
 395         bh = head;
 396         do {
 397                 wait_on_buffer(bh);
 398                 if (buffer_req(bh) && !buffer_uptodate(bh))
 399                         error = -EIO;
 400         } while ((bh = bh->b_this_page) != head);
 401         return error;
 402 }
 403
 404 static int do_buffer_fdatasync(struct inode *inode, unsigned long start, unsigned long end, int (*fn)(struct page *))
 405 {
 406         struct page *next;
 407         int retval = 0;
 408
 409         start &= PAGE_MASK;
 410
 411         spin_lock(&pagecache_lock);
 412         next = inode->i_pages;
 413         while (next) {
 414                 struct page *page = next;
 415                 next = page->next;
 416                 if (!page->buffers)
 417                         continue;
 418                 if (page->offset >= end)
 419                         continue;
 420                 if (page->offset < start)
 421                         continue;
 422
 423                 get_page(page);
 424                 spin_unlock(&pagecache_lock);
 425                 lock_page(page);
 426
 427                 /* The buffers could have been free'd while we waited for the page lock */
 428                 if (page->buffers)
 429                         retval |= fn(page);
 430
 431                 UnlockPage(page);
 432                 spin_lock(&pagecache_lock);
 433                 next = page->next;
 434                 page_cache_release(page);
 435         }
 436         spin_unlock(&pagecache_lock);
 437
 438         return retval;
 439 }
 440
 441 /*
 442  * Two-stage data sync: first start the IO, then go back and
 443  * collect the information..
 444  */
 445 int generic_buffer_fdatasync(struct inode *inode, unsigned long start, unsigned long end)
 446 {
 447         int retval;
 448
 449         retval = do_buffer_fdatasync(inode, start, end, writeout_one_page);
 450         retval |= do_buffer_fdatasync(inode, start, end, waitfor_one_page);
 451         return retval;
 452 }
 453
 454 /*
 455  * This adds a page to the page cache, starting out as locked,
 456  * owned by us, referenced, but not uptodate and with no errors.
 457  */
 458 static inline void __add_to_page_cache(struct page * page,
 459         struct inode * inode, unsigned long offset,
 460         struct page **hash)
 461 {
 462         unsigned long flags;
 463
 464         flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error));
 465         page->flags = flags |  ((1 << PG_locked) | (1 << PG_referenced));
 466         page->owner = current;  /* REMOVEME */
 467         get_page(page);
 468         page->offset = offset;
 469         add_page_to_inode_queue(inode, page);
 470         __add_page_to_hash_queue(page, hash);
 471 }
 472
 473 void add_to_page_cache(struct page * page, struct inode * inode, unsigned long offset)
 474 {
 475         spin_lock(&pagecache_lock);
 476         __add_to_page_cache(page, inode, offset, page_hash(inode, offset));
 477         spin_unlock(&pagecache_lock);
 478 }
 479
 480 int add_to_page_cache_unique(struct page * page,
 481         struct inode * inode, unsigned long offset,
 482         struct page **hash)
 483 {
 484         int err;
 485         struct page *alias;
 486
 487         spin_lock(&pagecache_lock);
 488         alias = __find_page_nolock(inode, offset, *hash);
 489
 490         err = 1;
 491         if (!alias) {
 492                 __add_to_page_cache(page,inode,offset,hash);
 493                 err = 0;
 494         }
 495
 496         spin_unlock(&pagecache_lock);
 497         return err;
 498 }
 499
 500 /*
 501  * Try to read ahead in the file. "page_cache" is a potentially free page
 502  * that we could use for the cache (if it is 0 we can try to create one,
 503  * this is all overlapped with the IO on the previous page finishing anyway)
 504  */
 505 static unsigned long try_to_read_ahead(struct file * file,
 506                                 unsigned long offset, unsigned long page_cache)
 507 {
 508         struct inode *inode = file->f_dentry->d_inode;
 509         struct page * page;
 510         struct page ** hash;
 511
 512         offset &= PAGE_CACHE_MASK;
 513         switch (page_cache) {
 514         case 0:
 515                 page_cache = page_cache_alloc();
 516                 if (!page_cache)
 517                         break;
 518         default:
 519                 if (offset >= inode->i_size)
 520                         break;
 521                 hash = page_hash(inode, offset);
 522                 page = page_cache_entry(page_cache);
 523                 if (!add_to_page_cache_unique(page, inode, offset, hash)) {
 524                         /*
 525                          * We do not have to check the return value here
 526                          * because it's a readahead.
 527                          */
 528                         inode->i_op->readpage(file, page);
 529                         page_cache = 0;
 530                         page_cache_release(page);
 531                 }
 532         }
 533         return page_cache;
 534 }
 535
 536 /*
 537  * Wait for a page to get unlocked.
 538  *
 539  * This must be called with the caller "holding" the page,
 540  * ie with increased "page->count" so that the page won't
 541  * go away during the wait..
 542  */
 543 void ___wait_on_page(struct page *page)
 544 {
 545         struct task_struct *tsk = current;
 546         DECLARE_WAITQUEUE(wait, tsk);
 547
 548         add_wait_queue(&page->wait, &wait);
 549         do {
 550                 tsk->state = TASK_UNINTERRUPTIBLE;
 551                 run_task_queue(&tq_disk);
 552                 if (!PageLocked(page))
 553                         break;
 554                 schedule();
 555         } while (PageLocked(page));
 556         tsk->state = TASK_RUNNING;
 557         remove_wait_queue(&page->wait, &wait);
 558 }
 559
 560 /*
 561  * Get an exclusive lock on the page..
 562  */
 563 void lock_page(struct page *page)
 564 {
 565         if (TryLockPage(page)) {
 566                 struct task_struct *tsk = current;
 567                 DECLARE_WAITQUEUE(wait, current);
 568
 569                 run_task_queue(&tq_disk);
 570                 add_wait_queue(&page->wait, &wait);
 571                 tsk->state = TASK_UNINTERRUPTIBLE;
 572
 573                 while (TryLockPage(page)) {
 574                         run_task_queue(&tq_disk);
 575                         schedule();
 576                         tsk->state = TASK_UNINTERRUPTIBLE;
 577                 }
 578
 579                 remove_wait_queue(&page->wait, &wait);
 580                 tsk->state = TASK_RUNNING;
 581         }
 582 }
 583
 584
 585 /*
 586  * a rather lightweight function, finding and getting a reference to a
 587  * hashed page atomically, waiting for it if it's locked.
 588  */
 589 struct page * __find_get_page (struct inode * inode,
 590                                 unsigned long offset, struct page **hash)
 591 {
 592         struct page *page;
 593
 594         /*
 595          * We scan the hash list read-only. Addition to and removal from
 596          * the hash-list needs a held write-lock.
 597          */
 598 repeat:
 599         spin_lock(&pagecache_lock);
 600         page = __find_page_nolock(inode, offset, *hash);
 601         if (page)
 602                 get_page(page);
 603         spin_unlock(&pagecache_lock);
 604
 605         /* Found the page, sleep if locked. */
 606         if (page && PageLocked(page)) {
 607                 struct task_struct *tsk = current;
 608                 DECLARE_WAITQUEUE(wait, tsk);
 609
 610                 add_wait_queue(&page->wait, &wait);
 611                 tsk->state = TASK_UNINTERRUPTIBLE;
 612
 613                 run_task_queue(&tq_disk);
 614                 if (PageLocked(page))
 615                         schedule();
 616                 tsk->state = TASK_RUNNING;
 617                 remove_wait_queue(&page->wait, &wait);
 618
 619                 /*
 620                  * The page might have been unhashed meanwhile. It's
 621                  * not freed though because we hold a reference to it.
 622                  * If this is the case then it will be freed _here_,
 623                  * and we recheck the hash anyway.
 624                  */
 625                 page_cache_release(page);
 626                 goto repeat;
 627         }
 628         /*
 629          * It's not locked so we can return the page and we hold
 630          * a reference to it.
 631          */
 632         return page;
 633 }
 634
 635 /*
 636  * Get the lock to a page atomically.
 637  */
 638 struct page * __find_lock_page (struct inode * inode,
 639                                 unsigned long offset, struct page **hash)
 640 {
 641         struct page *page;
 642
 643         /*
 644          * We scan the hash list read-only. Addition to and removal from
 645          * the hash-list needs a held write-lock.
 646          */
 647 repeat:
 648         spin_lock(&pagecache_lock);
 649         page = __find_page_nolock(inode, offset, *hash);
 650         if (page)
 651                 get_page(page);
 652         spin_unlock(&pagecache_lock);
 653
 654         /* Found the page, sleep if locked. */
 655         if (page && TryLockPage(page)) {
 656                 struct task_struct *tsk = current;
 657                 DECLARE_WAITQUEUE(wait, tsk);
 658
 659                 add_wait_queue(&page->wait, &wait);
 660                 tsk->state = TASK_UNINTERRUPTIBLE;
 661
 662                 run_task_queue(&tq_disk);
 663                 if (PageLocked(page))
 664                         schedule();
 665                 tsk->state = TASK_RUNNING;
 666                 remove_wait_queue(&page->wait, &wait);
 667
 668                 /*
 669                  * The page might have been unhashed meanwhile. It's
 670                  * not freed though because we hold a reference to it.
 671                  * If this is the case then it will be freed _here_,
 672                  * and we recheck the hash anyway.
 673                  */
 674                 page_cache_release(page);
 675                 goto repeat;
 676         }
 677         /*
 678          * It's not locked so we can return the page and we hold
 679          * a reference to it.
 680          */
 681         return page;
 682 }
 683
 684 #if 0
 685 #define PROFILE_READAHEAD
 686 #define DEBUG_READAHEAD
 687 #endif
 688
 689 /*
 690  * Read-ahead profiling information
 691  * --------------------------------
 692  * Every PROFILE_MAXREADCOUNT, the following information is written
 693  * to the syslog:
 694  *   Percentage of asynchronous read-ahead.
 695  *   Average of read-ahead fields context value.
 696  * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
 697  * to the syslog.
 698  */
 699
 700 #ifdef PROFILE_READAHEAD
 701
 702 #define PROFILE_MAXREADCOUNT 1000
 703
 704 static unsigned long total_reada;
 705 static unsigned long total_async;
 706 static unsigned long total_ramax;
 707 static unsigned long total_ralen;
 708 static unsigned long total_rawin;
 709
 710 static void profile_readahead(int async, struct file *filp)
 711 {
 712         unsigned long flags;
 713
 714         ++total_reada;
 715         if (async)
 716                 ++total_async;
 717
 718         total_ramax     += filp->f_ramax;
 719         total_ralen     += filp->f_ralen;
 720         total_rawin     += filp->f_rawin;
 721
 722         if (total_reada > PROFILE_MAXREADCOUNT) {
 723                 save_flags(flags);
 724                 cli();
 725                 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
 726                         restore_flags(flags);
 727                         return;
 728                 }
 729
 730                 printk("Readahead average:  max=%ld, len=%ld, win=%ld, async=%ld%%\n",
 731                         total_ramax/total_reada,
 732                         total_ralen/total_reada,
 733                         total_rawin/total_reada,
 734                         (total_async*100)/total_reada);
 735 #ifdef DEBUG_READAHEAD
 736                 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%ld\n",
 737                         filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
 738 #endif
 739
 740                 total_reada     = 0;
 741                 total_async     = 0;
 742                 total_ramax     = 0;
 743                 total_ralen     = 0;
 744                 total_rawin     = 0;
 745
 746                 restore_flags(flags);
 747         }
 748 }
 749 #endif  /* defined PROFILE_READAHEAD */
 750
 751 /*
 752  * Read-ahead context:
 753  * -------------------
 754  * The read ahead context fields of the "struct file" are the following:
 755  * - f_raend : position of the first byte after the last page we tried to
 756  *             read ahead.
 757  * - f_ramax : current read-ahead maximum size.
 758  * - f_ralen : length of the current IO read block we tried to read-ahead.
 759  * - f_rawin : length of the current read-ahead window.
 760  *              if last read-ahead was synchronous then
 761  *                      f_rawin = f_ralen
 762  *              otherwise (was asynchronous)
 763  *                      f_rawin = previous value of f_ralen + f_ralen
 764  *
 765  * Read-ahead limits:
 766  * ------------------
 767  * MIN_READAHEAD   : minimum read-ahead size when read-ahead.
 768  * MAX_READAHEAD   : maximum read-ahead size when read-ahead.
 769  *
 770  * Synchronous read-ahead benefits:
 771  * --------------------------------
 772  * Using reasonable IO xfer length from peripheral devices increase system
 773  * performances.
 774  * Reasonable means, in this context, not too large but not too small.
 775  * The actual maximum value is:
 776  *      MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
 777  *      and 32K if defined (4K page size assumed).
 778  *
 779  * Asynchronous read-ahead benefits:
 780  * ---------------------------------
 781  * Overlapping next read request and user process execution increase system
 782  * performance.
 783  *
 784  * Read-ahead risks:
 785  * -----------------
 786  * We have to guess which further data are needed by the user process.
 787  * If these data are often not really needed, it's bad for system
 788  * performances.
 789  * However, we know that files are often accessed sequentially by
 790  * application programs and it seems that it is possible to have some good
 791  * strategy in that guessing.
 792  * We only try to read-ahead files that seems to be read sequentially.
 793  *
 794  * Asynchronous read-ahead risks:
 795  * ------------------------------
 796  * In order to maximize overlapping, we must start some asynchronous read
 797  * request from the device, as soon as possible.
 798  * We must be very careful about:
 799  * - The number of effective pending IO read requests.
 800  *   ONE seems to be the only reasonable value.
 801  * - The total memory pool usage for the file access stream.
 802  *   This maximum memory usage is implicitly 2 IO read chunks:
 803  *   2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
 804  *   64k if defined (4K page size assumed).
 805  */
 806
 807 static inline int get_max_readahead(struct inode * inode)
 808 {
 809         if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
 810                 return MAX_READAHEAD;
 811         return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
 812 }
 813
 814 static inline unsigned long generic_file_readahead(int reada_ok,
 815         struct file * filp, struct inode * inode,
 816         unsigned long ppos, struct page * page, unsigned long page_cache)
 817 {
 818         unsigned long max_ahead, ahead;
 819         unsigned long raend;
 820         int max_readahead = get_max_readahead(inode);
 821
 822         raend = filp->f_raend & PAGE_CACHE_MASK;
 823         max_ahead = 0;
 824
 825 /*
 826  * The current page is locked.
 827  * If the current position is inside the previous read IO request, do not
 828  * try to reread previously read ahead pages.
 829  * Otherwise decide or not to read ahead some pages synchronously.
 830  * If we are not going to read ahead, set the read ahead context for this
 831  * page only.
 832  */
 833         if (PageLocked(page)) {
 834                 if (!filp->f_ralen || ppos >= raend || ppos + filp->f_ralen < raend) {
 835                         raend = ppos;
 836                         if (raend < inode->i_size)
 837                                 max_ahead = filp->f_ramax;
 838                         filp->f_rawin = 0;
 839                         filp->f_ralen = PAGE_CACHE_SIZE;
 840                         if (!max_ahead) {
 841                                 filp->f_raend  = ppos + filp->f_ralen;
 842                                 filp->f_rawin += filp->f_ralen;
 843                         }
 844                 }
 845         }
 846 /*
 847  * The current page is not locked.
 848  * If we were reading ahead and,
 849  * if the current max read ahead size is not zero and,
 850  * if the current position is inside the last read-ahead IO request,
 851  *   it is the moment to try to read ahead asynchronously.
 852  * We will later force unplug device in order to force asynchronous read IO.
 853  */
 854         else if (reada_ok && filp->f_ramax && raend >= PAGE_CACHE_SIZE &&
 855                  ppos <= raend && ppos + filp->f_ralen >= raend) {
 856 /*
 857  * Add ONE page to max_ahead in order to try to have about the same IO max size
 858  * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
 859  * Compute the position of the last page we have tried to read in order to
 860  * begin to read ahead just at the next page.
 861  */
 862                 raend -= PAGE_CACHE_SIZE;
 863                 if (raend < inode->i_size)
 864                         max_ahead = filp->f_ramax + PAGE_CACHE_SIZE;
 865
 866                 if (max_ahead) {
 867                         filp->f_rawin = filp->f_ralen;
 868                         filp->f_ralen = 0;
 869                         reada_ok      = 2;
 870                 }
 871         }
 872 /*
 873  * Try to read ahead pages.
 874  * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
 875  * scheduler, will work enough for us to avoid too bad actuals IO requests.
 876  */
 877         ahead = 0;
 878         while (ahead < max_ahead) {
 879                 ahead += PAGE_CACHE_SIZE;
 880                 page_cache = try_to_read_ahead(filp, raend + ahead,
 881                                                 page_cache);
 882         }
 883 /*
 884  * If we tried to read ahead some pages,
 885  * If we tried to read ahead asynchronously,
 886  *   Try to force unplug of the device in order to start an asynchronous
 887  *   read IO request.
 888  * Update the read-ahead context.
 889  * Store the length of the current read-ahead window.
 890  * Double the current max read ahead size.
 891  *   That heuristic avoid to do some large IO for files that are not really
 892  *   accessed sequentially.
 893  */
 894         if (ahead) {
 895                 if (reada_ok == 2) {
 896                         run_task_queue(&tq_disk);
 897                 }
 898
 899                 filp->f_ralen += ahead;
 900                 filp->f_rawin += filp->f_ralen;
 901                 filp->f_raend = raend + ahead + PAGE_CACHE_SIZE;
 902
 903                 filp->f_ramax += filp->f_ramax;
 904
 905                 if (filp->f_ramax > max_readahead)
 906                         filp->f_ramax = max_readahead;
 907
 908 #ifdef PROFILE_READAHEAD
 909                 profile_readahead((reada_ok == 2), filp);
 910 #endif
 911         }
 912
 913         return page_cache;
 914 }
 915
 916 /*
 917  * "descriptor" for what we're up to with a read.
 918  * This allows us to use the same read code yet
 919  * have multiple different users of the data that
 920  * we read from a file.
 921  *
 922  * The simplest case just copies the data to user
 923  * mode.
 924  */
 925 typedef struct {
 926         size_t written;
 927         size_t count;
 928         char * buf;
 929         int error;
 930 } read_descriptor_t;
 931
 932 typedef int (*read_actor_t)(read_descriptor_t *, const char *, unsigned long);
 933
 934 /*
 935  * This is a generic file read routine, and uses the
 936  * inode->i_op->readpage() function for the actual low-level
 937  * stuff.
 938  *
 939  * This is really ugly. But the goto's actually try to clarify some
 940  * of the logic when it comes to error handling etc.
 941  */
 942 static void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
 943 {
 944         struct dentry *dentry = filp->f_dentry;
 945         struct inode *inode = dentry->d_inode;
 946         size_t pos, pgpos, page_cache;
 947         int reada_ok;
 948         int error;
 949         int max_readahead = get_max_readahead(inode);
 950
 951         page_cache = 0;
 952
 953         pos = *ppos;
 954         pgpos = pos & PAGE_CACHE_MASK;
 955 /*
 956  * If the current position is outside the previous read-ahead window,
 957  * we reset the current read-ahead context and set read ahead max to zero
 958  * (will be set to just needed value later),
 959  * otherwise, we assume that the file accesses are sequential enough to
 960  * continue read-ahead.
 961  */
 962         if (pgpos > filp->f_raend || pgpos + filp->f_rawin < filp->f_raend) {
 963                 reada_ok = 0;
 964                 filp->f_raend = 0;
 965                 filp->f_ralen = 0;
 966                 filp->f_ramax = 0;
 967                 filp->f_rawin = 0;
 968         } else {
 969                 reada_ok = 1;
 970         }
 971 /*
 972  * Adjust the current value of read-ahead max.
 973  * If the read operation stay in the first half page, force no readahead.
 974  * Otherwise try to increase read ahead max just enough to do the read request.
 975  * Then, at least MIN_READAHEAD if read ahead is ok,
 976  * and at most MAX_READAHEAD in all cases.
 977  */
 978         if (pos + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
 979                 filp->f_ramax = 0;
 980         } else {
 981                 unsigned long needed;
 982
 983                 needed = ((pos + desc->count) & PAGE_CACHE_MASK) - pgpos;
 984
 985                 if (filp->f_ramax < needed)
 986                         filp->f_ramax = needed;
 987
 988                 if (reada_ok && filp->f_ramax < MIN_READAHEAD)
 989                                 filp->f_ramax = MIN_READAHEAD;
 990                 if (filp->f_ramax > max_readahead)
 991                         filp->f_ramax = max_readahead;
 992         }
 993
 994         for (;;) {
 995                 struct page *page, **hash;
 996
 997                 if (pos >= inode->i_size)
 998                         break;
 999
1000                 /*
1001                  * Try to find the data in the page cache..
1002                  */
1003                 hash = page_hash(inode, pos & PAGE_CACHE_MASK);
1004
1005                 spin_lock(&pagecache_lock);
1006                 page = __find_page_nolock(inode, pos & PAGE_CACHE_MASK, *hash);
1007                 if (!page)
1008                         goto no_cached_page;
1009 found_page:
1010                 get_page(page);
1011                 spin_unlock(&pagecache_lock);
1012
1013                 if (!Page_Uptodate(page))
1014                         goto page_not_up_to_date;
1015 page_ok:
1016         /*
1017          * Ok, we have the page, and it's up-to-date, so
1018          * now we can copy it to user space...
1019          */
1020         {
1021                 unsigned long offset, nr;
1022
1023                 offset = pos & ~PAGE_CACHE_MASK;
1024                 nr = PAGE_CACHE_SIZE - offset;
1025                 if (nr > inode->i_size - pos)
1026                         nr = inode->i_size - pos;
1027
1028                 /*
1029                  * The actor routine returns how many bytes were actually used..
1030                  * NOTE! This may not be the same as how much of a user buffer
1031                  * we filled up (we may be padding etc), so we can only update
1032                  * "pos" here (the actor routine has to update the user buffer
1033                  * pointers and the remaining count).
1034                  */
1035                 nr = actor(desc, (const char *) (page_address(page) + offset), nr);
1036                 pos += nr;
1037                 page_cache_release(page);
1038                 if (nr && desc->count)
1039                         continue;
1040                 break;
1041         }
1042
1043 /*
1044  * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
1045  */
1046 page_not_up_to_date:
1047                 page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_CACHE_MASK, page, page_cache);
1048
1049                 if (Page_Uptodate(page))
1050                         goto page_ok;
1051
1052                 /* Get exclusive access to the page ... */
1053                 lock_page(page);
1054                 if (Page_Uptodate(page)) {
1055                         UnlockPage(page);
1056                         goto page_ok;
1057                 }
1058
1059 readpage:
1060                 /* ... and start the actual read. The read will unlock the page. */
1061                 error = inode->i_op->readpage(filp, page);
1062
1063                 if (!error) {
1064                         if (Page_Uptodate(page))
1065                                 goto page_ok;
1066
1067                         /* Again, try some read-ahead while waiting for the page to finish.. */
1068                         page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_CACHE_MASK, page, page_cache);
1069                         wait_on_page(page);
1070                         if (Page_Uptodate(page))
1071                                 goto page_ok;
1072                         error = -EIO;
1073                 }
1074
1075                 /* UHHUH! A synchronous read error occurred. Report it */
1076                 desc->error = error;
1077                 page_cache_release(page);
1078                 break;
1079
1080 no_cached_page:
1081                 /*
1082                  * Ok, it wasn't cached, so we need to create a new
1083                  * page..
1084                  *
1085                  * We get here with the page cache lock held.
1086                  */
1087                 if (!page_cache) {
1088                         spin_unlock(&pagecache_lock);
1089                         page_cache = page_cache_alloc();
1090                         if (!page_cache) {
1091                                 desc->error = -ENOMEM;
1092                                 break;
1093                         }
1094
1095                         /*
1096                          * Somebody may have added the page while we
1097                          * dropped the page cache lock. Check for that.
1098                          */
1099                         spin_lock(&pagecache_lock);
1100                         page = __find_page_nolock(inode, pos & PAGE_CACHE_MASK, *hash);
1101                         if (page)
1102                                 goto found_page;
1103                 }
1104
1105                 /*
1106                  * Ok, add the new page to the hash-queues...
1107                  */
1108                 page = page_cache_entry(page_cache);
1109                 __add_to_page_cache(page, inode, pos & PAGE_CACHE_MASK, hash);
1110                 spin_unlock(&pagecache_lock);
1111
1112                 page_cache = 0;
1113                 goto readpage;
1114         }
1115
1116         *ppos = pos;
1117         filp->f_reada = 1;
1118         if (page_cache)
1119                 page_cache_free(page_cache);
1120         UPDATE_ATIME(inode);
1121 }
1122
1123 static int file_read_actor(read_descriptor_t * desc, const char *area, unsigned long size)
1124 {
1125         unsigned long left;
1126         unsigned long count = desc->count;
1127
1128         if (size > count)
1129                 size = count;
1130         left = __copy_to_user(desc->buf, area, size);
1131         if (left) {
1132                 size -= left;
1133                 desc->error = -EFAULT;
1134         }
1135         desc->count = count - size;
1136         desc->written += size;
1137         desc->buf += size;
1138         return size;
1139 }
1140
1141 /*
1142  * This is the "read()" routine for all filesystems
1143  * that can use the page cache directly.
1144  */
1145 ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
1146 {
1147         ssize_t retval;
1148
1149         retval = -EFAULT;
1150         if (access_ok(VERIFY_WRITE, buf, count)) {
1151                 retval = 0;
1152                 if (count) {
1153                         read_descriptor_t desc;
1154
1155                         desc.written = 0;
1156                         desc.count = count;
1157                         desc.buf = buf;
1158                         desc.error = 0;
1159                         do_generic_file_read(filp, ppos, &desc, file_read_actor);
1160
1161                         retval = desc.written;
1162                         if (!retval)
1163                                 retval = desc.error;
1164                 }
1165         }
1166         return retval;
1167 }
1168
1169 static int file_send_actor(read_descriptor_t * desc, const char *area, unsigned long size)
1170 {
1171         ssize_t written;
1172         unsigned long count = desc->count;
1173         struct file *file = (struct file *) desc->buf;
1174         mm_segment_t old_fs;
1175
1176         if (size > count)
1177                 size = count;
1178         old_fs = get_fs();
1179         set_fs(KERNEL_DS);
1180         written = file->f_op->write(file, area, size, &file->f_pos);
1181         set_fs(old_fs);
1182         if (written < 0) {
1183                 desc->error = written;
1184                 written = 0;
1185         }
1186         desc->count = count - written;
1187         desc->written += written;
1188         return written;
1189 }
1190
1191 asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
1192 {
1193         ssize_t retval;
1194         struct file * in_file, * out_file;
1195         struct inode * in_inode, * out_inode;
1196
1197         /*
1198          * Get input file, and verify that it is ok..
1199          */
1200         retval = -EBADF;
1201         in_file = fget(in_fd);
1202         if (!in_file)
1203                 goto out;
1204         if (!(in_file->f_mode & FMODE_READ))
1205                 goto fput_in;
1206         retval = -EINVAL;
1207         in_inode = in_file->f_dentry->d_inode;
1208         if (!in_inode)
1209                 goto fput_in;
1210         if (!in_inode->i_op || !in_inode->i_op->readpage)
1211                 goto fput_in;
1212         retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
1213         if (retval)
1214                 goto fput_in;
1215
1216         /*
1217          * Get output file, and verify that it is ok..
1218          */
1219         retval = -EBADF;
1220         out_file = fget(out_fd);
1221         if (!out_file)
1222                 goto fput_in;
1223         if (!(out_file->f_mode & FMODE_WRITE))
1224                 goto fput_out;
1225         retval = -EINVAL;
1226         if (!out_file->f_op || !out_file->f_op->write)
1227                 goto fput_out;
1228         out_inode = out_file->f_dentry->d_inode;
1229         if (!out_inode)
1230                 goto fput_out;
1231         retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
1232         if (retval)
1233                 goto fput_out;
1234
1235         retval = 0;
1236         if (count) {
1237                 read_descriptor_t desc;
1238                 loff_t pos = 0, *ppos;
1239
1240                 retval = -EFAULT;
1241                 ppos = &in_file->f_pos;
1242                 if (offset) {
1243                         if (get_user(pos, offset))
1244                                 goto fput_out;
1245                         ppos = &pos;
1246                 }
1247
1248                 desc.written = 0;
1249                 desc.count = count;
1250                 desc.buf = (char *) out_file;
1251                 desc.error = 0;
1252                 do_generic_file_read(in_file, ppos, &desc, file_send_actor);
1253
1254                 retval = desc.written;
1255                 if (!retval)
1256                         retval = desc.error;
1257                 if (offset)
1258                         put_user(pos, offset);
1259         }
1260
1261 fput_out:
1262         fput(out_file);
1263 fput_in:
1264         fput(in_file);
1265 out:
1266         return retval;
1267 }
1268
1269 /*
1270  * Semantics for shared and private memory areas are different past the end
1271  * of the file. A shared mapping past the last page of the file is an error
1272  * and results in a SIGBUS, while a private mapping just maps in a zero page.
1273  *
1274  * The goto's are kind of ugly, but this streamlines the normal case of having
1275  * it in the page cache, and handles the special cases reasonably without
1276  * having a lot of duplicated code.
1277  *
1278  * WSH 06/04/97: fixed a memory leak and moved the allocation of new_page
1279  * ahead of the wait if we're sure to need it.
1280  */
1281 static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long address, int no_share)
1282 {
1283         struct file * file = area->vm_file;
1284         struct dentry * dentry = file->f_dentry;
1285         struct inode * inode = dentry->d_inode;
1286         unsigned long offset, reada, i;
1287         struct page * page, **hash;
1288         unsigned long old_page, new_page;
1289         int error;
1290
1291         new_page = 0;
1292         offset = (address & PAGE_MASK) - area->vm_start + area->vm_offset;
1293         if (offset >= inode->i_size && (area->vm_flags & VM_SHARED) && area->vm_mm == current->mm)
1294                 goto no_page;
1295
1296         /*
1297          * Do we have something in the page cache already?
1298          */
1299         hash = page_hash(inode, offset);
1300 retry_find:
1301         page = __find_get_page(inode, offset, hash);
1302         if (!page)
1303                 goto no_cached_page;
1304
1305 found_page:
1306         /*
1307          * Ok, found a page in the page cache, now we need to check
1308          * that it's up-to-date.  First check whether we'll need an
1309          * extra page -- better to overlap the allocation with the I/O.
1310          */
1311         if (no_share && !new_page) {
1312                 new_page = page_cache_alloc();
1313                 if (!new_page)
1314                         goto failure;
1315         }
1316
1317         if (!Page_Uptodate(page)) {
1318                 lock_page(page);
1319                 if (!Page_Uptodate(page))
1320                         goto page_not_uptodate;
1321                 UnlockPage(page);
1322         }
1323
1324 success:
1325         /*
1326          * Found the page and have a reference on it, need to check sharing
1327          * and possibly copy it over to another page..
1328          */
1329         old_page = page_address(page);
1330         if (!no_share) {
1331                 /*
1332                  * Ok, we can share the cached page directly.. Get rid
1333                  * of any potential extra pages.
1334                  */
1335                 if (new_page)
1336                         page_cache_free(new_page);
1337
1338                 flush_page_to_ram(old_page);
1339                 return old_page;
1340         }
1341
1342         /*
1343          * No sharing ... copy to the new page.
1344          */
1345         copy_page(new_page, old_page);
1346         flush_page_to_ram(new_page);
1347         page_cache_release(page);
1348         return new_page;
1349
1350 no_cached_page:
1351         /*
1352          * Try to read in an entire cluster at once.
1353          */
1354         reada   = offset;
1355         reada >>= PAGE_CACHE_SHIFT + page_cluster;
1356         reada <<= PAGE_CACHE_SHIFT + page_cluster;
1357
1358         for (i = 1 << page_cluster; i > 0; --i, reada += PAGE_CACHE_SIZE)
1359                 new_page = try_to_read_ahead(file, reada, new_page);
1360
1361         if (!new_page)
1362                 new_page = page_cache_alloc();
1363         if (!new_page)
1364                 goto no_page;
1365
1366         /*
1367          * During getting the above page we might have slept,
1368          * so we need to re-check the situation with the page
1369          * cache.. The page we just got may be useful if we
1370          * can't share, so don't get rid of it here.
1371          */
1372         page = __find_get_page(inode, offset, hash);
1373         if (page)
1374                 goto found_page;
1375
1376         /*
1377          * Now, create a new page-cache page from the page we got
1378          */
1379         page = page_cache_entry(new_page);
1380         if (add_to_page_cache_unique(page, inode, offset, hash))
1381                 goto retry_find;
1382
1383         /*
1384          * Now it's ours and locked, we can do initial IO to it:
1385          */
1386         new_page = 0;
1387
1388 page_not_uptodate:
1389         error = inode->i_op->readpage(file, page);
1390
1391         if (!error) {
1392                 wait_on_page(page);
1393                 if (PageError(page))
1394                         goto page_read_error;
1395                 goto success;
1396         }
1397
1398 page_read_error:
1399         /*
1400          * Umm, take care of errors if the page isn't up-to-date.
1401          * Try to re-read it _once_. We do this synchronously,
1402          * because there really aren't any performance issues here
1403          * and we need to check for errors.
1404          */
1405         if (!PageLocked(page))
1406                 PAGE_BUG(page);
1407         ClearPageError(page);
1408         error = inode->i_op->readpage(file, page);
1409         if (error)
1410                 goto failure;
1411         wait_on_page(page);
1412         if (Page_Uptodate(page))
1413                 goto success;
1414
1415         /*
1416          * Things didn't work out. Return zero to tell the
1417          * mm layer so, possibly freeing the page cache page first.
1418          */
1419 failure:
1420         page_cache_release(page);
1421         if (new_page)
1422                 page_cache_free(new_page);
1423 no_page:
1424         return 0;
1425 }
1426
1427 /*
1428  * Tries to write a shared mapped page to its backing store. May return -EIO
1429  * if the disk is full.
1430  */
1431 static inline int do_write_page(struct inode * inode, struct file * file,
1432         const char * page_addr, unsigned long offset)
1433 {
1434         int retval;
1435         unsigned long size;
1436         int (*writepage) (struct file *, struct page *);
1437         struct page * page;
1438
1439         size = offset + PAGE_SIZE;
1440         /* refuse to extend file size.. */
1441         if (S_ISREG(inode->i_mode)) {
1442                 if (size > inode->i_size)
1443                         size = inode->i_size;
1444                 /* Ho humm.. We should have tested for this earlier */
1445                 if (size < offset)
1446                         return -EIO;
1447         }
1448         size -= offset;
1449         retval = -EIO;
1450         writepage = inode->i_op->writepage;
1451         page = mem_map + MAP_NR(page_addr);
1452         lock_page(page);
1453
1454         retval = writepage(file, page);
1455
1456         UnlockPage(page);
1457         return retval;
1458 }
1459
1460 static int filemap_write_page(struct vm_area_struct * vma,
1461                               unsigned long offset,
1462                               unsigned long page,
1463                               int wait)
1464 {
1465         int result;
1466         struct file * file;
1467         struct dentry * dentry;
1468         struct inode * inode;
1469
1470         file = vma->vm_file;
1471         dentry = file->f_dentry;
1472         inode = dentry->d_inode;
1473
1474         /*
1475          * If a task terminates while we're swapping the page, the vma and
1476          * and file could be released ... increment the count to be safe.
1477          */
1478         get_file(file);
1479         result = do_write_page(inode, file, (const char *) page, offset);
1480         fput(file);
1481         return result;
1482 }
1483
1484
1485 /*
1486  * The page cache takes care of races between somebody
1487  * trying to swap something out and swap something in
1488  * at the same time..
1489  */
1490 extern void wakeup_bdflush(int);
1491 int filemap_swapout(struct vm_area_struct * vma, struct page * page)
1492 {
1493         int retval = filemap_write_page(vma, page->offset, page_address(page), 0);
1494         wakeup_bdflush(0);
1495         return retval;
1496 }
1497
1498 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
1499         unsigned long address, unsigned int flags)
1500 {
1501         pte_t pte = *ptep;
1502         unsigned long pageaddr;
1503         struct page *page;
1504         int error;
1505
1506         if (!(flags & MS_INVALIDATE)) {
1507                 if (!pte_present(pte))
1508                         return 0;
1509                 if (!pte_dirty(pte))
1510                         return 0;
1511                 flush_page_to_ram(pte_page(pte));
1512                 flush_cache_page(vma, address);
1513                 set_pte(ptep, pte_mkclean(pte));
1514                 flush_tlb_page(vma, address);
1515                 pageaddr = pte_page(pte);
1516                 page = page_cache_entry(pageaddr);
1517                 get_page(page);
1518         } else {
1519                 if (pte_none(pte))
1520                         return 0;
1521                 flush_cache_page(vma, address);
1522                 pte_clear(ptep);
1523                 flush_tlb_page(vma, address);
1524                 if (!pte_present(pte)) {
1525                         swap_free(pte_val(pte));
1526                         return 0;
1527                 }
1528                 pageaddr = pte_page(pte);
1529                 if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
1530                         page_cache_free(pageaddr);
1531                         return 0;
1532                 }
1533         }
1534         error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, pageaddr, 1);
1535         page_cache_free(pageaddr);
1536         return error;
1537 }
1538
1539 static inline int filemap_sync_pte_range(pmd_t * pmd,
1540         unsigned long address, unsigned long size,
1541         struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
1542 {
1543         pte_t * pte;
1544         unsigned long end;
1545         int error;
1546
1547         if (pmd_none(*pmd))
1548                 return 0;
1549         if (pmd_bad(*pmd)) {
1550                 printk("filemap_sync_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
1551                 pmd_clear(pmd);
1552                 return 0;
1553         }
1554         pte = pte_offset(pmd, address);
1555         offset += address & PMD_MASK;
1556         address &= ~PMD_MASK;
1557         end = address + size;
1558         if (end > PMD_SIZE)
1559                 end = PMD_SIZE;
1560         error = 0;
1561         do {
1562                 error |= filemap_sync_pte(pte, vma, address + offset, flags);
1563                 address += PAGE_SIZE;
1564                 pte++;
1565         } while (address < end);
1566         return error;
1567 }
1568
1569 static inline int filemap_sync_pmd_range(pgd_t * pgd,
1570         unsigned long address, unsigned long size,
1571         struct vm_area_struct *vma, unsigned int flags)
1572 {
1573         pmd_t * pmd;
1574         unsigned long offset, end;
1575         int error;
1576
1577         if (pgd_none(*pgd))
1578                 return 0;
1579         if (pgd_bad(*pgd)) {
1580                 printk("filemap_sync_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd));
1581                 pgd_clear(pgd);
1582                 return 0;
1583         }
1584         pmd = pmd_offset(pgd, address);
1585         offset = address & PGDIR_MASK;
1586         address &= ~PGDIR_MASK;
1587         end = address + size;
1588         if (end > PGDIR_SIZE)
1589                 end = PGDIR_SIZE;
1590         error = 0;
1591         do {
1592                 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
1593                 address = (address + PMD_SIZE) & PMD_MASK;
1594                 pmd++;
1595         } while (address < end);
1596         return error;
1597 }
1598
1599 static int filemap_sync(struct vm_area_struct * vma, unsigned long address,
1600         size_t size, unsigned int flags)
1601 {
1602         pgd_t * dir;
1603         unsigned long end = address + size;
1604         int error = 0;
1605
1606         dir = pgd_offset(vma->vm_mm, address);
1607         flush_cache_range(vma->vm_mm, end - size, end);
1608         while (address < end) {
1609                 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
1610                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
1611                 dir++;
1612         }
1613         flush_tlb_range(vma->vm_mm, end - size, end);
1614         return error;
1615 }
1616
1617 /*
1618  * This handles (potentially partial) area unmaps..
1619  */
1620 static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len)
1621 {
1622         filemap_sync(vma, start, len, MS_ASYNC);
1623 }
1624
1625 /*
1626  * Shared mappings need to be able to do the right thing at
1627  * close/unmap/sync. They will also use the private file as
1628  * backing-store for swapping..
1629  */
1630 static struct vm_operations_struct file_shared_mmap = {
1631         NULL,                   /* no special open */
1632         NULL,                   /* no special close */
1633         filemap_unmap,          /* unmap - we need to sync the pages */
1634         NULL,                   /* no special protect */
1635         filemap_sync,           /* sync */
1636         NULL,                   /* advise */
1637         filemap_nopage,         /* nopage */
1638         NULL,                   /* wppage */
1639         filemap_swapout         /* swapout */
1640 };
1641
1642 /*
1643  * Private mappings just need to be able to load in the map.
1644  *
1645  * (This is actually used for shared mappings as well, if we
1646  * know they can't ever get write permissions..)
1647  */
1648 static struct vm_operations_struct file_private_mmap = {
1649         NULL,                   /* open */
1650         NULL,                   /* close */
1651         NULL,                   /* unmap */
1652         NULL,                   /* protect */
1653         NULL,                   /* sync */
1654         NULL,                   /* advise */
1655         filemap_nopage,         /* nopage */
1656         NULL,                   /* wppage */
1657         NULL                    /* swapout */
1658 };
1659
1660 /* This is used for a general mmap of a disk file */
1661
1662 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1663 {
1664         struct vm_operations_struct * ops;
1665         struct inode *inode = file->f_dentry->d_inode;
1666
1667         ops = &file_private_mmap;
1668         if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1669                 if (!inode->i_op || !inode->i_op->writepage)
1670                         return -EINVAL;
1671                 ops = &file_shared_mmap;
1672         }
1673         if (!inode->i_sb || !S_ISREG(inode->i_mode))
1674                 return -EACCES;
1675         if (!inode->i_op || !inode->i_op->readpage)
1676                 return -ENOEXEC;
1677         UPDATE_ATIME(inode);
1678         vma->vm_ops = ops;
1679         return 0;
1680 }
1681
1682
1683 /*
1684  * The msync() system call.
1685  */
1686
1687 static int msync_interval(struct vm_area_struct * vma,
1688         unsigned long start, unsigned long end, int flags)
1689 {
1690         if (vma->vm_file && vma->vm_ops && vma->vm_ops->sync) {
1691                 int error;
1692                 error = vma->vm_ops->sync(vma, start, end-start, flags);
1693                 if (!error && (flags & MS_SYNC)) {
1694                         struct file * file = vma->vm_file;
1695                         if (file) {
1696                                 struct dentry * dentry = file->f_dentry;
1697                                 error = file_fsync(file, dentry);
1698                         }
1699                 }
1700                 return error;
1701         }
1702         return 0;
1703 }
1704
1705 asmlinkage int sys_msync(unsigned long start, size_t len, int flags)
1706 {
1707         unsigned long end;
1708         struct vm_area_struct * vma;
1709         int unmapped_error, error = -EINVAL;
1710
1711         down(&current->mm->mmap_sem);
1712         lock_kernel();
1713         if (start & ~PAGE_MASK)
1714                 goto out;
1715         len = (len + ~PAGE_MASK) & PAGE_MASK;
1716         end = start + len;
1717         if (end < start)
1718                 goto out;
1719         if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1720                 goto out;
1721         error = 0;
1722         if (end == start)
1723                 goto out;
1724         /*
1725          * If the interval [start,end) covers some unmapped address ranges,
1726          * just ignore them, but return -EFAULT at the end.
1727          */
1728         vma = find_vma(current->mm, start);
1729         unmapped_error = 0;
1730         for (;;) {
1731                 /* Still start < end. */
1732                 error = -EFAULT;
1733                 if (!vma)
1734                         goto out;
1735                 /* Here start < vma->vm_end. */
1736                 if (start < vma->vm_start) {
1737                         unmapped_error = -EFAULT;
1738                         start = vma->vm_start;
1739                 }
1740                 /* Here vma->vm_start <= start < vma->vm_end. */
1741                 if (end <= vma->vm_end) {
1742                         if (start < end) {
1743                                 error = msync_interval(vma, start, end, flags);
1744                                 if (error)
1745                                         goto out;
1746                         }
1747                         error = unmapped_error;
1748                         goto out;
1749                 }
1750                 /* Here vma->vm_start <= start < vma->vm_end < end. */
1751                 error = msync_interval(vma, start, vma->vm_end, flags);
1752                 if (error)
1753                         goto out;
1754                 start = vma->vm_end;
1755                 vma = vma->vm_next;
1756         }
1757 out:
1758         unlock_kernel();
1759         up(&current->mm->mmap_sem);
1760         return error;
1761 }
1762
1763 /*
1764  * Write to a file through the page cache. This is mainly for the
1765  * benefit of NFS and possibly other network-based file systems.
1766  *
1767  * We currently put everything into the page cache prior to writing it.
1768  * This is not a problem when writing full pages. With partial pages,
1769  * however, we first have to read the data into the cache, then
1770  * dirty the page, and finally schedule it for writing. Alternatively, we
1771  * could write-through just the portion of data that would go into that
1772  * page, but that would kill performance for applications that write data
1773  * line by line, and it's prone to race conditions.
1774  *
1775  * Note that this routine doesn't try to keep track of dirty pages. Each
1776  * file system has to do this all by itself, unfortunately.
1777  *                                                      okir@monad.swb.de
1778  */
1779 ssize_t
1780 generic_file_write(struct file *file, const char *buf,
1781                    size_t count, loff_t *ppos,
1782                    writepage_t write_one_page)
1783 {
1784         struct dentry   *dentry = file->f_dentry;
1785         struct inode    *inode = dentry->d_inode;
1786         unsigned long   pos = *ppos;
1787         unsigned long   limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
1788         struct page     *page, **hash;
1789         unsigned long   page_cache = 0;
1790         unsigned long   written;
1791         long            status;
1792         int             err;
1793
1794         err = file->f_error;
1795         if (err) {
1796                 file->f_error = 0;
1797                 goto out;
1798         }
1799
1800         written = 0;
1801
1802         if (file->f_flags & O_APPEND)
1803                 pos = inode->i_size;
1804
1805         /*
1806          * Check whether we've reached the file size limit.
1807          */
1808         err = -EFBIG;
1809         if (pos >= limit) {
1810                 send_sig(SIGXFSZ, current, 0);
1811                 goto out;
1812         }
1813
1814         status  = 0;
1815         /*
1816          * Check whether to truncate the write,
1817          * and send the signal if we do.
1818          */
1819         if (count > limit - pos) {
1820                 send_sig(SIGXFSZ, current, 0);
1821                 count = limit - pos;
1822         }
1823
1824         while (count) {
1825                 unsigned long bytes, pgpos, offset;
1826                 /*
1827                  * Try to find the page in the cache. If it isn't there,
1828                  * allocate a free page.
1829                  */
1830                 offset = (pos & ~PAGE_CACHE_MASK);
1831                 pgpos = pos & PAGE_CACHE_MASK;
1832                 bytes = PAGE_CACHE_SIZE - offset;
1833                 if (bytes > count)
1834                         bytes = count;
1835
1836                 hash = page_hash(inode, pgpos);
1837 repeat_find:
1838                 page = __find_lock_page(inode, pgpos, hash);
1839                 if (!page) {
1840                         if (!page_cache) {
1841                                 page_cache = page_cache_alloc();
1842                                 if (page_cache)
1843                                         goto repeat_find;
1844                                 status = -ENOMEM;
1845                                 break;
1846                         }
1847                         page = page_cache_entry(page_cache);
1848                         if (add_to_page_cache_unique(page,inode,pgpos,hash))
1849                                 goto repeat_find;
1850
1851                         page_cache = 0;
1852                 }
1853
1854                 /* We have exclusive IO access to the page.. */
1855                 if (!PageLocked(page)) {
1856                         PAGE_BUG(page);
1857                 } else {
1858                         if (page->owner != current) {
1859                                 PAGE_BUG(page);
1860                         }
1861                 }
1862
1863                 status = write_one_page(file, page, offset, bytes, buf);
1864
1865                 /* Mark it unlocked again and drop the page.. */
1866                 UnlockPage(page);
1867                 page_cache_release(page);
1868
1869                 if (status < 0)
1870                         break;
1871
1872                 written += status;
1873                 count -= status;
1874                 pos += status;
1875                 buf += status;
1876         }
1877         *ppos = pos;
1878         if (pos > inode->i_size)
1879                 inode->i_size = pos;
1880
1881         if (page_cache)
1882                 page_cache_free(page_cache);
1883
1884         err = written ? written : status;
1885 out:
1886         return err;
1887 }
1888
1889 /*
1890  * Support routines for directory caching using the page cache.
1891  */
1892
1893 /*
1894  * Unlock and free a page.
1895  */
1896 void put_cached_page(unsigned long addr)
1897 {
1898         struct page * page = page_cache_entry(addr);
1899
1900         UnlockPage(page);
1901         if (page_count(page) != 2)
1902                 panic("put_cached_page: page count=%d\n",
1903                         page_count(page));
1904         page_cache_release(page);
1905 }
1906
1907 void __init page_cache_init(unsigned long memory_size)
1908 {
1909         unsigned long htable_size, order;
1910
1911         htable_size = memory_size >> PAGE_SHIFT;
1912         htable_size *= sizeof(struct page *);
1913         for(order = 0; (PAGE_SIZE << order) < htable_size; order++)
1914                 ;
1915
1916         do {
1917                 unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *);
1918
1919                 page_hash_bits = 0;
1920                 while((tmp >>= 1UL) != 0UL)
1921                         page_hash_bits++;
1922
1923                 page_hash_table = (struct page **)
1924                         __get_free_pages(GFP_ATOMIC, order);
1925         } while(page_hash_table == NULL && --order > 0);
1926
1927         printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
1928                (1 << page_hash_bits), order, (PAGE_SIZE << order));
1929         if (!page_hash_table)
1930                 panic("Failed to allocate page hash table\n");
1931         memset(page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));
1932 }