mm/filemap.c

   1 /*
   2  *      linux/mm/filemap.c
   3  *
   4  * Copyright (C) 1994, 1995  Linus Torvalds
   5  */
   6
   7 /*
   8  * This file handles the generic file mmap semantics used by
   9  * most "normal" filesystems (but you don't /have/ to use this:
  10  * the NFS filesystem does this differently, for example)
  11  */
  12 #include <linux/stat.h>
  13 #include <linux/sched.h>
  14 #include <linux/kernel.h>
  15 #include <linux/mm.h>
  16 #include <linux/shm.h>
  17 #include <linux/errno.h>
  18 #include <linux/mman.h>
  19 #include <linux/string.h>
  20 #include <linux/malloc.h>
  21 #include <linux/fs.h>
  22 #include <linux/locks.h>
  23 #include <linux/pagemap.h>
  24 #include <linux/swap.h>
  25 #include <linux/smp.h>
  26 #include <linux/smp_lock.h>
  27 #include <linux/blkdev.h>
  28 #include <linux/file.h>
  29
  30 #include <asm/system.h>
  31 #include <asm/pgtable.h>
  32 #include <asm/uaccess.h>
  33
  34 /*
  35  * Shared mappings implemented 30.11.1994. It's not fully working yet,
  36  * though.
  37  *
  38  * Shared mappings now work. 15.8.1995  Bruno.
  39  */
  40
  41 unsigned long page_cache_size = 0;
  42 struct page * page_hash_table[PAGE_HASH_SIZE];
  43
  44 /*
  45  * Simple routines for both non-shared and shared mappings.
  46  */
  47
  48 #define release_page(page) __free_page((page))
  49
  50 /*
  51  * Invalidate the pages of an inode, removing all pages that aren't
  52  * locked down (those are sure to be up-to-date anyway, so we shouldn't
  53  * invalidate them).
  54  */
  55 void invalidate_inode_pages(struct inode * inode)
  56 {
  57         struct page ** p;
  58         struct page * page;
  59
  60         p = &inode->i_pages;
  61         while ((page = *p) != NULL) {
  62                 if (PageLocked(page)) {
  63                         p = &page->next;
  64                         continue;
  65                 }
  66                 inode->i_nrpages--;
  67                 if ((*p = page->next) != NULL)
  68                         (*p)->prev = page->prev;
  69                 page->next = NULL;
  70                 page->prev = NULL;
  71                 remove_page_from_hash_queue(page);
  72                 page->inode = NULL;
  73                 __free_page(page);
  74                 continue;
  75         }
  76 }
  77
  78 /*
  79  * Truncate the page cache at a set offset, removing the pages
  80  * that are beyond that offset (and zeroing out partial pages).
  81  */
  82 void truncate_inode_pages(struct inode * inode, unsigned long start)
  83 {
  84         struct page ** p;
  85         struct page * page;
  86
  87 repeat:
  88         p = &inode->i_pages;
  89         while ((page = *p) != NULL) {
  90                 unsigned long offset = page->offset;
  91
  92                 /* page wholly truncated - free it */
  93                 if (offset >= start) {
  94                         if (PageLocked(page)) {
  95                                 wait_on_page(page);
  96                                 goto repeat;
  97                         }
  98                         inode->i_nrpages--;
  99                         if ((*p = page->next) != NULL)
 100                                 (*p)->prev = page->prev;
 101                         page->next = NULL;
 102                         page->prev = NULL;
 103                         remove_page_from_hash_queue(page);
 104                         page->inode = NULL;
 105                         __free_page(page);
 106                         continue;
 107                 }
 108                 p = &page->next;
 109                 offset = start - offset;
 110                 /* partial truncate, clear end of page */
 111                 if (offset < PAGE_SIZE) {
 112                         unsigned long address = page_address(page);
 113                         memset((void *) (offset + address), 0, PAGE_SIZE - offset);
 114                         flush_page_to_ram(address);
 115                 }
 116         }
 117 }
 118
 119 int shrink_mmap(int priority, int gfp_mask)
 120 {
 121         static unsigned long clock = 0;
 122         struct page * page;
 123         unsigned long limit = num_physpages;
 124         struct buffer_head *tmp, *bh;
 125         int count_max, count_min;
 126
 127         count_max = (limit<<1) >> (priority>>1);
 128         count_min = (limit<<1) >> (priority);
 129
 130         page = mem_map + clock;
 131         do {
 132                 count_max--;
 133                 if (page->inode || page->buffers)
 134                         count_min--;
 135
 136                 if (PageLocked(page))
 137                         goto next;
 138                 if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
 139                         goto next;
 140                 /* First of all, regenerate the page's referenced bit
 141                    from any buffers in the page */
 142                 bh = page->buffers;
 143                 if (bh) {
 144                         tmp = bh;
 145                         do {
 146                                 if (buffer_touched(tmp)) {
 147                                         clear_bit(BH_Touched, &tmp->b_state);
 148                                         set_bit(PG_referenced, &page->flags);
 149                                 }
 150                                 tmp = tmp->b_this_page;
 151                         } while (tmp != bh);
 152                 }
 153
 154                 /* We can't throw away shared pages, but we do mark
 155                    them as referenced.  This relies on the fact that
 156                    no page is currently in both the page cache and the
 157                    buffer cache; we'd have to modify the following
 158                    test to allow for that case. */
 159
 160                 switch (atomic_read(&page->count)) {
 161                         case 1:
 162                                 /* If it has been referenced recently, don't free it */
 163                                 if (test_and_clear_bit(PG_referenced, &page->flags))
 164                                         break;
 165
 166                                 /* is it a swap-cache or page-cache page? */
 167                                 if (page->inode) {
 168                                         if (PageSwapCache(page)) {
 169                                                 delete_from_swap_cache(page);
 170                                                 return 1;
 171                                         }
 172                                         remove_page_from_hash_queue(page);
 173                                         remove_page_from_inode_queue(page);
 174                                         __free_page(page);
 175                                         return 1;
 176                                 }
 177
 178                                 /* is it a buffer cache page? */
 179                                 if ((gfp_mask & __GFP_IO) && bh && try_to_free_buffer(bh, &bh, 6))
 180                                         return 1;
 181                                 break;
 182
 183                         default:
 184                                 /* more than one users: we can't throw it away */
 185                                 set_bit(PG_referenced, &page->flags);
 186                                 /* fall through */
 187                         case 0:
 188                                 /* nothing */
 189                 }
 190 next:
 191                 page++;
 192                 clock++;
 193                 if (clock >= limit) {
 194                         clock = 0;
 195                         page = mem_map;
 196                 }
 197         } while (count_max > 0 && count_min > 0);
 198         return 0;
 199 }
 200
 201 /*
 202  * This is called from try_to_swap_out() when we try to get rid of some
 203  * pages..  If we're unmapping the last occurrence of this page, we also
 204  * free it from the page hash-queues etc, as we don't want to keep it
 205  * in-core unnecessarily.
 206  */
 207 unsigned long page_unuse(unsigned long page)
 208 {
 209         struct page * p = mem_map + MAP_NR(page);
 210         int count = atomic_read(&p->count);
 211
 212         if (count != 2)
 213                 return count;
 214         if (!p->inode)
 215                 return count;
 216         if (PageSwapCache(p))
 217                 panic ("Doing a normal page_unuse of a swap cache page");
 218         remove_page_from_hash_queue(p);
 219         remove_page_from_inode_queue(p);
 220         free_page(page);
 221         return 1;
 222 }
 223
 224 /*
 225  * Update a page cache copy, when we're doing a "write()" system call
 226  * See also "update_vm_cache()".
 227  */
 228 void update_vm_cache(struct inode * inode, unsigned long pos, const char * buf, int count)
 229 {
 230         unsigned long offset, len;
 231
 232         offset = (pos & ~PAGE_MASK);
 233         pos = pos & PAGE_MASK;
 234         len = PAGE_SIZE - offset;
 235         do {
 236                 struct page * page;
 237
 238                 if (len > count)
 239                         len = count;
 240                 page = find_page(inode, pos);
 241                 if (page) {
 242                         wait_on_page(page);
 243                         memcpy((void *) (offset + page_address(page)), buf, len);
 244                         release_page(page);
 245                 }
 246                 count -= len;
 247                 buf += len;
 248                 len = PAGE_SIZE;
 249                 offset = 0;
 250                 pos += PAGE_SIZE;
 251         } while (count);
 252 }
 253
 254 static inline void add_to_page_cache(struct page * page,
 255         struct inode * inode, unsigned long offset,
 256         struct page **hash)
 257 {
 258         atomic_inc(&page->count);
 259         page->flags &= ~((1 << PG_uptodate) | (1 << PG_error));
 260         page->offset = offset;
 261         add_page_to_inode_queue(inode, page);
 262         __add_page_to_hash_queue(page, hash);
 263 }
 264
 265 /*
 266  * Try to read ahead in the file. "page_cache" is a potentially free page
 267  * that we could use for the cache (if it is 0 we can try to create one,
 268  * this is all overlapped with the IO on the previous page finishing anyway)
 269  */
 270 static unsigned long try_to_read_ahead(struct file * file,
 271                                 unsigned long offset, unsigned long page_cache)
 272 {
 273         struct inode *inode = file->f_dentry->d_inode;
 274         struct page * page;
 275         struct page ** hash;
 276
 277         offset &= PAGE_MASK;
 278         switch (page_cache) {
 279         case 0:
 280                 page_cache = __get_free_page(GFP_KERNEL);
 281                 if (!page_cache)
 282                         break;
 283         default:
 284                 if (offset >= inode->i_size)
 285                         break;
 286                 hash = page_hash(inode, offset);
 287                 page = __find_page(inode, offset, *hash);
 288                 if (!page) {
 289                         /*
 290                          * Ok, add the new page to the hash-queues...
 291                          */
 292                         page = mem_map + MAP_NR(page_cache);
 293                         add_to_page_cache(page, inode, offset, hash);
 294                         inode->i_op->readpage(file, page);
 295                         page_cache = 0;
 296                 }
 297                 release_page(page);
 298         }
 299         return page_cache;
 300 }
 301
 302 /*
 303  * Wait for IO to complete on a locked page.
 304  *
 305  * This must be called with the caller "holding" the page,
 306  * ie with increased "page->count" so that the page won't
 307  * go away during the wait..
 308  */
 309 void __wait_on_page(struct page *page)
 310 {
 311         struct task_struct *tsk = current;
 312         struct wait_queue wait;
 313
 314         wait.task = tsk;
 315         add_wait_queue(&page->wait, &wait);
 316 repeat:
 317         tsk->state = TASK_UNINTERRUPTIBLE;
 318         run_task_queue(&tq_disk);
 319         if (PageLocked(page)) {
 320                 schedule();
 321                 goto repeat;
 322         }
 323         tsk->state = TASK_RUNNING;
 324         remove_wait_queue(&page->wait, &wait);
 325 }
 326
 327 #if 0
 328 #define PROFILE_READAHEAD
 329 #define DEBUG_READAHEAD
 330 #endif
 331
 332 /*
 333  * Read-ahead profiling information
 334  * --------------------------------
 335  * Every PROFILE_MAXREADCOUNT, the following information is written
 336  * to the syslog:
 337  *   Percentage of asynchronous read-ahead.
 338  *   Average of read-ahead fields context value.
 339  * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
 340  * to the syslog.
 341  */
 342
 343 #ifdef PROFILE_READAHEAD
 344
 345 #define PROFILE_MAXREADCOUNT 1000
 346
 347 static unsigned long total_reada;
 348 static unsigned long total_async;
 349 static unsigned long total_ramax;
 350 static unsigned long total_ralen;
 351 static unsigned long total_rawin;
 352
 353 static void profile_readahead(int async, struct file *filp)
 354 {
 355         unsigned long flags;
 356
 357         ++total_reada;
 358         if (async)
 359                 ++total_async;
 360
 361         total_ramax     += filp->f_ramax;
 362         total_ralen     += filp->f_ralen;
 363         total_rawin     += filp->f_rawin;
 364
 365         if (total_reada > PROFILE_MAXREADCOUNT) {
 366                 save_flags(flags);
 367                 cli();
 368                 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
 369                         restore_flags(flags);
 370                         return;
 371                 }
 372
 373                 printk("Readahead average:  max=%ld, len=%ld, win=%ld, async=%ld%%\n",
 374                         total_ramax/total_reada,
 375                         total_ralen/total_reada,
 376                         total_rawin/total_reada,
 377                         (total_async*100)/total_reada);
 378 #ifdef DEBUG_READAHEAD
 379                 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%ld\n",
 380                         filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
 381 #endif
 382
 383                 total_reada     = 0;
 384                 total_async     = 0;
 385                 total_ramax     = 0;
 386                 total_ralen     = 0;
 387                 total_rawin     = 0;
 388
 389                 restore_flags(flags);
 390         }
 391 }
 392 #endif  /* defined PROFILE_READAHEAD */
 393
 394 /*
 395  * Read-ahead context:
 396  * -------------------
 397  * The read ahead context fields of the "struct file" are the following:
 398  * - f_raend : position of the first byte after the last page we tried to
 399  *             read ahead.
 400  * - f_ramax : current read-ahead maximum size.
 401  * - f_ralen : length of the current IO read block we tried to read-ahead.
 402  * - f_rawin : length of the current read-ahead window.
 403  *             if last read-ahead was synchronous then
 404  *                  f_rawin = f_ralen
 405  *             otherwise (was asynchronous)
 406  *                  f_rawin = previous value of f_ralen + f_ralen
 407  *
 408  * Read-ahead limits:
 409  * ------------------
 410  * MIN_READAHEAD   : minimum read-ahead size when read-ahead.
 411  * MAX_READAHEAD   : maximum read-ahead size when read-ahead.
 412  *
 413  * Synchronous read-ahead benefits:
 414  * --------------------------------
 415  * Using reasonable IO xfer length from peripheral devices increase system
 416  * performances.
 417  * Reasonable means, in this context, not too large but not too small.
 418  * The actual maximum value is:
 419  *      MAX_READAHEAD + PAGE_SIZE = 76k is CONFIG_READA_SMALL is undefined
 420  *      and 32K if defined (4K page size assumed).
 421  *
 422  * Asynchronous read-ahead benefits:
 423  * ---------------------------------
 424  * Overlapping next read request and user process execution increase system
 425  * performance.
 426  *
 427  * Read-ahead risks:
 428  * -----------------
 429  * We have to guess which further data are needed by the user process.
 430  * If these data are often not really needed, it's bad for system
 431  * performances.
 432  * However, we know that files are often accessed sequentially by
 433  * application programs and it seems that it is possible to have some good
 434  * strategy in that guessing.
 435  * We only try to read-ahead files that seems to be read sequentially.
 436  *
 437  * Asynchronous read-ahead risks:
 438  * ------------------------------
 439  * In order to maximize overlapping, we must start some asynchronous read
 440  * request from the device, as soon as possible.
 441  * We must be very careful about:
 442  * - The number of effective pending IO read requests.
 443  *   ONE seems to be the only reasonable value.
 444  * - The total memory pool usage for the file access stream.
 445  *   This maximum memory usage is implicitly 2 IO read chunks:
 446  *   2*(MAX_READAHEAD + PAGE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
 447  *   64k if defined (4K page size assumed).
 448  */
 449
 450 static inline int get_max_readahead(struct inode * inode)
 451 {
 452         if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
 453                 return MAX_READAHEAD;
 454         return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
 455 }
 456
 457 static inline unsigned long generic_file_readahead(int reada_ok,
 458         struct file * filp, struct inode * inode,
 459         unsigned long ppos, struct page * page, unsigned long page_cache)
 460 {
 461         unsigned long max_ahead, ahead;
 462         unsigned long raend;
 463         int max_readahead = get_max_readahead(inode);
 464
 465         raend = filp->f_raend & PAGE_MASK;
 466         max_ahead = 0;
 467
 468 /*
 469  * The current page is locked.
 470  * If the current position is inside the previous read IO request, do not
 471  * try to reread previously read ahead pages.
 472  * Otherwise decide or not to read ahead some pages synchronously.
 473  * If we are not going to read ahead, set the read ahead context for this
 474  * page only.
 475  */
 476         if (PageLocked(page)) {
 477                 if (!filp->f_ralen || ppos >= raend || ppos + filp->f_ralen < raend) {
 478                         raend = ppos;
 479                         if (raend < inode->i_size)
 480                                 max_ahead = filp->f_ramax;
 481                         filp->f_rawin = 0;
 482                         filp->f_ralen = PAGE_SIZE;
 483                         if (!max_ahead) {
 484                                 filp->f_raend  = ppos + filp->f_ralen;
 485                                 filp->f_rawin += filp->f_ralen;
 486                         }
 487                 }
 488         }
 489 /*
 490  * The current page is not locked.
 491  * If we were reading ahead and,
 492  * if the current max read ahead size is not zero and,
 493  * if the current position is inside the last read-ahead IO request,
 494  *   it is the moment to try to read ahead asynchronously.
 495  * We will later force unplug device in order to force asynchronous read IO.
 496  */
 497         else if (reada_ok && filp->f_ramax && raend >= PAGE_SIZE &&
 498                  ppos <= raend && ppos + filp->f_ralen >= raend) {
 499 /*
 500  * Add ONE page to max_ahead in order to try to have about the same IO max size
 501  * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_SIZE.
 502  * Compute the position of the last page we have tried to read in order to
 503  * begin to read ahead just at the next page.
 504  */
 505                 raend -= PAGE_SIZE;
 506                 if (raend < inode->i_size)
 507                         max_ahead = filp->f_ramax + PAGE_SIZE;
 508
 509                 if (max_ahead) {
 510                         filp->f_rawin = filp->f_ralen;
 511                         filp->f_ralen = 0;
 512                         reada_ok      = 2;
 513                 }
 514         }
 515 /*
 516  * Try to read ahead pages.
 517  * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
 518  * scheduler, will work enough for us to avoid too bad actuals IO requests.
 519  */
 520         ahead = 0;
 521         while (ahead < max_ahead) {
 522                 ahead += PAGE_SIZE;
 523                 page_cache = try_to_read_ahead(filp, raend + ahead,
 524                                                 page_cache);
 525         }
 526 /*
 527  * If we tried to read ahead some pages,
 528  * If we tried to read ahead asynchronously,
 529  *   Try to force unplug of the device in order to start an asynchronous
 530  *   read IO request.
 531  * Update the read-ahead context.
 532  * Store the length of the current read-ahead window.
 533  * Double the current max read ahead size.
 534  *   That heuristic avoid to do some large IO for files that are not really
 535  *   accessed sequentially.
 536  */
 537         if (ahead) {
 538                 if (reada_ok == 2) {
 539                         run_task_queue(&tq_disk);
 540                 }
 541
 542                 filp->f_ralen += ahead;
 543                 filp->f_rawin += filp->f_ralen;
 544                 filp->f_raend = raend + ahead + PAGE_SIZE;
 545
 546                 filp->f_ramax += filp->f_ramax;
 547
 548                 if (filp->f_ramax > max_readahead)
 549                         filp->f_ramax = max_readahead;
 550
 551 #ifdef PROFILE_READAHEAD
 552                 profile_readahead((reada_ok == 2), filp);
 553 #endif
 554         }
 555
 556         return page_cache;
 557 }
 558
 559
 560 /*
 561  * This is a generic file read routine, and uses the
 562  * inode->i_op->readpage() function for the actual low-level
 563  * stuff.
 564  *
 565  * This is really ugly. But the goto's actually try to clarify some
 566  * of the logic when it comes to error handling etc.
 567  */
 568
 569 ssize_t generic_file_read(struct file * filp, char * buf,
 570                           size_t count, loff_t *ppos)
 571 {
 572         struct dentry *dentry = filp->f_dentry;
 573         struct inode *inode = dentry->d_inode;
 574         ssize_t error, read;
 575         size_t pos, pgpos, page_cache;
 576         int reada_ok;
 577         int max_readahead = get_max_readahead(inode);
 578
 579         if (!access_ok(VERIFY_WRITE, buf, count))
 580                 return -EFAULT;
 581         if (!count)
 582                 return 0;
 583         error = 0;
 584         read = 0;
 585         page_cache = 0;
 586
 587         pos = *ppos;
 588         pgpos = pos & PAGE_MASK;
 589 /*
 590  * If the current position is outside the previous read-ahead window,
 591  * we reset the current read-ahead context and set read ahead max to zero
 592  * (will be set to just needed value later),
 593  * otherwise, we assume that the file accesses are sequential enough to
 594  * continue read-ahead.
 595  */
 596         if (pgpos > filp->f_raend || pgpos + filp->f_rawin < filp->f_raend) {
 597                 reada_ok = 0;
 598                 filp->f_raend = 0;
 599                 filp->f_ralen = 0;
 600                 filp->f_ramax = 0;
 601                 filp->f_rawin = 0;
 602         } else {
 603                 reada_ok = 1;
 604         }
 605 /*
 606  * Adjust the current value of read-ahead max.
 607  * If the read operation stay in the first half page, force no readahead.
 608  * Otherwise try to increase read ahead max just enough to do the read request.
 609  * Then, at least MIN_READAHEAD if read ahead is ok,
 610  * and at most MAX_READAHEAD in all cases.
 611  */
 612         if (pos + count <= (PAGE_SIZE >> 1)) {
 613                 filp->f_ramax = 0;
 614         } else {
 615                 unsigned long needed;
 616
 617                 needed = ((pos + count) & PAGE_MASK) - pgpos;
 618
 619                 if (filp->f_ramax < needed)
 620                         filp->f_ramax = needed;
 621
 622                 if (reada_ok && filp->f_ramax < MIN_READAHEAD)
 623                                 filp->f_ramax = MIN_READAHEAD;
 624                 if (filp->f_ramax > max_readahead)
 625                         filp->f_ramax = max_readahead;
 626         }
 627
 628         for (;;) {
 629                 struct page *page, **hash;
 630
 631                 if (pos >= inode->i_size)
 632                         break;
 633
 634                 /*
 635                  * Try to find the data in the page cache..
 636                  */
 637                 hash = page_hash(inode, pos & PAGE_MASK);
 638                 page = __find_page(inode, pos & PAGE_MASK, *hash);
 639                 if (!page)
 640                         goto no_cached_page;
 641
 642 found_page:
 643 /*
 644  * Try to read ahead only if the current page is filled or being filled.
 645  * Otherwise, if we were reading ahead, decrease max read ahead size to
 646  * the minimum value.
 647  * In this context, that seems to may happen only on some read error or if
 648  * the page has been rewritten.
 649  */
 650                 if (PageUptodate(page) || PageLocked(page))
 651                         page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_MASK, page, page_cache);
 652                 else if (reada_ok && filp->f_ramax > MIN_READAHEAD)
 653                                 filp->f_ramax = MIN_READAHEAD;
 654
 655                 wait_on_page(page);
 656
 657                 if (!PageUptodate(page))
 658                         goto page_read_error;
 659
 660 success:
 661                 /*
 662                  * Ok, we have the page, it's up-to-date and ok,
 663                  * so now we can finally copy it to user space...
 664                  */
 665         {
 666                 unsigned long offset, nr;
 667
 668                 offset = pos & ~PAGE_MASK;
 669                 nr = PAGE_SIZE - offset;
 670                 if (nr > count)
 671                         nr = count;
 672                 if (nr > inode->i_size - pos)
 673                         nr = inode->i_size - pos;
 674                 nr -= copy_to_user(buf, (void *) (page_address(page) + offset), nr);
 675                 release_page(page);
 676                 error = -EFAULT;
 677                 if (!nr)
 678                         break;
 679                 buf += nr;
 680                 pos += nr;
 681                 read += nr;
 682                 count -= nr;
 683                 if (count)
 684                         continue;
 685                 break;
 686         }
 687
 688 no_cached_page:
 689                 /*
 690                  * Ok, it wasn't cached, so we need to create a new
 691                  * page..
 692                  */
 693                 if (!page_cache) {
 694                         page_cache = __get_free_page(GFP_KERNEL);
 695                         /*
 696                          * That could have slept, so go around to the
 697                          * very beginning..
 698                          */
 699                         if (page_cache)
 700                                 continue;
 701                         error = -ENOMEM;
 702                         break;
 703                 }
 704
 705                 /*
 706                  * Ok, add the new page to the hash-queues...
 707                  */
 708                 page = mem_map + MAP_NR(page_cache);
 709                 page_cache = 0;
 710                 add_to_page_cache(page, inode, pos & PAGE_MASK, hash);
 711
 712                 /*
 713                  * Error handling is tricky. If we get a read error,
 714                  * the cached page stays in the cache (but uptodate=0),
 715                  * and the next process that accesses it will try to
 716                  * re-read it. This is needed for NFS etc, where the
 717                  * identity of the reader can decide if we can read the
 718                  * page or not..
 719                  */
 720 /*
 721  * We have to read the page.
 722  * If we were reading ahead, we had previously tried to read this page,
 723  * That means that the page has probably been removed from the cache before
 724  * the application process needs it, or has been rewritten.
 725  * Decrease max readahead size to the minimum value in that situation.
 726  */
 727                 if (reada_ok && filp->f_ramax > MIN_READAHEAD)
 728                         filp->f_ramax = MIN_READAHEAD;
 729
 730                 error = inode->i_op->readpage(filp, page);
 731                 if (!error)
 732                         goto found_page;
 733                 release_page(page);
 734                 break;
 735
 736 page_read_error:
 737                 /*
 738                  * We found the page, but it wasn't up-to-date.
 739                  * Try to re-read it _once_. We do this synchronously,
 740                  * because this happens only if there were errors.
 741                  */
 742                 error = inode->i_op->readpage(filp, page);
 743                 if (!error) {
 744                         wait_on_page(page);
 745                         if (PageUptodate(page) && !PageError(page))
 746                                 goto success;
 747                         error = -EIO; /* Some unspecified error occurred.. */
 748                 }
 749                 release_page(page);
 750                 break;
 751         }
 752
 753         *ppos = pos;
 754         filp->f_reada = 1;
 755         if (page_cache)
 756                 free_page(page_cache);
 757         UPDATE_ATIME(inode);
 758         if (!read)
 759                 read = error;
 760         return read;
 761 }
 762
 763 /*
 764  * Semantics for shared and private memory areas are different past the end
 765  * of the file. A shared mapping past the last page of the file is an error
 766  * and results in a SIGBUS, while a private mapping just maps in a zero page.
 767  *
 768  * The goto's are kind of ugly, but this streamlines the normal case of having
 769  * it in the page cache, and handles the special cases reasonably without
 770  * having a lot of duplicated code.
 771  *
 772  * WSH 06/04/97: fixed a memory leak and moved the allocation of new_page
 773  * ahead of the wait if we're sure to need it.
 774  */
 775 static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long address, int no_share)
 776 {
 777         struct file * file = area->vm_file;
 778         struct dentry * dentry = file->f_dentry;
 779         struct inode * inode = dentry->d_inode;
 780         unsigned long offset;
 781         struct page * page, **hash;
 782         unsigned long old_page, new_page;
 783
 784         new_page = 0;
 785         offset = (address & PAGE_MASK) - area->vm_start + area->vm_offset;
 786         if (offset >= inode->i_size && (area->vm_flags & VM_SHARED) && area->vm_mm == current->mm)
 787                 goto no_page;
 788
 789         /*
 790          * Do we have something in the page cache already?
 791          */
 792         hash = page_hash(inode, offset);
 793         page = __find_page(inode, offset, *hash);
 794         if (!page)
 795                 goto no_cached_page;
 796
 797 found_page:
 798         /*
 799          * Ok, found a page in the page cache, now we need to check
 800          * that it's up-to-date.  First check whether we'll need an
 801          * extra page -- better to overlap the allocation with the I/O.
 802          */
 803         if (no_share && !new_page) {
 804                 new_page = __get_free_page(GFP_KERNEL);
 805                 if (!new_page)
 806                         goto failure;
 807         }
 808
 809         if (PageLocked(page))
 810                 goto page_locked_wait;
 811         if (!PageUptodate(page))
 812                 goto page_read_error;
 813
 814 success:
 815         /*
 816          * Found the page, need to check sharing and possibly
 817          * copy it over to another page..
 818          */
 819         old_page = page_address(page);
 820         if (!no_share) {
 821                 /*
 822                  * Ok, we can share the cached page directly.. Get rid
 823                  * of any potential extra pages.
 824                  */
 825                 if (new_page)
 826                         free_page(new_page);
 827
 828                 flush_page_to_ram(old_page);
 829                 return old_page;
 830         }
 831
 832         /*
 833          * No sharing ... copy to the new page.
 834          */
 835         copy_page(new_page, old_page);
 836         flush_page_to_ram(new_page);
 837         release_page(page);
 838         return new_page;
 839
 840 no_cached_page:
 841         new_page = __get_free_page(GFP_KERNEL);
 842         if (!new_page)
 843                 goto no_page;
 844
 845         /*
 846          * During getting the above page we might have slept,
 847          * so we need to re-check the situation with the page
 848          * cache.. The page we just got may be useful if we
 849          * can't share, so don't get rid of it here.
 850          */
 851         page = find_page(inode, offset);
 852         if (page)
 853                 goto found_page;
 854
 855         /*
 856          * Now, create a new page-cache page from the page we got
 857          */
 858         page = mem_map + MAP_NR(new_page);
 859         new_page = 0;
 860         add_to_page_cache(page, inode, offset, hash);
 861
 862         if (inode->i_op->readpage(file, page) != 0)
 863                 goto failure;
 864
 865         /*
 866          * Do a very limited read-ahead if appropriate
 867          */
 868         if (PageLocked(page))
 869                 new_page = try_to_read_ahead(file, offset + PAGE_SIZE, 0);
 870         goto found_page;
 871
 872 page_locked_wait:
 873         __wait_on_page(page);
 874         if (PageUptodate(page))
 875                 goto success;
 876
 877 page_read_error:
 878         /*
 879          * Umm, take care of errors if the page isn't up-to-date.
 880          * Try to re-read it _once_. We do this synchronously,
 881          * because there really aren't any performance issues here
 882          * and we need to check for errors.
 883          */
 884         if (inode->i_op->readpage(file, page) != 0)
 885                 goto failure;
 886         wait_on_page(page);
 887         if (PageError(page))
 888                 goto failure;
 889         if (PageUptodate(page))
 890                 goto success;
 891
 892         /*
 893          * Uhhuh.. Things didn't work out. Return zero to tell the
 894          * mm layer so, possibly freeing the page cache page first.
 895          */
 896 failure:
 897         release_page(page);
 898         if (new_page)
 899                 free_page(new_page);
 900 no_page:
 901         return 0;
 902 }
 903
 904 /*
 905  * Tries to write a shared mapped page to its backing store. May return -EIO
 906  * if the disk is full.
 907  */
 908 static inline int do_write_page(struct inode * inode, struct file * file,
 909         const char * page, unsigned long offset)
 910 {
 911         int retval;
 912         unsigned long size;
 913         mm_segment_t old_fs;
 914
 915         size = offset + PAGE_SIZE;
 916         /* refuse to extend file size.. */
 917         if (S_ISREG(inode->i_mode)) {
 918                 if (size > inode->i_size)
 919                         size = inode->i_size;
 920                 /* Ho humm.. We should have tested for this earlier */
 921                 if (size < offset)
 922                         return -EIO;
 923         }
 924         size -= offset;
 925         old_fs = get_fs();
 926         set_fs(KERNEL_DS);
 927         retval = -EIO;
 928         if (size == file->f_op->write(file, (const char *) page,
 929                                       size, &file->f_pos))
 930                 retval = 0;
 931         set_fs(old_fs);
 932         return retval;
 933 }
 934
 935 static int filemap_write_page(struct vm_area_struct * vma,
 936         unsigned long offset,
 937         unsigned long page)
 938 {
 939         int result;
 940         struct file * file;
 941         struct dentry * dentry;
 942         struct inode * inode;
 943         struct buffer_head * bh;
 944
 945         bh = mem_map[MAP_NR(page)].buffers;
 946         if (bh) {
 947                 /* whee.. just mark the buffer heads dirty */
 948                 struct buffer_head * tmp = bh;
 949                 do {
 950                         /*
 951                          * WSH: There's a race here: mark_buffer_dirty()
 952                          * could block, and the buffers aren't pinned down.
 953                          */
 954                         mark_buffer_dirty(tmp, 0);
 955                         tmp = tmp->b_this_page;
 956                 } while (tmp != bh);
 957                 return 0;
 958         }
 959
 960         file = vma->vm_file;
 961         dentry = file->f_dentry;
 962         inode = dentry->d_inode;
 963         if (!file->f_op->write)
 964                 return -EIO;
 965
 966         /*
 967          * If a task terminates while we're swapping the page, the vma and
 968          * and file could be released ... increment the count to be safe.
 969          */
 970         file->f_count++;
 971         down(&inode->i_sem);
 972         result = do_write_page(inode, file, (const char *) page, offset);
 973         up(&inode->i_sem);
 974         fput(file);
 975         return result;
 976 }
 977
 978
 979 /*
 980  * Swapping to a shared file: while we're busy writing out the page
 981  * (and the page still exists in memory), we save the page information
 982  * in the page table, so that "filemap_swapin()" can re-use the page
 983  * immediately if it is called while we're busy swapping it out..
 984  *
 985  * Once we've written it all out, we mark the page entry "empty", which
 986  * will result in a normal page-in (instead of a swap-in) from the now
 987  * up-to-date disk file.
 988  */
 989 int filemap_swapout(struct vm_area_struct * vma,
 990         unsigned long offset,
 991         pte_t *page_table)
 992 {
 993         int error;
 994         unsigned long page = pte_page(*page_table);
 995         unsigned long entry = SWP_ENTRY(SHM_SWP_TYPE, MAP_NR(page));
 996
 997         flush_cache_page(vma, (offset + vma->vm_start - vma->vm_offset));
 998         set_pte(page_table, __pte(entry));
 999         flush_tlb_page(vma, (offset + vma->vm_start - vma->vm_offset));
1000         error = filemap_write_page(vma, offset, page);
1001         if (pte_val(*page_table) == entry)
1002                 pte_clear(page_table);
1003         return error;
1004 }
1005
1006 /*
1007  * filemap_swapin() is called only if we have something in the page
1008  * tables that is non-zero (but not present), which we know to be the
1009  * page index of a page that is busy being swapped out (see above).
1010  * So we just use it directly..
1011  */
1012 static pte_t filemap_swapin(struct vm_area_struct * vma,
1013         unsigned long offset,
1014         unsigned long entry)
1015 {
1016         unsigned long page = SWP_OFFSET(entry);
1017
1018         atomic_inc(&mem_map[page].count);
1019         page = (page << PAGE_SHIFT) + PAGE_OFFSET;
1020         return mk_pte(page,vma->vm_page_prot);
1021 }
1022
1023
1024 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
1025         unsigned long address, unsigned int flags)
1026 {
1027         pte_t pte = *ptep;
1028         unsigned long page;
1029         int error;
1030
1031         if (!(flags & MS_INVALIDATE)) {
1032                 if (!pte_present(pte))
1033                         return 0;
1034                 if (!pte_dirty(pte))
1035                         return 0;
1036                 flush_page_to_ram(pte_page(pte));
1037                 flush_cache_page(vma, address);
1038                 set_pte(ptep, pte_mkclean(pte));
1039                 flush_tlb_page(vma, address);
1040                 page = pte_page(pte);
1041                 atomic_inc(&mem_map[MAP_NR(page)].count);
1042         } else {
1043                 if (pte_none(pte))
1044                         return 0;
1045                 flush_cache_page(vma, address);
1046                 pte_clear(ptep);
1047                 flush_tlb_page(vma, address);
1048                 if (!pte_present(pte)) {
1049                         swap_free(pte_val(pte));
1050                         return 0;
1051                 }
1052                 page = pte_page(pte);
1053                 if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
1054                         free_page(page);
1055                         return 0;
1056                 }
1057         }
1058         error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page);
1059         free_page(page);
1060         return error;
1061 }
1062
1063 static inline int filemap_sync_pte_range(pmd_t * pmd,
1064         unsigned long address, unsigned long size,
1065         struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
1066 {
1067         pte_t * pte;
1068         unsigned long end;
1069         int error;
1070
1071         if (pmd_none(*pmd))
1072                 return 0;
1073         if (pmd_bad(*pmd)) {
1074                 printk("filemap_sync_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
1075                 pmd_clear(pmd);
1076                 return 0;
1077         }
1078         pte = pte_offset(pmd, address);
1079         offset += address & PMD_MASK;
1080         address &= ~PMD_MASK;
1081         end = address + size;
1082         if (end > PMD_SIZE)
1083                 end = PMD_SIZE;
1084         error = 0;
1085         do {
1086                 error |= filemap_sync_pte(pte, vma, address + offset, flags);
1087                 address += PAGE_SIZE;
1088                 pte++;
1089         } while (address < end);
1090         return error;
1091 }
1092
1093 static inline int filemap_sync_pmd_range(pgd_t * pgd,
1094         unsigned long address, unsigned long size,
1095         struct vm_area_struct *vma, unsigned int flags)
1096 {
1097         pmd_t * pmd;
1098         unsigned long offset, end;
1099         int error;
1100
1101         if (pgd_none(*pgd))
1102                 return 0;
1103         if (pgd_bad(*pgd)) {
1104                 printk("filemap_sync_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd));
1105                 pgd_clear(pgd);
1106                 return 0;
1107         }
1108         pmd = pmd_offset(pgd, address);
1109         offset = address & PGDIR_MASK;
1110         address &= ~PGDIR_MASK;
1111         end = address + size;
1112         if (end > PGDIR_SIZE)
1113                 end = PGDIR_SIZE;
1114         error = 0;
1115         do {
1116                 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
1117                 address = (address + PMD_SIZE) & PMD_MASK;
1118                 pmd++;
1119         } while (address < end);
1120         return error;
1121 }
1122
1123 static int filemap_sync(struct vm_area_struct * vma, unsigned long address,
1124         size_t size, unsigned int flags)
1125 {
1126         pgd_t * dir;
1127         unsigned long end = address + size;
1128         int error = 0;
1129
1130         dir = pgd_offset(vma->vm_mm, address);
1131         flush_cache_range(vma->vm_mm, end - size, end);
1132         while (address < end) {
1133                 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
1134                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
1135                 dir++;
1136         }
1137         flush_tlb_range(vma->vm_mm, end - size, end);
1138         return error;
1139 }
1140
1141 /*
1142  * This handles (potentially partial) area unmaps..
1143  */
1144 static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len)
1145 {
1146         filemap_sync(vma, start, len, MS_ASYNC);
1147 }
1148
1149 /*
1150  * Shared mappings need to be able to do the right thing at
1151  * close/unmap/sync. They will also use the private file as
1152  * backing-store for swapping..
1153  */
1154 static struct vm_operations_struct file_shared_mmap = {
1155         NULL,                   /* no special open */
1156         NULL,                   /* no special close */
1157         filemap_unmap,          /* unmap - we need to sync the pages */
1158         NULL,                   /* no special protect */
1159         filemap_sync,           /* sync */
1160         NULL,                   /* advise */
1161         filemap_nopage,         /* nopage */
1162         NULL,                   /* wppage */
1163         filemap_swapout,        /* swapout */
1164         filemap_swapin,         /* swapin */
1165 };
1166
1167 /*
1168  * Private mappings just need to be able to load in the map.
1169  *
1170  * (This is actually used for shared mappings as well, if we
1171  * know they can't ever get write permissions..)
1172  */
1173 static struct vm_operations_struct file_private_mmap = {
1174         NULL,                   /* open */
1175         NULL,                   /* close */
1176         NULL,                   /* unmap */
1177         NULL,                   /* protect */
1178         NULL,                   /* sync */
1179         NULL,                   /* advise */
1180         filemap_nopage,         /* nopage */
1181         NULL,                   /* wppage */
1182         NULL,                   /* swapout */
1183         NULL,                   /* swapin */
1184 };
1185
1186 /* This is used for a general mmap of a disk file */
1187
1188 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1189 {
1190         struct vm_operations_struct * ops;
1191         struct inode *inode = file->f_dentry->d_inode;
1192
1193         if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1194                 ops = &file_shared_mmap;
1195                 /* share_page() can only guarantee proper page sharing if
1196                  * the offsets are all page aligned. */
1197                 if (vma->vm_offset & (PAGE_SIZE - 1))
1198                         return -EINVAL;
1199         } else {
1200                 ops = &file_private_mmap;
1201                 if (vma->vm_offset & (inode->i_sb->s_blocksize - 1))
1202                         return -EINVAL;
1203         }
1204         if (!inode->i_sb || !S_ISREG(inode->i_mode))
1205                 return -EACCES;
1206         if (!inode->i_op || !inode->i_op->readpage)
1207                 return -ENOEXEC;
1208         UPDATE_ATIME(inode);
1209         vma->vm_file = file;
1210         file->f_count++;
1211         vma->vm_ops = ops;
1212         return 0;
1213 }
1214
1215
1216 /*
1217  * The msync() system call.
1218  */
1219
1220 static int msync_interval(struct vm_area_struct * vma,
1221         unsigned long start, unsigned long end, int flags)
1222 {
1223         if (vma->vm_file && vma->vm_ops && vma->vm_ops->sync) {
1224                 int error;
1225                 error = vma->vm_ops->sync(vma, start, end-start, flags);
1226                 if (!error && (flags & MS_SYNC)) {
1227                         struct file * file = vma->vm_file;
1228                         if (file) {
1229                                 struct dentry * dentry = file->f_dentry;
1230                                 struct inode * inode = dentry->d_inode;
1231                                 down(&inode->i_sem);
1232                                 error = file_fsync(file, dentry);
1233                                 up(&inode->i_sem);
1234                         }
1235                 }
1236                 return error;
1237         }
1238         return 0;
1239 }
1240
1241 asmlinkage int sys_msync(unsigned long start, size_t len, int flags)
1242 {
1243         unsigned long end;
1244         struct vm_area_struct * vma;
1245         int unmapped_error, error = -EINVAL;
1246
1247         lock_kernel();
1248         if (start & ~PAGE_MASK)
1249                 goto out;
1250         len = (len + ~PAGE_MASK) & PAGE_MASK;
1251         end = start + len;
1252         if (end < start)
1253                 goto out;
1254         if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1255                 goto out;
1256         error = 0;
1257         if (end == start)
1258                 goto out;
1259         /*
1260          * If the interval [start,end) covers some unmapped address ranges,
1261          * just ignore them, but return -EFAULT at the end.
1262          */
1263         vma = find_vma(current->mm, start);
1264         unmapped_error = 0;
1265         for (;;) {
1266                 /* Still start < end. */
1267                 error = -EFAULT;
1268                 if (!vma)
1269                         goto out;
1270                 /* Here start < vma->vm_end. */
1271                 if (start < vma->vm_start) {
1272                         unmapped_error = -EFAULT;
1273                         start = vma->vm_start;
1274                 }
1275                 /* Here vma->vm_start <= start < vma->vm_end. */
1276                 if (end <= vma->vm_end) {
1277                         if (start < end) {
1278                                 error = msync_interval(vma, start, end, flags);
1279                                 if (error)
1280                                         goto out;
1281                         }
1282                         error = unmapped_error;
1283                         goto out;
1284                 }
1285                 /* Here vma->vm_start <= start < vma->vm_end < end. */
1286                 error = msync_interval(vma, start, vma->vm_end, flags);
1287                 if (error)
1288                         goto out;
1289                 start = vma->vm_end;
1290                 vma = vma->vm_next;
1291         }
1292 out:
1293         unlock_kernel();
1294         return error;
1295 }
1296
1297 /*
1298  * Write to a file through the page cache. This is mainly for the
1299  * benefit of NFS and possibly other network-based file systems.
1300  *
1301  * We currently put everything into the page cache prior to writing it.
1302  * This is not a problem when writing full pages. With partial pages,
1303  * however, we first have to read the data into the cache, then
1304  * dirty the page, and finally schedule it for writing. Alternatively, we
1305  * could write-through just the portion of data that would go into that
1306  * page, but that would kill performance for applications that write data
1307  * line by line, and it's prone to race conditions.
1308  *
1309  * Note that this routine doesn't try to keep track of dirty pages. Each
1310  * file system has to do this all by itself, unfortunately.
1311  *                                                      okir@monad.swb.de
1312  */
1313 ssize_t
1314 generic_file_write(struct file *file, const char *buf,
1315                    size_t count, loff_t *ppos)
1316 {
1317         struct dentry   *dentry = file->f_dentry;
1318         struct inode    *inode = dentry->d_inode;
1319         struct page     *page, **hash;
1320         unsigned long   page_cache = 0;
1321         unsigned long   pgpos, offset;
1322         unsigned long   bytes, written;
1323         unsigned long   pos;
1324         long            status, sync, didread;
1325
1326         if (!inode->i_op || !inode->i_op->updatepage)
1327                 return -EIO;
1328
1329         sync    = file->f_flags & O_SYNC;
1330         pos     = *ppos;
1331         written = 0;
1332         status  = 0;
1333
1334         if (file->f_flags & O_APPEND)
1335                 pos = inode->i_size;
1336
1337         while (count) {
1338                 /*
1339                  * Try to find the page in the cache. If it isn't there,
1340                  * allocate a free page.
1341                  */
1342                 offset = (pos & ~PAGE_MASK);
1343                 pgpos = pos & PAGE_MASK;
1344
1345                 if ((bytes = PAGE_SIZE - offset) > count)
1346                         bytes = count;
1347
1348                 hash = page_hash(inode, pgpos);
1349                 if (!(page = __find_page(inode, pgpos, *hash))) {
1350                         if (!page_cache) {
1351                                 page_cache = __get_free_page(GFP_KERNEL);
1352                                 if (page_cache)
1353                                         continue;
1354                                 status = -ENOMEM;
1355                                 break;
1356                         }
1357                         page = mem_map + MAP_NR(page_cache);
1358                         add_to_page_cache(page, inode, pgpos, hash);
1359                         page_cache = 0;
1360                 }
1361
1362                 /*
1363                  * Note: setting of the PG_locked bit is handled
1364                  * below the i_op->xxx interface.
1365                  */
1366                 didread = 0;
1367 page_wait:
1368                 wait_on_page(page);
1369                 if (PageUptodate(page))
1370                         goto do_update_page;
1371
1372                 /*
1373                  * The page is not up-to-date ... if we're writing less
1374                  * than a full page of data, we may have to read it first.
1375                  * But if the page is past the current end of file, we must
1376                  * clear it before updating.
1377                  */
1378                 if (bytes < PAGE_SIZE) {
1379                         if (pgpos < inode->i_size) {
1380                                 status = -EIO;
1381                                 if (didread >= 2)
1382                                         goto done_with_page;
1383                                 status = inode->i_op->readpage(file, page);
1384                                 if (status < 0)
1385                                         goto done_with_page;
1386                                 didread++;
1387                                 goto page_wait;
1388                         } else {
1389                                 /* Must clear for partial writes */
1390                                 memset((void *) page_address(page), 0,
1391                                          PAGE_SIZE);
1392                         }
1393                 }
1394                 /*
1395                  * N.B. We should defer setting PG_uptodate at least until
1396                  * the data is copied. A failure in i_op->updatepage() could
1397                  * leave the page with garbage data.
1398                  */
1399                 set_bit(PG_uptodate, &page->flags);
1400
1401 do_update_page:
1402                 /* Alright, the page is there.  Now update it. */
1403                 status = inode->i_op->updatepage(file, page, buf,
1404                                                         offset, bytes, sync);
1405 done_with_page:
1406                 __free_page(page);
1407                 if (status < 0)
1408                         break;
1409
1410                 written += status;
1411                 count -= status;
1412                 pos += status;
1413                 buf += status;
1414         }
1415         *ppos = pos;
1416         if (pos > inode->i_size)
1417                 inode->i_size = pos;
1418
1419         if (page_cache)
1420                 free_page(page_cache);
1421         return written ? written : status;
1422 }
1423
1424 /*
1425  * Support routines for directory cacheing using the page cache.
1426  */
1427
1428 /*
1429  * Finds the page at the specified offset, installing a new page
1430  * if requested.  The count is incremented and the page is locked.
1431  *
1432  * Note: we don't have to worry about races here, as the caller
1433  * is holding the inode semaphore.
1434  */
1435 unsigned long get_cached_page(struct inode * inode, unsigned long offset,
1436                                 int new)
1437 {
1438         struct page * page;
1439         struct page ** hash;
1440         unsigned long page_cache = 0;
1441
1442         hash = page_hash(inode, offset);
1443         page = __find_page(inode, offset, *hash);
1444         if (!page) {
1445                 if (!new)
1446                         goto out;
1447                 page_cache = get_free_page(GFP_KERNEL);
1448                 if (!page_cache)
1449                         goto out;
1450                 page = mem_map + MAP_NR(page_cache);
1451                 add_to_page_cache(page, inode, offset, hash);
1452         }
1453         if (atomic_read(&page->count) != 2)
1454                 printk(KERN_ERR "get_cached_page: page count=%d\n",
1455                         atomic_read(&page->count));
1456         if (test_bit(PG_locked, &page->flags))
1457                 printk(KERN_ERR "get_cached_page: page already locked!\n");
1458         set_bit(PG_locked, &page->flags);
1459         page_cache = page_address(page);
1460
1461 out:
1462         return page_cache;
1463 }
1464
1465 /*
1466  * Unlock and free a page.
1467  */
1468 void put_cached_page(unsigned long addr)
1469 {
1470         struct page * page = mem_map + MAP_NR(addr);
1471
1472         if (!test_bit(PG_locked, &page->flags))
1473                 printk("put_cached_page: page not locked!\n");
1474         if (atomic_read(&page->count) != 2)
1475                 printk("put_cached_page: page count=%d\n",
1476                         atomic_read(&page->count));
1477         clear_bit(PG_locked, &page->flags);
1478         wake_up(&page->wait);
1479         __free_page(page);
1480 }