mm/filemap.c

   1 /*
   2  *      linux/mm/filemap.c
   3  *
   4  * Copyright (C) 1994, 1995  Linus Torvalds
   5  */
   6
   7 /*
   8  * This file handles the generic file mmap semantics used by
   9  * most "normal" filesystems (but you don't /have/ to use this:
  10  * the NFS filesystem used to do this differently, for example)
  11  */
  12 #include <linux/malloc.h>
  13 #include <linux/shm.h>
  14 #include <linux/mman.h>
  15 #include <linux/locks.h>
  16 #include <linux/pagemap.h>
  17 #include <linux/swap.h>
  18 #include <linux/smp_lock.h>
  19 #include <linux/blkdev.h>
  20 #include <linux/file.h>
  21 #include <linux/swapctl.h>
  22
  23 #include <asm/pgtable.h>
  24 #include <asm/uaccess.h>
  25
  26 /*
  27  * Shared mappings implemented 30.11.1994. It's not fully working yet,
  28  * though.
  29  *
  30  * Shared mappings now work. 15.8.1995  Bruno.
  31  */
  32
  33 unsigned long page_cache_size = 0;
  34 struct page * page_hash_table[PAGE_HASH_SIZE];
  35
  36 /*
  37  * Simple routines for both non-shared and shared mappings.
  38  */
  39
  40 #define release_page(page) __free_page((page))
  41
  42 /*
  43  * Invalidate the pages of an inode, removing all pages that aren't
  44  * locked down (those are sure to be up-to-date anyway, so we shouldn't
  45  * invalidate them).
  46  */
  47 void invalidate_inode_pages(struct inode * inode)
  48 {
  49         struct page ** p;
  50         struct page * page;
  51
  52         p = &inode->i_pages;
  53         while ((page = *p) != NULL) {
  54                 if (PageLocked(page)) {
  55                         p = &page->next;
  56                         continue;
  57                 }
  58                 inode->i_nrpages--;
  59                 if ((*p = page->next) != NULL)
  60                         (*p)->prev = page->prev;
  61                 page->next = NULL;
  62                 page->prev = NULL;
  63                 remove_page_from_hash_queue(page);
  64                 page->inode = NULL;
  65                 __free_page(page);
  66                 continue;
  67         }
  68 }
  69
  70 /*
  71  * Truncate the page cache at a set offset, removing the pages
  72  * that are beyond that offset (and zeroing out partial pages).
  73  */
  74 void truncate_inode_pages(struct inode * inode, unsigned long start)
  75 {
  76         struct page ** p;
  77         struct page * page;
  78
  79 repeat:
  80         p = &inode->i_pages;
  81         while ((page = *p) != NULL) {
  82                 unsigned long offset = page->offset;
  83
  84                 /* page wholly truncated - free it */
  85                 if (offset >= start) {
  86                         if (PageLocked(page)) {
  87                                 wait_on_page(page);
  88                                 goto repeat;
  89                         }
  90                         inode->i_nrpages--;
  91                         if ((*p = page->next) != NULL)
  92                                 (*p)->prev = page->prev;
  93                         page->next = NULL;
  94                         page->prev = NULL;
  95                         remove_page_from_hash_queue(page);
  96                         page->inode = NULL;
  97                         __free_page(page);
  98                         continue;
  99                 }
 100                 p = &page->next;
 101                 offset = start - offset;
 102                 /* partial truncate, clear end of page */
 103                 if (offset < PAGE_SIZE) {
 104                         unsigned long address = page_address(page);
 105                         memset((void *) (offset + address), 0, PAGE_SIZE - offset);
 106                         flush_page_to_ram(address);
 107                 }
 108         }
 109 }
 110
 111 /*
 112  * Remove a page from the page cache and free it.
 113  */
 114 void remove_inode_page(struct page *page)
 115 {
 116         remove_page_from_hash_queue(page);
 117         remove_page_from_inode_queue(page);
 118         __free_page(page);
 119 }
 120
 121 int shrink_mmap(int priority, int gfp_mask)
 122 {
 123         static unsigned long clock = 0;
 124         unsigned long limit = num_physpages;
 125         struct page * page;
 126         int count;
 127
 128         count = limit >> priority;
 129
 130         page = mem_map + clock;
 131         do {
 132                 int referenced;
 133
 134                 /* This works even in the presence of PageSkip because
 135                  * the first two entries at the beginning of a hole will
 136                  * be marked, not just the first.
 137                  */
 138                 page++;
 139                 clock++;
 140                 if (clock >= max_mapnr) {
 141                         clock = 0;
 142                         page = mem_map;
 143                 }
 144                 if (PageSkip(page)) {
 145                         /* next_hash is overloaded for PageSkip */
 146                         page = page->next_hash;
 147                         clock = page - mem_map;
 148                 }
 149
 150                 referenced = test_and_clear_bit(PG_referenced, &page->flags);
 151
 152                 if (PageLocked(page))
 153                         continue;
 154
 155                 if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
 156                         continue;
 157
 158                 /* We can't free pages unless there's just one user */
 159                 if (atomic_read(&page->count) != 1)
 160                         continue;
 161
 162                 count--;
 163
 164                 /*
 165                  * Is it a page swap page? If so, we want to
 166                  * drop it if it is no longer used, even if it
 167                  * were to be marked referenced..
 168                  */
 169                 if (PageSwapCache(page)) {
 170                         if (referenced && swap_count(page->offset) != 1)
 171                                 continue;
 172                         delete_from_swap_cache(page);
 173                         return 1;
 174                 }
 175
 176                 if (referenced)
 177                         continue;
 178
 179                 /* Is it a buffer page? */
 180                 if (page->buffers) {
 181                         if (buffer_under_min())
 182                                 continue;
 183                         if (!try_to_free_buffers(page))
 184                                 continue;
 185                         return 1;
 186                 }
 187
 188                 /* is it a page-cache page? */
 189                 if (page->inode) {
 190                         if (pgcache_under_min())
 191                                 continue;
 192                         remove_inode_page(page);
 193                         return 1;
 194                 }
 195
 196         } while (count > 0);
 197         return 0;
 198 }
 199
 200 /*
 201  * Update a page cache copy, when we're doing a "write()" system call
 202  * See also "update_vm_cache()".
 203  */
 204 void update_vm_cache(struct inode * inode, unsigned long pos, const char * buf, int count)
 205 {
 206         unsigned long offset, len;
 207
 208         offset = (pos & ~PAGE_MASK);
 209         pos = pos & PAGE_MASK;
 210         len = PAGE_SIZE - offset;
 211         do {
 212                 struct page * page;
 213
 214                 if (len > count)
 215                         len = count;
 216                 page = find_page(inode, pos);
 217                 if (page) {
 218                         wait_on_page(page);
 219                         memcpy((void *) (offset + page_address(page)), buf, len);
 220                         release_page(page);
 221                 }
 222                 count -= len;
 223                 buf += len;
 224                 len = PAGE_SIZE;
 225                 offset = 0;
 226                 pos += PAGE_SIZE;
 227         } while (count);
 228 }
 229
 230 static inline void add_to_page_cache(struct page * page,
 231         struct inode * inode, unsigned long offset,
 232         struct page **hash)
 233 {
 234         atomic_inc(&page->count);
 235         page->flags = (page->flags & ~((1 << PG_uptodate) | (1 << PG_error))) | (1 << PG_referenced);
 236         page->offset = offset;
 237         add_page_to_inode_queue(inode, page);
 238         __add_page_to_hash_queue(page, hash);
 239 }
 240
 241 /*
 242  * Try to read ahead in the file. "page_cache" is a potentially free page
 243  * that we could use for the cache (if it is 0 we can try to create one,
 244  * this is all overlapped with the IO on the previous page finishing anyway)
 245  */
 246 static unsigned long try_to_read_ahead(struct file * file,
 247                                 unsigned long offset, unsigned long page_cache)
 248 {
 249         struct inode *inode = file->f_dentry->d_inode;
 250         struct page * page;
 251         struct page ** hash;
 252
 253         offset &= PAGE_MASK;
 254         switch (page_cache) {
 255         case 0:
 256                 page_cache = __get_free_page(GFP_USER);
 257                 if (!page_cache)
 258                         break;
 259         default:
 260                 if (offset >= inode->i_size)
 261                         break;
 262                 hash = page_hash(inode, offset);
 263                 page = __find_page(inode, offset, *hash);
 264                 if (!page) {
 265                         /*
 266                          * Ok, add the new page to the hash-queues...
 267                          */
 268                         page = mem_map + MAP_NR(page_cache);
 269                         add_to_page_cache(page, inode, offset, hash);
 270                         inode->i_op->readpage(file, page);
 271                         page_cache = 0;
 272                 }
 273                 release_page(page);
 274         }
 275         return page_cache;
 276 }
 277
 278 /*
 279  * Wait for IO to complete on a locked page.
 280  *
 281  * This must be called with the caller "holding" the page,
 282  * ie with increased "page->count" so that the page won't
 283  * go away during the wait..
 284  */
 285 void __wait_on_page(struct page *page)
 286 {
 287         struct task_struct *tsk = current;
 288         struct wait_queue wait;
 289
 290         wait.task = tsk;
 291         add_wait_queue(&page->wait, &wait);
 292 repeat:
 293         tsk->state = TASK_UNINTERRUPTIBLE;
 294         run_task_queue(&tq_disk);
 295         if (PageLocked(page)) {
 296                 schedule();
 297                 goto repeat;
 298         }
 299         tsk->state = TASK_RUNNING;
 300         remove_wait_queue(&page->wait, &wait);
 301 }
 302
 303 #if 0
 304 #define PROFILE_READAHEAD
 305 #define DEBUG_READAHEAD
 306 #endif
 307
 308 /*
 309  * Read-ahead profiling information
 310  * --------------------------------
 311  * Every PROFILE_MAXREADCOUNT, the following information is written
 312  * to the syslog:
 313  *   Percentage of asynchronous read-ahead.
 314  *   Average of read-ahead fields context value.
 315  * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
 316  * to the syslog.
 317  */
 318
 319 #ifdef PROFILE_READAHEAD
 320
 321 #define PROFILE_MAXREADCOUNT 1000
 322
 323 static unsigned long total_reada;
 324 static unsigned long total_async;
 325 static unsigned long total_ramax;
 326 static unsigned long total_ralen;
 327 static unsigned long total_rawin;
 328
 329 static void profile_readahead(int async, struct file *filp)
 330 {
 331         unsigned long flags;
 332
 333         ++total_reada;
 334         if (async)
 335                 ++total_async;
 336
 337         total_ramax     += filp->f_ramax;
 338         total_ralen     += filp->f_ralen;
 339         total_rawin     += filp->f_rawin;
 340
 341         if (total_reada > PROFILE_MAXREADCOUNT) {
 342                 save_flags(flags);
 343                 cli();
 344                 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
 345                         restore_flags(flags);
 346                         return;
 347                 }
 348
 349                 printk("Readahead average:  max=%ld, len=%ld, win=%ld, async=%ld%%\n",
 350                         total_ramax/total_reada,
 351                         total_ralen/total_reada,
 352                         total_rawin/total_reada,
 353                         (total_async*100)/total_reada);
 354 #ifdef DEBUG_READAHEAD
 355                 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%ld\n",
 356                         filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
 357 #endif
 358
 359                 total_reada     = 0;
 360                 total_async     = 0;
 361                 total_ramax     = 0;
 362                 total_ralen     = 0;
 363                 total_rawin     = 0;
 364
 365                 restore_flags(flags);
 366         }
 367 }
 368 #endif  /* defined PROFILE_READAHEAD */
 369
 370 /*
 371  * Read-ahead context:
 372  * -------------------
 373  * The read ahead context fields of the "struct file" are the following:
 374  * - f_raend : position of the first byte after the last page we tried to
 375  *             read ahead.
 376  * - f_ramax : current read-ahead maximum size.
 377  * - f_ralen : length of the current IO read block we tried to read-ahead.
 378  * - f_rawin : length of the current read-ahead window.
 379  *             if last read-ahead was synchronous then
 380  *                  f_rawin = f_ralen
 381  *             otherwise (was asynchronous)
 382  *                  f_rawin = previous value of f_ralen + f_ralen
 383  *
 384  * Read-ahead limits:
 385  * ------------------
 386  * MIN_READAHEAD   : minimum read-ahead size when read-ahead.
 387  * MAX_READAHEAD   : maximum read-ahead size when read-ahead.
 388  *
 389  * Synchronous read-ahead benefits:
 390  * --------------------------------
 391  * Using reasonable IO xfer length from peripheral devices increase system
 392  * performances.
 393  * Reasonable means, in this context, not too large but not too small.
 394  * The actual maximum value is:
 395  *      MAX_READAHEAD + PAGE_SIZE = 76k is CONFIG_READA_SMALL is undefined
 396  *      and 32K if defined (4K page size assumed).
 397  *
 398  * Asynchronous read-ahead benefits:
 399  * ---------------------------------
 400  * Overlapping next read request and user process execution increase system
 401  * performance.
 402  *
 403  * Read-ahead risks:
 404  * -----------------
 405  * We have to guess which further data are needed by the user process.
 406  * If these data are often not really needed, it's bad for system
 407  * performances.
 408  * However, we know that files are often accessed sequentially by
 409  * application programs and it seems that it is possible to have some good
 410  * strategy in that guessing.
 411  * We only try to read-ahead files that seems to be read sequentially.
 412  *
 413  * Asynchronous read-ahead risks:
 414  * ------------------------------
 415  * In order to maximize overlapping, we must start some asynchronous read
 416  * request from the device, as soon as possible.
 417  * We must be very careful about:
 418  * - The number of effective pending IO read requests.
 419  *   ONE seems to be the only reasonable value.
 420  * - The total memory pool usage for the file access stream.
 421  *   This maximum memory usage is implicitly 2 IO read chunks:
 422  *   2*(MAX_READAHEAD + PAGE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
 423  *   64k if defined (4K page size assumed).
 424  */
 425
 426 static inline int get_max_readahead(struct inode * inode)
 427 {
 428         if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
 429                 return MAX_READAHEAD;
 430         return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
 431 }
 432
 433 static inline unsigned long generic_file_readahead(int reada_ok,
 434         struct file * filp, struct inode * inode,
 435         unsigned long ppos, struct page * page, unsigned long page_cache)
 436 {
 437         unsigned long max_ahead, ahead;
 438         unsigned long raend;
 439         int max_readahead = get_max_readahead(inode);
 440
 441         raend = filp->f_raend & PAGE_MASK;
 442         max_ahead = 0;
 443
 444 /*
 445  * The current page is locked.
 446  * If the current position is inside the previous read IO request, do not
 447  * try to reread previously read ahead pages.
 448  * Otherwise decide or not to read ahead some pages synchronously.
 449  * If we are not going to read ahead, set the read ahead context for this
 450  * page only.
 451  */
 452         if (PageLocked(page)) {
 453                 if (!filp->f_ralen || ppos >= raend || ppos + filp->f_ralen < raend) {
 454                         raend = ppos;
 455                         if (raend < inode->i_size)
 456                                 max_ahead = filp->f_ramax;
 457                         filp->f_rawin = 0;
 458                         filp->f_ralen = PAGE_SIZE;
 459                         if (!max_ahead) {
 460                                 filp->f_raend  = ppos + filp->f_ralen;
 461                                 filp->f_rawin += filp->f_ralen;
 462                         }
 463                 }
 464         }
 465 /*
 466  * The current page is not locked.
 467  * If we were reading ahead and,
 468  * if the current max read ahead size is not zero and,
 469  * if the current position is inside the last read-ahead IO request,
 470  *   it is the moment to try to read ahead asynchronously.
 471  * We will later force unplug device in order to force asynchronous read IO.
 472  */
 473         else if (reada_ok && filp->f_ramax && raend >= PAGE_SIZE &&
 474                  ppos <= raend && ppos + filp->f_ralen >= raend) {
 475 /*
 476  * Add ONE page to max_ahead in order to try to have about the same IO max size
 477  * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_SIZE.
 478  * Compute the position of the last page we have tried to read in order to
 479  * begin to read ahead just at the next page.
 480  */
 481                 raend -= PAGE_SIZE;
 482                 if (raend < inode->i_size)
 483                         max_ahead = filp->f_ramax + PAGE_SIZE;
 484
 485                 if (max_ahead) {
 486                         filp->f_rawin = filp->f_ralen;
 487                         filp->f_ralen = 0;
 488                         reada_ok      = 2;
 489                 }
 490         }
 491 /*
 492  * Try to read ahead pages.
 493  * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
 494  * scheduler, will work enough for us to avoid too bad actuals IO requests.
 495  */
 496         ahead = 0;
 497         while (ahead < max_ahead) {
 498                 ahead += PAGE_SIZE;
 499                 page_cache = try_to_read_ahead(filp, raend + ahead,
 500                                                 page_cache);
 501         }
 502 /*
 503  * If we tried to read ahead some pages,
 504  * If we tried to read ahead asynchronously,
 505  *   Try to force unplug of the device in order to start an asynchronous
 506  *   read IO request.
 507  * Update the read-ahead context.
 508  * Store the length of the current read-ahead window.
 509  * Double the current max read ahead size.
 510  *   That heuristic avoid to do some large IO for files that are not really
 511  *   accessed sequentially.
 512  */
 513         if (ahead) {
 514                 if (reada_ok == 2) {
 515                         run_task_queue(&tq_disk);
 516                 }
 517
 518                 filp->f_ralen += ahead;
 519                 filp->f_rawin += filp->f_ralen;
 520                 filp->f_raend = raend + ahead + PAGE_SIZE;
 521
 522                 filp->f_ramax += filp->f_ramax;
 523
 524                 if (filp->f_ramax > max_readahead)
 525                         filp->f_ramax = max_readahead;
 526
 527 #ifdef PROFILE_READAHEAD
 528                 profile_readahead((reada_ok == 2), filp);
 529 #endif
 530         }
 531
 532         return page_cache;
 533 }
 534
 535 /*
 536  * "descriptor" for what we're up to with a read.
 537  * This allows us to use the same read code yet
 538  * have multiple different users of the data that
 539  * we read from a file.
 540  *
 541  * The simplest case just copies the data to user
 542  * mode.
 543  */
 544 typedef struct {
 545         size_t written;
 546         size_t count;
 547         char * buf;
 548         int error;
 549 } read_descriptor_t;
 550
 551 typedef int (*read_actor_t)(read_descriptor_t *, const char *, unsigned long);
 552
 553 /*
 554  * This is a generic file read routine, and uses the
 555  * inode->i_op->readpage() function for the actual low-level
 556  * stuff.
 557  *
 558  * This is really ugly. But the goto's actually try to clarify some
 559  * of the logic when it comes to error handling etc.
 560  */
 561 static void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
 562 {
 563         struct dentry *dentry = filp->f_dentry;
 564         struct inode *inode = dentry->d_inode;
 565         size_t pos, pgpos, page_cache;
 566         int reada_ok;
 567         int max_readahead = get_max_readahead(inode);
 568
 569         page_cache = 0;
 570
 571         pos = *ppos;
 572         pgpos = pos & PAGE_MASK;
 573 /*
 574  * If the current position is outside the previous read-ahead window,
 575  * we reset the current read-ahead context and set read ahead max to zero
 576  * (will be set to just needed value later),
 577  * otherwise, we assume that the file accesses are sequential enough to
 578  * continue read-ahead.
 579  */
 580         if (pgpos > filp->f_raend || pgpos + filp->f_rawin < filp->f_raend) {
 581                 reada_ok = 0;
 582                 filp->f_raend = 0;
 583                 filp->f_ralen = 0;
 584                 filp->f_ramax = 0;
 585                 filp->f_rawin = 0;
 586         } else {
 587                 reada_ok = 1;
 588         }
 589 /*
 590  * Adjust the current value of read-ahead max.
 591  * If the read operation stay in the first half page, force no readahead.
 592  * Otherwise try to increase read ahead max just enough to do the read request.
 593  * Then, at least MIN_READAHEAD if read ahead is ok,
 594  * and at most MAX_READAHEAD in all cases.
 595  */
 596         if (pos + desc->count <= (PAGE_SIZE >> 1)) {
 597                 filp->f_ramax = 0;
 598         } else {
 599                 unsigned long needed;
 600
 601                 needed = ((pos + desc->count) & PAGE_MASK) - pgpos;
 602
 603                 if (filp->f_ramax < needed)
 604                         filp->f_ramax = needed;
 605
 606                 if (reada_ok && filp->f_ramax < MIN_READAHEAD)
 607                                 filp->f_ramax = MIN_READAHEAD;
 608                 if (filp->f_ramax > max_readahead)
 609                         filp->f_ramax = max_readahead;
 610         }
 611
 612         for (;;) {
 613                 struct page *page, **hash;
 614
 615                 if (pos >= inode->i_size)
 616                         break;
 617
 618                 /*
 619                  * Try to find the data in the page cache..
 620                  */
 621                 hash = page_hash(inode, pos & PAGE_MASK);
 622                 page = __find_page(inode, pos & PAGE_MASK, *hash);
 623                 if (!page)
 624                         goto no_cached_page;
 625
 626 found_page:
 627 /*
 628  * Try to read ahead only if the current page is filled or being filled.
 629  * Otherwise, if we were reading ahead, decrease max read ahead size to
 630  * the minimum value.
 631  * In this context, that seems to may happen only on some read error or if
 632  * the page has been rewritten.
 633  */
 634                 if (PageUptodate(page) || PageLocked(page))
 635                         page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_MASK, page, page_cache);
 636                 else if (reada_ok && filp->f_ramax > MIN_READAHEAD)
 637                                 filp->f_ramax = MIN_READAHEAD;
 638
 639                 wait_on_page(page);
 640
 641                 if (!PageUptodate(page))
 642                         goto page_read_error;
 643
 644 success:
 645                 /*
 646                  * Ok, we have the page, it's up-to-date and ok,
 647                  * so now we can finally copy it to user space...
 648                  */
 649         {
 650                 unsigned long offset, nr;
 651
 652                 offset = pos & ~PAGE_MASK;
 653                 nr = PAGE_SIZE - offset;
 654                 if (nr > inode->i_size - pos)
 655                         nr = inode->i_size - pos;
 656
 657                 /*
 658                  * The actor routine returns how many bytes were actually used..
 659                  * NOTE! This may not be the same as how much of a user buffer
 660                  * we filled up (we may be padding etc), so we can only update
 661                  * "pos" here (the actor routine has to update the user buffer
 662                  * pointers and the remaining count).
 663                  */
 664                 nr = actor(desc, (const char *) (page_address(page) + offset), nr);
 665                 pos += nr;
 666                 release_page(page);
 667                 if (nr && desc->count)
 668                         continue;
 669                 break;
 670         }
 671
 672 no_cached_page:
 673                 /*
 674                  * Ok, it wasn't cached, so we need to create a new
 675                  * page..
 676                  */
 677                 if (!page_cache) {
 678                         page_cache = __get_free_page(GFP_USER);
 679                         /*
 680                          * That could have slept, so go around to the
 681                          * very beginning..
 682                          */
 683                         if (page_cache)
 684                                 continue;
 685                         desc->error = -ENOMEM;
 686                         break;
 687                 }
 688
 689                 /*
 690                  * Ok, add the new page to the hash-queues...
 691                  */
 692                 page = mem_map + MAP_NR(page_cache);
 693                 page_cache = 0;
 694                 add_to_page_cache(page, inode, pos & PAGE_MASK, hash);
 695
 696                 /*
 697                  * Error handling is tricky. If we get a read error,
 698                  * the cached page stays in the cache (but uptodate=0),
 699                  * and the next process that accesses it will try to
 700                  * re-read it. This is needed for NFS etc, where the
 701                  * identity of the reader can decide if we can read the
 702                  * page or not..
 703                  */
 704 /*
 705  * We have to read the page.
 706  * If we were reading ahead, we had previously tried to read this page,
 707  * That means that the page has probably been removed from the cache before
 708  * the application process needs it, or has been rewritten.
 709  * Decrease max readahead size to the minimum value in that situation.
 710  */
 711                 if (reada_ok && filp->f_ramax > MIN_READAHEAD)
 712                         filp->f_ramax = MIN_READAHEAD;
 713
 714                 {
 715                         int error = inode->i_op->readpage(filp, page);
 716                         if (!error)
 717                                 goto found_page;
 718                         desc->error = error;
 719                         release_page(page);
 720                         break;
 721                 }
 722
 723 page_read_error:
 724                 /*
 725                  * We found the page, but it wasn't up-to-date.
 726                  * Try to re-read it _once_. We do this synchronously,
 727                  * because this happens only if there were errors.
 728                  */
 729                 {
 730                         int error = inode->i_op->readpage(filp, page);
 731                         if (!error) {
 732                                 wait_on_page(page);
 733                                 if (PageUptodate(page) && !PageError(page))
 734                                         goto success;
 735                                 error = -EIO; /* Some unspecified error occurred.. */
 736                         }
 737                         desc->error = error;
 738                         release_page(page);
 739                         break;
 740                 }
 741         }
 742
 743         *ppos = pos;
 744         filp->f_reada = 1;
 745         if (page_cache)
 746                 free_page(page_cache);
 747         UPDATE_ATIME(inode);
 748 }
 749
 750 static int file_read_actor(read_descriptor_t * desc, const char *area, unsigned long size)
 751 {
 752         unsigned long left;
 753         unsigned long count = desc->count;
 754
 755         if (size > count)
 756                 size = count;
 757         left = __copy_to_user(desc->buf, area, size);
 758         if (left) {
 759                 size -= left;
 760                 desc->error = -EFAULT;
 761         }
 762         desc->count = count - size;
 763         desc->written += size;
 764         desc->buf += size;
 765         return size;
 766 }
 767
 768 /*
 769  * This is the "read()" routine for all filesystems
 770  * that can use the page cache directly.
 771  */
 772 ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
 773 {
 774         ssize_t retval;
 775
 776         retval = -EFAULT;
 777         if (access_ok(VERIFY_WRITE, buf, count)) {
 778                 retval = 0;
 779                 if (count) {
 780                         read_descriptor_t desc;
 781
 782                         desc.written = 0;
 783                         desc.count = count;
 784                         desc.buf = buf;
 785                         desc.error = 0;
 786                         do_generic_file_read(filp, ppos, &desc, file_read_actor);
 787
 788                         retval = desc.written;
 789                         if (!retval)
 790                                 retval = desc.error;
 791                 }
 792         }
 793         return retval;
 794 }
 795
 796 static int file_send_actor(read_descriptor_t * desc, const char *area, unsigned long size)
 797 {
 798         ssize_t written;
 799         unsigned long count = desc->count;
 800         struct file *file = (struct file *) desc->buf;
 801         struct inode *inode = file->f_dentry->d_inode;
 802         mm_segment_t old_fs;
 803
 804         if (size > count)
 805                 size = count;
 806         down(&inode->i_sem);
 807         old_fs = get_fs();
 808         set_fs(KERNEL_DS);
 809         written = file->f_op->write(file, area, size, &file->f_pos);
 810         set_fs(old_fs);
 811         up(&inode->i_sem);
 812         if (written < 0) {
 813                 desc->error = written;
 814                 written = 0;
 815         }
 816         desc->count = count - written;
 817         desc->written += written;
 818         return written;
 819 }
 820
 821 asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
 822 {
 823         ssize_t retval;
 824         struct file * in_file, * out_file;
 825         struct inode * in_inode, * out_inode;
 826
 827         lock_kernel();
 828
 829         /*
 830          * Get input file, and verify that it is ok..
 831          */
 832         retval = -EBADF;
 833         in_file = fget(in_fd);
 834         if (!in_file)
 835                 goto out;
 836         if (!(in_file->f_mode & FMODE_READ))
 837                 goto fput_in;
 838         retval = -EINVAL;
 839         in_inode = in_file->f_dentry->d_inode;
 840         if (!in_inode)
 841                 goto fput_in;
 842         if (!in_inode->i_op || !in_inode->i_op->readpage)
 843                 goto fput_in;
 844         retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
 845         if (retval)
 846                 goto fput_in;
 847
 848         /*
 849          * Get output file, and verify that it is ok..
 850          */
 851         retval = -EBADF;
 852         out_file = fget(out_fd);
 853         if (!out_file)
 854                 goto fput_in;
 855         if (!(out_file->f_mode & FMODE_WRITE))
 856                 goto fput_out;
 857         retval = -EINVAL;
 858         if (!out_file->f_op || !out_file->f_op->write)
 859                 goto fput_out;
 860         out_inode = out_file->f_dentry->d_inode;
 861         if (!out_inode)
 862                 goto fput_out;
 863         retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
 864         if (retval)
 865                 goto fput_out;
 866
 867         retval = 0;
 868         if (count) {
 869                 read_descriptor_t desc;
 870                 loff_t pos = 0, *ppos;
 871
 872                 retval = -EFAULT;
 873                 ppos = &in_file->f_pos;
 874                 if (offset) {
 875                         if (get_user(pos, offset))
 876                                 goto fput_out;
 877                         ppos = &pos;
 878                 }
 879
 880                 desc.written = 0;
 881                 desc.count = count;
 882                 desc.buf = (char *) out_file;
 883                 desc.error = 0;
 884                 do_generic_file_read(in_file, ppos, &desc, file_send_actor);
 885
 886                 retval = desc.written;
 887                 if (!retval)
 888                         retval = desc.error;
 889                 if (offset)
 890                         put_user(pos, offset);
 891         }
 892
 893
 894 fput_out:
 895         fput(out_file);
 896 fput_in:
 897         fput(in_file);
 898 out:
 899         unlock_kernel();
 900         return retval;
 901 }
 902
 903 /*
 904  * Semantics for shared and private memory areas are different past the end
 905  * of the file. A shared mapping past the last page of the file is an error
 906  * and results in a SIGBUS, while a private mapping just maps in a zero page.
 907  *
 908  * The goto's are kind of ugly, but this streamlines the normal case of having
 909  * it in the page cache, and handles the special cases reasonably without
 910  * having a lot of duplicated code.
 911  *
 912  * WSH 06/04/97: fixed a memory leak and moved the allocation of new_page
 913  * ahead of the wait if we're sure to need it.
 914  */
 915 static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long address, int no_share)
 916 {
 917         struct file * file = area->vm_file;
 918         struct dentry * dentry = file->f_dentry;
 919         struct inode * inode = dentry->d_inode;
 920         unsigned long offset, reada, i;
 921         struct page * page, **hash;
 922         unsigned long old_page, new_page;
 923
 924         new_page = 0;
 925         offset = (address & PAGE_MASK) - area->vm_start + area->vm_offset;
 926         if (offset >= inode->i_size && (area->vm_flags & VM_SHARED) && area->vm_mm == current->mm)
 927                 goto no_page;
 928
 929         /*
 930          * Do we have something in the page cache already?
 931          */
 932         hash = page_hash(inode, offset);
 933         page = __find_page(inode, offset, *hash);
 934         if (!page)
 935                 goto no_cached_page;
 936
 937 found_page:
 938         /*
 939          * Ok, found a page in the page cache, now we need to check
 940          * that it's up-to-date.  First check whether we'll need an
 941          * extra page -- better to overlap the allocation with the I/O.
 942          */
 943         if (no_share && !new_page) {
 944                 new_page = __get_free_page(GFP_USER);
 945                 if (!new_page)
 946                         goto failure;
 947         }
 948
 949         if (PageLocked(page))
 950                 goto page_locked_wait;
 951         if (!PageUptodate(page))
 952                 goto page_read_error;
 953
 954 success:
 955         /*
 956          * Found the page, need to check sharing and possibly
 957          * copy it over to another page..
 958          */
 959         old_page = page_address(page);
 960         if (!no_share) {
 961                 /*
 962                  * Ok, we can share the cached page directly.. Get rid
 963                  * of any potential extra pages.
 964                  */
 965                 if (new_page)
 966                         free_page(new_page);
 967
 968                 flush_page_to_ram(old_page);
 969                 return old_page;
 970         }
 971
 972         /*
 973          * No sharing ... copy to the new page.
 974          */
 975         copy_page(new_page, old_page);
 976         flush_page_to_ram(new_page);
 977         release_page(page);
 978         return new_page;
 979
 980 no_cached_page:
 981         /*
 982          * Try to read in an entire cluster at once.
 983          */
 984         reada   = offset;
 985         reada >>= PAGE_SHIFT + page_cluster;
 986         reada <<= PAGE_SHIFT + page_cluster;
 987
 988         for (i = 1 << page_cluster; i > 0; --i, reada += PAGE_SIZE)
 989                 new_page = try_to_read_ahead(file, reada, new_page);
 990
 991         if (!new_page)
 992                 new_page = __get_free_page(GFP_USER);
 993         if (!new_page)
 994                 goto no_page;
 995
 996         /*
 997          * During getting the above page we might have slept,
 998          * so we need to re-check the situation with the page
 999          * cache.. The page we just got may be useful if we
1000          * can't share, so don't get rid of it here.
1001          */
1002         page = find_page(inode, offset);
1003         if (page)
1004                 goto found_page;
1005
1006         /*
1007          * Now, create a new page-cache page from the page we got
1008          */
1009         page = mem_map + MAP_NR(new_page);
1010         new_page = 0;
1011         add_to_page_cache(page, inode, offset, hash);
1012
1013         if (inode->i_op->readpage(file, page) != 0)
1014                 goto failure;
1015
1016         goto found_page;
1017
1018 page_locked_wait:
1019         __wait_on_page(page);
1020         if (PageUptodate(page))
1021                 goto success;
1022
1023 page_read_error:
1024         /*
1025          * Umm, take care of errors if the page isn't up-to-date.
1026          * Try to re-read it _once_. We do this synchronously,
1027          * because there really aren't any performance issues here
1028          * and we need to check for errors.
1029          */
1030         if (inode->i_op->readpage(file, page) != 0)
1031                 goto failure;
1032         wait_on_page(page);
1033         if (PageError(page))
1034                 goto failure;
1035         if (PageUptodate(page))
1036                 goto success;
1037
1038         /*
1039          * Things didn't work out. Return zero to tell the
1040          * mm layer so, possibly freeing the page cache page first.
1041          */
1042 failure:
1043         release_page(page);
1044         if (new_page)
1045                 free_page(new_page);
1046 no_page:
1047         return 0;
1048 }
1049
1050 /*
1051  * Tries to write a shared mapped page to its backing store. May return -EIO
1052  * if the disk is full.
1053  */
1054 static inline int do_write_page(struct inode * inode, struct file * file,
1055         const char * page, unsigned long offset)
1056 {
1057         int retval;
1058         unsigned long size;
1059         loff_t loff = offset;
1060         mm_segment_t old_fs;
1061
1062         size = offset + PAGE_SIZE;
1063         /* refuse to extend file size.. */
1064         if (S_ISREG(inode->i_mode)) {
1065                 if (size > inode->i_size)
1066                         size = inode->i_size;
1067                 /* Ho humm.. We should have tested for this earlier */
1068                 if (size < offset)
1069                         return -EIO;
1070         }
1071         size -= offset;
1072         old_fs = get_fs();
1073         set_fs(KERNEL_DS);
1074         retval = -EIO;
1075         if (size == file->f_op->write(file, (const char *) page, size, &loff))
1076                 retval = 0;
1077         set_fs(old_fs);
1078         return retval;
1079 }
1080
1081 static int filemap_write_page(struct vm_area_struct * vma,
1082         unsigned long offset,
1083         unsigned long page)
1084 {
1085         int result;
1086         struct file * file;
1087         struct dentry * dentry;
1088         struct inode * inode;
1089
1090         file = vma->vm_file;
1091         dentry = file->f_dentry;
1092         inode = dentry->d_inode;
1093         if (!file->f_op->write)
1094                 return -EIO;
1095
1096         /*
1097          * If a task terminates while we're swapping the page, the vma and
1098          * and file could be released ... increment the count to be safe.
1099          */
1100         file->f_count++;
1101         down(&inode->i_sem);
1102         result = do_write_page(inode, file, (const char *) page, offset);
1103         up(&inode->i_sem);
1104         fput(file);
1105         return result;
1106 }
1107
1108
1109 /*
1110  * The page cache takes care of races between somebody
1111  * trying to swap something out and swap something in
1112  * at the same time..
1113  */
1114 int filemap_swapout(struct vm_area_struct * vma, struct page * page)
1115 {
1116         return filemap_write_page(vma, page->offset, page_address(page));
1117 }
1118
1119 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
1120         unsigned long address, unsigned int flags)
1121 {
1122         pte_t pte = *ptep;
1123         unsigned long page;
1124         int error;
1125
1126         if (!(flags & MS_INVALIDATE)) {
1127                 if (!pte_present(pte))
1128                         return 0;
1129                 if (!pte_dirty(pte))
1130                         return 0;
1131                 flush_page_to_ram(pte_page(pte));
1132                 flush_cache_page(vma, address);
1133                 set_pte(ptep, pte_mkclean(pte));
1134                 flush_tlb_page(vma, address);
1135                 page = pte_page(pte);
1136                 atomic_inc(&mem_map[MAP_NR(page)].count);
1137         } else {
1138                 if (pte_none(pte))
1139                         return 0;
1140                 flush_cache_page(vma, address);
1141                 pte_clear(ptep);
1142                 flush_tlb_page(vma, address);
1143                 if (!pte_present(pte)) {
1144                         swap_free(pte_val(pte));
1145                         return 0;
1146                 }
1147                 page = pte_page(pte);
1148                 if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
1149                         free_page(page);
1150                         return 0;
1151                 }
1152         }
1153         error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page);
1154         free_page(page);
1155         return error;
1156 }
1157
1158 static inline int filemap_sync_pte_range(pmd_t * pmd,
1159         unsigned long address, unsigned long size,
1160         struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
1161 {
1162         pte_t * pte;
1163         unsigned long end;
1164         int error;
1165
1166         if (pmd_none(*pmd))
1167                 return 0;
1168         if (pmd_bad(*pmd)) {
1169                 printk("filemap_sync_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
1170                 pmd_clear(pmd);
1171                 return 0;
1172         }
1173         pte = pte_offset(pmd, address);
1174         offset += address & PMD_MASK;
1175         address &= ~PMD_MASK;
1176         end = address + size;
1177         if (end > PMD_SIZE)
1178                 end = PMD_SIZE;
1179         error = 0;
1180         do {
1181                 error |= filemap_sync_pte(pte, vma, address + offset, flags);
1182                 address += PAGE_SIZE;
1183                 pte++;
1184         } while (address < end);
1185         return error;
1186 }
1187
1188 static inline int filemap_sync_pmd_range(pgd_t * pgd,
1189         unsigned long address, unsigned long size,
1190         struct vm_area_struct *vma, unsigned int flags)
1191 {
1192         pmd_t * pmd;
1193         unsigned long offset, end;
1194         int error;
1195
1196         if (pgd_none(*pgd))
1197                 return 0;
1198         if (pgd_bad(*pgd)) {
1199                 printk("filemap_sync_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd));
1200                 pgd_clear(pgd);
1201                 return 0;
1202         }
1203         pmd = pmd_offset(pgd, address);
1204         offset = address & PGDIR_MASK;
1205         address &= ~PGDIR_MASK;
1206         end = address + size;
1207         if (end > PGDIR_SIZE)
1208                 end = PGDIR_SIZE;
1209         error = 0;
1210         do {
1211                 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
1212                 address = (address + PMD_SIZE) & PMD_MASK;
1213                 pmd++;
1214         } while (address < end);
1215         return error;
1216 }
1217
1218 static int filemap_sync(struct vm_area_struct * vma, unsigned long address,
1219         size_t size, unsigned int flags)
1220 {
1221         pgd_t * dir;
1222         unsigned long end = address + size;
1223         int error = 0;
1224
1225         dir = pgd_offset(vma->vm_mm, address);
1226         flush_cache_range(vma->vm_mm, end - size, end);
1227         while (address < end) {
1228                 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
1229                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
1230                 dir++;
1231         }
1232         flush_tlb_range(vma->vm_mm, end - size, end);
1233         return error;
1234 }
1235
1236 /*
1237  * This handles (potentially partial) area unmaps..
1238  */
1239 static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len)
1240 {
1241         filemap_sync(vma, start, len, MS_ASYNC);
1242 }
1243
1244 /*
1245  * Shared mappings need to be able to do the right thing at
1246  * close/unmap/sync. They will also use the private file as
1247  * backing-store for swapping..
1248  */
1249 static struct vm_operations_struct file_shared_mmap = {
1250         NULL,                   /* no special open */
1251         NULL,                   /* no special close */
1252         filemap_unmap,          /* unmap - we need to sync the pages */
1253         NULL,                   /* no special protect */
1254         filemap_sync,           /* sync */
1255         NULL,                   /* advise */
1256         filemap_nopage,         /* nopage */
1257         NULL,                   /* wppage */
1258         filemap_swapout,        /* swapout */
1259         NULL,                   /* swapin */
1260 };
1261
1262 /*
1263  * Private mappings just need to be able to load in the map.
1264  *
1265  * (This is actually used for shared mappings as well, if we
1266  * know they can't ever get write permissions..)
1267  */
1268 static struct vm_operations_struct file_private_mmap = {
1269         NULL,                   /* open */
1270         NULL,                   /* close */
1271         NULL,                   /* unmap */
1272         NULL,                   /* protect */
1273         NULL,                   /* sync */
1274         NULL,                   /* advise */
1275         filemap_nopage,         /* nopage */
1276         NULL,                   /* wppage */
1277         NULL,                   /* swapout */
1278         NULL,                   /* swapin */
1279 };
1280
1281 /* This is used for a general mmap of a disk file */
1282
1283 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1284 {
1285         struct vm_operations_struct * ops;
1286         struct inode *inode = file->f_dentry->d_inode;
1287
1288         if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1289                 ops = &file_shared_mmap;
1290                 /* share_page() can only guarantee proper page sharing if
1291                  * the offsets are all page aligned. */
1292                 if (vma->vm_offset & (PAGE_SIZE - 1))
1293                         return -EINVAL;
1294         } else {
1295                 ops = &file_private_mmap;
1296                 if (vma->vm_offset & (inode->i_sb->s_blocksize - 1))
1297                         return -EINVAL;
1298         }
1299         if (!inode->i_sb || !S_ISREG(inode->i_mode))
1300                 return -EACCES;
1301         if (!inode->i_op || !inode->i_op->readpage)
1302                 return -ENOEXEC;
1303         UPDATE_ATIME(inode);
1304         vma->vm_file = file;
1305         file->f_count++;
1306         vma->vm_ops = ops;
1307         return 0;
1308 }
1309
1310
1311 /*
1312  * The msync() system call.
1313  */
1314
1315 static int msync_interval(struct vm_area_struct * vma,
1316         unsigned long start, unsigned long end, int flags)
1317 {
1318         if (vma->vm_file && vma->vm_ops && vma->vm_ops->sync) {
1319                 int error;
1320                 error = vma->vm_ops->sync(vma, start, end-start, flags);
1321                 if (!error && (flags & MS_SYNC)) {
1322                         struct file * file = vma->vm_file;
1323                         if (file) {
1324                                 struct dentry * dentry = file->f_dentry;
1325                                 struct inode * inode = dentry->d_inode;
1326                                 down(&inode->i_sem);
1327                                 error = file_fsync(file, dentry);
1328                                 up(&inode->i_sem);
1329                         }
1330                 }
1331                 return error;
1332         }
1333         return 0;
1334 }
1335
1336 asmlinkage int sys_msync(unsigned long start, size_t len, int flags)
1337 {
1338         unsigned long end;
1339         struct vm_area_struct * vma;
1340         int unmapped_error, error = -EINVAL;
1341
1342         down(&current->mm->mmap_sem);
1343         lock_kernel();
1344         if (start & ~PAGE_MASK)
1345                 goto out;
1346         len = (len + ~PAGE_MASK) & PAGE_MASK;
1347         end = start + len;
1348         if (end < start)
1349                 goto out;
1350         if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1351                 goto out;
1352         error = 0;
1353         if (end == start)
1354                 goto out;
1355         /*
1356          * If the interval [start,end) covers some unmapped address ranges,
1357          * just ignore them, but return -EFAULT at the end.
1358          */
1359         vma = find_vma(current->mm, start);
1360         unmapped_error = 0;
1361         for (;;) {
1362                 /* Still start < end. */
1363                 error = -EFAULT;
1364                 if (!vma)
1365                         goto out;
1366                 /* Here start < vma->vm_end. */
1367                 if (start < vma->vm_start) {
1368                         unmapped_error = -EFAULT;
1369                         start = vma->vm_start;
1370                 }
1371                 /* Here vma->vm_start <= start < vma->vm_end. */
1372                 if (end <= vma->vm_end) {
1373                         if (start < end) {
1374                                 error = msync_interval(vma, start, end, flags);
1375                                 if (error)
1376                                         goto out;
1377                         }
1378                         error = unmapped_error;
1379                         goto out;
1380                 }
1381                 /* Here vma->vm_start <= start < vma->vm_end < end. */
1382                 error = msync_interval(vma, start, vma->vm_end, flags);
1383                 if (error)
1384                         goto out;
1385                 start = vma->vm_end;
1386                 vma = vma->vm_next;
1387         }
1388 out:
1389         unlock_kernel();
1390         up(&current->mm->mmap_sem);
1391         return error;
1392 }
1393
1394 /*
1395  * Write to a file through the page cache. This is mainly for the
1396  * benefit of NFS and possibly other network-based file systems.
1397  *
1398  * We currently put everything into the page cache prior to writing it.
1399  * This is not a problem when writing full pages. With partial pages,
1400  * however, we first have to read the data into the cache, then
1401  * dirty the page, and finally schedule it for writing. Alternatively, we
1402  * could write-through just the portion of data that would go into that
1403  * page, but that would kill performance for applications that write data
1404  * line by line, and it's prone to race conditions.
1405  *
1406  * Note that this routine doesn't try to keep track of dirty pages. Each
1407  * file system has to do this all by itself, unfortunately.
1408  *                                                      okir@monad.swb.de
1409  */
1410 ssize_t
1411 generic_file_write(struct file *file, const char *buf,
1412                    size_t count, loff_t *ppos)
1413 {
1414         struct dentry   *dentry = file->f_dentry;
1415         struct inode    *inode = dentry->d_inode;
1416         unsigned long   pos = *ppos;
1417         unsigned long   limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
1418         struct page     *page, **hash;
1419         unsigned long   page_cache = 0;
1420         unsigned long   written;
1421         long            status, sync;
1422
1423         if (!inode->i_op || !inode->i_op->updatepage)
1424                 return -EIO;
1425
1426         sync    = file->f_flags & O_SYNC;
1427         written = 0;
1428
1429         if (file->f_flags & O_APPEND)
1430                 pos = inode->i_size;
1431
1432         /*
1433          * Check whether we've reached the file size limit.
1434          */
1435         status = -EFBIG;
1436         if (pos >= limit) {
1437                 send_sig(SIGXFSZ, current, 0);
1438                 goto out;
1439         }
1440
1441         status  = 0;
1442         /*
1443          * Check whether to truncate the write,
1444          * and send the signal if we do.
1445          */
1446         if (count > limit - pos) {
1447                 send_sig(SIGXFSZ, current, 0);
1448                 count = limit - pos;
1449         }
1450
1451         while (count) {
1452                 unsigned long bytes, pgpos, offset;
1453                 /*
1454                  * Try to find the page in the cache. If it isn't there,
1455                  * allocate a free page.
1456                  */
1457                 offset = (pos & ~PAGE_MASK);
1458                 pgpos = pos & PAGE_MASK;
1459                 bytes = PAGE_SIZE - offset;
1460                 if (bytes > count)
1461                         bytes = count;
1462
1463                 hash = page_hash(inode, pgpos);
1464                 page = __find_page(inode, pgpos, *hash);
1465                 if (!page) {
1466                         if (!page_cache) {
1467                                 page_cache = __get_free_page(GFP_USER);
1468                                 if (page_cache)
1469                                         continue;
1470                                 status = -ENOMEM;
1471                                 break;
1472                         }
1473                         page = mem_map + MAP_NR(page_cache);
1474                         add_to_page_cache(page, inode, pgpos, hash);
1475                         page_cache = 0;
1476                 }
1477
1478                 /* Get exclusive IO access to the page.. */
1479                 wait_on_page(page);
1480                 set_bit(PG_locked, &page->flags);
1481
1482                 /*
1483                  * Do the real work.. If the writer ends up delaying the write,
1484                  * the writer needs to increment the page use counts until he
1485                  * is done with the page.
1486                  */
1487                 bytes -= copy_from_user((u8*)page_address(page) + offset, buf, bytes);
1488                 status = -EFAULT;
1489                 if (bytes)
1490                         status = inode->i_op->updatepage(file, page, offset, bytes, sync);
1491
1492                 /* Mark it unlocked again and drop the page.. */
1493                 clear_bit(PG_locked, &page->flags);
1494                 wake_up(&page->wait);
1495                 __free_page(page);
1496
1497                 if (status < 0)
1498                         break;
1499
1500                 written += status;
1501                 count -= status;
1502                 pos += status;
1503                 buf += status;
1504         }
1505         *ppos = pos;
1506         if (pos > inode->i_size)
1507                 inode->i_size = pos;
1508
1509         if (page_cache)
1510                 free_page(page_cache);
1511 out:
1512         return written ? written : status;
1513 }
1514
1515 /*
1516  * Support routines for directory cacheing using the page cache.
1517  */
1518
1519 /*
1520  * Finds the page at the specified offset, installing a new page
1521  * if requested.  The count is incremented and the page is locked.
1522  *
1523  * Note: we don't have to worry about races here, as the caller
1524  * is holding the inode semaphore.
1525  */
1526 unsigned long get_cached_page(struct inode * inode, unsigned long offset,
1527                                 int new)
1528 {
1529         struct page * page;
1530         struct page ** hash;
1531         unsigned long page_cache = 0;
1532
1533         hash = page_hash(inode, offset);
1534         page = __find_page(inode, offset, *hash);
1535         if (!page) {
1536                 if (!new)
1537                         goto out;
1538                 page_cache = get_free_page(GFP_USER);
1539                 if (!page_cache)
1540                         goto out;
1541                 page = mem_map + MAP_NR(page_cache);
1542                 add_to_page_cache(page, inode, offset, hash);
1543         }
1544         if (atomic_read(&page->count) != 2)
1545                 printk(KERN_ERR "get_cached_page: page count=%d\n",
1546                         atomic_read(&page->count));
1547         if (test_bit(PG_locked, &page->flags))
1548                 printk(KERN_ERR "get_cached_page: page already locked!\n");
1549         set_bit(PG_locked, &page->flags);
1550         page_cache = page_address(page);
1551
1552 out:
1553         return page_cache;
1554 }
1555
1556 /*
1557  * Unlock and free a page.
1558  */
1559 void put_cached_page(unsigned long addr)
1560 {
1561         struct page * page = mem_map + MAP_NR(addr);
1562
1563         if (!test_bit(PG_locked, &page->flags))
1564                 printk("put_cached_page: page not locked!\n");
1565         if (atomic_read(&page->count) != 2)
1566                 printk("put_cached_page: page count=%d\n",
1567                         atomic_read(&page->count));
1568         clear_bit(PG_locked, &page->flags);
1569         wake_up(&page->wait);
1570         __free_page(page);
1571 }