mm/filemap.c

   1 /*
   2  *      linux/mm/filemap.c
   3  *
   4  * Copyright (C) 1994, 1995  Linus Torvalds
   5  */
   6
   7 /*
   8  * This file handles the generic file mmap semantics used by
   9  * most "normal" filesystems (but you don't /have/ to use this:
  10  * the NFS filesystem used to do this differently, for example)
  11  */
  12 #include <linux/malloc.h>
  13 #include <linux/shm.h>
  14 #include <linux/mman.h>
  15 #include <linux/locks.h>
  16 #include <linux/pagemap.h>
  17 #include <linux/swap.h>
  18 #include <linux/smp_lock.h>
  19 #include <linux/blkdev.h>
  20 #include <linux/file.h>
  21 #include <linux/swapctl.h>
  22
  23 #include <asm/pgtable.h>
  24 #include <asm/uaccess.h>
  25
  26 /*
  27  * Shared mappings implemented 30.11.1994. It's not fully working yet,
  28  * though.
  29  *
  30  * Shared mappings now work. 15.8.1995  Bruno.
  31  */
  32
  33 unsigned long page_cache_size = 0;
  34 struct page * page_hash_table[PAGE_HASH_SIZE];
  35
  36 /*
  37  * Simple routines for both non-shared and shared mappings.
  38  */
  39
  40 #define release_page(page) __free_page((page))
  41
  42 /*
  43  * Invalidate the pages of an inode, removing all pages that aren't
  44  * locked down (those are sure to be up-to-date anyway, so we shouldn't
  45  * invalidate them).
  46  */
  47 void invalidate_inode_pages(struct inode * inode)
  48 {
  49         struct page ** p;
  50         struct page * page;
  51
  52         p = &inode->i_pages;
  53         while ((page = *p) != NULL) {
  54                 if (PageLocked(page)) {
  55                         p = &page->next;
  56                         continue;
  57                 }
  58                 inode->i_nrpages--;
  59                 if ((*p = page->next) != NULL)
  60                         (*p)->prev = page->prev;
  61                 page->next = NULL;
  62                 page->prev = NULL;
  63                 remove_page_from_hash_queue(page);
  64                 page->inode = NULL;
  65                 __free_page(page);
  66                 continue;
  67         }
  68 }
  69
  70 /*
  71  * Truncate the page cache at a set offset, removing the pages
  72  * that are beyond that offset (and zeroing out partial pages).
  73  */
  74 void truncate_inode_pages(struct inode * inode, unsigned long start)
  75 {
  76         struct page ** p;
  77         struct page * page;
  78
  79 repeat:
  80         p = &inode->i_pages;
  81         while ((page = *p) != NULL) {
  82                 unsigned long offset = page->offset;
  83
  84                 /* page wholly truncated - free it */
  85                 if (offset >= start) {
  86                         if (PageLocked(page)) {
  87                                 wait_on_page(page);
  88                                 goto repeat;
  89                         }
  90                         inode->i_nrpages--;
  91                         if ((*p = page->next) != NULL)
  92                                 (*p)->prev = page->prev;
  93                         page->next = NULL;
  94                         page->prev = NULL;
  95                         remove_page_from_hash_queue(page);
  96                         page->inode = NULL;
  97                         __free_page(page);
  98                         continue;
  99                 }
 100                 p = &page->next;
 101                 offset = start - offset;
 102                 /* partial truncate, clear end of page */
 103                 if (offset < PAGE_SIZE) {
 104                         unsigned long address = page_address(page);
 105                         memset((void *) (offset + address), 0, PAGE_SIZE - offset);
 106                         flush_page_to_ram(address);
 107                 }
 108         }
 109 }
 110
 111 /*
 112  * Remove a page from the page cache and free it.
 113  */
 114 void remove_inode_page(struct page *page)
 115 {
 116         remove_page_from_hash_queue(page);
 117         remove_page_from_inode_queue(page);
 118         __free_page(page);
 119 }
 120
 121 int shrink_mmap(int priority, int gfp_mask)
 122 {
 123         static unsigned long clock = 0;
 124         unsigned long limit = num_physpages;
 125         struct page * page;
 126         int count;
 127
 128         count = (limit << 1) >> priority;
 129
 130         page = mem_map + clock;
 131         do {
 132                 int referenced;
 133
 134                 /* This works even in the presence of PageSkip because
 135                  * the first two entries at the beginning of a hole will
 136                  * be marked, not just the first.
 137                  */
 138                 page++;
 139                 clock++;
 140                 if (clock >= max_mapnr) {
 141                         clock = 0;
 142                         page = mem_map;
 143                 }
 144                 if (PageSkip(page)) {
 145                         /* next_hash is overloaded for PageSkip */
 146                         page = page->next_hash;
 147                         clock = page - mem_map;
 148                 }
 149
 150                 count--;
 151                 referenced = test_and_clear_bit(PG_referenced, &page->flags);
 152
 153                 if (PageLocked(page))
 154                         continue;
 155
 156                 if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
 157                         continue;
 158
 159                 /* We can't free pages unless there's just one user */
 160                 if (atomic_read(&page->count) != 1)
 161                         continue;
 162
 163                 /*
 164                  * Is it a page swap page? If so, we want to
 165                  * drop it if it is no longer used, even if it
 166                  * were to be marked referenced..
 167                  */
 168                 if (PageSwapCache(page)) {
 169                         if (referenced && swap_count(page->offset) != 1)
 170                                 continue;
 171                         delete_from_swap_cache(page);
 172                         return 1;
 173                 }
 174
 175                 if (referenced)
 176                         continue;
 177
 178                 /* Is it a buffer page? */
 179                 if (page->buffers) {
 180                         if (buffer_under_min())
 181                                 continue;
 182                         if (!try_to_free_buffers(page))
 183                                 continue;
 184                         return 1;
 185                 }
 186
 187                 /* is it a page-cache page? */
 188                 if (page->inode) {
 189                         if (pgcache_under_min())
 190                                 continue;
 191                         remove_inode_page(page);
 192                         return 1;
 193                 }
 194
 195         } while (count > 0);
 196         return 0;
 197 }
 198
 199 /*
 200  * Update a page cache copy, when we're doing a "write()" system call
 201  * See also "update_vm_cache()".
 202  */
 203 void update_vm_cache(struct inode * inode, unsigned long pos, const char * buf, int count)
 204 {
 205         unsigned long offset, len;
 206
 207         offset = (pos & ~PAGE_MASK);
 208         pos = pos & PAGE_MASK;
 209         len = PAGE_SIZE - offset;
 210         do {
 211                 struct page * page;
 212
 213                 if (len > count)
 214                         len = count;
 215                 page = find_page(inode, pos);
 216                 if (page) {
 217                         wait_on_page(page);
 218                         memcpy((void *) (offset + page_address(page)), buf, len);
 219                         release_page(page);
 220                 }
 221                 count -= len;
 222                 buf += len;
 223                 len = PAGE_SIZE;
 224                 offset = 0;
 225                 pos += PAGE_SIZE;
 226         } while (count);
 227 }
 228
 229 static inline void add_to_page_cache(struct page * page,
 230         struct inode * inode, unsigned long offset,
 231         struct page **hash)
 232 {
 233         atomic_inc(&page->count);
 234         page->flags = (page->flags & ~((1 << PG_uptodate) | (1 << PG_error))) | (1 << PG_referenced);
 235         page->offset = offset;
 236         add_page_to_inode_queue(inode, page);
 237         __add_page_to_hash_queue(page, hash);
 238 }
 239
 240 /*
 241  * Try to read ahead in the file. "page_cache" is a potentially free page
 242  * that we could use for the cache (if it is 0 we can try to create one,
 243  * this is all overlapped with the IO on the previous page finishing anyway)
 244  */
 245 static unsigned long try_to_read_ahead(struct file * file,
 246                                 unsigned long offset, unsigned long page_cache)
 247 {
 248         struct inode *inode = file->f_dentry->d_inode;
 249         struct page * page;
 250         struct page ** hash;
 251
 252         offset &= PAGE_MASK;
 253         switch (page_cache) {
 254         case 0:
 255                 page_cache = __get_free_page(GFP_USER);
 256                 if (!page_cache)
 257                         break;
 258         default:
 259                 if (offset >= inode->i_size)
 260                         break;
 261                 hash = page_hash(inode, offset);
 262                 page = __find_page(inode, offset, *hash);
 263                 if (!page) {
 264                         /*
 265                          * Ok, add the new page to the hash-queues...
 266                          */
 267                         page = mem_map + MAP_NR(page_cache);
 268                         add_to_page_cache(page, inode, offset, hash);
 269                         inode->i_op->readpage(file, page);
 270                         page_cache = 0;
 271                 }
 272                 release_page(page);
 273         }
 274         return page_cache;
 275 }
 276
 277 /*
 278  * Wait for IO to complete on a locked page.
 279  *
 280  * This must be called with the caller "holding" the page,
 281  * ie with increased "page->count" so that the page won't
 282  * go away during the wait..
 283  */
 284 void __wait_on_page(struct page *page)
 285 {
 286         struct task_struct *tsk = current;
 287         struct wait_queue wait;
 288
 289         wait.task = tsk;
 290         add_wait_queue(&page->wait, &wait);
 291 repeat:
 292         tsk->state = TASK_UNINTERRUPTIBLE;
 293         run_task_queue(&tq_disk);
 294         if (PageLocked(page)) {
 295                 schedule();
 296                 goto repeat;
 297         }
 298         tsk->state = TASK_RUNNING;
 299         remove_wait_queue(&page->wait, &wait);
 300 }
 301
 302 #if 0
 303 #define PROFILE_READAHEAD
 304 #define DEBUG_READAHEAD
 305 #endif
 306
 307 /*
 308  * Read-ahead profiling information
 309  * --------------------------------
 310  * Every PROFILE_MAXREADCOUNT, the following information is written
 311  * to the syslog:
 312  *   Percentage of asynchronous read-ahead.
 313  *   Average of read-ahead fields context value.
 314  * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
 315  * to the syslog.
 316  */
 317
 318 #ifdef PROFILE_READAHEAD
 319
 320 #define PROFILE_MAXREADCOUNT 1000
 321
 322 static unsigned long total_reada;
 323 static unsigned long total_async;
 324 static unsigned long total_ramax;
 325 static unsigned long total_ralen;
 326 static unsigned long total_rawin;
 327
 328 static void profile_readahead(int async, struct file *filp)
 329 {
 330         unsigned long flags;
 331
 332         ++total_reada;
 333         if (async)
 334                 ++total_async;
 335
 336         total_ramax     += filp->f_ramax;
 337         total_ralen     += filp->f_ralen;
 338         total_rawin     += filp->f_rawin;
 339
 340         if (total_reada > PROFILE_MAXREADCOUNT) {
 341                 save_flags(flags);
 342                 cli();
 343                 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
 344                         restore_flags(flags);
 345                         return;
 346                 }
 347
 348                 printk("Readahead average:  max=%ld, len=%ld, win=%ld, async=%ld%%\n",
 349                         total_ramax/total_reada,
 350                         total_ralen/total_reada,
 351                         total_rawin/total_reada,
 352                         (total_async*100)/total_reada);
 353 #ifdef DEBUG_READAHEAD
 354                 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%ld\n",
 355                         filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
 356 #endif
 357
 358                 total_reada     = 0;
 359                 total_async     = 0;
 360                 total_ramax     = 0;
 361                 total_ralen     = 0;
 362                 total_rawin     = 0;
 363
 364                 restore_flags(flags);
 365         }
 366 }
 367 #endif  /* defined PROFILE_READAHEAD */
 368
 369 /*
 370  * Read-ahead context:
 371  * -------------------
 372  * The read ahead context fields of the "struct file" are the following:
 373  * - f_raend : position of the first byte after the last page we tried to
 374  *             read ahead.
 375  * - f_ramax : current read-ahead maximum size.
 376  * - f_ralen : length of the current IO read block we tried to read-ahead.
 377  * - f_rawin : length of the current read-ahead window.
 378  *             if last read-ahead was synchronous then
 379  *                  f_rawin = f_ralen
 380  *             otherwise (was asynchronous)
 381  *                  f_rawin = previous value of f_ralen + f_ralen
 382  *
 383  * Read-ahead limits:
 384  * ------------------
 385  * MIN_READAHEAD   : minimum read-ahead size when read-ahead.
 386  * MAX_READAHEAD   : maximum read-ahead size when read-ahead.
 387  *
 388  * Synchronous read-ahead benefits:
 389  * --------------------------------
 390  * Using reasonable IO xfer length from peripheral devices increase system
 391  * performances.
 392  * Reasonable means, in this context, not too large but not too small.
 393  * The actual maximum value is:
 394  *      MAX_READAHEAD + PAGE_SIZE = 76k is CONFIG_READA_SMALL is undefined
 395  *      and 32K if defined (4K page size assumed).
 396  *
 397  * Asynchronous read-ahead benefits:
 398  * ---------------------------------
 399  * Overlapping next read request and user process execution increase system
 400  * performance.
 401  *
 402  * Read-ahead risks:
 403  * -----------------
 404  * We have to guess which further data are needed by the user process.
 405  * If these data are often not really needed, it's bad for system
 406  * performances.
 407  * However, we know that files are often accessed sequentially by
 408  * application programs and it seems that it is possible to have some good
 409  * strategy in that guessing.
 410  * We only try to read-ahead files that seems to be read sequentially.
 411  *
 412  * Asynchronous read-ahead risks:
 413  * ------------------------------
 414  * In order to maximize overlapping, we must start some asynchronous read
 415  * request from the device, as soon as possible.
 416  * We must be very careful about:
 417  * - The number of effective pending IO read requests.
 418  *   ONE seems to be the only reasonable value.
 419  * - The total memory pool usage for the file access stream.
 420  *   This maximum memory usage is implicitly 2 IO read chunks:
 421  *   2*(MAX_READAHEAD + PAGE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
 422  *   64k if defined (4K page size assumed).
 423  */
 424
 425 static inline int get_max_readahead(struct inode * inode)
 426 {
 427         if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
 428                 return MAX_READAHEAD;
 429         return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
 430 }
 431
 432 static inline unsigned long generic_file_readahead(int reada_ok,
 433         struct file * filp, struct inode * inode,
 434         unsigned long ppos, struct page * page, unsigned long page_cache)
 435 {
 436         unsigned long max_ahead, ahead;
 437         unsigned long raend;
 438         int max_readahead = get_max_readahead(inode);
 439
 440         raend = filp->f_raend & PAGE_MASK;
 441         max_ahead = 0;
 442
 443 /*
 444  * The current page is locked.
 445  * If the current position is inside the previous read IO request, do not
 446  * try to reread previously read ahead pages.
 447  * Otherwise decide or not to read ahead some pages synchronously.
 448  * If we are not going to read ahead, set the read ahead context for this
 449  * page only.
 450  */
 451         if (PageLocked(page)) {
 452                 if (!filp->f_ralen || ppos >= raend || ppos + filp->f_ralen < raend) {
 453                         raend = ppos;
 454                         if (raend < inode->i_size)
 455                                 max_ahead = filp->f_ramax;
 456                         filp->f_rawin = 0;
 457                         filp->f_ralen = PAGE_SIZE;
 458                         if (!max_ahead) {
 459                                 filp->f_raend  = ppos + filp->f_ralen;
 460                                 filp->f_rawin += filp->f_ralen;
 461                         }
 462                 }
 463         }
 464 /*
 465  * The current page is not locked.
 466  * If we were reading ahead and,
 467  * if the current max read ahead size is not zero and,
 468  * if the current position is inside the last read-ahead IO request,
 469  *   it is the moment to try to read ahead asynchronously.
 470  * We will later force unplug device in order to force asynchronous read IO.
 471  */
 472         else if (reada_ok && filp->f_ramax && raend >= PAGE_SIZE &&
 473                  ppos <= raend && ppos + filp->f_ralen >= raend) {
 474 /*
 475  * Add ONE page to max_ahead in order to try to have about the same IO max size
 476  * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_SIZE.
 477  * Compute the position of the last page we have tried to read in order to
 478  * begin to read ahead just at the next page.
 479  */
 480                 raend -= PAGE_SIZE;
 481                 if (raend < inode->i_size)
 482                         max_ahead = filp->f_ramax + PAGE_SIZE;
 483
 484                 if (max_ahead) {
 485                         filp->f_rawin = filp->f_ralen;
 486                         filp->f_ralen = 0;
 487                         reada_ok      = 2;
 488                 }
 489         }
 490 /*
 491  * Try to read ahead pages.
 492  * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
 493  * scheduler, will work enough for us to avoid too bad actuals IO requests.
 494  */
 495         ahead = 0;
 496         while (ahead < max_ahead) {
 497                 ahead += PAGE_SIZE;
 498                 page_cache = try_to_read_ahead(filp, raend + ahead,
 499                                                 page_cache);
 500         }
 501 /*
 502  * If we tried to read ahead some pages,
 503  * If we tried to read ahead asynchronously,
 504  *   Try to force unplug of the device in order to start an asynchronous
 505  *   read IO request.
 506  * Update the read-ahead context.
 507  * Store the length of the current read-ahead window.
 508  * Double the current max read ahead size.
 509  *   That heuristic avoid to do some large IO for files that are not really
 510  *   accessed sequentially.
 511  */
 512         if (ahead) {
 513                 if (reada_ok == 2) {
 514                         run_task_queue(&tq_disk);
 515                 }
 516
 517                 filp->f_ralen += ahead;
 518                 filp->f_rawin += filp->f_ralen;
 519                 filp->f_raend = raend + ahead + PAGE_SIZE;
 520
 521                 filp->f_ramax += filp->f_ramax;
 522
 523                 if (filp->f_ramax > max_readahead)
 524                         filp->f_ramax = max_readahead;
 525
 526 #ifdef PROFILE_READAHEAD
 527                 profile_readahead((reada_ok == 2), filp);
 528 #endif
 529         }
 530
 531         return page_cache;
 532 }
 533
 534 /*
 535  * "descriptor" for what we're up to with a read.
 536  * This allows us to use the same read code yet
 537  * have multiple different users of the data that
 538  * we read from a file.
 539  *
 540  * The simplest case just copies the data to user
 541  * mode.
 542  */
 543 typedef struct {
 544         size_t written;
 545         size_t count;
 546         char * buf;
 547         int error;
 548 } read_descriptor_t;
 549
 550 typedef int (*read_actor_t)(read_descriptor_t *, const char *, unsigned long);
 551
 552 /*
 553  * This is a generic file read routine, and uses the
 554  * inode->i_op->readpage() function for the actual low-level
 555  * stuff.
 556  *
 557  * This is really ugly. But the goto's actually try to clarify some
 558  * of the logic when it comes to error handling etc.
 559  */
 560 static void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
 561 {
 562         struct dentry *dentry = filp->f_dentry;
 563         struct inode *inode = dentry->d_inode;
 564         size_t pos, pgpos, page_cache;
 565         int reada_ok;
 566         int max_readahead = get_max_readahead(inode);
 567
 568         page_cache = 0;
 569
 570         pos = *ppos;
 571         pgpos = pos & PAGE_MASK;
 572 /*
 573  * If the current position is outside the previous read-ahead window,
 574  * we reset the current read-ahead context and set read ahead max to zero
 575  * (will be set to just needed value later),
 576  * otherwise, we assume that the file accesses are sequential enough to
 577  * continue read-ahead.
 578  */
 579         if (pgpos > filp->f_raend || pgpos + filp->f_rawin < filp->f_raend) {
 580                 reada_ok = 0;
 581                 filp->f_raend = 0;
 582                 filp->f_ralen = 0;
 583                 filp->f_ramax = 0;
 584                 filp->f_rawin = 0;
 585         } else {
 586                 reada_ok = 1;
 587         }
 588 /*
 589  * Adjust the current value of read-ahead max.
 590  * If the read operation stay in the first half page, force no readahead.
 591  * Otherwise try to increase read ahead max just enough to do the read request.
 592  * Then, at least MIN_READAHEAD if read ahead is ok,
 593  * and at most MAX_READAHEAD in all cases.
 594  */
 595         if (pos + desc->count <= (PAGE_SIZE >> 1)) {
 596                 filp->f_ramax = 0;
 597         } else {
 598                 unsigned long needed;
 599
 600                 needed = ((pos + desc->count) & PAGE_MASK) - pgpos;
 601
 602                 if (filp->f_ramax < needed)
 603                         filp->f_ramax = needed;
 604
 605                 if (reada_ok && filp->f_ramax < MIN_READAHEAD)
 606                                 filp->f_ramax = MIN_READAHEAD;
 607                 if (filp->f_ramax > max_readahead)
 608                         filp->f_ramax = max_readahead;
 609         }
 610
 611         for (;;) {
 612                 struct page *page, **hash;
 613
 614                 if (pos >= inode->i_size)
 615                         break;
 616
 617                 /*
 618                  * Try to find the data in the page cache..
 619                  */
 620                 hash = page_hash(inode, pos & PAGE_MASK);
 621                 page = __find_page(inode, pos & PAGE_MASK, *hash);
 622                 if (!page)
 623                         goto no_cached_page;
 624
 625 found_page:
 626 /*
 627  * Try to read ahead only if the current page is filled or being filled.
 628  * Otherwise, if we were reading ahead, decrease max read ahead size to
 629  * the minimum value.
 630  * In this context, that seems to may happen only on some read error or if
 631  * the page has been rewritten.
 632  */
 633                 if (PageUptodate(page) || PageLocked(page))
 634                         page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_MASK, page, page_cache);
 635                 else if (reada_ok && filp->f_ramax > MIN_READAHEAD)
 636                                 filp->f_ramax = MIN_READAHEAD;
 637
 638                 wait_on_page(page);
 639
 640                 if (!PageUptodate(page))
 641                         goto page_read_error;
 642
 643 success:
 644                 /*
 645                  * Ok, we have the page, it's up-to-date and ok,
 646                  * so now we can finally copy it to user space...
 647                  */
 648         {
 649                 unsigned long offset, nr;
 650
 651                 offset = pos & ~PAGE_MASK;
 652                 nr = PAGE_SIZE - offset;
 653                 if (nr > inode->i_size - pos)
 654                         nr = inode->i_size - pos;
 655
 656                 /*
 657                  * The actor routine returns how many bytes were actually used..
 658                  * NOTE! This may not be the same as how much of a user buffer
 659                  * we filled up (we may be padding etc), so we can only update
 660                  * "pos" here (the actor routine has to update the user buffer
 661                  * pointers and the remaining count).
 662                  */
 663                 nr = actor(desc, (const char *) (page_address(page) + offset), nr);
 664                 pos += nr;
 665                 release_page(page);
 666                 if (nr && desc->count)
 667                         continue;
 668                 break;
 669         }
 670
 671 no_cached_page:
 672                 /*
 673                  * Ok, it wasn't cached, so we need to create a new
 674                  * page..
 675                  */
 676                 if (!page_cache) {
 677                         page_cache = __get_free_page(GFP_USER);
 678                         /*
 679                          * That could have slept, so go around to the
 680                          * very beginning..
 681                          */
 682                         if (page_cache)
 683                                 continue;
 684                         desc->error = -ENOMEM;
 685                         break;
 686                 }
 687
 688                 /*
 689                  * Ok, add the new page to the hash-queues...
 690                  */
 691                 page = mem_map + MAP_NR(page_cache);
 692                 page_cache = 0;
 693                 add_to_page_cache(page, inode, pos & PAGE_MASK, hash);
 694
 695                 /*
 696                  * Error handling is tricky. If we get a read error,
 697                  * the cached page stays in the cache (but uptodate=0),
 698                  * and the next process that accesses it will try to
 699                  * re-read it. This is needed for NFS etc, where the
 700                  * identity of the reader can decide if we can read the
 701                  * page or not..
 702                  */
 703 /*
 704  * We have to read the page.
 705  * If we were reading ahead, we had previously tried to read this page,
 706  * That means that the page has probably been removed from the cache before
 707  * the application process needs it, or has been rewritten.
 708  * Decrease max readahead size to the minimum value in that situation.
 709  */
 710                 if (reada_ok && filp->f_ramax > MIN_READAHEAD)
 711                         filp->f_ramax = MIN_READAHEAD;
 712
 713                 {
 714                         int error = inode->i_op->readpage(filp, page);
 715                         if (!error)
 716                                 goto found_page;
 717                         desc->error = error;
 718                         release_page(page);
 719                         break;
 720                 }
 721
 722 page_read_error:
 723                 /*
 724                  * We found the page, but it wasn't up-to-date.
 725                  * Try to re-read it _once_. We do this synchronously,
 726                  * because this happens only if there were errors.
 727                  */
 728                 {
 729                         int error = inode->i_op->readpage(filp, page);
 730                         if (!error) {
 731                                 wait_on_page(page);
 732                                 if (PageUptodate(page) && !PageError(page))
 733                                         goto success;
 734                                 error = -EIO; /* Some unspecified error occurred.. */
 735                         }
 736                         desc->error = error;
 737                         release_page(page);
 738                         break;
 739                 }
 740         }
 741
 742         *ppos = pos;
 743         filp->f_reada = 1;
 744         if (page_cache)
 745                 free_page(page_cache);
 746         UPDATE_ATIME(inode);
 747 }
 748
 749 static int file_read_actor(read_descriptor_t * desc, const char *area, unsigned long size)
 750 {
 751         unsigned long left;
 752         unsigned long count = desc->count;
 753
 754         if (size > count)
 755                 size = count;
 756         left = __copy_to_user(desc->buf, area, size);
 757         if (left) {
 758                 size -= left;
 759                 desc->error = -EFAULT;
 760         }
 761         desc->count = count - size;
 762         desc->written += size;
 763         desc->buf += size;
 764         return size;
 765 }
 766
 767 /*
 768  * This is the "read()" routine for all filesystems
 769  * that can use the page cache directly.
 770  */
 771 ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
 772 {
 773         ssize_t retval;
 774
 775         retval = -EFAULT;
 776         if (access_ok(VERIFY_WRITE, buf, count)) {
 777                 retval = 0;
 778                 if (count) {
 779                         read_descriptor_t desc;
 780
 781                         desc.written = 0;
 782                         desc.count = count;
 783                         desc.buf = buf;
 784                         desc.error = 0;
 785                         do_generic_file_read(filp, ppos, &desc, file_read_actor);
 786
 787                         retval = desc.written;
 788                         if (!retval)
 789                                 retval = desc.error;
 790                 }
 791         }
 792         return retval;
 793 }
 794
 795 static int file_send_actor(read_descriptor_t * desc, const char *area, unsigned long size)
 796 {
 797         ssize_t written;
 798         unsigned long count = desc->count;
 799         struct file *file = (struct file *) desc->buf;
 800         struct inode *inode = file->f_dentry->d_inode;
 801         mm_segment_t old_fs;
 802
 803         if (size > count)
 804                 size = count;
 805         down(&inode->i_sem);
 806         old_fs = get_fs();
 807         set_fs(KERNEL_DS);
 808         written = file->f_op->write(file, area, size, &file->f_pos);
 809         set_fs(old_fs);
 810         up(&inode->i_sem);
 811         if (written < 0) {
 812                 desc->error = written;
 813                 written = 0;
 814         }
 815         desc->count = count - written;
 816         desc->written += written;
 817         return written;
 818 }
 819
 820 asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
 821 {
 822         ssize_t retval;
 823         struct file * in_file, * out_file;
 824         struct inode * in_inode, * out_inode;
 825
 826         lock_kernel();
 827
 828         /*
 829          * Get input file, and verify that it is ok..
 830          */
 831         retval = -EBADF;
 832         in_file = fget(in_fd);
 833         if (!in_file)
 834                 goto out;
 835         if (!(in_file->f_mode & FMODE_READ))
 836                 goto fput_in;
 837         retval = -EINVAL;
 838         in_inode = in_file->f_dentry->d_inode;
 839         if (!in_inode)
 840                 goto fput_in;
 841         if (!in_inode->i_op || !in_inode->i_op->readpage)
 842                 goto fput_in;
 843         retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
 844         if (retval)
 845                 goto fput_in;
 846
 847         /*
 848          * Get output file, and verify that it is ok..
 849          */
 850         retval = -EBADF;
 851         out_file = fget(out_fd);
 852         if (!out_file)
 853                 goto fput_in;
 854         if (!(out_file->f_mode & FMODE_WRITE))
 855                 goto fput_out;
 856         retval = -EINVAL;
 857         if (!out_file->f_op || !out_file->f_op->write)
 858                 goto fput_out;
 859         out_inode = out_file->f_dentry->d_inode;
 860         if (!out_inode)
 861                 goto fput_out;
 862         retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
 863         if (retval)
 864                 goto fput_out;
 865
 866         retval = 0;
 867         if (count) {
 868                 read_descriptor_t desc;
 869                 loff_t pos = 0, *ppos;
 870
 871                 retval = -EFAULT;
 872                 ppos = &in_file->f_pos;
 873                 if (offset) {
 874                         if (get_user(pos, offset))
 875                                 goto fput_out;
 876                         ppos = &pos;
 877                 }
 878
 879                 desc.written = 0;
 880                 desc.count = count;
 881                 desc.buf = (char *) out_file;
 882                 desc.error = 0;
 883                 do_generic_file_read(in_file, ppos, &desc, file_send_actor);
 884
 885                 retval = desc.written;
 886                 if (!retval)
 887                         retval = desc.error;
 888                 if (offset)
 889                         put_user(pos, offset);
 890         }
 891
 892
 893 fput_out:
 894         fput(out_file);
 895 fput_in:
 896         fput(in_file);
 897 out:
 898         unlock_kernel();
 899         return retval;
 900 }
 901
 902 /*
 903  * Semantics for shared and private memory areas are different past the end
 904  * of the file. A shared mapping past the last page of the file is an error
 905  * and results in a SIGBUS, while a private mapping just maps in a zero page.
 906  *
 907  * The goto's are kind of ugly, but this streamlines the normal case of having
 908  * it in the page cache, and handles the special cases reasonably without
 909  * having a lot of duplicated code.
 910  *
 911  * WSH 06/04/97: fixed a memory leak and moved the allocation of new_page
 912  * ahead of the wait if we're sure to need it.
 913  */
 914 static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long address, int no_share)
 915 {
 916         struct file * file = area->vm_file;
 917         struct dentry * dentry = file->f_dentry;
 918         struct inode * inode = dentry->d_inode;
 919         unsigned long offset, reada, i;
 920         struct page * page, **hash;
 921         unsigned long old_page, new_page;
 922
 923         new_page = 0;
 924         offset = (address & PAGE_MASK) - area->vm_start + area->vm_offset;
 925         if (offset >= inode->i_size && (area->vm_flags & VM_SHARED) && area->vm_mm == current->mm)
 926                 goto no_page;
 927
 928         /*
 929          * Do we have something in the page cache already?
 930          */
 931         hash = page_hash(inode, offset);
 932         page = __find_page(inode, offset, *hash);
 933         if (!page)
 934                 goto no_cached_page;
 935
 936 found_page:
 937         /*
 938          * Ok, found a page in the page cache, now we need to check
 939          * that it's up-to-date.  First check whether we'll need an
 940          * extra page -- better to overlap the allocation with the I/O.
 941          */
 942         if (no_share && !new_page) {
 943                 new_page = __get_free_page(GFP_USER);
 944                 if (!new_page)
 945                         goto failure;
 946         }
 947
 948         if (PageLocked(page))
 949                 goto page_locked_wait;
 950         if (!PageUptodate(page))
 951                 goto page_read_error;
 952
 953 success:
 954         /*
 955          * Found the page, need to check sharing and possibly
 956          * copy it over to another page..
 957          */
 958         old_page = page_address(page);
 959         if (!no_share) {
 960                 /*
 961                  * Ok, we can share the cached page directly.. Get rid
 962                  * of any potential extra pages.
 963                  */
 964                 if (new_page)
 965                         free_page(new_page);
 966
 967                 flush_page_to_ram(old_page);
 968                 return old_page;
 969         }
 970
 971         /*
 972          * No sharing ... copy to the new page.
 973          */
 974         copy_page(new_page, old_page);
 975         flush_page_to_ram(new_page);
 976         release_page(page);
 977         return new_page;
 978
 979 no_cached_page:
 980         /*
 981          * Try to read in an entire cluster at once.
 982          */
 983         reada   = offset;
 984         reada >>= PAGE_SHIFT + page_cluster;
 985         reada <<= PAGE_SHIFT + page_cluster;
 986
 987         for (i = 1 << page_cluster; i > 0; --i, reada += PAGE_SIZE)
 988                 new_page = try_to_read_ahead(file, reada, new_page);
 989
 990         if (!new_page)
 991                 new_page = __get_free_page(GFP_USER);
 992         if (!new_page)
 993                 goto no_page;
 994
 995         /*
 996          * During getting the above page we might have slept,
 997          * so we need to re-check the situation with the page
 998          * cache.. The page we just got may be useful if we
 999          * can't share, so don't get rid of it here.
1000          */
1001         page = find_page(inode, offset);
1002         if (page)
1003                 goto found_page;
1004
1005         /*
1006          * Now, create a new page-cache page from the page we got
1007          */
1008         page = mem_map + MAP_NR(new_page);
1009         new_page = 0;
1010         add_to_page_cache(page, inode, offset, hash);
1011
1012         if (inode->i_op->readpage(file, page) != 0)
1013                 goto failure;
1014
1015         goto found_page;
1016
1017 page_locked_wait:
1018         __wait_on_page(page);
1019         if (PageUptodate(page))
1020                 goto success;
1021
1022 page_read_error:
1023         /*
1024          * Umm, take care of errors if the page isn't up-to-date.
1025          * Try to re-read it _once_. We do this synchronously,
1026          * because there really aren't any performance issues here
1027          * and we need to check for errors.
1028          */
1029         if (inode->i_op->readpage(file, page) != 0)
1030                 goto failure;
1031         wait_on_page(page);
1032         if (PageError(page))
1033                 goto failure;
1034         if (PageUptodate(page))
1035                 goto success;
1036
1037         /*
1038          * Things didn't work out. Return zero to tell the
1039          * mm layer so, possibly freeing the page cache page first.
1040          */
1041 failure:
1042         release_page(page);
1043         if (new_page)
1044                 free_page(new_page);
1045 no_page:
1046         return 0;
1047 }
1048
1049 /*
1050  * Tries to write a shared mapped page to its backing store. May return -EIO
1051  * if the disk is full.
1052  */
1053 static inline int do_write_page(struct inode * inode, struct file * file,
1054         const char * page, unsigned long offset)
1055 {
1056         int retval;
1057         unsigned long size;
1058         loff_t loff = offset;
1059         mm_segment_t old_fs;
1060
1061         size = offset + PAGE_SIZE;
1062         /* refuse to extend file size.. */
1063         if (S_ISREG(inode->i_mode)) {
1064                 if (size > inode->i_size)
1065                         size = inode->i_size;
1066                 /* Ho humm.. We should have tested for this earlier */
1067                 if (size < offset)
1068                         return -EIO;
1069         }
1070         size -= offset;
1071         old_fs = get_fs();
1072         set_fs(KERNEL_DS);
1073         retval = -EIO;
1074         if (size == file->f_op->write(file, (const char *) page, size, &loff))
1075                 retval = 0;
1076         set_fs(old_fs);
1077         return retval;
1078 }
1079
1080 static int filemap_write_page(struct vm_area_struct * vma,
1081         unsigned long offset,
1082         unsigned long page)
1083 {
1084         int result;
1085         struct file * file;
1086         struct dentry * dentry;
1087         struct inode * inode;
1088
1089         file = vma->vm_file;
1090         dentry = file->f_dentry;
1091         inode = dentry->d_inode;
1092         if (!file->f_op->write)
1093                 return -EIO;
1094
1095         /*
1096          * If a task terminates while we're swapping the page, the vma and
1097          * and file could be released ... increment the count to be safe.
1098          */
1099         file->f_count++;
1100         down(&inode->i_sem);
1101         result = do_write_page(inode, file, (const char *) page, offset);
1102         up(&inode->i_sem);
1103         fput(file);
1104         return result;
1105 }
1106
1107
1108 /*
1109  * The page cache takes care of races between somebody
1110  * trying to swap something out and swap something in
1111  * at the same time..
1112  */
1113 int filemap_swapout(struct vm_area_struct * vma, struct page * page)
1114 {
1115         return filemap_write_page(vma, page->offset, page_address(page));
1116 }
1117
1118 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
1119         unsigned long address, unsigned int flags)
1120 {
1121         pte_t pte = *ptep;
1122         unsigned long page;
1123         int error;
1124
1125         if (!(flags & MS_INVALIDATE)) {
1126                 if (!pte_present(pte))
1127                         return 0;
1128                 if (!pte_dirty(pte))
1129                         return 0;
1130                 flush_page_to_ram(pte_page(pte));
1131                 flush_cache_page(vma, address);
1132                 set_pte(ptep, pte_mkclean(pte));
1133                 flush_tlb_page(vma, address);
1134                 page = pte_page(pte);
1135                 atomic_inc(&mem_map[MAP_NR(page)].count);
1136         } else {
1137                 if (pte_none(pte))
1138                         return 0;
1139                 flush_cache_page(vma, address);
1140                 pte_clear(ptep);
1141                 flush_tlb_page(vma, address);
1142                 if (!pte_present(pte)) {
1143                         swap_free(pte_val(pte));
1144                         return 0;
1145                 }
1146                 page = pte_page(pte);
1147                 if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
1148                         free_page(page);
1149                         return 0;
1150                 }
1151         }
1152         error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page);
1153         free_page(page);
1154         return error;
1155 }
1156
1157 static inline int filemap_sync_pte_range(pmd_t * pmd,
1158         unsigned long address, unsigned long size,
1159         struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
1160 {
1161         pte_t * pte;
1162         unsigned long end;
1163         int error;
1164
1165         if (pmd_none(*pmd))
1166                 return 0;
1167         if (pmd_bad(*pmd)) {
1168                 printk("filemap_sync_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
1169                 pmd_clear(pmd);
1170                 return 0;
1171         }
1172         pte = pte_offset(pmd, address);
1173         offset += address & PMD_MASK;
1174         address &= ~PMD_MASK;
1175         end = address + size;
1176         if (end > PMD_SIZE)
1177                 end = PMD_SIZE;
1178         error = 0;
1179         do {
1180                 error |= filemap_sync_pte(pte, vma, address + offset, flags);
1181                 address += PAGE_SIZE;
1182                 pte++;
1183         } while (address < end);
1184         return error;
1185 }
1186
1187 static inline int filemap_sync_pmd_range(pgd_t * pgd,
1188         unsigned long address, unsigned long size,
1189         struct vm_area_struct *vma, unsigned int flags)
1190 {
1191         pmd_t * pmd;
1192         unsigned long offset, end;
1193         int error;
1194
1195         if (pgd_none(*pgd))
1196                 return 0;
1197         if (pgd_bad(*pgd)) {
1198                 printk("filemap_sync_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd));
1199                 pgd_clear(pgd);
1200                 return 0;
1201         }
1202         pmd = pmd_offset(pgd, address);
1203         offset = address & PGDIR_MASK;
1204         address &= ~PGDIR_MASK;
1205         end = address + size;
1206         if (end > PGDIR_SIZE)
1207                 end = PGDIR_SIZE;
1208         error = 0;
1209         do {
1210                 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
1211                 address = (address + PMD_SIZE) & PMD_MASK;
1212                 pmd++;
1213         } while (address < end);
1214         return error;
1215 }
1216
1217 static int filemap_sync(struct vm_area_struct * vma, unsigned long address,
1218         size_t size, unsigned int flags)
1219 {
1220         pgd_t * dir;
1221         unsigned long end = address + size;
1222         int error = 0;
1223
1224         dir = pgd_offset(vma->vm_mm, address);
1225         flush_cache_range(vma->vm_mm, end - size, end);
1226         while (address < end) {
1227                 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
1228                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
1229                 dir++;
1230         }
1231         flush_tlb_range(vma->vm_mm, end - size, end);
1232         return error;
1233 }
1234
1235 /*
1236  * This handles (potentially partial) area unmaps..
1237  */
1238 static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len)
1239 {
1240         filemap_sync(vma, start, len, MS_ASYNC);
1241 }
1242
1243 /*
1244  * Shared mappings need to be able to do the right thing at
1245  * close/unmap/sync. They will also use the private file as
1246  * backing-store for swapping..
1247  */
1248 static struct vm_operations_struct file_shared_mmap = {
1249         NULL,                   /* no special open */
1250         NULL,                   /* no special close */
1251         filemap_unmap,          /* unmap - we need to sync the pages */
1252         NULL,                   /* no special protect */
1253         filemap_sync,           /* sync */
1254         NULL,                   /* advise */
1255         filemap_nopage,         /* nopage */
1256         NULL,                   /* wppage */
1257         filemap_swapout,        /* swapout */
1258         NULL,                   /* swapin */
1259 };
1260
1261 /*
1262  * Private mappings just need to be able to load in the map.
1263  *
1264  * (This is actually used for shared mappings as well, if we
1265  * know they can't ever get write permissions..)
1266  */
1267 static struct vm_operations_struct file_private_mmap = {
1268         NULL,                   /* open */
1269         NULL,                   /* close */
1270         NULL,                   /* unmap */
1271         NULL,                   /* protect */
1272         NULL,                   /* sync */
1273         NULL,                   /* advise */
1274         filemap_nopage,         /* nopage */
1275         NULL,                   /* wppage */
1276         NULL,                   /* swapout */
1277         NULL,                   /* swapin */
1278 };
1279
1280 /* This is used for a general mmap of a disk file */
1281
1282 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1283 {
1284         struct vm_operations_struct * ops;
1285         struct inode *inode = file->f_dentry->d_inode;
1286
1287         if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1288                 ops = &file_shared_mmap;
1289                 /* share_page() can only guarantee proper page sharing if
1290                  * the offsets are all page aligned. */
1291                 if (vma->vm_offset & (PAGE_SIZE - 1))
1292                         return -EINVAL;
1293         } else {
1294                 ops = &file_private_mmap;
1295                 if (vma->vm_offset & (inode->i_sb->s_blocksize - 1))
1296                         return -EINVAL;
1297         }
1298         if (!inode->i_sb || !S_ISREG(inode->i_mode))
1299                 return -EACCES;
1300         if (!inode->i_op || !inode->i_op->readpage)
1301                 return -ENOEXEC;
1302         UPDATE_ATIME(inode);
1303         vma->vm_file = file;
1304         file->f_count++;
1305         vma->vm_ops = ops;
1306         return 0;
1307 }
1308
1309
1310 /*
1311  * The msync() system call.
1312  */
1313
1314 static int msync_interval(struct vm_area_struct * vma,
1315         unsigned long start, unsigned long end, int flags)
1316 {
1317         if (vma->vm_file && vma->vm_ops && vma->vm_ops->sync) {
1318                 int error;
1319                 error = vma->vm_ops->sync(vma, start, end-start, flags);
1320                 if (!error && (flags & MS_SYNC)) {
1321                         struct file * file = vma->vm_file;
1322                         if (file) {
1323                                 struct dentry * dentry = file->f_dentry;
1324                                 struct inode * inode = dentry->d_inode;
1325                                 down(&inode->i_sem);
1326                                 error = file_fsync(file, dentry);
1327                                 up(&inode->i_sem);
1328                         }
1329                 }
1330                 return error;
1331         }
1332         return 0;
1333 }
1334
1335 asmlinkage int sys_msync(unsigned long start, size_t len, int flags)
1336 {
1337         unsigned long end;
1338         struct vm_area_struct * vma;
1339         int unmapped_error, error = -EINVAL;
1340
1341         down(&current->mm->mmap_sem);
1342         lock_kernel();
1343         if (start & ~PAGE_MASK)
1344                 goto out;
1345         len = (len + ~PAGE_MASK) & PAGE_MASK;
1346         end = start + len;
1347         if (end < start)
1348                 goto out;
1349         if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1350                 goto out;
1351         error = 0;
1352         if (end == start)
1353                 goto out;
1354         /*
1355          * If the interval [start,end) covers some unmapped address ranges,
1356          * just ignore them, but return -EFAULT at the end.
1357          */
1358         vma = find_vma(current->mm, start);
1359         unmapped_error = 0;
1360         for (;;) {
1361                 /* Still start < end. */
1362                 error = -EFAULT;
1363                 if (!vma)
1364                         goto out;
1365                 /* Here start < vma->vm_end. */
1366                 if (start < vma->vm_start) {
1367                         unmapped_error = -EFAULT;
1368                         start = vma->vm_start;
1369                 }
1370                 /* Here vma->vm_start <= start < vma->vm_end. */
1371                 if (end <= vma->vm_end) {
1372                         if (start < end) {
1373                                 error = msync_interval(vma, start, end, flags);
1374                                 if (error)
1375                                         goto out;
1376                         }
1377                         error = unmapped_error;
1378                         goto out;
1379                 }
1380                 /* Here vma->vm_start <= start < vma->vm_end < end. */
1381                 error = msync_interval(vma, start, vma->vm_end, flags);
1382                 if (error)
1383                         goto out;
1384                 start = vma->vm_end;
1385                 vma = vma->vm_next;
1386         }
1387 out:
1388         unlock_kernel();
1389         up(&current->mm->mmap_sem);
1390         return error;
1391 }
1392
1393 /*
1394  * Write to a file through the page cache. This is mainly for the
1395  * benefit of NFS and possibly other network-based file systems.
1396  *
1397  * We currently put everything into the page cache prior to writing it.
1398  * This is not a problem when writing full pages. With partial pages,
1399  * however, we first have to read the data into the cache, then
1400  * dirty the page, and finally schedule it for writing. Alternatively, we
1401  * could write-through just the portion of data that would go into that
1402  * page, but that would kill performance for applications that write data
1403  * line by line, and it's prone to race conditions.
1404  *
1405  * Note that this routine doesn't try to keep track of dirty pages. Each
1406  * file system has to do this all by itself, unfortunately.
1407  *                                                      okir@monad.swb.de
1408  */
1409 ssize_t
1410 generic_file_write(struct file *file, const char *buf,
1411                    size_t count, loff_t *ppos)
1412 {
1413         struct dentry   *dentry = file->f_dentry;
1414         struct inode    *inode = dentry->d_inode;
1415         unsigned long   pos = *ppos;
1416         unsigned long   limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
1417         struct page     *page, **hash;
1418         unsigned long   page_cache = 0;
1419         unsigned long   written;
1420         long            status, sync;
1421
1422         if (!inode->i_op || !inode->i_op->updatepage)
1423                 return -EIO;
1424
1425         sync    = file->f_flags & O_SYNC;
1426         written = 0;
1427
1428         if (file->f_flags & O_APPEND)
1429                 pos = inode->i_size;
1430
1431         /*
1432          * Check whether we've reached the file size limit.
1433          */
1434         status = -EFBIG;
1435         if (pos >= limit) {
1436                 send_sig(SIGXFSZ, current, 0);
1437                 goto out;
1438         }
1439
1440         status  = 0;
1441         /*
1442          * Check whether to truncate the write,
1443          * and send the signal if we do.
1444          */
1445         if (count > limit - pos) {
1446                 send_sig(SIGXFSZ, current, 0);
1447                 count = limit - pos;
1448         }
1449
1450         while (count) {
1451                 unsigned long bytes, pgpos, offset;
1452                 /*
1453                  * Try to find the page in the cache. If it isn't there,
1454                  * allocate a free page.
1455                  */
1456                 offset = (pos & ~PAGE_MASK);
1457                 pgpos = pos & PAGE_MASK;
1458                 bytes = PAGE_SIZE - offset;
1459                 if (bytes > count)
1460                         bytes = count;
1461
1462                 hash = page_hash(inode, pgpos);
1463                 page = __find_page(inode, pgpos, *hash);
1464                 if (!page) {
1465                         if (!page_cache) {
1466                                 page_cache = __get_free_page(GFP_USER);
1467                                 if (page_cache)
1468                                         continue;
1469                                 status = -ENOMEM;
1470                                 break;
1471                         }
1472                         page = mem_map + MAP_NR(page_cache);
1473                         add_to_page_cache(page, inode, pgpos, hash);
1474                         page_cache = 0;
1475                 }
1476
1477                 /* Get exclusive IO access to the page.. */
1478                 wait_on_page(page);
1479                 set_bit(PG_locked, &page->flags);
1480
1481                 /*
1482                  * Do the real work.. If the writer ends up delaying the write,
1483                  * the writer needs to increment the page use counts until he
1484                  * is done with the page.
1485                  */
1486                 bytes -= copy_from_user((u8*)page_address(page) + offset, buf, bytes);
1487                 status = -EFAULT;
1488                 if (bytes)
1489                         status = inode->i_op->updatepage(file, page, offset, bytes, sync);
1490
1491                 /* Mark it unlocked again and drop the page.. */
1492                 clear_bit(PG_locked, &page->flags);
1493                 wake_up(&page->wait);
1494                 __free_page(page);
1495
1496                 if (status < 0)
1497                         break;
1498
1499                 written += status;
1500                 count -= status;
1501                 pos += status;
1502                 buf += status;
1503         }
1504         *ppos = pos;
1505         if (pos > inode->i_size)
1506                 inode->i_size = pos;
1507
1508         if (page_cache)
1509                 free_page(page_cache);
1510 out:
1511         return written ? written : status;
1512 }
1513
1514 /*
1515  * Support routines for directory cacheing using the page cache.
1516  */
1517
1518 /*
1519  * Finds the page at the specified offset, installing a new page
1520  * if requested.  The count is incremented and the page is locked.
1521  *
1522  * Note: we don't have to worry about races here, as the caller
1523  * is holding the inode semaphore.
1524  */
1525 unsigned long get_cached_page(struct inode * inode, unsigned long offset,
1526                                 int new)
1527 {
1528         struct page * page;
1529         struct page ** hash;
1530         unsigned long page_cache = 0;
1531
1532         hash = page_hash(inode, offset);
1533         page = __find_page(inode, offset, *hash);
1534         if (!page) {
1535                 if (!new)
1536                         goto out;
1537                 page_cache = get_free_page(GFP_USER);
1538                 if (!page_cache)
1539                         goto out;
1540                 page = mem_map + MAP_NR(page_cache);
1541                 add_to_page_cache(page, inode, offset, hash);
1542         }
1543         if (atomic_read(&page->count) != 2)
1544                 printk(KERN_ERR "get_cached_page: page count=%d\n",
1545                         atomic_read(&page->count));
1546         if (test_bit(PG_locked, &page->flags))
1547                 printk(KERN_ERR "get_cached_page: page already locked!\n");
1548         set_bit(PG_locked, &page->flags);
1549         page_cache = page_address(page);
1550
1551 out:
1552         return page_cache;
1553 }
1554
1555 /*
1556  * Unlock and free a page.
1557  */
1558 void put_cached_page(unsigned long addr)
1559 {
1560         struct page * page = mem_map + MAP_NR(addr);
1561
1562         if (!test_bit(PG_locked, &page->flags))
1563                 printk("put_cached_page: page not locked!\n");
1564         if (atomic_read(&page->count) != 2)
1565                 printk("put_cached_page: page count=%d\n",
1566                         atomic_read(&page->count));
1567         clear_bit(PG_locked, &page->flags);
1568         wake_up(&page->wait);
1569         __free_page(page);
1570 }