mm/filemap.c

   1 /*
   2  *      linux/mm/filemap.c
   3  *
   4  * Copyright (C) 1994, 1995  Linus Torvalds
   5  */
   6
   7 /*
   8  * This file handles the generic file mmap semantics used by
   9  * most "normal" filesystems (but you don't /have/ to use this:
  10  * the NFS filesystem used to do this differently, for example)
  11  */
  12 #include <linux/malloc.h>
  13 #include <linux/shm.h>
  14 #include <linux/mman.h>
  15 #include <linux/locks.h>
  16 #include <linux/pagemap.h>
  17 #include <linux/swap.h>
  18 #include <linux/smp_lock.h>
  19 #include <linux/blkdev.h>
  20 #include <linux/file.h>
  21 #include <linux/swapctl.h>
  22 #include <linux/slab.h>
  23
  24 #include <asm/pgtable.h>
  25 #include <asm/uaccess.h>
  26
  27 /*
  28  * Shared mappings implemented 30.11.1994. It's not fully working yet,
  29  * though.
  30  *
  31  * Shared mappings now work. 15.8.1995  Bruno.
  32  */
  33
  34 unsigned long page_cache_size = 0;
  35 struct page * page_hash_table[PAGE_HASH_SIZE];
  36
  37 /*
  38  * Define a request structure for outstanding page write requests
  39  * to the background page io daemon
  40  */
  41
  42 struct pio_request
  43 {
  44         struct pio_request *    next;
  45         struct file *           file;
  46         unsigned long           offset;
  47         unsigned long           page;
  48 };
  49 static struct pio_request *pio_first = NULL, **pio_last = &pio_first;
  50 static kmem_cache_t *pio_request_cache;
  51 static struct wait_queue *pio_wait = NULL;
  52
  53 static inline void
  54 make_pio_request(struct file *, unsigned long, unsigned long);
  55
  56
  57 /*
  58  * Invalidate the pages of an inode, removing all pages that aren't
  59  * locked down (those are sure to be up-to-date anyway, so we shouldn't
  60  * invalidate them).
  61  */
  62 void invalidate_inode_pages(struct inode * inode)
  63 {
  64         struct page ** p;
  65         struct page * page;
  66
  67         p = &inode->i_pages;
  68         while ((page = *p) != NULL) {
  69                 if (PageLocked(page)) {
  70                         p = &page->next;
  71                         continue;
  72                 }
  73                 inode->i_nrpages--;
  74                 if ((*p = page->next) != NULL)
  75                         (*p)->prev = page->prev;
  76                 page->next = NULL;
  77                 page->prev = NULL;
  78                 remove_page_from_hash_queue(page);
  79                 page->inode = NULL;
  80                 page_cache_release(page);
  81                 continue;
  82         }
  83 }
  84
  85 /*
  86  * Truncate the page cache at a set offset, removing the pages
  87  * that are beyond that offset (and zeroing out partial pages).
  88  */
  89 void truncate_inode_pages(struct inode * inode, unsigned long start)
  90 {
  91         struct page ** p;
  92         struct page * page;
  93
  94 repeat:
  95         p = &inode->i_pages;
  96         while ((page = *p) != NULL) {
  97                 unsigned long offset = page->offset;
  98
  99                 /* page wholly truncated - free it */
 100                 if (offset >= start) {
 101                         if (PageLocked(page)) {
 102                                 wait_on_page(page);
 103                                 goto repeat;
 104                         }
 105                         inode->i_nrpages--;
 106                         if ((*p = page->next) != NULL)
 107                                 (*p)->prev = page->prev;
 108                         page->next = NULL;
 109                         page->prev = NULL;
 110                         remove_page_from_hash_queue(page);
 111                         page->inode = NULL;
 112                         page_cache_release(page);
 113                         continue;
 114                 }
 115                 p = &page->next;
 116                 offset = start - offset;
 117                 /* partial truncate, clear end of page */
 118                 if (offset < PAGE_CACHE_SIZE) {
 119                         unsigned long address = page_address(page);
 120                         memset((void *) (offset + address), 0, PAGE_CACHE_SIZE - offset);
 121                         flush_page_to_ram(address);
 122                 }
 123         }
 124 }
 125
 126 /*
 127  * Remove a page from the page cache and free it.
 128  */
 129 void remove_inode_page(struct page *page)
 130 {
 131         remove_page_from_hash_queue(page);
 132         remove_page_from_inode_queue(page);
 133         page_cache_release(page);
 134 }
 135
 136 int shrink_mmap(int priority, int gfp_mask)
 137 {
 138         static unsigned long clock = 0;
 139         unsigned long limit = num_physpages;
 140         struct page * page;
 141         int count;
 142
 143         count = limit >> priority;
 144
 145         page = mem_map + clock;
 146         do {
 147                 int referenced;
 148
 149                 /* This works even in the presence of PageSkip because
 150                  * the first two entries at the beginning of a hole will
 151                  * be marked, not just the first.
 152                  */
 153                 page++;
 154                 clock++;
 155                 if (clock >= max_mapnr) {
 156                         clock = 0;
 157                         page = mem_map;
 158                 }
 159                 if (PageSkip(page)) {
 160                         /* next_hash is overloaded for PageSkip */
 161                         page = page->next_hash;
 162                         clock = page - mem_map;
 163                 }
 164
 165                 referenced = test_and_clear_bit(PG_referenced, &page->flags);
 166
 167                 if (PageLocked(page))
 168                         continue;
 169
 170                 if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
 171                         continue;
 172
 173                 /* We can't free pages unless there's just one user */
 174                 if (atomic_read(&page->count) != 1)
 175                         continue;
 176
 177                 count--;
 178
 179                 /*
 180                  * Is it a page swap page? If so, we want to
 181                  * drop it if it is no longer used, even if it
 182                  * were to be marked referenced..
 183                  */
 184                 if (PageSwapCache(page)) {
 185                         if (referenced && swap_count(page->offset) != 1)
 186                                 continue;
 187                         delete_from_swap_cache(page);
 188                         return 1;
 189                 }
 190
 191                 if (referenced)
 192                         continue;
 193
 194                 /* Is it a buffer page? */
 195                 if (page->buffers) {
 196                         if (buffer_under_min())
 197                                 continue;
 198                         if (!try_to_free_buffers(page))
 199                                 continue;
 200                         return 1;
 201                 }
 202
 203                 /* is it a page-cache page? */
 204                 if (page->inode) {
 205                         if (pgcache_under_min())
 206                                 continue;
 207                         remove_inode_page(page);
 208                         return 1;
 209                 }
 210
 211         } while (count > 0);
 212         return 0;
 213 }
 214
 215 /*
 216  * Update a page cache copy, when we're doing a "write()" system call
 217  * See also "update_vm_cache()".
 218  */
 219 void update_vm_cache(struct inode * inode, unsigned long pos, const char * buf, int count)
 220 {
 221         unsigned long offset, len;
 222
 223         offset = (pos & ~PAGE_CACHE_MASK);
 224         pos = pos & PAGE_CACHE_MASK;
 225         len = PAGE_CACHE_SIZE - offset;
 226         do {
 227                 struct page * page;
 228
 229                 if (len > count)
 230                         len = count;
 231                 page = find_page(inode, pos);
 232                 if (page) {
 233                         wait_on_page(page);
 234                         memcpy((void *) (offset + page_address(page)), buf, len);
 235                         page_cache_release(page);
 236                 }
 237                 count -= len;
 238                 buf += len;
 239                 len = PAGE_CACHE_SIZE;
 240                 offset = 0;
 241                 pos += PAGE_CACHE_SIZE;
 242         } while (count);
 243 }
 244
 245 static inline void add_to_page_cache(struct page * page,
 246         struct inode * inode, unsigned long offset,
 247         struct page **hash)
 248 {
 249         atomic_inc(&page->count);
 250         page->flags = (page->flags & ~((1 << PG_uptodate) | (1 << PG_error))) | (1 << PG_referenced);
 251         page->offset = offset;
 252         add_page_to_inode_queue(inode, page);
 253         __add_page_to_hash_queue(page, hash);
 254 }
 255
 256 /*
 257  * Try to read ahead in the file. "page_cache" is a potentially free page
 258  * that we could use for the cache (if it is 0 we can try to create one,
 259  * this is all overlapped with the IO on the previous page finishing anyway)
 260  */
 261 static unsigned long try_to_read_ahead(struct file * file,
 262                                 unsigned long offset, unsigned long page_cache)
 263 {
 264         struct inode *inode = file->f_dentry->d_inode;
 265         struct page * page;
 266         struct page ** hash;
 267
 268         offset &= PAGE_CACHE_MASK;
 269         switch (page_cache) {
 270         case 0:
 271                 page_cache = page_cache_alloc();
 272                 if (!page_cache)
 273                         break;
 274         default:
 275                 if (offset >= inode->i_size)
 276                         break;
 277                 hash = page_hash(inode, offset);
 278                 page = __find_page(inode, offset, *hash);
 279                 if (!page) {
 280                         /*
 281                          * Ok, add the new page to the hash-queues...
 282                          */
 283                         page = page_cache_entry(page_cache);
 284                         add_to_page_cache(page, inode, offset, hash);
 285                         inode->i_op->readpage(file, page);
 286                         page_cache = 0;
 287                 }
 288                 page_cache_release(page);
 289         }
 290         return page_cache;
 291 }
 292
 293 /*
 294  * Wait for IO to complete on a locked page.
 295  *
 296  * This must be called with the caller "holding" the page,
 297  * ie with increased "page->count" so that the page won't
 298  * go away during the wait..
 299  */
 300 void __wait_on_page(struct page *page)
 301 {
 302         struct task_struct *tsk = current;
 303         struct wait_queue wait;
 304
 305         wait.task = tsk;
 306         add_wait_queue(&page->wait, &wait);
 307 repeat:
 308         tsk->state = TASK_UNINTERRUPTIBLE;
 309         run_task_queue(&tq_disk);
 310         if (PageLocked(page)) {
 311                 schedule();
 312                 goto repeat;
 313         }
 314         tsk->state = TASK_RUNNING;
 315         remove_wait_queue(&page->wait, &wait);
 316 }
 317
 318 #if 0
 319 #define PROFILE_READAHEAD
 320 #define DEBUG_READAHEAD
 321 #endif
 322
 323 /*
 324  * Read-ahead profiling information
 325  * --------------------------------
 326  * Every PROFILE_MAXREADCOUNT, the following information is written
 327  * to the syslog:
 328  *   Percentage of asynchronous read-ahead.
 329  *   Average of read-ahead fields context value.
 330  * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
 331  * to the syslog.
 332  */
 333
 334 #ifdef PROFILE_READAHEAD
 335
 336 #define PROFILE_MAXREADCOUNT 1000
 337
 338 static unsigned long total_reada;
 339 static unsigned long total_async;
 340 static unsigned long total_ramax;
 341 static unsigned long total_ralen;
 342 static unsigned long total_rawin;
 343
 344 static void profile_readahead(int async, struct file *filp)
 345 {
 346         unsigned long flags;
 347
 348         ++total_reada;
 349         if (async)
 350                 ++total_async;
 351
 352         total_ramax     += filp->f_ramax;
 353         total_ralen     += filp->f_ralen;
 354         total_rawin     += filp->f_rawin;
 355
 356         if (total_reada > PROFILE_MAXREADCOUNT) {
 357                 save_flags(flags);
 358                 cli();
 359                 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
 360                         restore_flags(flags);
 361                         return;
 362                 }
 363
 364                 printk("Readahead average:  max=%ld, len=%ld, win=%ld, async=%ld%%\n",
 365                         total_ramax/total_reada,
 366                         total_ralen/total_reada,
 367                         total_rawin/total_reada,
 368                         (total_async*100)/total_reada);
 369 #ifdef DEBUG_READAHEAD
 370                 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%ld\n",
 371                         filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
 372 #endif
 373
 374                 total_reada     = 0;
 375                 total_async     = 0;
 376                 total_ramax     = 0;
 377                 total_ralen     = 0;
 378                 total_rawin     = 0;
 379
 380                 restore_flags(flags);
 381         }
 382 }
 383 #endif  /* defined PROFILE_READAHEAD */
 384
 385 /*
 386  * Read-ahead context:
 387  * -------------------
 388  * The read ahead context fields of the "struct file" are the following:
 389  * - f_raend : position of the first byte after the last page we tried to
 390  *             read ahead.
 391  * - f_ramax : current read-ahead maximum size.
 392  * - f_ralen : length of the current IO read block we tried to read-ahead.
 393  * - f_rawin : length of the current read-ahead window.
 394  *             if last read-ahead was synchronous then
 395  *                  f_rawin = f_ralen
 396  *             otherwise (was asynchronous)
 397  *                  f_rawin = previous value of f_ralen + f_ralen
 398  *
 399  * Read-ahead limits:
 400  * ------------------
 401  * MIN_READAHEAD   : minimum read-ahead size when read-ahead.
 402  * MAX_READAHEAD   : maximum read-ahead size when read-ahead.
 403  *
 404  * Synchronous read-ahead benefits:
 405  * --------------------------------
 406  * Using reasonable IO xfer length from peripheral devices increase system
 407  * performances.
 408  * Reasonable means, in this context, not too large but not too small.
 409  * The actual maximum value is:
 410  *      MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
 411  *      and 32K if defined (4K page size assumed).
 412  *
 413  * Asynchronous read-ahead benefits:
 414  * ---------------------------------
 415  * Overlapping next read request and user process execution increase system
 416  * performance.
 417  *
 418  * Read-ahead risks:
 419  * -----------------
 420  * We have to guess which further data are needed by the user process.
 421  * If these data are often not really needed, it's bad for system
 422  * performances.
 423  * However, we know that files are often accessed sequentially by
 424  * application programs and it seems that it is possible to have some good
 425  * strategy in that guessing.
 426  * We only try to read-ahead files that seems to be read sequentially.
 427  *
 428  * Asynchronous read-ahead risks:
 429  * ------------------------------
 430  * In order to maximize overlapping, we must start some asynchronous read
 431  * request from the device, as soon as possible.
 432  * We must be very careful about:
 433  * - The number of effective pending IO read requests.
 434  *   ONE seems to be the only reasonable value.
 435  * - The total memory pool usage for the file access stream.
 436  *   This maximum memory usage is implicitly 2 IO read chunks:
 437  *   2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
 438  *   64k if defined (4K page size assumed).
 439  */
 440
 441 static inline int get_max_readahead(struct inode * inode)
 442 {
 443         if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
 444                 return MAX_READAHEAD;
 445         return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
 446 }
 447
 448 static inline unsigned long generic_file_readahead(int reada_ok,
 449         struct file * filp, struct inode * inode,
 450         unsigned long ppos, struct page * page, unsigned long page_cache)
 451 {
 452         unsigned long max_ahead, ahead;
 453         unsigned long raend;
 454         int max_readahead = get_max_readahead(inode);
 455
 456         raend = filp->f_raend & PAGE_CACHE_MASK;
 457         max_ahead = 0;
 458
 459 /*
 460  * The current page is locked.
 461  * If the current position is inside the previous read IO request, do not
 462  * try to reread previously read ahead pages.
 463  * Otherwise decide or not to read ahead some pages synchronously.
 464  * If we are not going to read ahead, set the read ahead context for this
 465  * page only.
 466  */
 467         if (PageLocked(page)) {
 468                 if (!filp->f_ralen || ppos >= raend || ppos + filp->f_ralen < raend) {
 469                         raend = ppos;
 470                         if (raend < inode->i_size)
 471                                 max_ahead = filp->f_ramax;
 472                         filp->f_rawin = 0;
 473                         filp->f_ralen = PAGE_CACHE_SIZE;
 474                         if (!max_ahead) {
 475                                 filp->f_raend  = ppos + filp->f_ralen;
 476                                 filp->f_rawin += filp->f_ralen;
 477                         }
 478                 }
 479         }
 480 /*
 481  * The current page is not locked.
 482  * If we were reading ahead and,
 483  * if the current max read ahead size is not zero and,
 484  * if the current position is inside the last read-ahead IO request,
 485  *   it is the moment to try to read ahead asynchronously.
 486  * We will later force unplug device in order to force asynchronous read IO.
 487  */
 488         else if (reada_ok && filp->f_ramax && raend >= PAGE_CACHE_SIZE &&
 489                  ppos <= raend && ppos + filp->f_ralen >= raend) {
 490 /*
 491  * Add ONE page to max_ahead in order to try to have about the same IO max size
 492  * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
 493  * Compute the position of the last page we have tried to read in order to
 494  * begin to read ahead just at the next page.
 495  */
 496                 raend -= PAGE_CACHE_SIZE;
 497                 if (raend < inode->i_size)
 498                         max_ahead = filp->f_ramax + PAGE_CACHE_SIZE;
 499
 500                 if (max_ahead) {
 501                         filp->f_rawin = filp->f_ralen;
 502                         filp->f_ralen = 0;
 503                         reada_ok      = 2;
 504                 }
 505         }
 506 /*
 507  * Try to read ahead pages.
 508  * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
 509  * scheduler, will work enough for us to avoid too bad actuals IO requests.
 510  */
 511         ahead = 0;
 512         while (ahead < max_ahead) {
 513                 ahead += PAGE_CACHE_SIZE;
 514                 page_cache = try_to_read_ahead(filp, raend + ahead,
 515                                                 page_cache);
 516         }
 517 /*
 518  * If we tried to read ahead some pages,
 519  * If we tried to read ahead asynchronously,
 520  *   Try to force unplug of the device in order to start an asynchronous
 521  *   read IO request.
 522  * Update the read-ahead context.
 523  * Store the length of the current read-ahead window.
 524  * Double the current max read ahead size.
 525  *   That heuristic avoid to do some large IO for files that are not really
 526  *   accessed sequentially.
 527  */
 528         if (ahead) {
 529                 if (reada_ok == 2) {
 530                         run_task_queue(&tq_disk);
 531                 }
 532
 533                 filp->f_ralen += ahead;
 534                 filp->f_rawin += filp->f_ralen;
 535                 filp->f_raend = raend + ahead + PAGE_CACHE_SIZE;
 536
 537                 filp->f_ramax += filp->f_ramax;
 538
 539                 if (filp->f_ramax > max_readahead)
 540                         filp->f_ramax = max_readahead;
 541
 542 #ifdef PROFILE_READAHEAD
 543                 profile_readahead((reada_ok == 2), filp);
 544 #endif
 545         }
 546
 547         return page_cache;
 548 }
 549
 550 /*
 551  * "descriptor" for what we're up to with a read.
 552  * This allows us to use the same read code yet
 553  * have multiple different users of the data that
 554  * we read from a file.
 555  *
 556  * The simplest case just copies the data to user
 557  * mode.
 558  */
 559 typedef struct {
 560         size_t written;
 561         size_t count;
 562         char * buf;
 563         int error;
 564 } read_descriptor_t;
 565
 566 typedef int (*read_actor_t)(read_descriptor_t *, const char *, unsigned long);
 567
 568 /*
 569  * This is a generic file read routine, and uses the
 570  * inode->i_op->readpage() function for the actual low-level
 571  * stuff.
 572  *
 573  * This is really ugly. But the goto's actually try to clarify some
 574  * of the logic when it comes to error handling etc.
 575  */
 576 static void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
 577 {
 578         struct dentry *dentry = filp->f_dentry;
 579         struct inode *inode = dentry->d_inode;
 580         size_t pos, pgpos, page_cache;
 581         int reada_ok;
 582         int max_readahead = get_max_readahead(inode);
 583
 584         page_cache = 0;
 585
 586         pos = *ppos;
 587         pgpos = pos & PAGE_CACHE_MASK;
 588 /*
 589  * If the current position is outside the previous read-ahead window,
 590  * we reset the current read-ahead context and set read ahead max to zero
 591  * (will be set to just needed value later),
 592  * otherwise, we assume that the file accesses are sequential enough to
 593  * continue read-ahead.
 594  */
 595         if (pgpos > filp->f_raend || pgpos + filp->f_rawin < filp->f_raend) {
 596                 reada_ok = 0;
 597                 filp->f_raend = 0;
 598                 filp->f_ralen = 0;
 599                 filp->f_ramax = 0;
 600                 filp->f_rawin = 0;
 601         } else {
 602                 reada_ok = 1;
 603         }
 604 /*
 605  * Adjust the current value of read-ahead max.
 606  * If the read operation stay in the first half page, force no readahead.
 607  * Otherwise try to increase read ahead max just enough to do the read request.
 608  * Then, at least MIN_READAHEAD if read ahead is ok,
 609  * and at most MAX_READAHEAD in all cases.
 610  */
 611         if (pos + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
 612                 filp->f_ramax = 0;
 613         } else {
 614                 unsigned long needed;
 615
 616                 needed = ((pos + desc->count) & PAGE_CACHE_MASK) - pgpos;
 617
 618                 if (filp->f_ramax < needed)
 619                         filp->f_ramax = needed;
 620
 621                 if (reada_ok && filp->f_ramax < MIN_READAHEAD)
 622                                 filp->f_ramax = MIN_READAHEAD;
 623                 if (filp->f_ramax > max_readahead)
 624                         filp->f_ramax = max_readahead;
 625         }
 626
 627         for (;;) {
 628                 struct page *page, **hash;
 629
 630                 if (pos >= inode->i_size)
 631                         break;
 632
 633                 /*
 634                  * Try to find the data in the page cache..
 635                  */
 636                 hash = page_hash(inode, pos & PAGE_CACHE_MASK);
 637                 page = __find_page(inode, pos & PAGE_CACHE_MASK, *hash);
 638                 if (!page)
 639                         goto no_cached_page;
 640
 641 found_page:
 642 /*
 643  * Try to read ahead only if the current page is filled or being filled.
 644  * Otherwise, if we were reading ahead, decrease max read ahead size to
 645  * the minimum value.
 646  * In this context, that seems to may happen only on some read error or if
 647  * the page has been rewritten.
 648  */
 649                 if (PageUptodate(page) || PageLocked(page))
 650                         page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_CACHE_MASK, page, page_cache);
 651                 else if (reada_ok && filp->f_ramax > MIN_READAHEAD)
 652                                 filp->f_ramax = MIN_READAHEAD;
 653
 654                 wait_on_page(page);
 655
 656                 if (!PageUptodate(page))
 657                         goto page_read_error;
 658
 659 success:
 660                 /*
 661                  * Ok, we have the page, it's up-to-date and ok,
 662                  * so now we can finally copy it to user space...
 663                  */
 664         {
 665                 unsigned long offset, nr;
 666
 667                 offset = pos & ~PAGE_CACHE_MASK;
 668                 nr = PAGE_CACHE_SIZE - offset;
 669                 if (nr > inode->i_size - pos)
 670                         nr = inode->i_size - pos;
 671
 672                 /*
 673                  * The actor routine returns how many bytes were actually used..
 674                  * NOTE! This may not be the same as how much of a user buffer
 675                  * we filled up (we may be padding etc), so we can only update
 676                  * "pos" here (the actor routine has to update the user buffer
 677                  * pointers and the remaining count).
 678                  */
 679                 nr = actor(desc, (const char *) (page_address(page) + offset), nr);
 680                 pos += nr;
 681                 page_cache_release(page);
 682                 if (nr && desc->count)
 683                         continue;
 684                 break;
 685         }
 686
 687 no_cached_page:
 688                 /*
 689                  * Ok, it wasn't cached, so we need to create a new
 690                  * page..
 691                  */
 692                 if (!page_cache) {
 693                         page_cache = page_cache_alloc();
 694                         /*
 695                          * That could have slept, so go around to the
 696                          * very beginning..
 697                          */
 698                         if (page_cache)
 699                                 continue;
 700                         desc->error = -ENOMEM;
 701                         break;
 702                 }
 703
 704                 /*
 705                  * Ok, add the new page to the hash-queues...
 706                  */
 707                 page = page_cache_entry(page_cache);
 708                 page_cache = 0;
 709                 add_to_page_cache(page, inode, pos & PAGE_CACHE_MASK, hash);
 710
 711                 /*
 712                  * Error handling is tricky. If we get a read error,
 713                  * the cached page stays in the cache (but uptodate=0),
 714                  * and the next process that accesses it will try to
 715                  * re-read it. This is needed for NFS etc, where the
 716                  * identity of the reader can decide if we can read the
 717                  * page or not..
 718                  */
 719 /*
 720  * We have to read the page.
 721  * If we were reading ahead, we had previously tried to read this page,
 722  * That means that the page has probably been removed from the cache before
 723  * the application process needs it, or has been rewritten.
 724  * Decrease max readahead size to the minimum value in that situation.
 725  */
 726                 if (reada_ok && filp->f_ramax > MIN_READAHEAD)
 727                         filp->f_ramax = MIN_READAHEAD;
 728
 729                 {
 730                         int error = inode->i_op->readpage(filp, page);
 731                         if (!error)
 732                                 goto found_page;
 733                         desc->error = error;
 734                         page_cache_release(page);
 735                         break;
 736                 }
 737
 738 page_read_error:
 739                 /*
 740                  * We found the page, but it wasn't up-to-date.
 741                  * Try to re-read it _once_. We do this synchronously,
 742                  * because this happens only if there were errors.
 743                  */
 744                 {
 745                         int error = inode->i_op->readpage(filp, page);
 746                         if (!error) {
 747                                 wait_on_page(page);
 748                                 if (PageUptodate(page) && !PageError(page))
 749                                         goto success;
 750                                 error = -EIO; /* Some unspecified error occurred.. */
 751                         }
 752                         desc->error = error;
 753                         page_cache_release(page);
 754                         break;
 755                 }
 756         }
 757
 758         *ppos = pos;
 759         filp->f_reada = 1;
 760         if (page_cache)
 761                 page_cache_free(page_cache);
 762         UPDATE_ATIME(inode);
 763 }
 764
 765 static int file_read_actor(read_descriptor_t * desc, const char *area, unsigned long size)
 766 {
 767         unsigned long left;
 768         unsigned long count = desc->count;
 769
 770         if (size > count)
 771                 size = count;
 772         left = __copy_to_user(desc->buf, area, size);
 773         if (left) {
 774                 size -= left;
 775                 desc->error = -EFAULT;
 776         }
 777         desc->count = count - size;
 778         desc->written += size;
 779         desc->buf += size;
 780         return size;
 781 }
 782
 783 /*
 784  * This is the "read()" routine for all filesystems
 785  * that can use the page cache directly.
 786  */
 787 ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
 788 {
 789         ssize_t retval;
 790
 791         retval = -EFAULT;
 792         if (access_ok(VERIFY_WRITE, buf, count)) {
 793                 retval = 0;
 794                 if (count) {
 795                         read_descriptor_t desc;
 796
 797                         desc.written = 0;
 798                         desc.count = count;
 799                         desc.buf = buf;
 800                         desc.error = 0;
 801                         do_generic_file_read(filp, ppos, &desc, file_read_actor);
 802
 803                         retval = desc.written;
 804                         if (!retval)
 805                                 retval = desc.error;
 806                 }
 807         }
 808         return retval;
 809 }
 810
 811 static int file_send_actor(read_descriptor_t * desc, const char *area, unsigned long size)
 812 {
 813         ssize_t written;
 814         unsigned long count = desc->count;
 815         struct file *file = (struct file *) desc->buf;
 816         struct inode *inode = file->f_dentry->d_inode;
 817         mm_segment_t old_fs;
 818
 819         if (size > count)
 820                 size = count;
 821         down(&inode->i_sem);
 822         old_fs = get_fs();
 823         set_fs(KERNEL_DS);
 824         written = file->f_op->write(file, area, size, &file->f_pos);
 825         set_fs(old_fs);
 826         up(&inode->i_sem);
 827         if (written < 0) {
 828                 desc->error = written;
 829                 written = 0;
 830         }
 831         desc->count = count - written;
 832         desc->written += written;
 833         return written;
 834 }
 835
 836 asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
 837 {
 838         ssize_t retval;
 839         struct file * in_file, * out_file;
 840         struct inode * in_inode, * out_inode;
 841
 842         lock_kernel();
 843
 844         /*
 845          * Get input file, and verify that it is ok..
 846          */
 847         retval = -EBADF;
 848         in_file = fget(in_fd);
 849         if (!in_file)
 850                 goto out;
 851         if (!(in_file->f_mode & FMODE_READ))
 852                 goto fput_in;
 853         retval = -EINVAL;
 854         in_inode = in_file->f_dentry->d_inode;
 855         if (!in_inode)
 856                 goto fput_in;
 857         if (!in_inode->i_op || !in_inode->i_op->readpage)
 858                 goto fput_in;
 859         retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
 860         if (retval)
 861                 goto fput_in;
 862
 863         /*
 864          * Get output file, and verify that it is ok..
 865          */
 866         retval = -EBADF;
 867         out_file = fget(out_fd);
 868         if (!out_file)
 869                 goto fput_in;
 870         if (!(out_file->f_mode & FMODE_WRITE))
 871                 goto fput_out;
 872         retval = -EINVAL;
 873         if (!out_file->f_op || !out_file->f_op->write)
 874                 goto fput_out;
 875         out_inode = out_file->f_dentry->d_inode;
 876         if (!out_inode)
 877                 goto fput_out;
 878         retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
 879         if (retval)
 880                 goto fput_out;
 881
 882         retval = 0;
 883         if (count) {
 884                 read_descriptor_t desc;
 885                 loff_t pos = 0, *ppos;
 886
 887                 retval = -EFAULT;
 888                 ppos = &in_file->f_pos;
 889                 if (offset) {
 890                         if (get_user(pos, offset))
 891                                 goto fput_out;
 892                         ppos = &pos;
 893                 }
 894
 895                 desc.written = 0;
 896                 desc.count = count;
 897                 desc.buf = (char *) out_file;
 898                 desc.error = 0;
 899                 do_generic_file_read(in_file, ppos, &desc, file_send_actor);
 900
 901                 retval = desc.written;
 902                 if (!retval)
 903                         retval = desc.error;
 904                 if (offset)
 905                         put_user(pos, offset);
 906         }
 907
 908
 909 fput_out:
 910         fput(out_file);
 911 fput_in:
 912         fput(in_file);
 913 out:
 914         unlock_kernel();
 915         return retval;
 916 }
 917
 918 /*
 919  * Semantics for shared and private memory areas are different past the end
 920  * of the file. A shared mapping past the last page of the file is an error
 921  * and results in a SIGBUS, while a private mapping just maps in a zero page.
 922  *
 923  * The goto's are kind of ugly, but this streamlines the normal case of having
 924  * it in the page cache, and handles the special cases reasonably without
 925  * having a lot of duplicated code.
 926  *
 927  * WSH 06/04/97: fixed a memory leak and moved the allocation of new_page
 928  * ahead of the wait if we're sure to need it.
 929  */
 930 static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long address, int no_share)
 931 {
 932         struct file * file = area->vm_file;
 933         struct dentry * dentry = file->f_dentry;
 934         struct inode * inode = dentry->d_inode;
 935         unsigned long offset, reada, i;
 936         struct page * page, **hash;
 937         unsigned long old_page, new_page;
 938
 939         new_page = 0;
 940         offset = (address - area->vm_start + area->vm_offset) & PAGE_MASK;
 941         if (offset >= inode->i_size && (area->vm_flags & VM_SHARED) && area->vm_mm == current->mm)
 942                 goto no_page;
 943
 944         /*
 945          * Do we have something in the page cache already?
 946          */
 947         hash = page_hash(inode, offset);
 948         page = __find_page(inode, offset, *hash);
 949         if (!page)
 950                 goto no_cached_page;
 951
 952 found_page:
 953         /*
 954          * Ok, found a page in the page cache, now we need to check
 955          * that it's up-to-date.  First check whether we'll need an
 956          * extra page -- better to overlap the allocation with the I/O.
 957          */
 958         if (no_share && !new_page) {
 959                 new_page = page_cache_alloc();
 960                 if (!new_page)
 961                         goto failure;
 962         }
 963
 964         if (PageLocked(page))
 965                 goto page_locked_wait;
 966         if (!PageUptodate(page))
 967                 goto page_read_error;
 968
 969 success:
 970         /*
 971          * Found the page, need to check sharing and possibly
 972          * copy it over to another page..
 973          */
 974         old_page = page_address(page);
 975         if (!no_share) {
 976                 /*
 977                  * Ok, we can share the cached page directly.. Get rid
 978                  * of any potential extra pages.
 979                  */
 980                 if (new_page)
 981                         page_cache_free(new_page);
 982
 983                 flush_page_to_ram(old_page);
 984                 return old_page;
 985         }
 986
 987         /*
 988          * No sharing ... copy to the new page.
 989          */
 990         copy_page(new_page, old_page);
 991         flush_page_to_ram(new_page);
 992         page_cache_release(page);
 993         return new_page;
 994
 995 no_cached_page:
 996         /*
 997          * Try to read in an entire cluster at once.
 998          */
 999         reada   = offset;
1000         reada >>= PAGE_CACHE_SHIFT + page_cluster;
1001         reada <<= PAGE_CACHE_SHIFT + page_cluster;
1002
1003         for (i = 1 << page_cluster; i > 0; --i, reada += PAGE_CACHE_SIZE)
1004                 new_page = try_to_read_ahead(file, reada, new_page);
1005
1006         if (!new_page)
1007                 new_page = page_cache_alloc();
1008         if (!new_page)
1009                 goto no_page;
1010
1011         /*
1012          * During getting the above page we might have slept,
1013          * so we need to re-check the situation with the page
1014          * cache.. The page we just got may be useful if we
1015          * can't share, so don't get rid of it here.
1016          */
1017         page = find_page(inode, offset);
1018         if (page)
1019                 goto found_page;
1020
1021         /*
1022          * Now, create a new page-cache page from the page we got
1023          */
1024         page = page_cache_entry(new_page);
1025         new_page = 0;
1026         add_to_page_cache(page, inode, offset, hash);
1027
1028         if (inode->i_op->readpage(file, page) != 0)
1029                 goto failure;
1030
1031         goto found_page;
1032
1033 page_locked_wait:
1034         __wait_on_page(page);
1035         if (PageUptodate(page))
1036                 goto success;
1037
1038 page_read_error:
1039         /*
1040          * Umm, take care of errors if the page isn't up-to-date.
1041          * Try to re-read it _once_. We do this synchronously,
1042          * because there really aren't any performance issues here
1043          * and we need to check for errors.
1044          */
1045         if (inode->i_op->readpage(file, page) != 0)
1046                 goto failure;
1047         wait_on_page(page);
1048         if (PageError(page))
1049                 goto failure;
1050         if (PageUptodate(page))
1051                 goto success;
1052
1053         /*
1054          * Things didn't work out. Return zero to tell the
1055          * mm layer so, possibly freeing the page cache page first.
1056          */
1057 failure:
1058         page_cache_release(page);
1059         if (new_page)
1060                 page_cache_free(new_page);
1061 no_page:
1062         return 0;
1063 }
1064
1065 /*
1066  * Tries to write a shared mapped page to its backing store. May return -EIO
1067  * if the disk is full.
1068  */
1069 static inline int do_write_page(struct inode * inode, struct file * file,
1070         const char * page, unsigned long offset)
1071 {
1072         int retval;
1073         unsigned long size;
1074         loff_t loff = offset;
1075         mm_segment_t old_fs;
1076
1077         size = offset + PAGE_SIZE;
1078         /* refuse to extend file size.. */
1079         if (S_ISREG(inode->i_mode)) {
1080                 if (size > inode->i_size)
1081                         size = inode->i_size;
1082                 /* Ho humm.. We should have tested for this earlier */
1083                 if (size < offset)
1084                         return -EIO;
1085         }
1086         size -= offset;
1087         old_fs = get_fs();
1088         set_fs(KERNEL_DS);
1089         retval = -EIO;
1090         if (size == file->f_op->write(file, (const char *) page, size, &loff))
1091                 retval = 0;
1092         set_fs(old_fs);
1093         return retval;
1094 }
1095
1096 static int filemap_write_page(struct vm_area_struct * vma,
1097                               unsigned long offset,
1098                               unsigned long page,
1099                               int wait)
1100 {
1101         int result;
1102         struct file * file;
1103         struct dentry * dentry;
1104         struct inode * inode;
1105
1106         file = vma->vm_file;
1107         dentry = file->f_dentry;
1108         inode = dentry->d_inode;
1109         if (!file->f_op->write)
1110                 return -EIO;
1111
1112         /*
1113          * If a task terminates while we're swapping the page, the vma and
1114          * and file could be released ... increment the count to be safe.
1115          */
1116         file->f_count++;
1117
1118         /*
1119          * If this is a swapping operation rather than msync(), then
1120          * leave the actual IO, and the restoration of the file count,
1121          * to the kpiod thread.  Just queue the request for now.
1122          */
1123         if (!wait) {
1124                 make_pio_request(file, offset, page);
1125                 return 0;
1126         }
1127
1128         down(&inode->i_sem);
1129         result = do_write_page(inode, file, (const char *) page, offset);
1130         up(&inode->i_sem);
1131         fput(file);
1132         return result;
1133 }
1134
1135
1136 /*
1137  * The page cache takes care of races between somebody
1138  * trying to swap something out and swap something in
1139  * at the same time..
1140  */
1141 int filemap_swapout(struct vm_area_struct * vma, struct page * page)
1142 {
1143         return filemap_write_page(vma, page->offset, page_address(page), 0);
1144 }
1145
1146 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
1147         unsigned long address, unsigned int flags)
1148 {
1149         pte_t pte = *ptep;
1150         unsigned long page;
1151         int error;
1152
1153         if (!(flags & MS_INVALIDATE)) {
1154                 if (!pte_present(pte))
1155                         return 0;
1156                 if (!pte_dirty(pte))
1157                         return 0;
1158                 flush_page_to_ram(pte_page(pte));
1159                 flush_cache_page(vma, address);
1160                 set_pte(ptep, pte_mkclean(pte));
1161                 flush_tlb_page(vma, address);
1162                 page = pte_page(pte);
1163                 atomic_inc(&page_cache_entry(page)->count);
1164         } else {
1165                 if (pte_none(pte))
1166                         return 0;
1167                 flush_cache_page(vma, address);
1168                 pte_clear(ptep);
1169                 flush_tlb_page(vma, address);
1170                 if (!pte_present(pte)) {
1171                         swap_free(pte_val(pte));
1172                         return 0;
1173                 }
1174                 page = pte_page(pte);
1175                 if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
1176                         page_cache_free(page);
1177                         return 0;
1178                 }
1179         }
1180         error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page, 1);
1181         page_cache_free(page);
1182         return error;
1183 }
1184
1185 static inline int filemap_sync_pte_range(pmd_t * pmd,
1186         unsigned long address, unsigned long size,
1187         struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
1188 {
1189         pte_t * pte;
1190         unsigned long end;
1191         int error;
1192
1193         if (pmd_none(*pmd))
1194                 return 0;
1195         if (pmd_bad(*pmd)) {
1196                 printk("filemap_sync_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
1197                 pmd_clear(pmd);
1198                 return 0;
1199         }
1200         pte = pte_offset(pmd, address);
1201         offset += address & PMD_MASK;
1202         address &= ~PMD_MASK;
1203         end = address + size;
1204         if (end > PMD_SIZE)
1205                 end = PMD_SIZE;
1206         error = 0;
1207         do {
1208                 error |= filemap_sync_pte(pte, vma, address + offset, flags);
1209                 address += PAGE_SIZE;
1210                 pte++;
1211         } while (address < end);
1212         return error;
1213 }
1214
1215 static inline int filemap_sync_pmd_range(pgd_t * pgd,
1216         unsigned long address, unsigned long size,
1217         struct vm_area_struct *vma, unsigned int flags)
1218 {
1219         pmd_t * pmd;
1220         unsigned long offset, end;
1221         int error;
1222
1223         if (pgd_none(*pgd))
1224                 return 0;
1225         if (pgd_bad(*pgd)) {
1226                 printk("filemap_sync_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd));
1227                 pgd_clear(pgd);
1228                 return 0;
1229         }
1230         pmd = pmd_offset(pgd, address);
1231         offset = address & PGDIR_MASK;
1232         address &= ~PGDIR_MASK;
1233         end = address + size;
1234         if (end > PGDIR_SIZE)
1235                 end = PGDIR_SIZE;
1236         error = 0;
1237         do {
1238                 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
1239                 address = (address + PMD_SIZE) & PMD_MASK;
1240                 pmd++;
1241         } while (address < end);
1242         return error;
1243 }
1244
1245 static int filemap_sync(struct vm_area_struct * vma, unsigned long address,
1246         size_t size, unsigned int flags)
1247 {
1248         pgd_t * dir;
1249         unsigned long end = address + size;
1250         int error = 0;
1251
1252         dir = pgd_offset(vma->vm_mm, address);
1253         flush_cache_range(vma->vm_mm, end - size, end);
1254         while (address < end) {
1255                 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
1256                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
1257                 dir++;
1258         }
1259         flush_tlb_range(vma->vm_mm, end - size, end);
1260         return error;
1261 }
1262
1263 /*
1264  * This handles (potentially partial) area unmaps..
1265  */
1266 static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len)
1267 {
1268         filemap_sync(vma, start, len, MS_ASYNC);
1269 }
1270
1271 /*
1272  * Shared mappings need to be able to do the right thing at
1273  * close/unmap/sync. They will also use the private file as
1274  * backing-store for swapping..
1275  */
1276 static struct vm_operations_struct file_shared_mmap = {
1277         NULL,                   /* no special open */
1278         NULL,                   /* no special close */
1279         filemap_unmap,          /* unmap - we need to sync the pages */
1280         NULL,                   /* no special protect */
1281         filemap_sync,           /* sync */
1282         NULL,                   /* advise */
1283         filemap_nopage,         /* nopage */
1284         NULL,                   /* wppage */
1285         filemap_swapout,        /* swapout */
1286         NULL,                   /* swapin */
1287 };
1288
1289 /*
1290  * Private mappings just need to be able to load in the map.
1291  *
1292  * (This is actually used for shared mappings as well, if we
1293  * know they can't ever get write permissions..)
1294  */
1295 static struct vm_operations_struct file_private_mmap = {
1296         NULL,                   /* open */
1297         NULL,                   /* close */
1298         NULL,                   /* unmap */
1299         NULL,                   /* protect */
1300         NULL,                   /* sync */
1301         NULL,                   /* advise */
1302         filemap_nopage,         /* nopage */
1303         NULL,                   /* wppage */
1304         NULL,                   /* swapout */
1305         NULL,                   /* swapin */
1306 };
1307
1308 /* This is used for a general mmap of a disk file */
1309
1310 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1311 {
1312         struct vm_operations_struct * ops;
1313         struct inode *inode = file->f_dentry->d_inode;
1314
1315         if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1316                 ops = &file_shared_mmap;
1317                 /* share_page() can only guarantee proper page sharing if
1318                  * the offsets are all page aligned. */
1319                 if (vma->vm_offset & (PAGE_SIZE - 1))
1320                         return -EINVAL;
1321         } else {
1322                 ops = &file_private_mmap;
1323                 if (inode->i_op && inode->i_op->bmap &&
1324                     (vma->vm_offset & (inode->i_sb->s_blocksize - 1)))
1325                         return -EINVAL;
1326         }
1327         if (!inode->i_sb || !S_ISREG(inode->i_mode))
1328                 return -EACCES;
1329         if (!inode->i_op || !inode->i_op->readpage)
1330                 return -ENOEXEC;
1331         UPDATE_ATIME(inode);
1332         vma->vm_ops = ops;
1333         return 0;
1334 }
1335
1336
1337 /*
1338  * The msync() system call.
1339  */
1340
1341 static int msync_interval(struct vm_area_struct * vma,
1342         unsigned long start, unsigned long end, int flags)
1343 {
1344         if (vma->vm_file && vma->vm_ops && vma->vm_ops->sync) {
1345                 int error;
1346                 error = vma->vm_ops->sync(vma, start, end-start, flags);
1347                 if (!error && (flags & MS_SYNC)) {
1348                         struct file * file = vma->vm_file;
1349                         if (file) {
1350                                 struct dentry * dentry = file->f_dentry;
1351                                 struct inode * inode = dentry->d_inode;
1352                                 down(&inode->i_sem);
1353                                 error = file_fsync(file, dentry);
1354                                 up(&inode->i_sem);
1355                         }
1356                 }
1357                 return error;
1358         }
1359         return 0;
1360 }
1361
1362 asmlinkage int sys_msync(unsigned long start, size_t len, int flags)
1363 {
1364         unsigned long end;
1365         struct vm_area_struct * vma;
1366         int unmapped_error, error = -EINVAL;
1367
1368         down(&current->mm->mmap_sem);
1369         lock_kernel();
1370         if (start & ~PAGE_MASK)
1371                 goto out;
1372         len = (len + ~PAGE_MASK) & PAGE_MASK;
1373         end = start + len;
1374         if (end < start)
1375                 goto out;
1376         if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1377                 goto out;
1378         error = 0;
1379         if (end == start)
1380                 goto out;
1381         /*
1382          * If the interval [start,end) covers some unmapped address ranges,
1383          * just ignore them, but return -EFAULT at the end.
1384          */
1385         vma = find_vma(current->mm, start);
1386         unmapped_error = 0;
1387         for (;;) {
1388                 /* Still start < end. */
1389                 error = -EFAULT;
1390                 if (!vma)
1391                         goto out;
1392                 /* Here start < vma->vm_end. */
1393                 if (start < vma->vm_start) {
1394                         unmapped_error = -EFAULT;
1395                         start = vma->vm_start;
1396                 }
1397                 /* Here vma->vm_start <= start < vma->vm_end. */
1398                 if (end <= vma->vm_end) {
1399                         if (start < end) {
1400                                 error = msync_interval(vma, start, end, flags);
1401                                 if (error)
1402                                         goto out;
1403                         }
1404                         error = unmapped_error;
1405                         goto out;
1406                 }
1407                 /* Here vma->vm_start <= start < vma->vm_end < end. */
1408                 error = msync_interval(vma, start, vma->vm_end, flags);
1409                 if (error)
1410                         goto out;
1411                 start = vma->vm_end;
1412                 vma = vma->vm_next;
1413         }
1414 out:
1415         unlock_kernel();
1416         up(&current->mm->mmap_sem);
1417         return error;
1418 }
1419
1420 /*
1421  * Write to a file through the page cache. This is mainly for the
1422  * benefit of NFS and possibly other network-based file systems.
1423  *
1424  * We currently put everything into the page cache prior to writing it.
1425  * This is not a problem when writing full pages. With partial pages,
1426  * however, we first have to read the data into the cache, then
1427  * dirty the page, and finally schedule it for writing. Alternatively, we
1428  * could write-through just the portion of data that would go into that
1429  * page, but that would kill performance for applications that write data
1430  * line by line, and it's prone to race conditions.
1431  *
1432  * Note that this routine doesn't try to keep track of dirty pages. Each
1433  * file system has to do this all by itself, unfortunately.
1434  *                                                      okir@monad.swb.de
1435  */
1436 ssize_t
1437 generic_file_write(struct file *file, const char *buf,
1438                    size_t count, loff_t *ppos)
1439 {
1440         struct dentry   *dentry = file->f_dentry;
1441         struct inode    *inode = dentry->d_inode;
1442         unsigned long   pos = *ppos;
1443         unsigned long   limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
1444         struct page     *page, **hash;
1445         unsigned long   page_cache = 0;
1446         unsigned long   written;
1447         long            status, sync;
1448
1449         if (!inode->i_op || !inode->i_op->updatepage)
1450                 return -EIO;
1451
1452         if (file->f_error) {
1453                 int error = file->f_error;
1454                 file->f_error = 0;
1455                 return error;
1456         }
1457
1458         sync    = file->f_flags & O_SYNC;
1459         written = 0;
1460
1461         if (file->f_flags & O_APPEND)
1462                 pos = inode->i_size;
1463
1464         /*
1465          * Check whether we've reached the file size limit.
1466          */
1467         status = -EFBIG;
1468         if (pos >= limit) {
1469                 send_sig(SIGXFSZ, current, 0);
1470                 goto out;
1471         }
1472
1473         status  = 0;
1474         /*
1475          * Check whether to truncate the write,
1476          * and send the signal if we do.
1477          */
1478         if (count > limit - pos) {
1479                 send_sig(SIGXFSZ, current, 0);
1480                 count = limit - pos;
1481         }
1482
1483         while (count) {
1484                 unsigned long bytes, pgpos, offset;
1485                 /*
1486                  * Try to find the page in the cache. If it isn't there,
1487                  * allocate a free page.
1488                  */
1489                 offset = (pos & ~PAGE_CACHE_MASK);
1490                 pgpos = pos & PAGE_CACHE_MASK;
1491                 bytes = PAGE_CACHE_SIZE - offset;
1492                 if (bytes > count)
1493                         bytes = count;
1494
1495                 hash = page_hash(inode, pgpos);
1496                 page = __find_page(inode, pgpos, *hash);
1497                 if (!page) {
1498                         if (!page_cache) {
1499                                 page_cache = page_cache_alloc();
1500                                 if (page_cache)
1501                                         continue;
1502                                 status = -ENOMEM;
1503                                 break;
1504                         }
1505                         page = page_cache_entry(page_cache);
1506                         add_to_page_cache(page, inode, pgpos, hash);
1507                         page_cache = 0;
1508                 }
1509
1510                 /* Get exclusive IO access to the page.. */
1511                 wait_on_page(page);
1512                 set_bit(PG_locked, &page->flags);
1513
1514                 /*
1515                  * Do the real work.. If the writer ends up delaying the write,
1516                  * the writer needs to increment the page use counts until he
1517                  * is done with the page.
1518                  */
1519                 bytes -= copy_from_user((u8*)page_address(page) + offset, buf, bytes);
1520                 status = -EFAULT;
1521                 if (bytes)
1522                         status = inode->i_op->updatepage(file, page, offset, bytes, sync);
1523
1524                 /* Mark it unlocked again and drop the page.. */
1525                 clear_bit(PG_locked, &page->flags);
1526                 wake_up(&page->wait);
1527                 page_cache_release(page);
1528
1529                 if (status < 0)
1530                         break;
1531
1532                 written += status;
1533                 count -= status;
1534                 pos += status;
1535                 buf += status;
1536         }
1537         *ppos = pos;
1538         if (pos > inode->i_size)
1539                 inode->i_size = pos;
1540
1541         if (page_cache)
1542                 page_cache_free(page_cache);
1543 out:
1544         return written ? written : status;
1545 }
1546
1547 /*
1548  * Support routines for directory cacheing using the page cache.
1549  */
1550
1551 /*
1552  * Finds the page at the specified offset, installing a new page
1553  * if requested.  The count is incremented and the page is locked.
1554  *
1555  * Note: we don't have to worry about races here, as the caller
1556  * is holding the inode semaphore.
1557  */
1558 unsigned long get_cached_page(struct inode * inode, unsigned long offset,
1559                                 int new)
1560 {
1561         struct page * page;
1562         struct page ** hash;
1563         unsigned long page_cache = 0;
1564
1565         hash = page_hash(inode, offset);
1566         page = __find_page(inode, offset, *hash);
1567         if (!page) {
1568                 if (!new)
1569                         goto out;
1570                 page_cache = page_cache_alloc();
1571                 if (!page_cache)
1572                         goto out;
1573                 clear_page(page_cache);
1574                 page = page_cache_entry(page_cache);
1575                 add_to_page_cache(page, inode, offset, hash);
1576         }
1577         if (atomic_read(&page->count) != 2)
1578                 printk(KERN_ERR "get_cached_page: page count=%d\n",
1579                         atomic_read(&page->count));
1580         if (test_bit(PG_locked, &page->flags))
1581                 printk(KERN_ERR "get_cached_page: page already locked!\n");
1582         set_bit(PG_locked, &page->flags);
1583         page_cache = page_address(page);
1584
1585 out:
1586         return page_cache;
1587 }
1588
1589 /*
1590  * Unlock and free a page.
1591  */
1592 void put_cached_page(unsigned long addr)
1593 {
1594         struct page * page = page_cache_entry(addr);
1595
1596         if (!test_bit(PG_locked, &page->flags))
1597                 printk("put_cached_page: page not locked!\n");
1598         if (atomic_read(&page->count) != 2)
1599                 printk("put_cached_page: page count=%d\n",
1600                         atomic_read(&page->count));
1601         clear_bit(PG_locked, &page->flags);
1602         wake_up(&page->wait);
1603         page_cache_release(page);
1604 }
1605
1606
1607 /* Add request for page IO to the queue */
1608
1609 static inline void put_pio_request(struct pio_request *p)
1610 {
1611         *pio_last = p;
1612         p->next = NULL;
1613         pio_last = &p->next;
1614 }
1615
1616 /* Take the first page IO request off the queue */
1617
1618 static inline struct pio_request * get_pio_request(void)
1619 {
1620         struct pio_request * p = pio_first;
1621         pio_first = p->next;
1622         if (!pio_first)
1623                 pio_last = &pio_first;
1624         return p;
1625 }
1626
1627 /* Make a new page IO request and queue it to the kpiod thread */
1628
1629 static inline void make_pio_request(struct file *file,
1630                                     unsigned long offset,
1631                                     unsigned long page)
1632 {
1633         struct pio_request *p;
1634
1635         atomic_inc(&page_cache_entry(page)->count);
1636
1637         /*
1638          * We need to allocate without causing any recursive IO in the
1639          * current thread's context.  We might currently be swapping out
1640          * as a result of an allocation made while holding a critical
1641          * filesystem lock.  To avoid deadlock, we *MUST* not reenter
1642          * the filesystem in this thread.
1643          *
1644          * We can wait for kswapd to free memory, or we can try to free
1645          * pages without actually performing further IO, without fear of
1646          * deadlock.  --sct
1647          */
1648
1649         while ((p = kmem_cache_alloc(pio_request_cache, GFP_BUFFER)) == NULL) {
1650                 if (try_to_free_pages(__GFP_WAIT))
1651                         continue;
1652                 current->state = TASK_INTERRUPTIBLE;
1653                 schedule_timeout(HZ/10);
1654         }
1655
1656         p->file   = file;
1657         p->offset = offset;
1658         p->page   = page;
1659
1660         put_pio_request(p);
1661         wake_up(&pio_wait);
1662 }
1663
1664
1665 /*
1666  * This is the only thread which is allowed to write out filemap pages
1667  * while swapping.
1668  *
1669  * To avoid deadlock, it is important that we never reenter this thread.
1670  * Although recursive memory allocations within this thread may result
1671  * in more page swapping, that swapping will always be done by queuing
1672  * another IO request to the same thread: we will never actually start
1673  * that IO request until we have finished with the current one, and so
1674  * we will not deadlock.
1675  */
1676
1677 int kpiod(void * unused)
1678 {
1679         struct task_struct *tsk = current;
1680         struct wait_queue wait = { tsk, };
1681         struct inode * inode;
1682         struct dentry * dentry;
1683         struct pio_request * p;
1684
1685         tsk->session = 1;
1686         tsk->pgrp = 1;
1687         strcpy(tsk->comm, "kpiod");
1688         sigfillset(&tsk->blocked);
1689         init_waitqueue(&pio_wait);
1690         /*
1691          * Mark this task as a memory allocator - we don't want to get caught
1692          * up in the regular mm freeing frenzy if we have to allocate memory
1693          * in order to write stuff out.
1694          */
1695         tsk->flags |= PF_MEMALLOC;
1696
1697         lock_kernel();
1698
1699         pio_request_cache = kmem_cache_create("pio_request",
1700                                               sizeof(struct pio_request),
1701                                               0, SLAB_HWCACHE_ALIGN,
1702                                               NULL, NULL);
1703         if (!pio_request_cache)
1704                 panic ("Could not create pio_request slab cache");
1705
1706         while (1) {
1707                 tsk->state = TASK_INTERRUPTIBLE;
1708                 add_wait_queue(&pio_wait, &wait);
1709                 if (!pio_first)
1710                         schedule();
1711                 remove_wait_queue(&pio_wait, &wait);
1712                 tsk->state = TASK_RUNNING;
1713
1714                 while (pio_first) {
1715                         p = get_pio_request();
1716                         dentry = p->file->f_dentry;
1717                         inode = dentry->d_inode;
1718
1719                         down(&inode->i_sem);
1720                         do_write_page(inode, p->file,
1721                                       (const char *) p->page, p->offset);
1722                         up(&inode->i_sem);
1723                         fput(p->file);
1724                         page_cache_free(p->page);
1725                         kmem_cache_free(pio_request_cache, p);
1726                 }
1727         }
1728 }