Linux 2.1.89-4
[davej-history.git] / mm / filemap.c
blob019a55c34ee11e571e2ef37fe31cd1563edda7e7
1 /*
2 * linux/mm/filemap.c
4 * Copyright (C) 1994, 1995 Linus Torvalds
5 */
7 /*
8 * This file handles the generic file mmap semantics used by
9 * most "normal" filesystems (but you don't /have/ to use this:
10 * the NFS filesystem does this differently, for example)
12 #include <linux/stat.h>
13 #include <linux/sched.h>
14 #include <linux/kernel.h>
15 #include <linux/mm.h>
16 #include <linux/shm.h>
17 #include <linux/errno.h>
18 #include <linux/mman.h>
19 #include <linux/string.h>
20 #include <linux/malloc.h>
21 #include <linux/fs.h>
22 #include <linux/locks.h>
23 #include <linux/pagemap.h>
24 #include <linux/swap.h>
25 #include <linux/smp.h>
26 #include <linux/smp_lock.h>
27 #include <linux/blkdev.h>
28 #include <linux/file.h>
30 #include <asm/system.h>
31 #include <asm/pgtable.h>
32 #include <asm/uaccess.h>
35 * Shared mappings implemented 30.11.1994. It's not fully working yet,
36 * though.
38 * Shared mappings now work. 15.8.1995 Bruno.
41 unsigned long page_cache_size = 0;
42 struct page * page_hash_table[PAGE_HASH_SIZE];
45 * Simple routines for both non-shared and shared mappings.
48 #define release_page(page) __free_page((page))
51 * Invalidate the pages of an inode, removing all pages that aren't
52 * locked down (those are sure to be up-to-date anyway, so we shouldn't
53 * invalidate them).
55 void invalidate_inode_pages(struct inode * inode)
57 struct page ** p;
58 struct page * page;
60 p = &inode->i_pages;
61 while ((page = *p) != NULL) {
62 if (PageLocked(page)) {
63 p = &page->next;
64 continue;
66 inode->i_nrpages--;
67 if ((*p = page->next) != NULL)
68 (*p)->prev = page->prev;
69 page->next = NULL;
70 page->prev = NULL;
71 remove_page_from_hash_queue(page);
72 page->inode = NULL;
73 __free_page(page);
74 continue;
79 * Truncate the page cache at a set offset, removing the pages
80 * that are beyond that offset (and zeroing out partial pages).
82 void truncate_inode_pages(struct inode * inode, unsigned long start)
84 struct page ** p;
85 struct page * page;
87 repeat:
88 p = &inode->i_pages;
89 while ((page = *p) != NULL) {
90 unsigned long offset = page->offset;
92 /* page wholly truncated - free it */
93 if (offset >= start) {
94 if (PageLocked(page)) {
95 wait_on_page(page);
96 goto repeat;
98 inode->i_nrpages--;
99 if ((*p = page->next) != NULL)
100 (*p)->prev = page->prev;
101 page->next = NULL;
102 page->prev = NULL;
103 remove_page_from_hash_queue(page);
104 page->inode = NULL;
105 __free_page(page);
106 continue;
108 p = &page->next;
109 offset = start - offset;
110 /* partial truncate, clear end of page */
111 if (offset < PAGE_SIZE) {
112 unsigned long address = page_address(page);
113 memset((void *) (offset + address), 0, PAGE_SIZE - offset);
114 flush_page_to_ram(address);
119 int shrink_mmap(int priority, int gfp_mask)
121 static unsigned long clock = 0;
122 struct page * page;
123 unsigned long limit = num_physpages;
124 struct buffer_head *tmp, *bh;
125 int count_max, count_min;
127 count_max = (limit<<1) >> (priority>>1);
128 count_min = (limit<<1) >> (priority);
130 page = mem_map + clock;
131 do {
132 count_max--;
133 if (page->inode || page->buffers)
134 count_min--;
136 if (PageLocked(page))
137 goto next;
138 if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
139 goto next;
140 /* First of all, regenerate the page's referenced bit
141 from any buffers in the page */
142 bh = page->buffers;
143 if (bh) {
144 tmp = bh;
145 do {
146 if (buffer_touched(tmp)) {
147 clear_bit(BH_Touched, &tmp->b_state);
148 set_bit(PG_referenced, &page->flags);
150 tmp = tmp->b_this_page;
151 } while (tmp != bh);
154 /* We can't throw away shared pages, but we do mark
155 them as referenced. This relies on the fact that
156 no page is currently in both the page cache and the
157 buffer cache; we'd have to modify the following
158 test to allow for that case. */
160 switch (atomic_read(&page->count)) {
161 case 1:
162 /* If it has been referenced recently, don't free it */
163 if (test_and_clear_bit(PG_referenced, &page->flags))
164 break;
166 /* is it a swap-cache or page-cache page? */
167 if (page->inode) {
168 if (PageSwapCache(page)) {
169 delete_from_swap_cache(page);
170 return 1;
172 remove_page_from_hash_queue(page);
173 remove_page_from_inode_queue(page);
174 __free_page(page);
175 return 1;
178 /* is it a buffer cache page? */
179 if ((gfp_mask & __GFP_IO) && bh && try_to_free_buffer(bh, &bh, 6))
180 return 1;
181 break;
183 default:
184 /* more than one users: we can't throw it away */
185 set_bit(PG_referenced, &page->flags);
186 /* fall through */
187 case 0:
188 /* nothing */
190 next:
191 page++;
192 clock++;
193 if (clock >= limit) {
194 clock = 0;
195 page = mem_map;
197 } while (count_max > 0 && count_min > 0);
198 return 0;
202 * This is called from try_to_swap_out() when we try to get rid of some
203 * pages.. If we're unmapping the last occurrence of this page, we also
204 * free it from the page hash-queues etc, as we don't want to keep it
205 * in-core unnecessarily.
207 unsigned long page_unuse(unsigned long page)
209 struct page * p = mem_map + MAP_NR(page);
210 int count = atomic_read(&p->count);
212 if (count != 2)
213 return count;
214 if (!p->inode)
215 return count;
216 if (PageSwapCache(p))
217 panic ("Doing a normal page_unuse of a swap cache page");
218 remove_page_from_hash_queue(p);
219 remove_page_from_inode_queue(p);
220 free_page(page);
221 return 1;
225 * Update a page cache copy, when we're doing a "write()" system call
226 * See also "update_vm_cache()".
228 void update_vm_cache(struct inode * inode, unsigned long pos, const char * buf, int count)
230 unsigned long offset, len;
232 offset = (pos & ~PAGE_MASK);
233 pos = pos & PAGE_MASK;
234 len = PAGE_SIZE - offset;
235 do {
236 struct page * page;
238 if (len > count)
239 len = count;
240 page = find_page(inode, pos);
241 if (page) {
242 wait_on_page(page);
243 memcpy((void *) (offset + page_address(page)), buf, len);
244 release_page(page);
246 count -= len;
247 buf += len;
248 len = PAGE_SIZE;
249 offset = 0;
250 pos += PAGE_SIZE;
251 } while (count);
254 static inline void add_to_page_cache(struct page * page,
255 struct inode * inode, unsigned long offset,
256 struct page **hash)
258 atomic_inc(&page->count);
259 page->flags &= ~((1 << PG_uptodate) | (1 << PG_error));
260 page->offset = offset;
261 add_page_to_inode_queue(inode, page);
262 __add_page_to_hash_queue(page, hash);
266 * Try to read ahead in the file. "page_cache" is a potentially free page
267 * that we could use for the cache (if it is 0 we can try to create one,
268 * this is all overlapped with the IO on the previous page finishing anyway)
270 static unsigned long try_to_read_ahead(struct file * file,
271 unsigned long offset, unsigned long page_cache)
273 struct inode *inode = file->f_dentry->d_inode;
274 struct page * page;
275 struct page ** hash;
277 offset &= PAGE_MASK;
278 switch (page_cache) {
279 case 0:
280 page_cache = __get_free_page(GFP_KERNEL);
281 if (!page_cache)
282 break;
283 default:
284 if (offset >= inode->i_size)
285 break;
286 hash = page_hash(inode, offset);
287 page = __find_page(inode, offset, *hash);
288 if (!page) {
290 * Ok, add the new page to the hash-queues...
292 page = mem_map + MAP_NR(page_cache);
293 add_to_page_cache(page, inode, offset, hash);
294 inode->i_op->readpage(file, page);
295 page_cache = 0;
297 release_page(page);
299 return page_cache;
303 * Wait for IO to complete on a locked page.
305 * This must be called with the caller "holding" the page,
306 * ie with increased "page->count" so that the page won't
307 * go away during the wait..
309 void __wait_on_page(struct page *page)
311 struct task_struct *tsk = current;
312 struct wait_queue wait;
314 wait.task = tsk;
315 add_wait_queue(&page->wait, &wait);
316 repeat:
317 tsk->state = TASK_UNINTERRUPTIBLE;
318 run_task_queue(&tq_disk);
319 if (PageLocked(page)) {
320 schedule();
321 goto repeat;
323 tsk->state = TASK_RUNNING;
324 remove_wait_queue(&page->wait, &wait);
327 #if 0
328 #define PROFILE_READAHEAD
329 #define DEBUG_READAHEAD
330 #endif
333 * Read-ahead profiling information
334 * --------------------------------
335 * Every PROFILE_MAXREADCOUNT, the following information is written
336 * to the syslog:
337 * Percentage of asynchronous read-ahead.
338 * Average of read-ahead fields context value.
339 * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
340 * to the syslog.
343 #ifdef PROFILE_READAHEAD
345 #define PROFILE_MAXREADCOUNT 1000
347 static unsigned long total_reada;
348 static unsigned long total_async;
349 static unsigned long total_ramax;
350 static unsigned long total_ralen;
351 static unsigned long total_rawin;
353 static void profile_readahead(int async, struct file *filp)
355 unsigned long flags;
357 ++total_reada;
358 if (async)
359 ++total_async;
361 total_ramax += filp->f_ramax;
362 total_ralen += filp->f_ralen;
363 total_rawin += filp->f_rawin;
365 if (total_reada > PROFILE_MAXREADCOUNT) {
366 save_flags(flags);
367 cli();
368 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
369 restore_flags(flags);
370 return;
373 printk("Readahead average: max=%ld, len=%ld, win=%ld, async=%ld%%\n",
374 total_ramax/total_reada,
375 total_ralen/total_reada,
376 total_rawin/total_reada,
377 (total_async*100)/total_reada);
378 #ifdef DEBUG_READAHEAD
379 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%ld\n",
380 filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
381 #endif
383 total_reada = 0;
384 total_async = 0;
385 total_ramax = 0;
386 total_ralen = 0;
387 total_rawin = 0;
389 restore_flags(flags);
392 #endif /* defined PROFILE_READAHEAD */
395 * Read-ahead context:
396 * -------------------
397 * The read ahead context fields of the "struct file" are the following:
398 * - f_raend : position of the first byte after the last page we tried to
399 * read ahead.
400 * - f_ramax : current read-ahead maximum size.
401 * - f_ralen : length of the current IO read block we tried to read-ahead.
402 * - f_rawin : length of the current read-ahead window.
403 * if last read-ahead was synchronous then
404 * f_rawin = f_ralen
405 * otherwise (was asynchronous)
406 * f_rawin = previous value of f_ralen + f_ralen
408 * Read-ahead limits:
409 * ------------------
410 * MIN_READAHEAD : minimum read-ahead size when read-ahead.
411 * MAX_READAHEAD : maximum read-ahead size when read-ahead.
413 * Synchronous read-ahead benefits:
414 * --------------------------------
415 * Using reasonable IO xfer length from peripheral devices increase system
416 * performances.
417 * Reasonable means, in this context, not too large but not too small.
418 * The actual maximum value is:
419 * MAX_READAHEAD + PAGE_SIZE = 76k is CONFIG_READA_SMALL is undefined
420 * and 32K if defined (4K page size assumed).
422 * Asynchronous read-ahead benefits:
423 * ---------------------------------
424 * Overlapping next read request and user process execution increase system
425 * performance.
427 * Read-ahead risks:
428 * -----------------
429 * We have to guess which further data are needed by the user process.
430 * If these data are often not really needed, it's bad for system
431 * performances.
432 * However, we know that files are often accessed sequentially by
433 * application programs and it seems that it is possible to have some good
434 * strategy in that guessing.
435 * We only try to read-ahead files that seems to be read sequentially.
437 * Asynchronous read-ahead risks:
438 * ------------------------------
439 * In order to maximize overlapping, we must start some asynchronous read
440 * request from the device, as soon as possible.
441 * We must be very careful about:
442 * - The number of effective pending IO read requests.
443 * ONE seems to be the only reasonable value.
444 * - The total memory pool usage for the file access stream.
445 * This maximum memory usage is implicitly 2 IO read chunks:
446 * 2*(MAX_READAHEAD + PAGE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
447 * 64k if defined (4K page size assumed).
450 static inline int get_max_readahead(struct inode * inode)
452 if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
453 return MAX_READAHEAD;
454 return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
457 static inline unsigned long generic_file_readahead(int reada_ok,
458 struct file * filp, struct inode * inode,
459 unsigned long ppos, struct page * page, unsigned long page_cache)
461 unsigned long max_ahead, ahead;
462 unsigned long raend;
463 int max_readahead = get_max_readahead(inode);
465 raend = filp->f_raend & PAGE_MASK;
466 max_ahead = 0;
469 * The current page is locked.
470 * If the current position is inside the previous read IO request, do not
471 * try to reread previously read ahead pages.
472 * Otherwise decide or not to read ahead some pages synchronously.
473 * If we are not going to read ahead, set the read ahead context for this
474 * page only.
476 if (PageLocked(page)) {
477 if (!filp->f_ralen || ppos >= raend || ppos + filp->f_ralen < raend) {
478 raend = ppos;
479 if (raend < inode->i_size)
480 max_ahead = filp->f_ramax;
481 filp->f_rawin = 0;
482 filp->f_ralen = PAGE_SIZE;
483 if (!max_ahead) {
484 filp->f_raend = ppos + filp->f_ralen;
485 filp->f_rawin += filp->f_ralen;
490 * The current page is not locked.
491 * If we were reading ahead and,
492 * if the current max read ahead size is not zero and,
493 * if the current position is inside the last read-ahead IO request,
494 * it is the moment to try to read ahead asynchronously.
495 * We will later force unplug device in order to force asynchronous read IO.
497 else if (reada_ok && filp->f_ramax && raend >= PAGE_SIZE &&
498 ppos <= raend && ppos + filp->f_ralen >= raend) {
500 * Add ONE page to max_ahead in order to try to have about the same IO max size
501 * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_SIZE.
502 * Compute the position of the last page we have tried to read in order to
503 * begin to read ahead just at the next page.
505 raend -= PAGE_SIZE;
506 if (raend < inode->i_size)
507 max_ahead = filp->f_ramax + PAGE_SIZE;
509 if (max_ahead) {
510 filp->f_rawin = filp->f_ralen;
511 filp->f_ralen = 0;
512 reada_ok = 2;
516 * Try to read ahead pages.
517 * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
518 * scheduler, will work enough for us to avoid too bad actuals IO requests.
520 ahead = 0;
521 while (ahead < max_ahead) {
522 ahead += PAGE_SIZE;
523 page_cache = try_to_read_ahead(filp, raend + ahead,
524 page_cache);
527 * If we tried to read ahead some pages,
528 * If we tried to read ahead asynchronously,
529 * Try to force unplug of the device in order to start an asynchronous
530 * read IO request.
531 * Update the read-ahead context.
532 * Store the length of the current read-ahead window.
533 * Double the current max read ahead size.
534 * That heuristic avoid to do some large IO for files that are not really
535 * accessed sequentially.
537 if (ahead) {
538 if (reada_ok == 2) {
539 run_task_queue(&tq_disk);
542 filp->f_ralen += ahead;
543 filp->f_rawin += filp->f_ralen;
544 filp->f_raend = raend + ahead + PAGE_SIZE;
546 filp->f_ramax += filp->f_ramax;
548 if (filp->f_ramax > max_readahead)
549 filp->f_ramax = max_readahead;
551 #ifdef PROFILE_READAHEAD
552 profile_readahead((reada_ok == 2), filp);
553 #endif
556 return page_cache;
561 * This is a generic file read routine, and uses the
562 * inode->i_op->readpage() function for the actual low-level
563 * stuff.
565 * This is really ugly. But the goto's actually try to clarify some
566 * of the logic when it comes to error handling etc.
569 ssize_t generic_file_read(struct file * filp, char * buf,
570 size_t count, loff_t *ppos)
572 struct dentry *dentry = filp->f_dentry;
573 struct inode *inode = dentry->d_inode;
574 ssize_t error, read;
575 size_t pos, pgpos, page_cache;
576 int reada_ok;
577 int max_readahead = get_max_readahead(inode);
579 if (!access_ok(VERIFY_WRITE, buf, count))
580 return -EFAULT;
581 if (!count)
582 return 0;
583 error = 0;
584 read = 0;
585 page_cache = 0;
587 pos = *ppos;
588 pgpos = pos & PAGE_MASK;
590 * If the current position is outside the previous read-ahead window,
591 * we reset the current read-ahead context and set read ahead max to zero
592 * (will be set to just needed value later),
593 * otherwise, we assume that the file accesses are sequential enough to
594 * continue read-ahead.
596 if (pgpos > filp->f_raend || pgpos + filp->f_rawin < filp->f_raend) {
597 reada_ok = 0;
598 filp->f_raend = 0;
599 filp->f_ralen = 0;
600 filp->f_ramax = 0;
601 filp->f_rawin = 0;
602 } else {
603 reada_ok = 1;
606 * Adjust the current value of read-ahead max.
607 * If the read operation stay in the first half page, force no readahead.
608 * Otherwise try to increase read ahead max just enough to do the read request.
609 * Then, at least MIN_READAHEAD if read ahead is ok,
610 * and at most MAX_READAHEAD in all cases.
612 if (pos + count <= (PAGE_SIZE >> 1)) {
613 filp->f_ramax = 0;
614 } else {
615 unsigned long needed;
617 needed = ((pos + count) & PAGE_MASK) - pgpos;
619 if (filp->f_ramax < needed)
620 filp->f_ramax = needed;
622 if (reada_ok && filp->f_ramax < MIN_READAHEAD)
623 filp->f_ramax = MIN_READAHEAD;
624 if (filp->f_ramax > max_readahead)
625 filp->f_ramax = max_readahead;
628 for (;;) {
629 struct page *page, **hash;
631 if (pos >= inode->i_size)
632 break;
635 * Try to find the data in the page cache..
637 hash = page_hash(inode, pos & PAGE_MASK);
638 page = __find_page(inode, pos & PAGE_MASK, *hash);
639 if (!page)
640 goto no_cached_page;
642 found_page:
644 * Try to read ahead only if the current page is filled or being filled.
645 * Otherwise, if we were reading ahead, decrease max read ahead size to
646 * the minimum value.
647 * In this context, that seems to may happen only on some read error or if
648 * the page has been rewritten.
650 if (PageUptodate(page) || PageLocked(page))
651 page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_MASK, page, page_cache);
652 else if (reada_ok && filp->f_ramax > MIN_READAHEAD)
653 filp->f_ramax = MIN_READAHEAD;
655 wait_on_page(page);
657 if (!PageUptodate(page))
658 goto page_read_error;
660 success:
662 * Ok, we have the page, it's up-to-date and ok,
663 * so now we can finally copy it to user space...
666 unsigned long offset, nr;
668 offset = pos & ~PAGE_MASK;
669 nr = PAGE_SIZE - offset;
670 if (nr > count)
671 nr = count;
672 if (nr > inode->i_size - pos)
673 nr = inode->i_size - pos;
674 nr -= copy_to_user(buf, (void *) (page_address(page) + offset), nr);
675 release_page(page);
676 error = -EFAULT;
677 if (!nr)
678 break;
679 buf += nr;
680 pos += nr;
681 read += nr;
682 count -= nr;
683 if (count)
684 continue;
685 break;
688 no_cached_page:
690 * Ok, it wasn't cached, so we need to create a new
691 * page..
693 if (!page_cache) {
694 page_cache = __get_free_page(GFP_KERNEL);
696 * That could have slept, so go around to the
697 * very beginning..
699 if (page_cache)
700 continue;
701 error = -ENOMEM;
702 break;
706 * Ok, add the new page to the hash-queues...
708 page = mem_map + MAP_NR(page_cache);
709 page_cache = 0;
710 add_to_page_cache(page, inode, pos & PAGE_MASK, hash);
713 * Error handling is tricky. If we get a read error,
714 * the cached page stays in the cache (but uptodate=0),
715 * and the next process that accesses it will try to
716 * re-read it. This is needed for NFS etc, where the
717 * identity of the reader can decide if we can read the
718 * page or not..
721 * We have to read the page.
722 * If we were reading ahead, we had previously tried to read this page,
723 * That means that the page has probably been removed from the cache before
724 * the application process needs it, or has been rewritten.
725 * Decrease max readahead size to the minimum value in that situation.
727 if (reada_ok && filp->f_ramax > MIN_READAHEAD)
728 filp->f_ramax = MIN_READAHEAD;
730 error = inode->i_op->readpage(filp, page);
731 if (!error)
732 goto found_page;
733 release_page(page);
734 break;
736 page_read_error:
738 * We found the page, but it wasn't up-to-date.
739 * Try to re-read it _once_. We do this synchronously,
740 * because this happens only if there were errors.
742 error = inode->i_op->readpage(filp, page);
743 if (!error) {
744 wait_on_page(page);
745 if (PageUptodate(page) && !PageError(page))
746 goto success;
747 error = -EIO; /* Some unspecified error occurred.. */
749 release_page(page);
750 break;
753 *ppos = pos;
754 filp->f_reada = 1;
755 if (page_cache)
756 free_page(page_cache);
757 UPDATE_ATIME(inode);
758 if (!read)
759 read = error;
760 return read;
764 * Semantics for shared and private memory areas are different past the end
765 * of the file. A shared mapping past the last page of the file is an error
766 * and results in a SIGBUS, while a private mapping just maps in a zero page.
768 * The goto's are kind of ugly, but this streamlines the normal case of having
769 * it in the page cache, and handles the special cases reasonably without
770 * having a lot of duplicated code.
772 * WSH 06/04/97: fixed a memory leak and moved the allocation of new_page
773 * ahead of the wait if we're sure to need it.
775 static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long address, int no_share)
777 struct file * file = area->vm_file;
778 struct dentry * dentry = file->f_dentry;
779 struct inode * inode = dentry->d_inode;
780 unsigned long offset;
781 struct page * page, **hash;
782 unsigned long old_page, new_page;
784 new_page = 0;
785 offset = (address & PAGE_MASK) - area->vm_start + area->vm_offset;
786 if (offset >= inode->i_size && (area->vm_flags & VM_SHARED) && area->vm_mm == current->mm)
787 goto no_page;
790 * Do we have something in the page cache already?
792 hash = page_hash(inode, offset);
793 page = __find_page(inode, offset, *hash);
794 if (!page)
795 goto no_cached_page;
797 found_page:
799 * Ok, found a page in the page cache, now we need to check
800 * that it's up-to-date. First check whether we'll need an
801 * extra page -- better to overlap the allocation with the I/O.
803 if (no_share && !new_page) {
804 new_page = __get_free_page(GFP_KERNEL);
805 if (!new_page)
806 goto failure;
809 if (PageLocked(page))
810 goto page_locked_wait;
811 if (!PageUptodate(page))
812 goto page_read_error;
814 success:
816 * Found the page, need to check sharing and possibly
817 * copy it over to another page..
819 old_page = page_address(page);
820 if (!no_share) {
822 * Ok, we can share the cached page directly.. Get rid
823 * of any potential extra pages.
825 if (new_page)
826 free_page(new_page);
828 flush_page_to_ram(old_page);
829 return old_page;
833 * No sharing ... copy to the new page.
835 copy_page(new_page, old_page);
836 flush_page_to_ram(new_page);
837 release_page(page);
838 return new_page;
840 no_cached_page:
841 new_page = __get_free_page(GFP_KERNEL);
842 if (!new_page)
843 goto no_page;
846 * During getting the above page we might have slept,
847 * so we need to re-check the situation with the page
848 * cache.. The page we just got may be useful if we
849 * can't share, so don't get rid of it here.
851 page = find_page(inode, offset);
852 if (page)
853 goto found_page;
856 * Now, create a new page-cache page from the page we got
858 page = mem_map + MAP_NR(new_page);
859 new_page = 0;
860 add_to_page_cache(page, inode, offset, hash);
862 if (inode->i_op->readpage(file, page) != 0)
863 goto failure;
866 * Do a very limited read-ahead if appropriate
868 if (PageLocked(page))
869 new_page = try_to_read_ahead(file, offset + PAGE_SIZE, 0);
870 goto found_page;
872 page_locked_wait:
873 __wait_on_page(page);
874 if (PageUptodate(page))
875 goto success;
877 page_read_error:
879 * Umm, take care of errors if the page isn't up-to-date.
880 * Try to re-read it _once_. We do this synchronously,
881 * because there really aren't any performance issues here
882 * and we need to check for errors.
884 if (inode->i_op->readpage(file, page) != 0)
885 goto failure;
886 wait_on_page(page);
887 if (PageError(page))
888 goto failure;
889 if (PageUptodate(page))
890 goto success;
893 * Uhhuh.. Things didn't work out. Return zero to tell the
894 * mm layer so, possibly freeing the page cache page first.
896 failure:
897 release_page(page);
898 if (new_page)
899 free_page(new_page);
900 no_page:
901 return 0;
905 * Tries to write a shared mapped page to its backing store. May return -EIO
906 * if the disk is full.
908 static inline int do_write_page(struct inode * inode, struct file * file,
909 const char * page, unsigned long offset)
911 int retval;
912 unsigned long size;
913 mm_segment_t old_fs;
915 size = offset + PAGE_SIZE;
916 /* refuse to extend file size.. */
917 if (S_ISREG(inode->i_mode)) {
918 if (size > inode->i_size)
919 size = inode->i_size;
920 /* Ho humm.. We should have tested for this earlier */
921 if (size < offset)
922 return -EIO;
924 size -= offset;
925 old_fs = get_fs();
926 set_fs(KERNEL_DS);
927 retval = -EIO;
928 if (size == file->f_op->write(file, (const char *) page,
929 size, &file->f_pos))
930 retval = 0;
931 set_fs(old_fs);
932 return retval;
935 static int filemap_write_page(struct vm_area_struct * vma,
936 unsigned long offset,
937 unsigned long page)
939 int result;
940 struct file * file;
941 struct dentry * dentry;
942 struct inode * inode;
943 struct buffer_head * bh;
945 bh = mem_map[MAP_NR(page)].buffers;
946 if (bh) {
947 /* whee.. just mark the buffer heads dirty */
948 struct buffer_head * tmp = bh;
949 do {
951 * WSH: There's a race here: mark_buffer_dirty()
952 * could block, and the buffers aren't pinned down.
954 mark_buffer_dirty(tmp, 0);
955 tmp = tmp->b_this_page;
956 } while (tmp != bh);
957 return 0;
960 file = vma->vm_file;
961 dentry = file->f_dentry;
962 inode = dentry->d_inode;
963 if (!file->f_op->write)
964 return -EIO;
967 * If a task terminates while we're swapping the page, the vma and
968 * and file could be released ... increment the count to be safe.
970 file->f_count++;
971 down(&inode->i_sem);
972 result = do_write_page(inode, file, (const char *) page, offset);
973 up(&inode->i_sem);
974 fput(file);
975 return result;
980 * Swapping to a shared file: while we're busy writing out the page
981 * (and the page still exists in memory), we save the page information
982 * in the page table, so that "filemap_swapin()" can re-use the page
983 * immediately if it is called while we're busy swapping it out..
985 * Once we've written it all out, we mark the page entry "empty", which
986 * will result in a normal page-in (instead of a swap-in) from the now
987 * up-to-date disk file.
989 int filemap_swapout(struct vm_area_struct * vma,
990 unsigned long offset,
991 pte_t *page_table)
993 int error;
994 unsigned long page = pte_page(*page_table);
995 unsigned long entry = SWP_ENTRY(SHM_SWP_TYPE, MAP_NR(page));
997 flush_cache_page(vma, (offset + vma->vm_start - vma->vm_offset));
998 set_pte(page_table, __pte(entry));
999 flush_tlb_page(vma, (offset + vma->vm_start - vma->vm_offset));
1000 error = filemap_write_page(vma, offset, page);
1001 if (pte_val(*page_table) == entry)
1002 pte_clear(page_table);
1003 return error;
1007 * filemap_swapin() is called only if we have something in the page
1008 * tables that is non-zero (but not present), which we know to be the
1009 * page index of a page that is busy being swapped out (see above).
1010 * So we just use it directly..
1012 static pte_t filemap_swapin(struct vm_area_struct * vma,
1013 unsigned long offset,
1014 unsigned long entry)
1016 unsigned long page = SWP_OFFSET(entry);
1018 atomic_inc(&mem_map[page].count);
1019 page = (page << PAGE_SHIFT) + PAGE_OFFSET;
1020 return mk_pte(page,vma->vm_page_prot);
1024 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
1025 unsigned long address, unsigned int flags)
1027 pte_t pte = *ptep;
1028 unsigned long page;
1029 int error;
1031 if (!(flags & MS_INVALIDATE)) {
1032 if (!pte_present(pte))
1033 return 0;
1034 if (!pte_dirty(pte))
1035 return 0;
1036 flush_page_to_ram(pte_page(pte));
1037 flush_cache_page(vma, address);
1038 set_pte(ptep, pte_mkclean(pte));
1039 flush_tlb_page(vma, address);
1040 page = pte_page(pte);
1041 atomic_inc(&mem_map[MAP_NR(page)].count);
1042 } else {
1043 if (pte_none(pte))
1044 return 0;
1045 flush_cache_page(vma, address);
1046 pte_clear(ptep);
1047 flush_tlb_page(vma, address);
1048 if (!pte_present(pte)) {
1049 swap_free(pte_val(pte));
1050 return 0;
1052 page = pte_page(pte);
1053 if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
1054 free_page(page);
1055 return 0;
1058 error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page);
1059 free_page(page);
1060 return error;
1063 static inline int filemap_sync_pte_range(pmd_t * pmd,
1064 unsigned long address, unsigned long size,
1065 struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
1067 pte_t * pte;
1068 unsigned long end;
1069 int error;
1071 if (pmd_none(*pmd))
1072 return 0;
1073 if (pmd_bad(*pmd)) {
1074 printk("filemap_sync_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
1075 pmd_clear(pmd);
1076 return 0;
1078 pte = pte_offset(pmd, address);
1079 offset += address & PMD_MASK;
1080 address &= ~PMD_MASK;
1081 end = address + size;
1082 if (end > PMD_SIZE)
1083 end = PMD_SIZE;
1084 error = 0;
1085 do {
1086 error |= filemap_sync_pte(pte, vma, address + offset, flags);
1087 address += PAGE_SIZE;
1088 pte++;
1089 } while (address < end);
1090 return error;
1093 static inline int filemap_sync_pmd_range(pgd_t * pgd,
1094 unsigned long address, unsigned long size,
1095 struct vm_area_struct *vma, unsigned int flags)
1097 pmd_t * pmd;
1098 unsigned long offset, end;
1099 int error;
1101 if (pgd_none(*pgd))
1102 return 0;
1103 if (pgd_bad(*pgd)) {
1104 printk("filemap_sync_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd));
1105 pgd_clear(pgd);
1106 return 0;
1108 pmd = pmd_offset(pgd, address);
1109 offset = address & PGDIR_MASK;
1110 address &= ~PGDIR_MASK;
1111 end = address + size;
1112 if (end > PGDIR_SIZE)
1113 end = PGDIR_SIZE;
1114 error = 0;
1115 do {
1116 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
1117 address = (address + PMD_SIZE) & PMD_MASK;
1118 pmd++;
1119 } while (address < end);
1120 return error;
1123 static int filemap_sync(struct vm_area_struct * vma, unsigned long address,
1124 size_t size, unsigned int flags)
1126 pgd_t * dir;
1127 unsigned long end = address + size;
1128 int error = 0;
1130 dir = pgd_offset(vma->vm_mm, address);
1131 flush_cache_range(vma->vm_mm, end - size, end);
1132 while (address < end) {
1133 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
1134 address = (address + PGDIR_SIZE) & PGDIR_MASK;
1135 dir++;
1137 flush_tlb_range(vma->vm_mm, end - size, end);
1138 return error;
1142 * This handles (potentially partial) area unmaps..
1144 static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len)
1146 filemap_sync(vma, start, len, MS_ASYNC);
1150 * Shared mappings need to be able to do the right thing at
1151 * close/unmap/sync. They will also use the private file as
1152 * backing-store for swapping..
1154 static struct vm_operations_struct file_shared_mmap = {
1155 NULL, /* no special open */
1156 NULL, /* no special close */
1157 filemap_unmap, /* unmap - we need to sync the pages */
1158 NULL, /* no special protect */
1159 filemap_sync, /* sync */
1160 NULL, /* advise */
1161 filemap_nopage, /* nopage */
1162 NULL, /* wppage */
1163 filemap_swapout, /* swapout */
1164 filemap_swapin, /* swapin */
1168 * Private mappings just need to be able to load in the map.
1170 * (This is actually used for shared mappings as well, if we
1171 * know they can't ever get write permissions..)
1173 static struct vm_operations_struct file_private_mmap = {
1174 NULL, /* open */
1175 NULL, /* close */
1176 NULL, /* unmap */
1177 NULL, /* protect */
1178 NULL, /* sync */
1179 NULL, /* advise */
1180 filemap_nopage, /* nopage */
1181 NULL, /* wppage */
1182 NULL, /* swapout */
1183 NULL, /* swapin */
1186 /* This is used for a general mmap of a disk file */
1188 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1190 struct vm_operations_struct * ops;
1191 struct inode *inode = file->f_dentry->d_inode;
1193 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1194 ops = &file_shared_mmap;
1195 /* share_page() can only guarantee proper page sharing if
1196 * the offsets are all page aligned. */
1197 if (vma->vm_offset & (PAGE_SIZE - 1))
1198 return -EINVAL;
1199 } else {
1200 ops = &file_private_mmap;
1201 if (vma->vm_offset & (inode->i_sb->s_blocksize - 1))
1202 return -EINVAL;
1204 if (!inode->i_sb || !S_ISREG(inode->i_mode))
1205 return -EACCES;
1206 if (!inode->i_op || !inode->i_op->readpage)
1207 return -ENOEXEC;
1208 UPDATE_ATIME(inode);
1209 vma->vm_file = file;
1210 file->f_count++;
1211 vma->vm_ops = ops;
1212 return 0;
1217 * The msync() system call.
1220 static int msync_interval(struct vm_area_struct * vma,
1221 unsigned long start, unsigned long end, int flags)
1223 if (vma->vm_file && vma->vm_ops && vma->vm_ops->sync) {
1224 int error;
1225 error = vma->vm_ops->sync(vma, start, end-start, flags);
1226 if (!error && (flags & MS_SYNC)) {
1227 struct file * file = vma->vm_file;
1228 if (file) {
1229 struct dentry * dentry = file->f_dentry;
1230 struct inode * inode = dentry->d_inode;
1231 down(&inode->i_sem);
1232 error = file_fsync(file, dentry);
1233 up(&inode->i_sem);
1236 return error;
1238 return 0;
1241 asmlinkage int sys_msync(unsigned long start, size_t len, int flags)
1243 unsigned long end;
1244 struct vm_area_struct * vma;
1245 int unmapped_error, error = -EINVAL;
1247 lock_kernel();
1248 if (start & ~PAGE_MASK)
1249 goto out;
1250 len = (len + ~PAGE_MASK) & PAGE_MASK;
1251 end = start + len;
1252 if (end < start)
1253 goto out;
1254 if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1255 goto out;
1256 error = 0;
1257 if (end == start)
1258 goto out;
1260 * If the interval [start,end) covers some unmapped address ranges,
1261 * just ignore them, but return -EFAULT at the end.
1263 vma = find_vma(current->mm, start);
1264 unmapped_error = 0;
1265 for (;;) {
1266 /* Still start < end. */
1267 error = -EFAULT;
1268 if (!vma)
1269 goto out;
1270 /* Here start < vma->vm_end. */
1271 if (start < vma->vm_start) {
1272 unmapped_error = -EFAULT;
1273 start = vma->vm_start;
1275 /* Here vma->vm_start <= start < vma->vm_end. */
1276 if (end <= vma->vm_end) {
1277 if (start < end) {
1278 error = msync_interval(vma, start, end, flags);
1279 if (error)
1280 goto out;
1282 error = unmapped_error;
1283 goto out;
1285 /* Here vma->vm_start <= start < vma->vm_end < end. */
1286 error = msync_interval(vma, start, vma->vm_end, flags);
1287 if (error)
1288 goto out;
1289 start = vma->vm_end;
1290 vma = vma->vm_next;
1292 out:
1293 unlock_kernel();
1294 return error;
1298 * Write to a file through the page cache. This is mainly for the
1299 * benefit of NFS and possibly other network-based file systems.
1301 * We currently put everything into the page cache prior to writing it.
1302 * This is not a problem when writing full pages. With partial pages,
1303 * however, we first have to read the data into the cache, then
1304 * dirty the page, and finally schedule it for writing. Alternatively, we
1305 * could write-through just the portion of data that would go into that
1306 * page, but that would kill performance for applications that write data
1307 * line by line, and it's prone to race conditions.
1309 * Note that this routine doesn't try to keep track of dirty pages. Each
1310 * file system has to do this all by itself, unfortunately.
1311 * okir@monad.swb.de
1313 ssize_t
1314 generic_file_write(struct file *file, const char *buf,
1315 size_t count, loff_t *ppos)
1317 struct dentry *dentry = file->f_dentry;
1318 struct inode *inode = dentry->d_inode;
1319 struct page *page, **hash;
1320 unsigned long page_cache = 0;
1321 unsigned long pgpos, offset;
1322 unsigned long bytes, written;
1323 unsigned long pos;
1324 long status, sync, didread;
1326 if (!inode->i_op || !inode->i_op->updatepage)
1327 return -EIO;
1329 sync = file->f_flags & O_SYNC;
1330 pos = *ppos;
1331 written = 0;
1332 status = 0;
1334 if (file->f_flags & O_APPEND)
1335 pos = inode->i_size;
1337 while (count) {
1339 * Try to find the page in the cache. If it isn't there,
1340 * allocate a free page.
1342 offset = (pos & ~PAGE_MASK);
1343 pgpos = pos & PAGE_MASK;
1345 if ((bytes = PAGE_SIZE - offset) > count)
1346 bytes = count;
1348 hash = page_hash(inode, pgpos);
1349 if (!(page = __find_page(inode, pgpos, *hash))) {
1350 if (!page_cache) {
1351 page_cache = __get_free_page(GFP_KERNEL);
1352 if (page_cache)
1353 continue;
1354 status = -ENOMEM;
1355 break;
1357 page = mem_map + MAP_NR(page_cache);
1358 add_to_page_cache(page, inode, pgpos, hash);
1359 page_cache = 0;
1363 * Note: setting of the PG_locked bit is handled
1364 * below the i_op->xxx interface.
1366 didread = 0;
1367 page_wait:
1368 wait_on_page(page);
1369 if (PageUptodate(page))
1370 goto do_update_page;
1373 * The page is not up-to-date ... if we're writing less
1374 * than a full page of data, we may have to read it first.
1375 * But if the page is past the current end of file, we must
1376 * clear it before updating.
1378 if (bytes < PAGE_SIZE) {
1379 if (pgpos < inode->i_size) {
1380 status = -EIO;
1381 if (didread >= 2)
1382 goto done_with_page;
1383 status = inode->i_op->readpage(file, page);
1384 if (status < 0)
1385 goto done_with_page;
1386 didread++;
1387 goto page_wait;
1388 } else {
1389 /* Must clear for partial writes */
1390 memset((void *) page_address(page), 0,
1391 PAGE_SIZE);
1395 * N.B. We should defer setting PG_uptodate at least until
1396 * the data is copied. A failure in i_op->updatepage() could
1397 * leave the page with garbage data.
1399 set_bit(PG_uptodate, &page->flags);
1401 do_update_page:
1402 /* Alright, the page is there. Now update it. */
1403 status = inode->i_op->updatepage(file, page, buf,
1404 offset, bytes, sync);
1405 done_with_page:
1406 __free_page(page);
1407 if (status < 0)
1408 break;
1410 written += status;
1411 count -= status;
1412 pos += status;
1413 buf += status;
1415 *ppos = pos;
1416 if (pos > inode->i_size)
1417 inode->i_size = pos;
1419 if (page_cache)
1420 free_page(page_cache);
1421 return written ? written : status;
1425 * Support routines for directory cacheing using the page cache.
1429 * Finds the page at the specified offset, installing a new page
1430 * if requested. The count is incremented and the page is locked.
1432 * Note: we don't have to worry about races here, as the caller
1433 * is holding the inode semaphore.
1435 unsigned long get_cached_page(struct inode * inode, unsigned long offset,
1436 int new)
1438 struct page * page;
1439 struct page ** hash;
1440 unsigned long page_cache = 0;
1442 hash = page_hash(inode, offset);
1443 page = __find_page(inode, offset, *hash);
1444 if (!page) {
1445 if (!new)
1446 goto out;
1447 page_cache = get_free_page(GFP_KERNEL);
1448 if (!page_cache)
1449 goto out;
1450 page = mem_map + MAP_NR(page_cache);
1451 add_to_page_cache(page, inode, offset, hash);
1453 if (atomic_read(&page->count) != 2)
1454 printk(KERN_ERR "get_cached_page: page count=%d\n",
1455 atomic_read(&page->count));
1456 if (test_bit(PG_locked, &page->flags))
1457 printk(KERN_ERR "get_cached_page: page already locked!\n");
1458 set_bit(PG_locked, &page->flags);
1459 page_cache = page_address(page);
1461 out:
1462 return page_cache;
1466 * Unlock and free a page.
1468 void put_cached_page(unsigned long addr)
1470 struct page * page = mem_map + MAP_NR(addr);
1472 if (!test_bit(PG_locked, &page->flags))
1473 printk("put_cached_page: page not locked!\n");
1474 if (atomic_read(&page->count) != 2)
1475 printk("put_cached_page: page count=%d\n",
1476 atomic_read(&page->count));
1477 clear_bit(PG_locked, &page->flags);
1478 wake_up(&page->wait);
1479 __free_page(page);