4 * Copyright (C) 1994, 1995 Linus Torvalds
8 * This file handles the generic file mmap semantics used by
9 * most "normal" filesystems (but you don't /have/ to use this:
10 * the NFS filesystem does this differently, for example)
12 #include <linux/stat.h>
13 #include <linux/sched.h>
14 #include <linux/kernel.h>
16 #include <linux/shm.h>
17 #include <linux/errno.h>
18 #include <linux/mman.h>
19 #include <linux/string.h>
20 #include <linux/malloc.h>
22 #include <linux/locks.h>
23 #include <linux/pagemap.h>
24 #include <linux/swap.h>
25 #include <linux/smp.h>
26 #include <linux/smp_lock.h>
27 #include <linux/blkdev.h>
28 #include <linux/file.h>
30 #include <asm/system.h>
31 #include <asm/pgtable.h>
32 #include <asm/uaccess.h>
35 * Shared mappings implemented 30.11.1994. It's not fully working yet,
38 * Shared mappings now work. 15.8.1995 Bruno.
41 unsigned long page_cache_size
= 0;
42 struct page
* page_hash_table
[PAGE_HASH_SIZE
];
45 * Simple routines for both non-shared and shared mappings.
48 #define release_page(page) __free_page((page))
51 * Invalidate the pages of an inode, removing all pages that aren't
52 * locked down (those are sure to be up-to-date anyway, so we shouldn't
55 void invalidate_inode_pages(struct inode
* inode
)
61 while ((page
= *p
) != NULL
) {
62 if (PageLocked(page
)) {
67 if ((*p
= page
->next
) != NULL
)
68 (*p
)->prev
= page
->prev
;
71 remove_page_from_hash_queue(page
);
79 * Truncate the page cache at a set offset, removing the pages
80 * that are beyond that offset (and zeroing out partial pages).
82 void truncate_inode_pages(struct inode
* inode
, unsigned long start
)
89 while ((page
= *p
) != NULL
) {
90 unsigned long offset
= page
->offset
;
92 /* page wholly truncated - free it */
93 if (offset
>= start
) {
94 if (PageLocked(page
)) {
99 if ((*p
= page
->next
) != NULL
)
100 (*p
)->prev
= page
->prev
;
103 remove_page_from_hash_queue(page
);
109 offset
= start
- offset
;
110 /* partial truncate, clear end of page */
111 if (offset
< PAGE_SIZE
) {
112 unsigned long address
= page_address(page
);
113 memset((void *) (offset
+ address
), 0, PAGE_SIZE
- offset
);
114 flush_page_to_ram(address
);
119 int shrink_mmap(int priority
, int gfp_mask
)
121 static unsigned long clock
= 0;
123 unsigned long limit
= num_physpages
;
124 struct buffer_head
*tmp
, *bh
;
125 int count_max
, count_min
;
127 count_max
= (limit
<<1) >> (priority
>>1);
128 count_min
= (limit
<<1) >> (priority
);
130 page
= mem_map
+ clock
;
133 if (page
->inode
|| page
->buffers
)
136 if (PageLocked(page
))
138 if ((gfp_mask
& __GFP_DMA
) && !PageDMA(page
))
140 /* First of all, regenerate the page's referenced bit
141 from any buffers in the page */
146 if (buffer_touched(tmp
)) {
147 clear_bit(BH_Touched
, &tmp
->b_state
);
148 set_bit(PG_referenced
, &page
->flags
);
150 tmp
= tmp
->b_this_page
;
154 /* We can't throw away shared pages, but we do mark
155 them as referenced. This relies on the fact that
156 no page is currently in both the page cache and the
157 buffer cache; we'd have to modify the following
158 test to allow for that case. */
160 switch (atomic_read(&page
->count
)) {
162 /* If it has been referenced recently, don't free it */
163 if (test_and_clear_bit(PG_referenced
, &page
->flags
))
166 /* is it a swap-cache or page-cache page? */
168 if (PageSwapCache(page
)) {
169 delete_from_swap_cache(page
);
172 remove_page_from_hash_queue(page
);
173 remove_page_from_inode_queue(page
);
178 /* is it a buffer cache page? */
179 if ((gfp_mask
& __GFP_IO
) && bh
&& try_to_free_buffer(bh
, &bh
, 6))
184 /* more than one users: we can't throw it away */
185 set_bit(PG_referenced
, &page
->flags
);
193 if (clock
>= limit
) {
197 } while (count_max
> 0 && count_min
> 0);
202 * This is called from try_to_swap_out() when we try to get rid of some
203 * pages.. If we're unmapping the last occurrence of this page, we also
204 * free it from the page hash-queues etc, as we don't want to keep it
205 * in-core unnecessarily.
207 unsigned long page_unuse(unsigned long page
)
209 struct page
* p
= mem_map
+ MAP_NR(page
);
210 int count
= atomic_read(&p
->count
);
216 if (PageSwapCache(p
))
217 panic ("Doing a normal page_unuse of a swap cache page");
218 remove_page_from_hash_queue(p
);
219 remove_page_from_inode_queue(p
);
225 * Update a page cache copy, when we're doing a "write()" system call
226 * See also "update_vm_cache()".
228 void update_vm_cache(struct inode
* inode
, unsigned long pos
, const char * buf
, int count
)
230 unsigned long offset
, len
;
232 offset
= (pos
& ~PAGE_MASK
);
233 pos
= pos
& PAGE_MASK
;
234 len
= PAGE_SIZE
- offset
;
240 page
= find_page(inode
, pos
);
243 memcpy((void *) (offset
+ page_address(page
)), buf
, len
);
254 static inline void add_to_page_cache(struct page
* page
,
255 struct inode
* inode
, unsigned long offset
,
258 atomic_inc(&page
->count
);
259 page
->flags
&= ~((1 << PG_uptodate
) | (1 << PG_error
));
260 page
->offset
= offset
;
261 add_page_to_inode_queue(inode
, page
);
262 __add_page_to_hash_queue(page
, hash
);
266 * Try to read ahead in the file. "page_cache" is a potentially free page
267 * that we could use for the cache (if it is 0 we can try to create one,
268 * this is all overlapped with the IO on the previous page finishing anyway)
270 static unsigned long try_to_read_ahead(struct file
* file
,
271 unsigned long offset
, unsigned long page_cache
)
273 struct inode
*inode
= file
->f_dentry
->d_inode
;
278 switch (page_cache
) {
280 page_cache
= __get_free_page(GFP_KERNEL
);
284 if (offset
>= inode
->i_size
)
286 hash
= page_hash(inode
, offset
);
287 page
= __find_page(inode
, offset
, *hash
);
290 * Ok, add the new page to the hash-queues...
292 page
= mem_map
+ MAP_NR(page_cache
);
293 add_to_page_cache(page
, inode
, offset
, hash
);
294 inode
->i_op
->readpage(file
, page
);
303 * Wait for IO to complete on a locked page.
305 * This must be called with the caller "holding" the page,
306 * ie with increased "page->count" so that the page won't
307 * go away during the wait..
309 void __wait_on_page(struct page
*page
)
311 struct task_struct
*tsk
= current
;
312 struct wait_queue wait
;
315 add_wait_queue(&page
->wait
, &wait
);
317 tsk
->state
= TASK_UNINTERRUPTIBLE
;
318 run_task_queue(&tq_disk
);
319 if (PageLocked(page
)) {
323 tsk
->state
= TASK_RUNNING
;
324 remove_wait_queue(&page
->wait
, &wait
);
328 #define PROFILE_READAHEAD
329 #define DEBUG_READAHEAD
333 * Read-ahead profiling information
334 * --------------------------------
335 * Every PROFILE_MAXREADCOUNT, the following information is written
337 * Percentage of asynchronous read-ahead.
338 * Average of read-ahead fields context value.
339 * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
343 #ifdef PROFILE_READAHEAD
345 #define PROFILE_MAXREADCOUNT 1000
347 static unsigned long total_reada
;
348 static unsigned long total_async
;
349 static unsigned long total_ramax
;
350 static unsigned long total_ralen
;
351 static unsigned long total_rawin
;
353 static void profile_readahead(int async
, struct file
*filp
)
361 total_ramax
+= filp
->f_ramax
;
362 total_ralen
+= filp
->f_ralen
;
363 total_rawin
+= filp
->f_rawin
;
365 if (total_reada
> PROFILE_MAXREADCOUNT
) {
368 if (!(total_reada
> PROFILE_MAXREADCOUNT
)) {
369 restore_flags(flags
);
373 printk("Readahead average: max=%ld, len=%ld, win=%ld, async=%ld%%\n",
374 total_ramax
/total_reada
,
375 total_ralen
/total_reada
,
376 total_rawin
/total_reada
,
377 (total_async
*100)/total_reada
);
378 #ifdef DEBUG_READAHEAD
379 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%ld\n",
380 filp
->f_ramax
, filp
->f_ralen
, filp
->f_rawin
, filp
->f_raend
);
389 restore_flags(flags
);
392 #endif /* defined PROFILE_READAHEAD */
395 * Read-ahead context:
396 * -------------------
397 * The read ahead context fields of the "struct file" are the following:
398 * - f_raend : position of the first byte after the last page we tried to
400 * - f_ramax : current read-ahead maximum size.
401 * - f_ralen : length of the current IO read block we tried to read-ahead.
402 * - f_rawin : length of the current read-ahead window.
403 * if last read-ahead was synchronous then
405 * otherwise (was asynchronous)
406 * f_rawin = previous value of f_ralen + f_ralen
410 * MIN_READAHEAD : minimum read-ahead size when read-ahead.
411 * MAX_READAHEAD : maximum read-ahead size when read-ahead.
413 * Synchronous read-ahead benefits:
414 * --------------------------------
415 * Using reasonable IO xfer length from peripheral devices increase system
417 * Reasonable means, in this context, not too large but not too small.
418 * The actual maximum value is:
419 * MAX_READAHEAD + PAGE_SIZE = 76k is CONFIG_READA_SMALL is undefined
420 * and 32K if defined (4K page size assumed).
422 * Asynchronous read-ahead benefits:
423 * ---------------------------------
424 * Overlapping next read request and user process execution increase system
429 * We have to guess which further data are needed by the user process.
430 * If these data are often not really needed, it's bad for system
432 * However, we know that files are often accessed sequentially by
433 * application programs and it seems that it is possible to have some good
434 * strategy in that guessing.
435 * We only try to read-ahead files that seems to be read sequentially.
437 * Asynchronous read-ahead risks:
438 * ------------------------------
439 * In order to maximize overlapping, we must start some asynchronous read
440 * request from the device, as soon as possible.
441 * We must be very careful about:
442 * - The number of effective pending IO read requests.
443 * ONE seems to be the only reasonable value.
444 * - The total memory pool usage for the file access stream.
445 * This maximum memory usage is implicitly 2 IO read chunks:
446 * 2*(MAX_READAHEAD + PAGE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
447 * 64k if defined (4K page size assumed).
450 static inline int get_max_readahead(struct inode
* inode
)
452 if (!inode
->i_dev
|| !max_readahead
[MAJOR(inode
->i_dev
)])
453 return MAX_READAHEAD
;
454 return max_readahead
[MAJOR(inode
->i_dev
)][MINOR(inode
->i_dev
)];
457 static inline unsigned long generic_file_readahead(int reada_ok
,
458 struct file
* filp
, struct inode
* inode
,
459 unsigned long ppos
, struct page
* page
, unsigned long page_cache
)
461 unsigned long max_ahead
, ahead
;
463 int max_readahead
= get_max_readahead(inode
);
465 raend
= filp
->f_raend
& PAGE_MASK
;
469 * The current page is locked.
470 * If the current position is inside the previous read IO request, do not
471 * try to reread previously read ahead pages.
472 * Otherwise decide or not to read ahead some pages synchronously.
473 * If we are not going to read ahead, set the read ahead context for this
476 if (PageLocked(page
)) {
477 if (!filp
->f_ralen
|| ppos
>= raend
|| ppos
+ filp
->f_ralen
< raend
) {
479 if (raend
< inode
->i_size
)
480 max_ahead
= filp
->f_ramax
;
482 filp
->f_ralen
= PAGE_SIZE
;
484 filp
->f_raend
= ppos
+ filp
->f_ralen
;
485 filp
->f_rawin
+= filp
->f_ralen
;
490 * The current page is not locked.
491 * If we were reading ahead and,
492 * if the current max read ahead size is not zero and,
493 * if the current position is inside the last read-ahead IO request,
494 * it is the moment to try to read ahead asynchronously.
495 * We will later force unplug device in order to force asynchronous read IO.
497 else if (reada_ok
&& filp
->f_ramax
&& raend
>= PAGE_SIZE
&&
498 ppos
<= raend
&& ppos
+ filp
->f_ralen
>= raend
) {
500 * Add ONE page to max_ahead in order to try to have about the same IO max size
501 * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_SIZE.
502 * Compute the position of the last page we have tried to read in order to
503 * begin to read ahead just at the next page.
506 if (raend
< inode
->i_size
)
507 max_ahead
= filp
->f_ramax
+ PAGE_SIZE
;
510 filp
->f_rawin
= filp
->f_ralen
;
516 * Try to read ahead pages.
517 * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
518 * scheduler, will work enough for us to avoid too bad actuals IO requests.
521 while (ahead
< max_ahead
) {
523 page_cache
= try_to_read_ahead(filp
, raend
+ ahead
,
527 * If we tried to read ahead some pages,
528 * If we tried to read ahead asynchronously,
529 * Try to force unplug of the device in order to start an asynchronous
531 * Update the read-ahead context.
532 * Store the length of the current read-ahead window.
533 * Double the current max read ahead size.
534 * That heuristic avoid to do some large IO for files that are not really
535 * accessed sequentially.
539 run_task_queue(&tq_disk
);
542 filp
->f_ralen
+= ahead
;
543 filp
->f_rawin
+= filp
->f_ralen
;
544 filp
->f_raend
= raend
+ ahead
+ PAGE_SIZE
;
546 filp
->f_ramax
+= filp
->f_ramax
;
548 if (filp
->f_ramax
> max_readahead
)
549 filp
->f_ramax
= max_readahead
;
551 #ifdef PROFILE_READAHEAD
552 profile_readahead((reada_ok
== 2), filp
);
561 * This is a generic file read routine, and uses the
562 * inode->i_op->readpage() function for the actual low-level
565 * This is really ugly. But the goto's actually try to clarify some
566 * of the logic when it comes to error handling etc.
569 ssize_t
generic_file_read(struct file
* filp
, char * buf
,
570 size_t count
, loff_t
*ppos
)
572 struct dentry
*dentry
= filp
->f_dentry
;
573 struct inode
*inode
= dentry
->d_inode
;
575 size_t pos
, pgpos
, page_cache
;
577 int max_readahead
= get_max_readahead(inode
);
579 if (!access_ok(VERIFY_WRITE
, buf
, count
))
588 pgpos
= pos
& PAGE_MASK
;
590 * If the current position is outside the previous read-ahead window,
591 * we reset the current read-ahead context and set read ahead max to zero
592 * (will be set to just needed value later),
593 * otherwise, we assume that the file accesses are sequential enough to
594 * continue read-ahead.
596 if (pgpos
> filp
->f_raend
|| pgpos
+ filp
->f_rawin
< filp
->f_raend
) {
606 * Adjust the current value of read-ahead max.
607 * If the read operation stay in the first half page, force no readahead.
608 * Otherwise try to increase read ahead max just enough to do the read request.
609 * Then, at least MIN_READAHEAD if read ahead is ok,
610 * and at most MAX_READAHEAD in all cases.
612 if (pos
+ count
<= (PAGE_SIZE
>> 1)) {
615 unsigned long needed
;
617 needed
= ((pos
+ count
) & PAGE_MASK
) - pgpos
;
619 if (filp
->f_ramax
< needed
)
620 filp
->f_ramax
= needed
;
622 if (reada_ok
&& filp
->f_ramax
< MIN_READAHEAD
)
623 filp
->f_ramax
= MIN_READAHEAD
;
624 if (filp
->f_ramax
> max_readahead
)
625 filp
->f_ramax
= max_readahead
;
629 struct page
*page
, **hash
;
631 if (pos
>= inode
->i_size
)
635 * Try to find the data in the page cache..
637 hash
= page_hash(inode
, pos
& PAGE_MASK
);
638 page
= __find_page(inode
, pos
& PAGE_MASK
, *hash
);
644 * Try to read ahead only if the current page is filled or being filled.
645 * Otherwise, if we were reading ahead, decrease max read ahead size to
647 * In this context, that seems to may happen only on some read error or if
648 * the page has been rewritten.
650 if (PageUptodate(page
) || PageLocked(page
))
651 page_cache
= generic_file_readahead(reada_ok
, filp
, inode
, pos
& PAGE_MASK
, page
, page_cache
);
652 else if (reada_ok
&& filp
->f_ramax
> MIN_READAHEAD
)
653 filp
->f_ramax
= MIN_READAHEAD
;
657 if (!PageUptodate(page
))
658 goto page_read_error
;
662 * Ok, we have the page, it's up-to-date and ok,
663 * so now we can finally copy it to user space...
666 unsigned long offset
, nr
;
668 offset
= pos
& ~PAGE_MASK
;
669 nr
= PAGE_SIZE
- offset
;
672 if (nr
> inode
->i_size
- pos
)
673 nr
= inode
->i_size
- pos
;
674 nr
-= copy_to_user(buf
, (void *) (page_address(page
) + offset
), nr
);
690 * Ok, it wasn't cached, so we need to create a new
694 page_cache
= __get_free_page(GFP_KERNEL
);
696 * That could have slept, so go around to the
706 * Ok, add the new page to the hash-queues...
708 page
= mem_map
+ MAP_NR(page_cache
);
710 add_to_page_cache(page
, inode
, pos
& PAGE_MASK
, hash
);
713 * Error handling is tricky. If we get a read error,
714 * the cached page stays in the cache (but uptodate=0),
715 * and the next process that accesses it will try to
716 * re-read it. This is needed for NFS etc, where the
717 * identity of the reader can decide if we can read the
721 * We have to read the page.
722 * If we were reading ahead, we had previously tried to read this page,
723 * That means that the page has probably been removed from the cache before
724 * the application process needs it, or has been rewritten.
725 * Decrease max readahead size to the minimum value in that situation.
727 if (reada_ok
&& filp
->f_ramax
> MIN_READAHEAD
)
728 filp
->f_ramax
= MIN_READAHEAD
;
730 error
= inode
->i_op
->readpage(filp
, page
);
738 * We found the page, but it wasn't up-to-date.
739 * Try to re-read it _once_. We do this synchronously,
740 * because this happens only if there were errors.
742 error
= inode
->i_op
->readpage(filp
, page
);
745 if (PageUptodate(page
) && !PageError(page
))
747 error
= -EIO
; /* Some unspecified error occurred.. */
756 free_page(page_cache
);
764 * Semantics for shared and private memory areas are different past the end
765 * of the file. A shared mapping past the last page of the file is an error
766 * and results in a SIGBUS, while a private mapping just maps in a zero page.
768 * The goto's are kind of ugly, but this streamlines the normal case of having
769 * it in the page cache, and handles the special cases reasonably without
770 * having a lot of duplicated code.
772 * WSH 06/04/97: fixed a memory leak and moved the allocation of new_page
773 * ahead of the wait if we're sure to need it.
775 static unsigned long filemap_nopage(struct vm_area_struct
* area
, unsigned long address
, int no_share
)
777 struct file
* file
= area
->vm_file
;
778 struct dentry
* dentry
= file
->f_dentry
;
779 struct inode
* inode
= dentry
->d_inode
;
780 unsigned long offset
;
781 struct page
* page
, **hash
;
782 unsigned long old_page
, new_page
;
785 offset
= (address
& PAGE_MASK
) - area
->vm_start
+ area
->vm_offset
;
786 if (offset
>= inode
->i_size
&& (area
->vm_flags
& VM_SHARED
) && area
->vm_mm
== current
->mm
)
790 * Do we have something in the page cache already?
792 hash
= page_hash(inode
, offset
);
793 page
= __find_page(inode
, offset
, *hash
);
799 * Ok, found a page in the page cache, now we need to check
800 * that it's up-to-date. First check whether we'll need an
801 * extra page -- better to overlap the allocation with the I/O.
803 if (no_share
&& !new_page
) {
804 new_page
= __get_free_page(GFP_KERNEL
);
809 if (PageLocked(page
))
810 goto page_locked_wait
;
811 if (!PageUptodate(page
))
812 goto page_read_error
;
816 * Found the page, need to check sharing and possibly
817 * copy it over to another page..
819 old_page
= page_address(page
);
822 * Ok, we can share the cached page directly.. Get rid
823 * of any potential extra pages.
828 flush_page_to_ram(old_page
);
833 * No sharing ... copy to the new page.
835 copy_page(new_page
, old_page
);
836 flush_page_to_ram(new_page
);
841 new_page
= __get_free_page(GFP_KERNEL
);
846 * During getting the above page we might have slept,
847 * so we need to re-check the situation with the page
848 * cache.. The page we just got may be useful if we
849 * can't share, so don't get rid of it here.
851 page
= find_page(inode
, offset
);
856 * Now, create a new page-cache page from the page we got
858 page
= mem_map
+ MAP_NR(new_page
);
860 add_to_page_cache(page
, inode
, offset
, hash
);
862 if (inode
->i_op
->readpage(file
, page
) != 0)
866 * Do a very limited read-ahead if appropriate
868 if (PageLocked(page
))
869 new_page
= try_to_read_ahead(file
, offset
+ PAGE_SIZE
, 0);
873 __wait_on_page(page
);
874 if (PageUptodate(page
))
879 * Umm, take care of errors if the page isn't up-to-date.
880 * Try to re-read it _once_. We do this synchronously,
881 * because there really aren't any performance issues here
882 * and we need to check for errors.
884 if (inode
->i_op
->readpage(file
, page
) != 0)
889 if (PageUptodate(page
))
893 * Uhhuh.. Things didn't work out. Return zero to tell the
894 * mm layer so, possibly freeing the page cache page first.
905 * Tries to write a shared mapped page to its backing store. May return -EIO
906 * if the disk is full.
908 static inline int do_write_page(struct inode
* inode
, struct file
* file
,
909 const char * page
, unsigned long offset
)
915 size
= offset
+ PAGE_SIZE
;
916 /* refuse to extend file size.. */
917 if (S_ISREG(inode
->i_mode
)) {
918 if (size
> inode
->i_size
)
919 size
= inode
->i_size
;
920 /* Ho humm.. We should have tested for this earlier */
928 if (size
== file
->f_op
->write(file
, (const char *) page
,
935 static int filemap_write_page(struct vm_area_struct
* vma
,
936 unsigned long offset
,
941 struct dentry
* dentry
;
942 struct inode
* inode
;
943 struct buffer_head
* bh
;
945 bh
= mem_map
[MAP_NR(page
)].buffers
;
947 /* whee.. just mark the buffer heads dirty */
948 struct buffer_head
* tmp
= bh
;
951 * WSH: There's a race here: mark_buffer_dirty()
952 * could block, and the buffers aren't pinned down.
954 mark_buffer_dirty(tmp
, 0);
955 tmp
= tmp
->b_this_page
;
961 dentry
= file
->f_dentry
;
962 inode
= dentry
->d_inode
;
963 if (!file
->f_op
->write
)
967 * If a task terminates while we're swapping the page, the vma and
968 * and file could be released ... increment the count to be safe.
972 result
= do_write_page(inode
, file
, (const char *) page
, offset
);
980 * Swapping to a shared file: while we're busy writing out the page
981 * (and the page still exists in memory), we save the page information
982 * in the page table, so that "filemap_swapin()" can re-use the page
983 * immediately if it is called while we're busy swapping it out..
985 * Once we've written it all out, we mark the page entry "empty", which
986 * will result in a normal page-in (instead of a swap-in) from the now
987 * up-to-date disk file.
989 int filemap_swapout(struct vm_area_struct
* vma
,
990 unsigned long offset
,
994 unsigned long page
= pte_page(*page_table
);
995 unsigned long entry
= SWP_ENTRY(SHM_SWP_TYPE
, MAP_NR(page
));
997 flush_cache_page(vma
, (offset
+ vma
->vm_start
- vma
->vm_offset
));
998 set_pte(page_table
, __pte(entry
));
999 flush_tlb_page(vma
, (offset
+ vma
->vm_start
- vma
->vm_offset
));
1000 error
= filemap_write_page(vma
, offset
, page
);
1001 if (pte_val(*page_table
) == entry
)
1002 pte_clear(page_table
);
1007 * filemap_swapin() is called only if we have something in the page
1008 * tables that is non-zero (but not present), which we know to be the
1009 * page index of a page that is busy being swapped out (see above).
1010 * So we just use it directly..
1012 static pte_t
filemap_swapin(struct vm_area_struct
* vma
,
1013 unsigned long offset
,
1014 unsigned long entry
)
1016 unsigned long page
= SWP_OFFSET(entry
);
1018 atomic_inc(&mem_map
[page
].count
);
1019 page
= (page
<< PAGE_SHIFT
) + PAGE_OFFSET
;
1020 return mk_pte(page
,vma
->vm_page_prot
);
1024 static inline int filemap_sync_pte(pte_t
* ptep
, struct vm_area_struct
*vma
,
1025 unsigned long address
, unsigned int flags
)
1031 if (!(flags
& MS_INVALIDATE
)) {
1032 if (!pte_present(pte
))
1034 if (!pte_dirty(pte
))
1036 flush_page_to_ram(pte_page(pte
));
1037 flush_cache_page(vma
, address
);
1038 set_pte(ptep
, pte_mkclean(pte
));
1039 flush_tlb_page(vma
, address
);
1040 page
= pte_page(pte
);
1041 atomic_inc(&mem_map
[MAP_NR(page
)].count
);
1045 flush_cache_page(vma
, address
);
1047 flush_tlb_page(vma
, address
);
1048 if (!pte_present(pte
)) {
1049 swap_free(pte_val(pte
));
1052 page
= pte_page(pte
);
1053 if (!pte_dirty(pte
) || flags
== MS_INVALIDATE
) {
1058 error
= filemap_write_page(vma
, address
- vma
->vm_start
+ vma
->vm_offset
, page
);
1063 static inline int filemap_sync_pte_range(pmd_t
* pmd
,
1064 unsigned long address
, unsigned long size
,
1065 struct vm_area_struct
*vma
, unsigned long offset
, unsigned int flags
)
1073 if (pmd_bad(*pmd
)) {
1074 printk("filemap_sync_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd
));
1078 pte
= pte_offset(pmd
, address
);
1079 offset
+= address
& PMD_MASK
;
1080 address
&= ~PMD_MASK
;
1081 end
= address
+ size
;
1086 error
|= filemap_sync_pte(pte
, vma
, address
+ offset
, flags
);
1087 address
+= PAGE_SIZE
;
1089 } while (address
< end
);
1093 static inline int filemap_sync_pmd_range(pgd_t
* pgd
,
1094 unsigned long address
, unsigned long size
,
1095 struct vm_area_struct
*vma
, unsigned int flags
)
1098 unsigned long offset
, end
;
1103 if (pgd_bad(*pgd
)) {
1104 printk("filemap_sync_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd
));
1108 pmd
= pmd_offset(pgd
, address
);
1109 offset
= address
& PGDIR_MASK
;
1110 address
&= ~PGDIR_MASK
;
1111 end
= address
+ size
;
1112 if (end
> PGDIR_SIZE
)
1116 error
|= filemap_sync_pte_range(pmd
, address
, end
- address
, vma
, offset
, flags
);
1117 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
1119 } while (address
< end
);
1123 static int filemap_sync(struct vm_area_struct
* vma
, unsigned long address
,
1124 size_t size
, unsigned int flags
)
1127 unsigned long end
= address
+ size
;
1130 dir
= pgd_offset(vma
->vm_mm
, address
);
1131 flush_cache_range(vma
->vm_mm
, end
- size
, end
);
1132 while (address
< end
) {
1133 error
|= filemap_sync_pmd_range(dir
, address
, end
- address
, vma
, flags
);
1134 address
= (address
+ PGDIR_SIZE
) & PGDIR_MASK
;
1137 flush_tlb_range(vma
->vm_mm
, end
- size
, end
);
1142 * This handles (potentially partial) area unmaps..
1144 static void filemap_unmap(struct vm_area_struct
*vma
, unsigned long start
, size_t len
)
1146 filemap_sync(vma
, start
, len
, MS_ASYNC
);
1150 * Shared mappings need to be able to do the right thing at
1151 * close/unmap/sync. They will also use the private file as
1152 * backing-store for swapping..
1154 static struct vm_operations_struct file_shared_mmap
= {
1155 NULL
, /* no special open */
1156 NULL
, /* no special close */
1157 filemap_unmap
, /* unmap - we need to sync the pages */
1158 NULL
, /* no special protect */
1159 filemap_sync
, /* sync */
1161 filemap_nopage
, /* nopage */
1163 filemap_swapout
, /* swapout */
1164 filemap_swapin
, /* swapin */
1168 * Private mappings just need to be able to load in the map.
1170 * (This is actually used for shared mappings as well, if we
1171 * know they can't ever get write permissions..)
1173 static struct vm_operations_struct file_private_mmap
= {
1180 filemap_nopage
, /* nopage */
1186 /* This is used for a general mmap of a disk file */
1188 int generic_file_mmap(struct file
* file
, struct vm_area_struct
* vma
)
1190 struct vm_operations_struct
* ops
;
1191 struct inode
*inode
= file
->f_dentry
->d_inode
;
1193 if ((vma
->vm_flags
& VM_SHARED
) && (vma
->vm_flags
& VM_MAYWRITE
)) {
1194 ops
= &file_shared_mmap
;
1195 /* share_page() can only guarantee proper page sharing if
1196 * the offsets are all page aligned. */
1197 if (vma
->vm_offset
& (PAGE_SIZE
- 1))
1200 ops
= &file_private_mmap
;
1201 if (vma
->vm_offset
& (inode
->i_sb
->s_blocksize
- 1))
1204 if (!inode
->i_sb
|| !S_ISREG(inode
->i_mode
))
1206 if (!inode
->i_op
|| !inode
->i_op
->readpage
)
1208 UPDATE_ATIME(inode
);
1209 vma
->vm_file
= file
;
1217 * The msync() system call.
1220 static int msync_interval(struct vm_area_struct
* vma
,
1221 unsigned long start
, unsigned long end
, int flags
)
1223 if (vma
->vm_file
&& vma
->vm_ops
&& vma
->vm_ops
->sync
) {
1225 error
= vma
->vm_ops
->sync(vma
, start
, end
-start
, flags
);
1226 if (!error
&& (flags
& MS_SYNC
)) {
1227 struct file
* file
= vma
->vm_file
;
1229 struct dentry
* dentry
= file
->f_dentry
;
1230 struct inode
* inode
= dentry
->d_inode
;
1231 down(&inode
->i_sem
);
1232 error
= file_fsync(file
, dentry
);
1241 asmlinkage
int sys_msync(unsigned long start
, size_t len
, int flags
)
1244 struct vm_area_struct
* vma
;
1245 int unmapped_error
, error
= -EINVAL
;
1248 if (start
& ~PAGE_MASK
)
1250 len
= (len
+ ~PAGE_MASK
) & PAGE_MASK
;
1254 if (flags
& ~(MS_ASYNC
| MS_INVALIDATE
| MS_SYNC
))
1260 * If the interval [start,end) covers some unmapped address ranges,
1261 * just ignore them, but return -EFAULT at the end.
1263 vma
= find_vma(current
->mm
, start
);
1266 /* Still start < end. */
1270 /* Here start < vma->vm_end. */
1271 if (start
< vma
->vm_start
) {
1272 unmapped_error
= -EFAULT
;
1273 start
= vma
->vm_start
;
1275 /* Here vma->vm_start <= start < vma->vm_end. */
1276 if (end
<= vma
->vm_end
) {
1278 error
= msync_interval(vma
, start
, end
, flags
);
1282 error
= unmapped_error
;
1285 /* Here vma->vm_start <= start < vma->vm_end < end. */
1286 error
= msync_interval(vma
, start
, vma
->vm_end
, flags
);
1289 start
= vma
->vm_end
;
1298 * Write to a file through the page cache. This is mainly for the
1299 * benefit of NFS and possibly other network-based file systems.
1301 * We currently put everything into the page cache prior to writing it.
1302 * This is not a problem when writing full pages. With partial pages,
1303 * however, we first have to read the data into the cache, then
1304 * dirty the page, and finally schedule it for writing. Alternatively, we
1305 * could write-through just the portion of data that would go into that
1306 * page, but that would kill performance for applications that write data
1307 * line by line, and it's prone to race conditions.
1309 * Note that this routine doesn't try to keep track of dirty pages. Each
1310 * file system has to do this all by itself, unfortunately.
1314 generic_file_write(struct file
*file
, const char *buf
,
1315 size_t count
, loff_t
*ppos
)
1317 struct dentry
*dentry
= file
->f_dentry
;
1318 struct inode
*inode
= dentry
->d_inode
;
1319 struct page
*page
, **hash
;
1320 unsigned long page_cache
= 0;
1321 unsigned long pgpos
, offset
;
1322 unsigned long bytes
, written
;
1324 long status
, sync
, didread
;
1326 if (!inode
->i_op
|| !inode
->i_op
->updatepage
)
1329 sync
= file
->f_flags
& O_SYNC
;
1334 if (file
->f_flags
& O_APPEND
)
1335 pos
= inode
->i_size
;
1339 * Try to find the page in the cache. If it isn't there,
1340 * allocate a free page.
1342 offset
= (pos
& ~PAGE_MASK
);
1343 pgpos
= pos
& PAGE_MASK
;
1345 if ((bytes
= PAGE_SIZE
- offset
) > count
)
1348 hash
= page_hash(inode
, pgpos
);
1349 if (!(page
= __find_page(inode
, pgpos
, *hash
))) {
1351 page_cache
= __get_free_page(GFP_KERNEL
);
1357 page
= mem_map
+ MAP_NR(page_cache
);
1358 add_to_page_cache(page
, inode
, pgpos
, hash
);
1363 * Note: setting of the PG_locked bit is handled
1364 * below the i_op->xxx interface.
1369 if (PageUptodate(page
))
1370 goto do_update_page
;
1373 * The page is not up-to-date ... if we're writing less
1374 * than a full page of data, we may have to read it first.
1375 * But if the page is past the current end of file, we must
1376 * clear it before updating.
1378 if (bytes
< PAGE_SIZE
) {
1379 if (pgpos
< inode
->i_size
) {
1382 goto done_with_page
;
1383 status
= inode
->i_op
->readpage(file
, page
);
1385 goto done_with_page
;
1389 /* Must clear for partial writes */
1390 memset((void *) page_address(page
), 0,
1395 * N.B. We should defer setting PG_uptodate at least until
1396 * the data is copied. A failure in i_op->updatepage() could
1397 * leave the page with garbage data.
1399 set_bit(PG_uptodate
, &page
->flags
);
1402 /* Alright, the page is there. Now update it. */
1403 status
= inode
->i_op
->updatepage(file
, page
, buf
,
1404 offset
, bytes
, sync
);
1416 if (pos
> inode
->i_size
)
1417 inode
->i_size
= pos
;
1420 free_page(page_cache
);
1421 return written
? written
: status
;
1425 * Support routines for directory cacheing using the page cache.
1429 * Finds the page at the specified offset, installing a new page
1430 * if requested. The count is incremented and the page is locked.
1432 * Note: we don't have to worry about races here, as the caller
1433 * is holding the inode semaphore.
1435 unsigned long get_cached_page(struct inode
* inode
, unsigned long offset
,
1439 struct page
** hash
;
1440 unsigned long page_cache
= 0;
1442 hash
= page_hash(inode
, offset
);
1443 page
= __find_page(inode
, offset
, *hash
);
1447 page_cache
= get_free_page(GFP_KERNEL
);
1450 page
= mem_map
+ MAP_NR(page_cache
);
1451 add_to_page_cache(page
, inode
, offset
, hash
);
1453 if (atomic_read(&page
->count
) != 2)
1454 printk(KERN_ERR
"get_cached_page: page count=%d\n",
1455 atomic_read(&page
->count
));
1456 if (test_bit(PG_locked
, &page
->flags
))
1457 printk(KERN_ERR
"get_cached_page: page already locked!\n");
1458 set_bit(PG_locked
, &page
->flags
);
1459 page_cache
= page_address(page
);
1466 * Unlock and free a page.
1468 void put_cached_page(unsigned long addr
)
1470 struct page
* page
= mem_map
+ MAP_NR(addr
);
1472 if (!test_bit(PG_locked
, &page
->flags
))
1473 printk("put_cached_page: page not locked!\n");
1474 if (atomic_read(&page
->count
) != 2)
1475 printk("put_cached_page: page count=%d\n",
1476 atomic_read(&page
->count
));
1477 clear_bit(PG_locked
, &page
->flags
);
1478 wake_up(&page
->wait
);