4 * Copyright (C) 1994-1999 Linus Torvalds
8 * This file handles the generic file mmap semantics used by
9 * most "normal" filesystems (but you don't /have/ to use this:
10 * the NFS filesystem used to do this differently, for example)
12 #include <linux/malloc.h>
13 #include <linux/shm.h>
14 #include <linux/mman.h>
15 #include <linux/locks.h>
16 #include <linux/pagemap.h>
17 #include <linux/swap.h>
18 #include <linux/smp_lock.h>
19 #include <linux/blkdev.h>
20 #include <linux/file.h>
21 #include <linux/swapctl.h>
22 #include <linux/slab.h>
23 #include <linux/init.h>
26 #include <asm/pgalloc.h>
27 #include <asm/uaccess.h>
29 #include <linux/highmem.h>
32 * Shared mappings implemented 30.11.1994. It's not fully working yet,
35 * Shared mappings now work. 15.8.1995 Bruno.
37 * finished 'unifying' the page and buffer cache and SMP-threaded the
38 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
40 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
43 atomic_t page_cache_size
= ATOMIC_INIT(0);
44 unsigned int page_hash_bits
;
45 struct page
**page_hash_table
;
47 spinlock_t pagecache_lock
= SPIN_LOCK_UNLOCKED
;
49 * NOTE: to avoid deadlocking you must never acquire the pagecache_lock with
50 * the pagemap_lru_lock held.
52 spinlock_t pagemap_lru_lock
= SPIN_LOCK_UNLOCKED
;
54 #define CLUSTER_PAGES (1 << page_cluster)
55 #define CLUSTER_OFFSET(x) (((x) >> page_cluster) << page_cluster)
57 void __add_page_to_hash_queue(struct page
* page
, struct page
**p
)
59 atomic_inc(&page_cache_size
);
60 if((page
->next_hash
= *p
) != NULL
)
61 (*p
)->pprev_hash
= &page
->next_hash
;
68 static void remove_page_from_hash_queue(struct page
* page
)
70 if(page
->pprev_hash
) {
72 page
->next_hash
->pprev_hash
= page
->pprev_hash
;
73 *page
->pprev_hash
= page
->next_hash
;
74 page
->pprev_hash
= NULL
;
76 atomic_dec(&page_cache_size
);
80 * Remove a page from the page cache and free it. Caller has to make
81 * sure the page is locked and that nobody else uses it - or that usage
84 void remove_inode_page(struct page
*page
)
86 if (!PageLocked(page
))
89 spin_lock(&pagecache_lock
);
90 remove_page_from_inode_queue(page
);
91 remove_page_from_hash_queue(page
);
93 spin_unlock(&pagecache_lock
);
96 void invalidate_inode_pages(struct inode
* inode
)
98 struct list_head
*head
, *curr
;
101 head
= &inode
->i_data
.pages
;
102 spin_lock(&pagecache_lock
);
105 while (curr
!= head
) {
106 page
= list_entry(curr
, struct page
, list
);
109 /* We cannot invalidate a locked page */
110 if (PageLocked(page
))
115 remove_page_from_inode_queue(page
);
116 remove_page_from_hash_queue(page
);
117 page
->mapping
= NULL
;
118 page_cache_release(page
);
120 spin_unlock(&pagecache_lock
);
124 * Truncate the page cache at a set offset, removing the pages
125 * that are beyond that offset (and zeroing out partial pages).
127 void truncate_inode_pages(struct inode
* inode
, loff_t lstart
)
129 struct list_head
*head
, *curr
;
131 unsigned partial
= lstart
& (PAGE_CACHE_SIZE
- 1);
134 start
= (lstart
+ PAGE_CACHE_SIZE
- 1) >> PAGE_CACHE_SHIFT
;
137 head
= &inode
->i_data
.pages
;
138 spin_lock(&pagecache_lock
);
140 while (curr
!= head
) {
141 unsigned long offset
;
143 page
= list_entry(curr
, struct page
, list
);
146 offset
= page
->index
;
148 /* page wholly truncated - free it */
149 if (offset
>= start
) {
151 spin_unlock(&pagecache_lock
);
155 if (!page
->buffers
|| block_flushpage(page
, 0))
159 * We remove the page from the page cache
160 * _after_ we have destroyed all buffer-cache
161 * references to it. Otherwise some other process
162 * might think this inode page is not in the
163 * page cache and creates a buffer-cache alias
164 * to it causing all sorts of fun problems ...
166 remove_inode_page(page
);
169 page_cache_release(page
);
170 page_cache_release(page
);
173 * We have done things without the pagecache lock,
174 * so we'll have to repeat the scan.
175 * It's not possible to deadlock here because
176 * we are guaranteed to make progress. (ie. we have
177 * just removed a page)
182 * there is only one partial page possible.
187 /* and it's the one preceeding the first wholly truncated page */
188 if ((offset
+ 1) != start
)
191 /* partial truncate, clear end of page */
193 spin_unlock(&pagecache_lock
);
197 memclear_highpage_flush(page
, partial
, PAGE_CACHE_SIZE
-partial
);
199 block_flushpage(page
, partial
);
204 * we have dropped the spinlock so we have to
208 page_cache_release(page
);
211 spin_unlock(&pagecache_lock
);
214 int shrink_mmap(int priority
, int gfp_mask
, zone_t
*zone
)
220 struct list_head
* page_lru
, * dispose
;
223 count
= nr_lru_pages
/ (priority
+1);
225 spin_lock(&pagemap_lru_lock
);
227 while (count
> 0 && (page_lru
= lru_cache
.prev
) != &lru_cache
) {
228 page
= list_entry(page_lru
, struct page
, lru
);
231 dispose
= &lru_cache
;
232 if (test_and_clear_bit(PG_referenced
, &page
->flags
))
233 /* Roll the page at the top of the lru list,
234 * we could also be more aggressive putting
235 * the page in the young-dispose-list, so
236 * avoiding to free young pages in each pass.
238 goto dispose_continue
;
241 /* don't account passes over not DMA pages */
242 if (zone
&& (!memclass(page
->zone
, zone
)))
243 goto dispose_continue
;
248 if (TryLockPage(page
))
249 goto dispose_continue
;
251 /* Release the pagemap_lru lock even if the page is not yet
252 queued in any lru queue since we have just locked down
253 the page so nobody else may SMP race with us running
254 a lru_cache_del() (lru_cache_del() always run with the
255 page locked down ;). */
256 spin_unlock(&pagemap_lru_lock
);
258 /* avoid unscalable SMP locking */
259 if (!page
->buffers
&& page_count(page
) > 1)
260 goto unlock_noput_continue
;
262 /* Take the pagecache_lock spinlock held to avoid
263 other tasks to notice the page while we are looking at its
264 page count. If it's a pagecache-page we'll free it
265 in one atomic transaction after checking its page count. */
266 spin_lock(&pagecache_lock
);
268 /* avoid freeing the page while it's locked */
271 /* Is it a buffer page? */
273 spin_unlock(&pagecache_lock
);
274 if (!try_to_free_buffers(page
))
275 goto unlock_continue
;
276 /* page was locked, inode can't go away under us */
277 if (!page
->mapping
) {
278 atomic_dec(&buffermem_pages
);
279 goto made_buffer_progress
;
281 spin_lock(&pagecache_lock
);
285 * We can't free pages unless there's just one user
286 * (count == 2 because we added one ourselves above).
288 if (page_count(page
) != 2)
289 goto cache_unlock_continue
;
292 * Is it a page swap page? If so, we want to
293 * drop it if it is no longer used, even if it
294 * were to be marked referenced..
296 if (PageSwapCache(page
)) {
297 spin_unlock(&pagecache_lock
);
298 __delete_from_swap_cache(page
);
299 goto made_inode_progress
;
302 /* is it a page-cache page? */
304 if (!pgcache_under_min())
306 remove_page_from_inode_queue(page
);
307 remove_page_from_hash_queue(page
);
308 page
->mapping
= NULL
;
309 spin_unlock(&pagecache_lock
);
310 goto made_inode_progress
;
312 goto cache_unlock_continue
;
316 printk(KERN_ERR
"shrink_mmap: unknown LRU page!\n");
318 cache_unlock_continue
:
319 spin_unlock(&pagecache_lock
);
323 dispose_relock_continue
:
324 /* even if the dispose list is local, a truncate_inode_page()
325 may remove a page from its queue so always
326 synchronize with the lru lock while accesing the
328 spin_lock(&pagemap_lru_lock
);
329 list_add(page_lru
, dispose
);
332 unlock_noput_continue
:
334 goto dispose_relock_continue
;
337 list_add(page_lru
, dispose
);
342 page_cache_release(page
);
343 made_buffer_progress
:
347 spin_lock(&pagemap_lru_lock
);
348 /* nr_lru_pages needs the spinlock */
352 list_splice(&young
, &lru_cache
);
353 list_splice(&old
, lru_cache
.prev
);
355 spin_unlock(&pagemap_lru_lock
);
360 static inline struct page
* __find_page_nolock(struct address_space
*mapping
, unsigned long offset
, struct page
*page
)
365 page
= page
->next_hash
;
369 if (page
->mapping
!= mapping
)
371 if (page
->index
== offset
)
374 set_bit(PG_referenced
, &page
->flags
);
380 * By the time this is called, the page is locked and
381 * we don't have to worry about any races any more.
385 static int writeout_one_page(struct page
*page
)
387 struct buffer_head
*bh
, *head
= page
->buffers
;
391 if (buffer_locked(bh
) || !buffer_dirty(bh
) || !buffer_uptodate(bh
))
395 ll_rw_block(WRITE
, 1, &bh
);
396 } while ((bh
= bh
->b_this_page
) != head
);
400 static int waitfor_one_page(struct page
*page
)
403 struct buffer_head
*bh
, *head
= page
->buffers
;
408 if (buffer_req(bh
) && !buffer_uptodate(bh
))
410 } while ((bh
= bh
->b_this_page
) != head
);
414 static int do_buffer_fdatasync(struct inode
*inode
, unsigned long start
, unsigned long end
, int (*fn
)(struct page
*))
416 struct list_head
*head
, *curr
;
420 head
= &inode
->i_data
.pages
;
422 spin_lock(&pagecache_lock
);
424 while (curr
!= head
) {
425 page
= list_entry(curr
, struct page
, list
);
429 if (page
->index
>= end
)
431 if (page
->index
< start
)
435 spin_unlock(&pagecache_lock
);
438 /* The buffers could have been free'd while we waited for the page lock */
443 spin_lock(&pagecache_lock
);
444 curr
= page
->list
.next
;
445 page_cache_release(page
);
447 spin_unlock(&pagecache_lock
);
453 * Two-stage data sync: first start the IO, then go back and
454 * collect the information..
456 int generic_buffer_fdatasync(struct inode
*inode
, unsigned long start_idx
, unsigned long end_idx
)
460 retval
= do_buffer_fdatasync(inode
, start_idx
, end_idx
, writeout_one_page
);
461 retval
|= do_buffer_fdatasync(inode
, start_idx
, end_idx
, waitfor_one_page
);
466 * This adds a page to the page cache, starting out as locked,
467 * owned by us, referenced, but not uptodate and with no errors.
469 static inline void __add_to_page_cache(struct page
* page
,
470 struct address_space
*mapping
, unsigned long offset
,
476 flags
= page
->flags
& ~((1 << PG_uptodate
) | (1 << PG_error
) | (1 << PG_referenced
));
477 page
->flags
= flags
| (1 << PG_locked
);
479 page
->index
= offset
;
480 add_page_to_inode_queue(mapping
, page
);
481 __add_page_to_hash_queue(page
, hash
);
483 alias
= __find_page_nolock(mapping
, offset
, *hash
);
488 void add_to_page_cache(struct page
* page
, struct address_space
* mapping
, unsigned long offset
)
490 spin_lock(&pagecache_lock
);
491 __add_to_page_cache(page
, mapping
, offset
, page_hash(mapping
, offset
));
492 spin_unlock(&pagecache_lock
);
495 static int add_to_page_cache_unique(struct page
* page
,
496 struct address_space
*mapping
, unsigned long offset
,
502 spin_lock(&pagecache_lock
);
503 alias
= __find_page_nolock(mapping
, offset
, *hash
);
507 __add_to_page_cache(page
,mapping
,offset
,hash
);
511 spin_unlock(&pagecache_lock
);
516 * This adds the requested page to the page cache if it isn't already there,
517 * and schedules an I/O to read in its contents from disk.
519 static inline int page_cache_read(struct file
* file
, unsigned long offset
)
521 struct inode
*inode
= file
->f_dentry
->d_inode
;
522 struct page
**hash
= page_hash(&inode
->i_data
, offset
);
525 spin_lock(&pagecache_lock
);
526 page
= __find_page_nolock(&inode
->i_data
, offset
, *hash
);
527 spin_unlock(&pagecache_lock
);
531 page
= page_cache_alloc();
535 if (!add_to_page_cache_unique(page
, &inode
->i_data
, offset
, hash
)) {
536 int error
= inode
->i_op
->readpage(file
->f_dentry
, page
);
537 page_cache_release(page
);
541 * We arrive here in the unlikely event that someone
542 * raced with us and added our page to the cache first.
544 page_cache_free(page
);
549 * Read in an entire cluster at once. A cluster is usually a 64k-
550 * aligned block that includes the page requested in "offset."
552 static int read_cluster_nonblocking(struct file
* file
, unsigned long offset
,
553 unsigned long filesize
)
555 unsigned long pages
= CLUSTER_PAGES
;
557 offset
= CLUSTER_OFFSET(offset
);
558 while ((pages
-- > 0) && (offset
< filesize
)) {
559 int error
= page_cache_read(file
, offset
);
569 * Wait for a page to get unlocked.
571 * This must be called with the caller "holding" the page,
572 * ie with increased "page->count" so that the page won't
573 * go away during the wait..
575 void ___wait_on_page(struct page
*page
)
577 struct task_struct
*tsk
= current
;
578 DECLARE_WAITQUEUE(wait
, tsk
);
580 add_wait_queue(&page
->wait
, &wait
);
582 run_task_queue(&tq_disk
);
583 set_task_state(tsk
, TASK_UNINTERRUPTIBLE
);
584 if (!PageLocked(page
))
587 } while (PageLocked(page
));
588 tsk
->state
= TASK_RUNNING
;
589 remove_wait_queue(&page
->wait
, &wait
);
593 * Get an exclusive lock on the page..
595 void lock_page(struct page
*page
)
597 while (TryLockPage(page
))
598 ___wait_on_page(page
);
603 * a rather lightweight function, finding and getting a reference to a
604 * hashed page atomically, waiting for it if it's locked.
606 struct page
* __find_get_page (struct address_space
*mapping
,
607 unsigned long offset
, struct page
**hash
)
612 * We scan the hash list read-only. Addition to and removal from
613 * the hash-list needs a held write-lock.
616 spin_lock(&pagecache_lock
);
617 page
= __find_page_nolock(mapping
, offset
, *hash
);
620 spin_unlock(&pagecache_lock
);
622 /* Found the page, sleep if locked. */
623 if (page
&& PageLocked(page
)) {
624 struct task_struct
*tsk
= current
;
625 DECLARE_WAITQUEUE(wait
, tsk
);
627 run_task_queue(&tq_disk
);
629 __set_task_state(tsk
, TASK_UNINTERRUPTIBLE
);
630 add_wait_queue(&page
->wait
, &wait
);
632 if (PageLocked(page
))
634 __set_task_state(tsk
, TASK_RUNNING
);
635 remove_wait_queue(&page
->wait
, &wait
);
638 * The page might have been unhashed meanwhile. It's
639 * not freed though because we hold a reference to it.
640 * If this is the case then it will be freed _here_,
641 * and we recheck the hash anyway.
643 page_cache_release(page
);
647 * It's not locked so we can return the page and we hold
654 * Get the lock to a page atomically.
656 struct page
* __find_lock_page (struct address_space
*mapping
,
657 unsigned long offset
, struct page
**hash
)
662 * We scan the hash list read-only. Addition to and removal from
663 * the hash-list needs a held write-lock.
666 spin_lock(&pagecache_lock
);
667 page
= __find_page_nolock(mapping
, offset
, *hash
);
670 spin_unlock(&pagecache_lock
);
672 /* Found the page, sleep if locked. */
673 if (page
&& TryLockPage(page
)) {
674 struct task_struct
*tsk
= current
;
675 DECLARE_WAITQUEUE(wait
, tsk
);
677 run_task_queue(&tq_disk
);
679 __set_task_state(tsk
, TASK_UNINTERRUPTIBLE
);
680 add_wait_queue(&page
->wait
, &wait
);
682 if (PageLocked(page
))
684 __set_task_state(tsk
, TASK_RUNNING
);
685 remove_wait_queue(&page
->wait
, &wait
);
688 * The page might have been unhashed meanwhile. It's
689 * not freed though because we hold a reference to it.
690 * If this is the case then it will be freed _here_,
691 * and we recheck the hash anyway.
693 page_cache_release(page
);
697 * It's not locked so we can return the page and we hold
704 #define PROFILE_READAHEAD
705 #define DEBUG_READAHEAD
709 * Read-ahead profiling information
710 * --------------------------------
711 * Every PROFILE_MAXREADCOUNT, the following information is written
713 * Percentage of asynchronous read-ahead.
714 * Average of read-ahead fields context value.
715 * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
719 #ifdef PROFILE_READAHEAD
721 #define PROFILE_MAXREADCOUNT 1000
723 static unsigned long total_reada
;
724 static unsigned long total_async
;
725 static unsigned long total_ramax
;
726 static unsigned long total_ralen
;
727 static unsigned long total_rawin
;
729 static void profile_readahead(int async
, struct file
*filp
)
737 total_ramax
+= filp
->f_ramax
;
738 total_ralen
+= filp
->f_ralen
;
739 total_rawin
+= filp
->f_rawin
;
741 if (total_reada
> PROFILE_MAXREADCOUNT
) {
744 if (!(total_reada
> PROFILE_MAXREADCOUNT
)) {
745 restore_flags(flags
);
749 printk("Readahead average: max=%ld, len=%ld, win=%ld, async=%ld%%\n",
750 total_ramax
/total_reada
,
751 total_ralen
/total_reada
,
752 total_rawin
/total_reada
,
753 (total_async
*100)/total_reada
);
754 #ifdef DEBUG_READAHEAD
755 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n",
756 filp
->f_ramax
, filp
->f_ralen
, filp
->f_rawin
, filp
->f_raend
);
765 restore_flags(flags
);
768 #endif /* defined PROFILE_READAHEAD */
771 * Read-ahead context:
772 * -------------------
773 * The read ahead context fields of the "struct file" are the following:
774 * - f_raend : position of the first byte after the last page we tried to
776 * - f_ramax : current read-ahead maximum size.
777 * - f_ralen : length of the current IO read block we tried to read-ahead.
778 * - f_rawin : length of the current read-ahead window.
779 * if last read-ahead was synchronous then
781 * otherwise (was asynchronous)
782 * f_rawin = previous value of f_ralen + f_ralen
786 * MIN_READAHEAD : minimum read-ahead size when read-ahead.
787 * MAX_READAHEAD : maximum read-ahead size when read-ahead.
789 * Synchronous read-ahead benefits:
790 * --------------------------------
791 * Using reasonable IO xfer length from peripheral devices increase system
793 * Reasonable means, in this context, not too large but not too small.
794 * The actual maximum value is:
795 * MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
796 * and 32K if defined (4K page size assumed).
798 * Asynchronous read-ahead benefits:
799 * ---------------------------------
800 * Overlapping next read request and user process execution increase system
805 * We have to guess which further data are needed by the user process.
806 * If these data are often not really needed, it's bad for system
808 * However, we know that files are often accessed sequentially by
809 * application programs and it seems that it is possible to have some good
810 * strategy in that guessing.
811 * We only try to read-ahead files that seems to be read sequentially.
813 * Asynchronous read-ahead risks:
814 * ------------------------------
815 * In order to maximize overlapping, we must start some asynchronous read
816 * request from the device, as soon as possible.
817 * We must be very careful about:
818 * - The number of effective pending IO read requests.
819 * ONE seems to be the only reasonable value.
820 * - The total memory pool usage for the file access stream.
821 * This maximum memory usage is implicitly 2 IO read chunks:
822 * 2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
823 * 64k if defined (4K page size assumed).
826 static inline int get_max_readahead(struct inode
* inode
)
828 if (!inode
->i_dev
|| !max_readahead
[MAJOR(inode
->i_dev
)])
829 return MAX_READAHEAD
;
830 return max_readahead
[MAJOR(inode
->i_dev
)][MINOR(inode
->i_dev
)];
833 static void generic_file_readahead(int reada_ok
,
834 struct file
* filp
, struct inode
* inode
,
837 unsigned long end_index
= inode
->i_size
>> PAGE_CACHE_SHIFT
;
838 unsigned long index
= page
->index
;
839 unsigned long max_ahead
, ahead
;
841 int max_readahead
= get_max_readahead(inode
);
843 raend
= filp
->f_raend
;
847 * The current page is locked.
848 * If the current position is inside the previous read IO request, do not
849 * try to reread previously read ahead pages.
850 * Otherwise decide or not to read ahead some pages synchronously.
851 * If we are not going to read ahead, set the read ahead context for this
854 if (PageLocked(page
)) {
855 if (!filp
->f_ralen
|| index
>= raend
|| index
+ filp
->f_ralen
< raend
) {
857 if (raend
< end_index
)
858 max_ahead
= filp
->f_ramax
;
862 filp
->f_raend
= index
+ filp
->f_ralen
;
863 filp
->f_rawin
+= filp
->f_ralen
;
868 * The current page is not locked.
869 * If we were reading ahead and,
870 * if the current max read ahead size is not zero and,
871 * if the current position is inside the last read-ahead IO request,
872 * it is the moment to try to read ahead asynchronously.
873 * We will later force unplug device in order to force asynchronous read IO.
875 else if (reada_ok
&& filp
->f_ramax
&& raend
>= 1 &&
876 index
<= raend
&& index
+ filp
->f_ralen
>= raend
) {
878 * Add ONE page to max_ahead in order to try to have about the same IO max size
879 * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
880 * Compute the position of the last page we have tried to read in order to
881 * begin to read ahead just at the next page.
884 if (raend
< end_index
)
885 max_ahead
= filp
->f_ramax
+ 1;
888 filp
->f_rawin
= filp
->f_ralen
;
894 * Try to read ahead pages.
895 * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
896 * scheduler, will work enough for us to avoid too bad actuals IO requests.
899 while (ahead
< max_ahead
) {
901 if ((raend
+ ahead
) >= end_index
)
903 if (page_cache_read(filp
, raend
+ ahead
) < 0)
907 * If we tried to read ahead some pages,
908 * If we tried to read ahead asynchronously,
909 * Try to force unplug of the device in order to start an asynchronous
911 * Update the read-ahead context.
912 * Store the length of the current read-ahead window.
913 * Double the current max read ahead size.
914 * That heuristic avoid to do some large IO for files that are not really
915 * accessed sequentially.
919 run_task_queue(&tq_disk
);
922 filp
->f_ralen
+= ahead
;
923 filp
->f_rawin
+= filp
->f_ralen
;
924 filp
->f_raend
= raend
+ ahead
+ 1;
926 filp
->f_ramax
+= filp
->f_ramax
;
928 if (filp
->f_ramax
> max_readahead
)
929 filp
->f_ramax
= max_readahead
;
931 #ifdef PROFILE_READAHEAD
932 profile_readahead((reada_ok
== 2), filp
);
941 * This is a generic file read routine, and uses the
942 * inode->i_op->readpage() function for the actual low-level
945 * This is really ugly. But the goto's actually try to clarify some
946 * of the logic when it comes to error handling etc.
948 void do_generic_file_read(struct file
* filp
, loff_t
*ppos
, read_descriptor_t
* desc
, read_actor_t actor
)
950 struct dentry
*dentry
= filp
->f_dentry
;
951 struct inode
*inode
= dentry
->d_inode
;
952 unsigned long index
, offset
;
953 struct page
*cached_page
;
956 int max_readahead
= get_max_readahead(inode
);
959 index
= *ppos
>> PAGE_CACHE_SHIFT
;
960 offset
= *ppos
& ~PAGE_CACHE_MASK
;
963 * If the current position is outside the previous read-ahead window,
964 * we reset the current read-ahead context and set read ahead max to zero
965 * (will be set to just needed value later),
966 * otherwise, we assume that the file accesses are sequential enough to
967 * continue read-ahead.
969 if (index
> filp
->f_raend
|| index
+ filp
->f_rawin
< filp
->f_raend
) {
979 * Adjust the current value of read-ahead max.
980 * If the read operation stay in the first half page, force no readahead.
981 * Otherwise try to increase read ahead max just enough to do the read request.
982 * Then, at least MIN_READAHEAD if read ahead is ok,
983 * and at most MAX_READAHEAD in all cases.
985 if (!index
&& offset
+ desc
->count
<= (PAGE_CACHE_SIZE
>> 1)) {
988 unsigned long needed
;
990 needed
= ((offset
+ desc
->count
) >> PAGE_CACHE_SHIFT
) + 1;
992 if (filp
->f_ramax
< needed
)
993 filp
->f_ramax
= needed
;
995 if (reada_ok
&& filp
->f_ramax
< MIN_READAHEAD
)
996 filp
->f_ramax
= MIN_READAHEAD
;
997 if (filp
->f_ramax
> max_readahead
)
998 filp
->f_ramax
= max_readahead
;
1002 struct page
*page
, **hash
;
1003 unsigned long end_index
, nr
;
1005 end_index
= inode
->i_size
>> PAGE_CACHE_SHIFT
;
1006 if (index
> end_index
)
1008 nr
= PAGE_CACHE_SIZE
;
1009 if (index
== end_index
) {
1010 nr
= inode
->i_size
& ~PAGE_CACHE_MASK
;
1018 * Try to find the data in the page cache..
1020 hash
= page_hash(&inode
->i_data
, index
);
1022 spin_lock(&pagecache_lock
);
1023 page
= __find_page_nolock(&inode
->i_data
, index
, *hash
);
1025 goto no_cached_page
;
1028 spin_unlock(&pagecache_lock
);
1030 if (!Page_Uptodate(page
))
1031 goto page_not_up_to_date
;
1034 * Ok, we have the page, and it's up-to-date, so
1035 * now we can copy it to user space...
1037 * The actor routine returns how many bytes were actually used..
1038 * NOTE! This may not be the same as how much of a user buffer
1039 * we filled up (we may be padding etc), so we can only update
1040 * "pos" here (the actor routine has to update the user buffer
1041 * pointers and the remaining count).
1043 nr
= actor(desc
, page
, offset
, nr
);
1045 index
+= offset
>> PAGE_CACHE_SHIFT
;
1046 offset
&= ~PAGE_CACHE_MASK
;
1048 page_cache_release(page
);
1049 if (nr
&& desc
->count
)
1054 * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
1056 page_not_up_to_date
:
1057 generic_file_readahead(reada_ok
, filp
, inode
, page
);
1059 if (Page_Uptodate(page
))
1062 /* Get exclusive access to the page ... */
1064 if (Page_Uptodate(page
)) {
1070 /* ... and start the actual read. The read will unlock the page. */
1071 error
= inode
->i_op
->readpage(filp
->f_dentry
, page
);
1074 if (Page_Uptodate(page
))
1077 /* Again, try some read-ahead while waiting for the page to finish.. */
1078 generic_file_readahead(reada_ok
, filp
, inode
, page
);
1080 if (Page_Uptodate(page
))
1085 /* UHHUH! A synchronous read error occurred. Report it */
1086 desc
->error
= error
;
1087 page_cache_release(page
);
1092 * Ok, it wasn't cached, so we need to create a new
1095 * We get here with the page cache lock held.
1098 spin_unlock(&pagecache_lock
);
1099 cached_page
= page_cache_alloc();
1101 desc
->error
= -ENOMEM
;
1106 * Somebody may have added the page while we
1107 * dropped the page cache lock. Check for that.
1109 spin_lock(&pagecache_lock
);
1110 page
= __find_page_nolock(&inode
->i_data
, index
, *hash
);
1116 * Ok, add the new page to the hash-queues...
1119 __add_to_page_cache(page
, &inode
->i_data
, index
, hash
);
1120 spin_unlock(&pagecache_lock
);
1126 *ppos
= ((loff_t
) index
<< PAGE_CACHE_SHIFT
) + offset
;
1129 page_cache_free(cached_page
);
1130 UPDATE_ATIME(inode
);
1133 static int file_read_actor(read_descriptor_t
* desc
, struct page
*page
, unsigned long offset
, unsigned long size
)
1135 unsigned long kaddr
;
1136 unsigned long left
, count
= desc
->count
;
1142 left
= __copy_to_user(desc
->buf
, (void *)(kaddr
+ offset
), size
);
1147 desc
->error
= -EFAULT
;
1149 desc
->count
= count
- size
;
1150 desc
->written
+= size
;
1156 * This is the "read()" routine for all filesystems
1157 * that can use the page cache directly.
1159 ssize_t
generic_file_read(struct file
* filp
, char * buf
, size_t count
, loff_t
*ppos
)
1164 if (access_ok(VERIFY_WRITE
, buf
, count
)) {
1168 read_descriptor_t desc
;
1174 do_generic_file_read(filp
, ppos
, &desc
, file_read_actor
);
1176 retval
= desc
.written
;
1178 retval
= desc
.error
;
1184 static int file_send_actor(read_descriptor_t
* desc
, struct page
*page
, unsigned long offset
, unsigned long size
)
1186 unsigned long kaddr
;
1188 unsigned long count
= desc
->count
;
1189 struct file
*file
= (struct file
*) desc
->buf
;
1190 mm_segment_t old_fs
;
1198 written
= file
->f_op
->write(file
, (char *)kaddr
+ offset
,
1199 size
, &file
->f_pos
);
1203 desc
->error
= written
;
1206 desc
->count
= count
- written
;
1207 desc
->written
+= written
;
1211 asmlinkage ssize_t
sys_sendfile(int out_fd
, int in_fd
, off_t
*offset
, size_t count
)
1214 struct file
* in_file
, * out_file
;
1215 struct inode
* in_inode
, * out_inode
;
1218 * Get input file, and verify that it is ok..
1221 in_file
= fget(in_fd
);
1224 if (!(in_file
->f_mode
& FMODE_READ
))
1227 in_inode
= in_file
->f_dentry
->d_inode
;
1230 if (!in_inode
->i_op
|| !in_inode
->i_op
->readpage
)
1232 retval
= locks_verify_area(FLOCK_VERIFY_READ
, in_inode
, in_file
, in_file
->f_pos
, count
);
1237 * Get output file, and verify that it is ok..
1240 out_file
= fget(out_fd
);
1243 if (!(out_file
->f_mode
& FMODE_WRITE
))
1246 if (!out_file
->f_op
|| !out_file
->f_op
->write
)
1248 out_inode
= out_file
->f_dentry
->d_inode
;
1251 retval
= locks_verify_area(FLOCK_VERIFY_WRITE
, out_inode
, out_file
, out_file
->f_pos
, count
);
1257 read_descriptor_t desc
;
1258 loff_t pos
= 0, *ppos
;
1261 ppos
= &in_file
->f_pos
;
1263 if (get_user(pos
, offset
))
1270 desc
.buf
= (char *) out_file
;
1272 do_generic_file_read(in_file
, ppos
, &desc
, file_send_actor
);
1274 retval
= desc
.written
;
1276 retval
= desc
.error
;
1278 put_user(pos
, offset
);
1290 * filemap_nopage() is invoked via the vma operations vector for a
1291 * mapped memory region to read in file data during a page fault.
1293 * The goto's are kind of ugly, but this streamlines the normal case of having
1294 * it in the page cache, and handles the special cases reasonably without
1295 * having a lot of duplicated code.
1297 struct page
* filemap_nopage(struct vm_area_struct
* area
,
1298 unsigned long address
, int no_share
)
1301 struct file
*file
= area
->vm_file
;
1302 struct dentry
*dentry
= file
->f_dentry
;
1303 struct inode
*inode
= dentry
->d_inode
;
1304 struct page
*page
, **hash
, *old_page
;
1305 unsigned long size
= (inode
->i_size
+ PAGE_CACHE_SIZE
- 1) >> PAGE_CACHE_SHIFT
;
1307 unsigned long pgoff
= ((address
- area
->vm_start
) >> PAGE_CACHE_SHIFT
) + area
->vm_pgoff
;
1310 * Semantics for shared and private memory areas are different
1311 * past the end of the file. A shared mapping past the last page
1312 * of the file is an error and results in a SIGBUS, while a
1313 * private mapping just maps in a zero page.
1315 if ((pgoff
>= size
) &&
1316 (area
->vm_flags
& VM_SHARED
) && (area
->vm_mm
== current
->mm
))
1320 * Do we have something in the page cache already?
1322 hash
= page_hash(&inode
->i_data
, pgoff
);
1324 page
= __find_get_page(&inode
->i_data
, pgoff
, hash
);
1326 goto no_cached_page
;
1329 * Ok, found a page in the page cache, now we need to check
1330 * that it's up-to-date.
1332 if (!Page_Uptodate(page
))
1333 goto page_not_uptodate
;
1337 * Found the page and have a reference on it, need to check sharing
1338 * and possibly copy it over to another page..
1342 struct page
*new_page
= page_cache_alloc();
1345 copy_highpage(new_page
, old_page
);
1346 flush_page_to_ram(new_page
);
1348 new_page
= NOPAGE_OOM
;
1349 page_cache_release(page
);
1353 flush_page_to_ram(old_page
);
1358 * If the requested offset is within our file, try to read a whole
1359 * cluster of pages at once.
1361 * Otherwise, we're off the end of a privately mapped file,
1362 * so we need to map a zero page.
1365 error
= read_cluster_nonblocking(file
, pgoff
, size
);
1367 error
= page_cache_read(file
, pgoff
);
1370 * The page we want has now been added to the page cache.
1371 * In the unlikely event that someone removed it in the
1372 * meantime, we'll just come back here and read it again.
1378 * An error return from page_cache_read can result if the
1379 * system is low on memory, or a problem occurs while trying
1382 if (error
== -ENOMEM
)
1388 if (Page_Uptodate(page
)) {
1393 if (!inode
->i_op
->readpage(file
->f_dentry
, page
)) {
1395 if (Page_Uptodate(page
))
1400 * Umm, take care of errors if the page isn't up-to-date.
1401 * Try to re-read it _once_. We do this synchronously,
1402 * because there really aren't any performance issues here
1403 * and we need to check for errors.
1406 if (Page_Uptodate(page
)) {
1410 ClearPageError(page
);
1411 if (!inode
->i_op
->readpage(file
->f_dentry
, page
)) {
1413 if (Page_Uptodate(page
))
1418 * Things didn't work out. Return zero to tell the
1419 * mm layer so, possibly freeing the page cache page first.
1421 page_cache_release(page
);
1426 * Tries to write a shared mapped page to its backing store. May return -EIO
1427 * if the disk is full.
1429 static inline int do_write_page(struct inode
* inode
, struct file
* file
,
1430 struct page
* page
, unsigned long index
)
1433 int (*writepage
) (struct dentry
*, struct page
*);
1435 /* refuse to extend file size.. */
1436 if (S_ISREG(inode
->i_mode
)) {
1437 unsigned long size_idx
= (inode
->i_size
+ PAGE_CACHE_SIZE
- 1) >> PAGE_CACHE_SHIFT
;
1439 /* Ho humm.. We should have tested for this earlier */
1440 if (size_idx
<= index
)
1443 writepage
= inode
->i_op
->writepage
;
1446 retval
= writepage(file
->f_dentry
, page
);
1452 static int filemap_write_page(struct file
*file
,
1453 unsigned long index
,
1458 struct dentry
* dentry
;
1459 struct inode
* inode
;
1461 dentry
= file
->f_dentry
;
1462 inode
= dentry
->d_inode
;
1465 * If a task terminates while we're swapping the page, the vma and
1466 * and file could be released: try_to_swap_out has done a get_file.
1467 * vma/file is guaranteed to exist in the unmap/sync cases because
1470 result
= do_write_page(inode
, file
, page
, index
);
1476 * The page cache takes care of races between somebody
1477 * trying to swap something out and swap something in
1478 * at the same time..
1480 extern void wakeup_bdflush(int);
1481 int filemap_swapout(struct page
* page
, struct file
* file
)
1483 int retval
= filemap_write_page(file
, page
->index
, page
, 0);
1488 static inline int filemap_sync_pte(pte_t
* ptep
, struct vm_area_struct
*vma
,
1489 unsigned long address
, unsigned int flags
)
1491 unsigned long pgoff
;
1496 if (!(flags
& MS_INVALIDATE
)) {
1497 if (!pte_present(pte
))
1499 if (!pte_dirty(pte
))
1501 flush_page_to_ram(pte_page(pte
));
1502 flush_cache_page(vma
, address
);
1503 set_pte(ptep
, pte_mkclean(pte
));
1504 flush_tlb_page(vma
, address
);
1505 page
= pte_page(pte
);
1510 flush_cache_page(vma
, address
);
1512 flush_tlb_page(vma
, address
);
1513 if (!pte_present(pte
)) {
1514 swap_free(pte_to_swp_entry(pte
));
1517 page
= pte_page(pte
);
1518 if (!pte_dirty(pte
) || flags
== MS_INVALIDATE
) {
1519 page_cache_free(page
);
1523 pgoff
= (address
- vma
->vm_start
) >> PAGE_CACHE_SHIFT
;
1524 pgoff
+= vma
->vm_pgoff
;
1525 if (page
->index
!= pgoff
) {
1526 printk("weirdness: pgoff=%lu index=%lu address=%lu vm_start=%lu vm_pgoff=%lu\n",
1527 pgoff
, page
->index
, address
, vma
->vm_start
, vma
->vm_pgoff
);
1529 error
= filemap_write_page(vma
->vm_file
, pgoff
, page
, 1);
1530 page_cache_free(page
);
1534 static inline int filemap_sync_pte_range(pmd_t
* pmd
,
1535 unsigned long address
, unsigned long size
,
1536 struct vm_area_struct
*vma
, unsigned long offset
, unsigned int flags
)
1544 if (pmd_bad(*pmd
)) {
1549 pte
= pte_offset(pmd
, address
);
1550 offset
+= address
& PMD_MASK
;
1551 address
&= ~PMD_MASK
;
1552 end
= address
+ size
;
1557 error
|= filemap_sync_pte(pte
, vma
, address
+ offset
, flags
);
1558 address
+= PAGE_SIZE
;
1560 } while (address
&& (address
< end
));
1564 static inline int filemap_sync_pmd_range(pgd_t
* pgd
,
1565 unsigned long address
, unsigned long size
,
1566 struct vm_area_struct
*vma
, unsigned int flags
)
1569 unsigned long offset
, end
;
1574 if (pgd_bad(*pgd
)) {
1579 pmd
= pmd_offset(pgd
, address
);
1580 offset
= address
& PGDIR_MASK
;
1581 address
&= ~PGDIR_MASK
;
1582 end
= address
+ size
;
1583 if (end
> PGDIR_SIZE
)
1587 error
|= filemap_sync_pte_range(pmd
, address
, end
- address
, vma
, offset
, flags
);
1588 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
1590 } while (address
&& (address
< end
));
1594 int filemap_sync(struct vm_area_struct
* vma
, unsigned long address
,
1595 size_t size
, unsigned int flags
)
1598 unsigned long end
= address
+ size
;
1601 dir
= pgd_offset(vma
->vm_mm
, address
);
1602 flush_cache_range(vma
->vm_mm
, end
- size
, end
);
1606 error
|= filemap_sync_pmd_range(dir
, address
, end
- address
, vma
, flags
);
1607 address
= (address
+ PGDIR_SIZE
) & PGDIR_MASK
;
1609 } while (address
&& (address
< end
));
1610 flush_tlb_range(vma
->vm_mm
, end
- size
, end
);
1615 * This handles (potentially partial) area unmaps..
1617 static void filemap_unmap(struct vm_area_struct
*vma
, unsigned long start
, size_t len
)
1620 filemap_sync(vma
, start
, len
, MS_ASYNC
);
1625 * Shared mappings need to be able to do the right thing at
1626 * close/unmap/sync. They will also use the private file as
1627 * backing-store for swapping..
1629 static struct vm_operations_struct file_shared_mmap
= {
1630 NULL
, /* no special open */
1631 NULL
, /* no special close */
1632 filemap_unmap
, /* unmap - we need to sync the pages */
1633 NULL
, /* no special protect */
1634 filemap_sync
, /* sync */
1636 filemap_nopage
, /* nopage */
1638 filemap_swapout
/* swapout */
1642 * Private mappings just need to be able to load in the map.
1644 * (This is actually used for shared mappings as well, if we
1645 * know they can't ever get write permissions..)
1647 static struct vm_operations_struct file_private_mmap
= {
1654 filemap_nopage
, /* nopage */
1659 /* This is used for a general mmap of a disk file */
1661 int generic_file_mmap(struct file
* file
, struct vm_area_struct
* vma
)
1663 struct vm_operations_struct
* ops
;
1664 struct inode
*inode
= file
->f_dentry
->d_inode
;
1666 ops
= &file_private_mmap
;
1667 if ((vma
->vm_flags
& VM_SHARED
) && (vma
->vm_flags
& VM_MAYWRITE
)) {
1668 if (!inode
->i_op
|| !inode
->i_op
->writepage
)
1670 ops
= &file_shared_mmap
;
1672 if (!inode
->i_sb
|| !S_ISREG(inode
->i_mode
))
1674 if (!inode
->i_op
|| !inode
->i_op
->readpage
)
1676 UPDATE_ATIME(inode
);
1683 * The msync() system call.
1686 static int msync_interval(struct vm_area_struct
* vma
,
1687 unsigned long start
, unsigned long end
, int flags
)
1689 if (vma
->vm_file
&& vma
->vm_ops
&& vma
->vm_ops
->sync
) {
1691 error
= vma
->vm_ops
->sync(vma
, start
, end
-start
, flags
);
1692 if (!error
&& (flags
& MS_SYNC
)) {
1693 struct file
* file
= vma
->vm_file
;
1695 struct dentry
* dentry
= file
->f_dentry
;
1696 error
= file_fsync(file
, dentry
);
1704 asmlinkage
long sys_msync(unsigned long start
, size_t len
, int flags
)
1707 struct vm_area_struct
* vma
;
1708 int unmapped_error
, error
= -EINVAL
;
1710 down(¤t
->mm
->mmap_sem
);
1712 if (start
& ~PAGE_MASK
)
1714 len
= (len
+ ~PAGE_MASK
) & PAGE_MASK
;
1718 if (flags
& ~(MS_ASYNC
| MS_INVALIDATE
| MS_SYNC
))
1724 * If the interval [start,end) covers some unmapped address ranges,
1725 * just ignore them, but return -EFAULT at the end.
1727 vma
= find_vma(current
->mm
, start
);
1730 /* Still start < end. */
1734 /* Here start < vma->vm_end. */
1735 if (start
< vma
->vm_start
) {
1736 unmapped_error
= -EFAULT
;
1737 start
= vma
->vm_start
;
1739 /* Here vma->vm_start <= start < vma->vm_end. */
1740 if (end
<= vma
->vm_end
) {
1742 error
= msync_interval(vma
, start
, end
, flags
);
1746 error
= unmapped_error
;
1749 /* Here vma->vm_start <= start < vma->vm_end < end. */
1750 error
= msync_interval(vma
, start
, vma
->vm_end
, flags
);
1753 start
= vma
->vm_end
;
1758 up(¤t
->mm
->mmap_sem
);
1762 struct page
*read_cache_page(struct address_space
*mapping
,
1763 unsigned long index
,
1764 int (*filler
)(void *,struct page
*),
1767 struct page
**hash
= page_hash(mapping
, index
);
1768 struct page
*page
, *cached_page
= NULL
;
1771 page
= __find_get_page(mapping
, index
, hash
);
1774 cached_page
= page_cache_alloc();
1776 return ERR_PTR(-ENOMEM
);
1779 if (add_to_page_cache_unique(page
, mapping
, index
, hash
))
1782 err
= filler(data
, page
);
1784 page_cache_release(page
);
1785 page
= ERR_PTR(err
);
1789 page_cache_free(cached_page
);
1793 static inline struct page
* __grab_cache_page(struct address_space
*mapping
,
1794 unsigned long index
, struct page
**cached_page
)
1796 struct page
*page
, **hash
= page_hash(mapping
, index
);
1798 page
= __find_lock_page(mapping
, index
, hash
);
1800 if (!*cached_page
) {
1801 *cached_page
= page_cache_alloc();
1805 page
= *cached_page
;
1806 if (add_to_page_cache_unique(page
, mapping
, index
, hash
))
1808 *cached_page
= NULL
;
1814 * Returns locked page at given index in given cache, creating it if needed.
1817 struct page
*grab_cache_page(struct address_space
*mapping
, unsigned long index
)
1819 struct page
*cached_page
= NULL
;
1820 struct page
*page
= __grab_cache_page(mapping
,index
,&cached_page
);
1822 page_cache_free(cached_page
);
1827 * Write to a file through the page cache. This is mainly for the
1828 * benefit of NFS and possibly other network-based file systems.
1830 * We currently put everything into the page cache prior to writing it.
1831 * This is not a problem when writing full pages. With partial pages,
1832 * however, we first have to read the data into the cache, then
1833 * dirty the page, and finally schedule it for writing. Alternatively, we
1834 * could write-through just the portion of data that would go into that
1835 * page, but that would kill performance for applications that write data
1836 * line by line, and it's prone to race conditions.
1838 * Note that this routine doesn't try to keep track of dirty pages. Each
1839 * file system has to do this all by itself, unfortunately.
1843 generic_file_write(struct file
*file
, const char *buf
,
1844 size_t count
, loff_t
*ppos
,
1845 writepage_t write_one_page
)
1847 struct dentry
*dentry
= file
->f_dentry
;
1848 struct inode
*inode
= dentry
->d_inode
;
1849 unsigned long limit
= current
->rlim
[RLIMIT_FSIZE
].rlim_cur
;
1851 struct page
*page
, *cached_page
;
1852 unsigned long written
;
1858 down(&inode
->i_sem
);
1865 err
= file
->f_error
;
1873 if (file
->f_flags
& O_APPEND
)
1874 pos
= inode
->i_size
;
1877 * Check whether we've reached the file size limit.
1880 if (limit
!= RLIM_INFINITY
) {
1882 send_sig(SIGXFSZ
, current
, 0);
1885 if (count
> limit
- pos
) {
1886 send_sig(SIGXFSZ
, current
, 0);
1887 count
= limit
- pos
;
1894 unsigned long bytes
, index
, offset
;
1897 * Try to find the page in the cache. If it isn't there,
1898 * allocate a free page.
1900 offset
= (pos
& (PAGE_CACHE_SIZE
-1)); /* Within page */
1901 index
= pos
>> PAGE_CACHE_SHIFT
;
1902 bytes
= PAGE_CACHE_SIZE
- offset
;
1906 status
= -ENOMEM
; /* we'll assign it later anyway */
1907 page
= __grab_cache_page(&inode
->i_data
, index
, &cached_page
);
1911 /* We have exclusive IO access to the page.. */
1912 if (!PageLocked(page
)) {
1916 status
= write_one_page(file
, page
, offset
, bytes
, buf
);
1923 if (pos
> inode
->i_size
)
1924 inode
->i_size
= pos
;
1926 /* Mark it unlocked again and drop the page.. */
1928 page_cache_release(page
);
1936 page_cache_free(cached_page
);
1938 err
= written
? written
: status
;
1944 void __init
page_cache_init(unsigned long mempages
)
1946 unsigned long htable_size
, order
;
1948 htable_size
= mempages
;
1949 htable_size
*= sizeof(struct page
*);
1950 for(order
= 0; (PAGE_SIZE
<< order
) < htable_size
; order
++)
1954 unsigned long tmp
= (PAGE_SIZE
<< order
) / sizeof(struct page
*);
1957 while((tmp
>>= 1UL) != 0UL)
1960 page_hash_table
= (struct page
**)
1961 __get_free_pages(GFP_ATOMIC
, order
);
1962 } while(page_hash_table
== NULL
&& --order
> 0);
1964 printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
1965 (1 << page_hash_bits
), order
, (PAGE_SIZE
<< order
));
1966 if (!page_hash_table
)
1967 panic("Failed to allocate page hash table\n");
1968 memset((void *)page_hash_table
, 0, PAGE_HASH_SIZE
* sizeof(struct page
*));