4 * Copyright (C) 1994-1999 Linus Torvalds
8 * This file handles the generic file mmap semantics used by
9 * most "normal" filesystems (but you don't /have/ to use this:
10 * the NFS filesystem used to do this differently, for example)
12 #include <linux/malloc.h>
13 #include <linux/shm.h>
14 #include <linux/mman.h>
15 #include <linux/locks.h>
16 #include <linux/pagemap.h>
17 #include <linux/swap.h>
18 #include <linux/smp_lock.h>
19 #include <linux/blkdev.h>
20 #include <linux/file.h>
21 #include <linux/swapctl.h>
22 #include <linux/slab.h>
23 #include <linux/init.h>
25 #include <asm/pgtable.h>
26 #include <asm/uaccess.h>
29 * Shared mappings implemented 30.11.1994. It's not fully working yet,
32 * Shared mappings now work. 15.8.1995 Bruno.
34 * finished 'unifying' the page and buffer cache and SMP-threaded the
35 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
37 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
40 atomic_t page_cache_size
= ATOMIC_INIT(0);
41 unsigned int page_hash_bits
;
42 struct page
**page_hash_table
;
44 spinlock_t pagecache_lock
= SPIN_LOCK_UNLOCKED
;
46 * NOTE: to avoid deadlocking you must never acquire the pagecache_lock with
47 * the pagemap_lru_lock held.
49 spinlock_t pagemap_lru_lock
= SPIN_LOCK_UNLOCKED
;
51 #define CLUSTER_PAGES (1 << page_cluster)
52 #define CLUSTER_SHIFT (PAGE_CACHE_SHIFT + page_cluster)
53 #define CLUSTER_BYTES (1 << CLUSTER_SHIFT)
54 #define CLUSTER_OFFSET(x) (((x) >> CLUSTER_SHIFT) << CLUSTER_SHIFT)
56 void __add_page_to_hash_queue(struct page
* page
, struct page
**p
)
58 atomic_inc(&page_cache_size
);
59 if((page
->next_hash
= *p
) != NULL
)
60 (*p
)->pprev_hash
= &page
->next_hash
;
67 static void remove_page_from_hash_queue(struct page
* page
)
69 if(page
->pprev_hash
) {
71 page
->next_hash
->pprev_hash
= page
->pprev_hash
;
72 *page
->pprev_hash
= page
->next_hash
;
73 page
->pprev_hash
= NULL
;
75 atomic_dec(&page_cache_size
);
78 static void remove_page_from_inode_queue(struct page
* page
)
80 struct inode
* inode
= page
->inode
;
81 struct page
*prev
, *next
;
86 if (inode
->i_pages
== page
)
87 inode
->i_pages
= next
;
97 * Remove a page from the page cache and free it. Caller has to make
98 * sure the page is locked and that nobody else uses it - or that usage
101 void remove_inode_page(struct page
*page
)
103 if (!PageLocked(page
))
106 spin_lock(&pagecache_lock
);
107 remove_page_from_inode_queue(page
);
108 remove_page_from_hash_queue(page
);
110 spin_unlock(&pagecache_lock
);
113 void invalidate_inode_pages(struct inode
* inode
)
119 spin_lock(&pagecache_lock
);
121 while ((page
= *p
) != NULL
) {
123 if (TryLockPage(page
)) {
124 spin_unlock(&pagecache_lock
);
126 page_cache_release(page
);
129 if (page_count(page
) != 2)
130 printk("hm, busy page invalidated? (not necesserily a bug)\n");
133 remove_page_from_inode_queue(page
);
134 remove_page_from_hash_queue(page
);
137 page_cache_release(page
);
138 page_cache_release(page
);
141 spin_unlock(&pagecache_lock
);
144 * Truncate the page cache at a set offset, removing the pages
145 * that are beyond that offset (and zeroing out partial pages).
147 void truncate_inode_pages(struct inode
* inode
, unsigned long start
)
154 spin_lock(&pagecache_lock
);
156 while ((page
= *p
) != NULL
) {
157 unsigned long offset
= page
->offset
;
159 /* page wholly truncated - free it */
160 if (offset
>= start
) {
162 spin_unlock(&pagecache_lock
);
166 if (!inode
->i_op
->flushpage
||
167 inode
->i_op
->flushpage(inode
, page
, 0))
171 * We remove the page from the page cache
172 * _after_ we have destroyed all buffer-cache
173 * references to it. Otherwise some other process
174 * might think this inode page is not in the
175 * page cache and creates a buffer-cache alias
176 * to it causing all sorts of fun problems ...
178 remove_inode_page(page
);
181 page_cache_release(page
);
182 page_cache_release(page
);
185 * We have done things without the pagecache lock,
186 * so we'll have to repeat the scan.
187 * It's not possible to deadlock here because
188 * we are guaranteed to make progress. (ie. we have
189 * just removed a page)
195 * there is only one partial page possible.
200 offset
= start
- offset
;
201 /* partial truncate, clear end of page */
202 if (offset
< PAGE_CACHE_SIZE
) {
203 unsigned long address
;
205 spin_unlock(&pagecache_lock
);
210 address
= page_address(page
);
211 memset((void *) (offset
+ address
), 0, PAGE_CACHE_SIZE
- offset
);
212 flush_page_to_ram(address
);
214 if (inode
->i_op
->flushpage
)
215 inode
->i_op
->flushpage(inode
, page
, offset
);
217 * we have dropped the spinlock so we have to
221 page_cache_release(page
);
225 spin_unlock(&pagecache_lock
);
228 int shrink_mmap(int priority
, int gfp_mask
)
234 struct list_head
* page_lru
, * dispose
;
237 count
= nr_lru_pages
/ (priority
+1);
239 spin_lock(&pagemap_lru_lock
);
241 while (count
> 0 && (page_lru
= lru_cache
.prev
) != &lru_cache
) {
242 page
= list_entry(page_lru
, struct page
, lru
);
245 dispose
= &lru_cache
;
246 if (test_and_clear_bit(PG_referenced
, &page
->flags
))
247 /* Roll the page at the top of the lru list,
248 * we could also be more aggressive putting
249 * the page in the young-dispose-list, so
250 * avoiding to free young pages in each pass.
252 goto dispose_continue
;
255 /* don't account passes over not DMA pages */
256 if ((gfp_mask
& __GFP_DMA
) && !PageDMA(page
))
257 goto dispose_continue
;
258 if (!(gfp_mask
& __GFP_BIGMEM
) && PageBIGMEM(page
))
259 goto dispose_continue
;
264 if (TryLockPage(page
))
265 goto dispose_continue
;
267 /* Release the pagemap_lru lock even if the page is not yet
268 queued in any lru queue since we have just locked down
269 the page so nobody else may SMP race with us running
270 a lru_cache_del() (lru_cache_del() always run with the
271 page locked down ;). */
272 spin_unlock(&pagemap_lru_lock
);
274 /* avoid unscalable SMP locking */
275 if (!page
->buffers
&& page_count(page
) > 1)
276 goto unlock_noput_continue
;
278 /* Take the pagecache_lock spinlock held to avoid
279 other tasks to notice the page while we are looking at its
280 page count. If it's a pagecache-page we'll free it
281 in one atomic transaction after checking its page count. */
282 spin_lock(&pagecache_lock
);
284 /* avoid freeing the page while it's locked */
287 /* Is it a buffer page? */
289 spin_unlock(&pagecache_lock
);
290 if (!try_to_free_buffers(page
))
291 goto unlock_continue
;
292 /* page was locked, inode can't go away under us */
294 atomic_sub(PAGE_CACHE_SIZE
, &buffermem
);
295 goto made_buffer_progress
;
297 spin_lock(&pagecache_lock
);
301 * We can't free pages unless there's just one user
302 * (count == 2 because we added one ourselves above).
304 if (page_count(page
) != 2)
305 goto cache_unlock_continue
;
308 * Is it a page swap page? If so, we want to
309 * drop it if it is no longer used, even if it
310 * were to be marked referenced..
312 if (PageSwapCache(page
)) {
313 spin_unlock(&pagecache_lock
);
314 __delete_from_swap_cache(page
);
315 goto made_inode_progress
;
318 /* is it a page-cache page? */
322 if (!pgcache_under_min())
324 remove_page_from_inode_queue(page
);
325 remove_page_from_hash_queue(page
);
327 spin_unlock(&pagecache_lock
);
328 goto made_inode_progress
;
330 goto cache_unlock_continue
;
334 printk(KERN_ERR
"shrink_mmap: unknown LRU page!\n");
336 cache_unlock_continue
:
337 spin_unlock(&pagecache_lock
);
341 dispose_relock_continue
:
342 /* even if the dispose list is local, a truncate_inode_page()
343 may remove a page from its queue so always
344 synchronize with the lru lock while accesing the
346 spin_lock(&pagemap_lru_lock
);
347 list_add(page_lru
, dispose
);
350 unlock_noput_continue
:
352 goto dispose_relock_continue
;
355 list_add(page_lru
, dispose
);
360 page_cache_release(page
);
361 made_buffer_progress
:
365 spin_lock(&pagemap_lru_lock
);
366 /* nr_lru_pages needs the spinlock */
370 list_splice(&young
, &lru_cache
);
371 list_splice(&old
, lru_cache
.prev
);
373 spin_unlock(&pagemap_lru_lock
);
378 static inline struct page
* __find_page_nolock(struct inode
* inode
, unsigned long offset
, struct page
*page
)
383 page
= page
->next_hash
;
387 if (page
->inode
!= inode
)
389 if (page
->offset
== offset
)
392 set_bit(PG_referenced
, &page
->flags
);
398 * By the time this is called, the page is locked and
399 * we don't have to worry about any races any more.
403 static int writeout_one_page(struct page
*page
)
405 struct buffer_head
*bh
, *head
= page
->buffers
;
409 if (buffer_locked(bh
) || !buffer_dirty(bh
) || !buffer_uptodate(bh
))
413 ll_rw_block(WRITE
, 1, &bh
);
414 } while ((bh
= bh
->b_this_page
) != head
);
418 static int waitfor_one_page(struct page
*page
)
421 struct buffer_head
*bh
, *head
= page
->buffers
;
426 if (buffer_req(bh
) && !buffer_uptodate(bh
))
428 } while ((bh
= bh
->b_this_page
) != head
);
432 static int do_buffer_fdatasync(struct inode
*inode
, unsigned long start
, unsigned long end
, int (*fn
)(struct page
*))
439 spin_lock(&pagecache_lock
);
440 next
= inode
->i_pages
;
442 struct page
*page
= next
;
446 if (page
->offset
>= end
)
448 if (page
->offset
< start
)
452 spin_unlock(&pagecache_lock
);
455 /* The buffers could have been free'd while we waited for the page lock */
460 spin_lock(&pagecache_lock
);
462 page_cache_release(page
);
464 spin_unlock(&pagecache_lock
);
470 * Two-stage data sync: first start the IO, then go back and
471 * collect the information..
473 int generic_buffer_fdatasync(struct inode
*inode
, unsigned long start
, unsigned long end
)
477 retval
= do_buffer_fdatasync(inode
, start
, end
, writeout_one_page
);
478 retval
|= do_buffer_fdatasync(inode
, start
, end
, waitfor_one_page
);
483 * This adds a page to the page cache, starting out as locked,
484 * owned by us, referenced, but not uptodate and with no errors.
486 static inline void __add_to_page_cache(struct page
* page
,
487 struct inode
* inode
, unsigned long offset
,
492 flags
= page
->flags
& ~((1 << PG_uptodate
) | (1 << PG_error
) | (1 << PG_referenced
));
493 page
->flags
= flags
| (1 << PG_locked
);
494 page
->owner
= current
; /* REMOVEME */
496 page
->offset
= offset
;
497 add_page_to_inode_queue(inode
, page
);
498 __add_page_to_hash_queue(page
, hash
);
502 void add_to_page_cache(struct page
* page
, struct inode
* inode
, unsigned long offset
)
504 spin_lock(&pagecache_lock
);
505 __add_to_page_cache(page
, inode
, offset
, page_hash(inode
, offset
));
506 spin_unlock(&pagecache_lock
);
509 int add_to_page_cache_unique(struct page
* page
,
510 struct inode
* inode
, unsigned long offset
,
516 spin_lock(&pagecache_lock
);
517 alias
= __find_page_nolock(inode
, offset
, *hash
);
521 __add_to_page_cache(page
,inode
,offset
,hash
);
525 spin_unlock(&pagecache_lock
);
530 * This adds the requested page to the page cache if it isn't already there,
531 * and schedules an I/O to read in its contents from disk.
533 static inline void page_cache_read(struct file
* file
, unsigned long offset
)
535 unsigned long new_page
;
536 struct inode
*inode
= file
->f_dentry
->d_inode
;
537 struct page
** hash
= page_hash(inode
, offset
);
540 spin_lock(&pagecache_lock
);
541 page
= __find_page_nolock(inode
, offset
, *hash
);
542 spin_unlock(&pagecache_lock
);
546 new_page
= page_cache_alloc();
549 page
= page_cache_entry(new_page
);
551 if (!add_to_page_cache_unique(page
, inode
, offset
, hash
)) {
552 inode
->i_op
->readpage(file
, page
);
553 page_cache_release(page
);
558 * We arrive here in the unlikely event that someone
559 * raced with us and added our page to the cache first.
561 page_cache_free(new_page
);
566 * Read in an entire cluster at once. A cluster is usually a 64k-
567 * aligned block that includes the address requested in "offset."
569 static void read_cluster_nonblocking(struct file
* file
,
570 unsigned long offset
)
572 off_t filesize
= file
->f_dentry
->d_inode
->i_size
;
573 unsigned long pages
= CLUSTER_PAGES
;
575 offset
= CLUSTER_OFFSET(offset
);
576 while ((pages
-- > 0) && (offset
< filesize
)) {
577 page_cache_read(file
, offset
);
578 offset
+= PAGE_CACHE_SIZE
;
585 * Wait for a page to get unlocked.
587 * This must be called with the caller "holding" the page,
588 * ie with increased "page->count" so that the page won't
589 * go away during the wait..
591 void ___wait_on_page(struct page
*page
)
593 struct task_struct
*tsk
= current
;
594 DECLARE_WAITQUEUE(wait
, tsk
);
596 add_wait_queue(&page
->wait
, &wait
);
598 run_task_queue(&tq_disk
);
599 set_task_state(tsk
, TASK_UNINTERRUPTIBLE
);
600 if (!PageLocked(page
))
603 } while (PageLocked(page
));
604 tsk
->state
= TASK_RUNNING
;
605 remove_wait_queue(&page
->wait
, &wait
);
609 * Get an exclusive lock on the page..
611 void lock_page(struct page
*page
)
613 while (TryLockPage(page
))
614 ___wait_on_page(page
);
619 * a rather lightweight function, finding and getting a reference to a
620 * hashed page atomically, waiting for it if it's locked.
622 struct page
* __find_get_page (struct inode
* inode
,
623 unsigned long offset
, struct page
**hash
)
628 * We scan the hash list read-only. Addition to and removal from
629 * the hash-list needs a held write-lock.
632 spin_lock(&pagecache_lock
);
633 page
= __find_page_nolock(inode
, offset
, *hash
);
636 spin_unlock(&pagecache_lock
);
638 /* Found the page, sleep if locked. */
639 if (page
&& PageLocked(page
)) {
640 struct task_struct
*tsk
= current
;
641 DECLARE_WAITQUEUE(wait
, tsk
);
643 run_task_queue(&tq_disk
);
645 __set_task_state(tsk
, TASK_UNINTERRUPTIBLE
);
646 add_wait_queue(&page
->wait
, &wait
);
648 if (PageLocked(page
))
650 __set_task_state(tsk
, TASK_RUNNING
);
651 remove_wait_queue(&page
->wait
, &wait
);
654 * The page might have been unhashed meanwhile. It's
655 * not freed though because we hold a reference to it.
656 * If this is the case then it will be freed _here_,
657 * and we recheck the hash anyway.
659 page_cache_release(page
);
663 * It's not locked so we can return the page and we hold
670 * Get the lock to a page atomically.
672 struct page
* __find_lock_page (struct inode
* inode
,
673 unsigned long offset
, struct page
**hash
)
678 * We scan the hash list read-only. Addition to and removal from
679 * the hash-list needs a held write-lock.
682 spin_lock(&pagecache_lock
);
683 page
= __find_page_nolock(inode
, offset
, *hash
);
686 spin_unlock(&pagecache_lock
);
688 /* Found the page, sleep if locked. */
689 if (page
&& TryLockPage(page
)) {
690 struct task_struct
*tsk
= current
;
691 DECLARE_WAITQUEUE(wait
, tsk
);
693 run_task_queue(&tq_disk
);
695 __set_task_state(tsk
, TASK_UNINTERRUPTIBLE
);
696 add_wait_queue(&page
->wait
, &wait
);
698 if (PageLocked(page
))
700 __set_task_state(tsk
, TASK_RUNNING
);
701 remove_wait_queue(&page
->wait
, &wait
);
704 * The page might have been unhashed meanwhile. It's
705 * not freed though because we hold a reference to it.
706 * If this is the case then it will be freed _here_,
707 * and we recheck the hash anyway.
709 page_cache_release(page
);
713 * It's not locked so we can return the page and we hold
720 #define PROFILE_READAHEAD
721 #define DEBUG_READAHEAD
725 * Read-ahead profiling information
726 * --------------------------------
727 * Every PROFILE_MAXREADCOUNT, the following information is written
729 * Percentage of asynchronous read-ahead.
730 * Average of read-ahead fields context value.
731 * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
735 #ifdef PROFILE_READAHEAD
737 #define PROFILE_MAXREADCOUNT 1000
739 static unsigned long total_reada
;
740 static unsigned long total_async
;
741 static unsigned long total_ramax
;
742 static unsigned long total_ralen
;
743 static unsigned long total_rawin
;
745 static void profile_readahead(int async
, struct file
*filp
)
753 total_ramax
+= filp
->f_ramax
;
754 total_ralen
+= filp
->f_ralen
;
755 total_rawin
+= filp
->f_rawin
;
757 if (total_reada
> PROFILE_MAXREADCOUNT
) {
760 if (!(total_reada
> PROFILE_MAXREADCOUNT
)) {
761 restore_flags(flags
);
765 printk("Readahead average: max=%ld, len=%ld, win=%ld, async=%ld%%\n",
766 total_ramax
/total_reada
,
767 total_ralen
/total_reada
,
768 total_rawin
/total_reada
,
769 (total_async
*100)/total_reada
);
770 #ifdef DEBUG_READAHEAD
771 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%ld\n",
772 filp
->f_ramax
, filp
->f_ralen
, filp
->f_rawin
, filp
->f_raend
);
781 restore_flags(flags
);
784 #endif /* defined PROFILE_READAHEAD */
787 * Read-ahead context:
788 * -------------------
789 * The read ahead context fields of the "struct file" are the following:
790 * - f_raend : position of the first byte after the last page we tried to
792 * - f_ramax : current read-ahead maximum size.
793 * - f_ralen : length of the current IO read block we tried to read-ahead.
794 * - f_rawin : length of the current read-ahead window.
795 * if last read-ahead was synchronous then
797 * otherwise (was asynchronous)
798 * f_rawin = previous value of f_ralen + f_ralen
802 * MIN_READAHEAD : minimum read-ahead size when read-ahead.
803 * MAX_READAHEAD : maximum read-ahead size when read-ahead.
805 * Synchronous read-ahead benefits:
806 * --------------------------------
807 * Using reasonable IO xfer length from peripheral devices increase system
809 * Reasonable means, in this context, not too large but not too small.
810 * The actual maximum value is:
811 * MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
812 * and 32K if defined (4K page size assumed).
814 * Asynchronous read-ahead benefits:
815 * ---------------------------------
816 * Overlapping next read request and user process execution increase system
821 * We have to guess which further data are needed by the user process.
822 * If these data are often not really needed, it's bad for system
824 * However, we know that files are often accessed sequentially by
825 * application programs and it seems that it is possible to have some good
826 * strategy in that guessing.
827 * We only try to read-ahead files that seems to be read sequentially.
829 * Asynchronous read-ahead risks:
830 * ------------------------------
831 * In order to maximize overlapping, we must start some asynchronous read
832 * request from the device, as soon as possible.
833 * We must be very careful about:
834 * - The number of effective pending IO read requests.
835 * ONE seems to be the only reasonable value.
836 * - The total memory pool usage for the file access stream.
837 * This maximum memory usage is implicitly 2 IO read chunks:
838 * 2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
839 * 64k if defined (4K page size assumed).
842 static inline int get_max_readahead(struct inode
* inode
)
844 if (!inode
->i_dev
|| !max_readahead
[MAJOR(inode
->i_dev
)])
845 return MAX_READAHEAD
;
846 return max_readahead
[MAJOR(inode
->i_dev
)][MINOR(inode
->i_dev
)];
849 static void generic_file_readahead(int reada_ok
,
850 struct file
* filp
, struct inode
* inode
,
851 unsigned long ppos
, struct page
* page
)
853 unsigned long max_ahead
, ahead
;
855 int max_readahead
= get_max_readahead(inode
);
857 raend
= filp
->f_raend
& PAGE_CACHE_MASK
;
861 * The current page is locked.
862 * If the current position is inside the previous read IO request, do not
863 * try to reread previously read ahead pages.
864 * Otherwise decide or not to read ahead some pages synchronously.
865 * If we are not going to read ahead, set the read ahead context for this
868 if (PageLocked(page
)) {
869 if (!filp
->f_ralen
|| ppos
>= raend
|| ppos
+ filp
->f_ralen
< raend
) {
871 if (raend
< inode
->i_size
)
872 max_ahead
= filp
->f_ramax
;
874 filp
->f_ralen
= PAGE_CACHE_SIZE
;
876 filp
->f_raend
= ppos
+ filp
->f_ralen
;
877 filp
->f_rawin
+= filp
->f_ralen
;
882 * The current page is not locked.
883 * If we were reading ahead and,
884 * if the current max read ahead size is not zero and,
885 * if the current position is inside the last read-ahead IO request,
886 * it is the moment to try to read ahead asynchronously.
887 * We will later force unplug device in order to force asynchronous read IO.
889 else if (reada_ok
&& filp
->f_ramax
&& raend
>= PAGE_CACHE_SIZE
&&
890 ppos
<= raend
&& ppos
+ filp
->f_ralen
>= raend
) {
892 * Add ONE page to max_ahead in order to try to have about the same IO max size
893 * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
894 * Compute the position of the last page we have tried to read in order to
895 * begin to read ahead just at the next page.
897 raend
-= PAGE_CACHE_SIZE
;
898 if (raend
< inode
->i_size
)
899 max_ahead
= filp
->f_ramax
+ PAGE_CACHE_SIZE
;
902 filp
->f_rawin
= filp
->f_ralen
;
908 * Try to read ahead pages.
909 * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
910 * scheduler, will work enough for us to avoid too bad actuals IO requests.
913 while (ahead
< max_ahead
) {
914 ahead
+= PAGE_CACHE_SIZE
;
915 page_cache_read(filp
, raend
+ ahead
);
918 * If we tried to read ahead some pages,
919 * If we tried to read ahead asynchronously,
920 * Try to force unplug of the device in order to start an asynchronous
922 * Update the read-ahead context.
923 * Store the length of the current read-ahead window.
924 * Double the current max read ahead size.
925 * That heuristic avoid to do some large IO for files that are not really
926 * accessed sequentially.
930 run_task_queue(&tq_disk
);
933 filp
->f_ralen
+= ahead
;
934 filp
->f_rawin
+= filp
->f_ralen
;
935 filp
->f_raend
= raend
+ ahead
+ PAGE_CACHE_SIZE
;
937 filp
->f_ramax
+= filp
->f_ramax
;
939 if (filp
->f_ramax
> max_readahead
)
940 filp
->f_ramax
= max_readahead
;
942 #ifdef PROFILE_READAHEAD
943 profile_readahead((reada_ok
== 2), filp
);
952 * This is a generic file read routine, and uses the
953 * inode->i_op->readpage() function for the actual low-level
956 * This is really ugly. But the goto's actually try to clarify some
957 * of the logic when it comes to error handling etc.
959 void do_generic_file_read(struct file
* filp
, loff_t
*ppos
, read_descriptor_t
* desc
, read_actor_t actor
)
961 struct dentry
*dentry
= filp
->f_dentry
;
962 struct inode
*inode
= dentry
->d_inode
;
963 size_t pos
, pgpos
, page_cache
;
966 int max_readahead
= get_max_readahead(inode
);
971 pgpos
= pos
& PAGE_CACHE_MASK
;
973 * If the current position is outside the previous read-ahead window,
974 * we reset the current read-ahead context and set read ahead max to zero
975 * (will be set to just needed value later),
976 * otherwise, we assume that the file accesses are sequential enough to
977 * continue read-ahead.
979 if (pgpos
> filp
->f_raend
|| pgpos
+ filp
->f_rawin
< filp
->f_raend
) {
989 * Adjust the current value of read-ahead max.
990 * If the read operation stay in the first half page, force no readahead.
991 * Otherwise try to increase read ahead max just enough to do the read request.
992 * Then, at least MIN_READAHEAD if read ahead is ok,
993 * and at most MAX_READAHEAD in all cases.
995 if (pos
+ desc
->count
<= (PAGE_CACHE_SIZE
>> 1)) {
998 unsigned long needed
;
1000 needed
= ((pos
+ desc
->count
) & PAGE_CACHE_MASK
) - pgpos
;
1002 if (filp
->f_ramax
< needed
)
1003 filp
->f_ramax
= needed
;
1005 if (reada_ok
&& filp
->f_ramax
< MIN_READAHEAD
)
1006 filp
->f_ramax
= MIN_READAHEAD
;
1007 if (filp
->f_ramax
> max_readahead
)
1008 filp
->f_ramax
= max_readahead
;
1012 struct page
*page
, **hash
;
1014 if (pos
>= inode
->i_size
)
1018 * Try to find the data in the page cache..
1020 hash
= page_hash(inode
, pos
& PAGE_CACHE_MASK
);
1022 spin_lock(&pagecache_lock
);
1023 page
= __find_page_nolock(inode
, pos
& PAGE_CACHE_MASK
, *hash
);
1025 goto no_cached_page
;
1028 spin_unlock(&pagecache_lock
);
1030 if (!Page_Uptodate(page
))
1031 goto page_not_up_to_date
;
1034 * Ok, we have the page, and it's up-to-date, so
1035 * now we can copy it to user space...
1038 unsigned long offset
, nr
;
1040 offset
= pos
& ~PAGE_CACHE_MASK
;
1041 nr
= PAGE_CACHE_SIZE
- offset
;
1042 if (nr
> inode
->i_size
- pos
)
1043 nr
= inode
->i_size
- pos
;
1046 * The actor routine returns how many bytes were actually used..
1047 * NOTE! This may not be the same as how much of a user buffer
1048 * we filled up (we may be padding etc), so we can only update
1049 * "pos" here (the actor routine has to update the user buffer
1050 * pointers and the remaining count).
1052 nr
= actor(desc
, (const char *) (page_address(page
) + offset
), nr
);
1054 page_cache_release(page
);
1055 if (nr
&& desc
->count
)
1061 * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
1063 page_not_up_to_date
:
1064 generic_file_readahead(reada_ok
, filp
, inode
,
1065 pos
& PAGE_CACHE_MASK
, page
);
1067 if (Page_Uptodate(page
))
1070 /* Get exclusive access to the page ... */
1072 if (Page_Uptodate(page
)) {
1078 /* ... and start the actual read. The read will unlock the page. */
1079 error
= inode
->i_op
->readpage(filp
, page
);
1082 if (Page_Uptodate(page
))
1085 /* Again, try some read-ahead while waiting for the page to finish.. */
1086 generic_file_readahead(reada_ok
, filp
, inode
,
1087 pos
& PAGE_CACHE_MASK
, page
);
1089 if (Page_Uptodate(page
))
1094 /* UHHUH! A synchronous read error occurred. Report it */
1095 desc
->error
= error
;
1096 page_cache_release(page
);
1101 * Ok, it wasn't cached, so we need to create a new
1104 * We get here with the page cache lock held.
1107 spin_unlock(&pagecache_lock
);
1108 page_cache
= page_cache_alloc();
1110 desc
->error
= -ENOMEM
;
1115 * Somebody may have added the page while we
1116 * dropped the page cache lock. Check for that.
1118 spin_lock(&pagecache_lock
);
1119 page
= __find_page_nolock(inode
, pos
& PAGE_CACHE_MASK
, *hash
);
1125 * Ok, add the new page to the hash-queues...
1127 page
= page_cache_entry(page_cache
);
1128 __add_to_page_cache(page
, inode
, pos
& PAGE_CACHE_MASK
, hash
);
1129 spin_unlock(&pagecache_lock
);
1138 page_cache_free(page_cache
);
1139 UPDATE_ATIME(inode
);
1142 static int file_read_actor(read_descriptor_t
* desc
, const char *area
, unsigned long size
)
1145 unsigned long count
= desc
->count
;
1149 left
= __copy_to_user(desc
->buf
, area
, size
);
1152 desc
->error
= -EFAULT
;
1154 desc
->count
= count
- size
;
1155 desc
->written
+= size
;
1161 * This is the "read()" routine for all filesystems
1162 * that can use the page cache directly.
1164 ssize_t
generic_file_read(struct file
* filp
, char * buf
, size_t count
, loff_t
*ppos
)
1169 if (access_ok(VERIFY_WRITE
, buf
, count
)) {
1172 read_descriptor_t desc
;
1178 do_generic_file_read(filp
, ppos
, &desc
, file_read_actor
);
1180 retval
= desc
.written
;
1182 retval
= desc
.error
;
1188 static int file_send_actor(read_descriptor_t
* desc
, const char *area
, unsigned long size
)
1191 unsigned long count
= desc
->count
;
1192 struct file
*file
= (struct file
*) desc
->buf
;
1193 mm_segment_t old_fs
;
1199 written
= file
->f_op
->write(file
, area
, size
, &file
->f_pos
);
1202 desc
->error
= written
;
1205 desc
->count
= count
- written
;
1206 desc
->written
+= written
;
1210 asmlinkage ssize_t
sys_sendfile(int out_fd
, int in_fd
, off_t
*offset
, size_t count
)
1213 struct file
* in_file
, * out_file
;
1214 struct inode
* in_inode
, * out_inode
;
1217 * Get input file, and verify that it is ok..
1220 in_file
= fget(in_fd
);
1223 if (!(in_file
->f_mode
& FMODE_READ
))
1226 in_inode
= in_file
->f_dentry
->d_inode
;
1229 if (!in_inode
->i_op
|| !in_inode
->i_op
->readpage
)
1231 retval
= locks_verify_area(FLOCK_VERIFY_READ
, in_inode
, in_file
, in_file
->f_pos
, count
);
1236 * Get output file, and verify that it is ok..
1239 out_file
= fget(out_fd
);
1242 if (!(out_file
->f_mode
& FMODE_WRITE
))
1245 if (!out_file
->f_op
|| !out_file
->f_op
->write
)
1247 out_inode
= out_file
->f_dentry
->d_inode
;
1250 retval
= locks_verify_area(FLOCK_VERIFY_WRITE
, out_inode
, out_file
, out_file
->f_pos
, count
);
1256 read_descriptor_t desc
;
1257 loff_t pos
= 0, *ppos
;
1260 ppos
= &in_file
->f_pos
;
1262 if (get_user(pos
, offset
))
1269 desc
.buf
= (char *) out_file
;
1271 do_generic_file_read(in_file
, ppos
, &desc
, file_send_actor
);
1273 retval
= desc
.written
;
1275 retval
= desc
.error
;
1277 put_user(pos
, offset
);
1289 * filemap_nopage() is invoked via the vma operations vector for a
1290 * mapped memory region to read in file data during a page fault.
1292 * The goto's are kind of ugly, but this streamlines the normal case of having
1293 * it in the page cache, and handles the special cases reasonably without
1294 * having a lot of duplicated code.
1296 * XXX - at some point, this should return unique values to indicate to
1297 * the caller whether this is EIO, OOM, or SIGBUS.
1299 static unsigned long filemap_nopage(struct vm_area_struct
* area
,
1300 unsigned long address
, int no_share
)
1302 struct file
* file
= area
->vm_file
;
1303 struct dentry
* dentry
= file
->f_dentry
;
1304 struct inode
* inode
= dentry
->d_inode
;
1305 struct page
* page
, **hash
;
1306 unsigned long old_page
, new_page
= 0;
1308 unsigned long offset
= address
- area
->vm_start
+ area
->vm_offset
;
1311 * Semantics for shared and private memory areas are different
1312 * past the end of the file. A shared mapping past the last page
1313 * of the file is an error and results in a SIGBUS, while a
1314 * private mapping just maps in a zero page.
1316 if ((offset
>= inode
->i_size
) &&
1317 (area
->vm_flags
& VM_SHARED
) && (area
->vm_mm
== current
->mm
))
1321 * Do we have something in the page cache already?
1323 hash
= page_hash(inode
, offset
);
1325 page
= __find_get_page(inode
, offset
, hash
);
1327 goto no_cached_page
;
1330 * Ok, found a page in the page cache, now we need to check
1331 * that it's up-to-date.
1333 if (!Page_Uptodate(page
))
1334 goto page_not_uptodate
;
1338 * Found the page and have a reference on it, need to check sharing
1339 * and possibly copy it over to another page..
1341 old_page
= page_address(page
);
1343 flush_page_to_ram(old_page
);
1347 new_page
= page_cache_alloc();
1349 copy_page(new_page
, old_page
);
1350 flush_page_to_ram(new_page
);
1352 page_cache_release(page
);
1357 * If the requested offset is within our file, try to read a whole
1358 * cluster of pages at once.
1360 * Otherwise, we're off the end of a privately mapped file,
1361 * so we need to map a zero page.
1363 if (offset
< inode
->i_size
)
1364 read_cluster_nonblocking(file
, offset
);
1366 page_cache_read(file
, offset
);
1369 * The page we want has now been added to the page cache.
1370 * In the unlikely event that someone removed it in the
1371 * meantime, we'll just come back here and read it again.
1377 if (Page_Uptodate(page
)) {
1382 if (!inode
->i_op
->readpage(file
, page
)) {
1384 if (Page_Uptodate(page
))
1389 * Umm, take care of errors if the page isn't up-to-date.
1390 * Try to re-read it _once_. We do this synchronously,
1391 * because there really aren't any performance issues here
1392 * and we need to check for errors.
1395 if (Page_Uptodate(page
)) {
1399 ClearPageError(page
);
1400 if (!inode
->i_op
->readpage(file
, page
)) {
1402 if (Page_Uptodate(page
))
1407 * Things didn't work out. Return zero to tell the
1408 * mm layer so, possibly freeing the page cache page first.
1410 page_cache_release(page
);
1412 page_cache_free(new_page
);
1417 * Tries to write a shared mapped page to its backing store. May return -EIO
1418 * if the disk is full.
1420 static inline int do_write_page(struct inode
* inode
, struct file
* file
,
1421 const char * page_addr
, unsigned long offset
)
1425 int (*writepage
) (struct file
*, struct page
*);
1428 size
= offset
+ PAGE_SIZE
;
1429 /* refuse to extend file size.. */
1430 if (S_ISREG(inode
->i_mode
)) {
1431 if (size
> inode
->i_size
)
1432 size
= inode
->i_size
;
1433 /* Ho humm.. We should have tested for this earlier */
1439 writepage
= inode
->i_op
->writepage
;
1440 page
= mem_map
+ MAP_NR(page_addr
);
1443 retval
= writepage(file
, page
);
1449 static int filemap_write_page(struct vm_area_struct
* vma
,
1450 unsigned long offset
,
1456 struct dentry
* dentry
;
1457 struct inode
* inode
;
1459 file
= vma
->vm_file
;
1460 dentry
= file
->f_dentry
;
1461 inode
= dentry
->d_inode
;
1464 * If a task terminates while we're swapping the page, the vma and
1465 * and file could be released ... increment the count to be safe.
1468 result
= do_write_page(inode
, file
, (const char *) page
, offset
);
1475 * The page cache takes care of races between somebody
1476 * trying to swap something out and swap something in
1477 * at the same time..
1479 extern void wakeup_bdflush(int);
1480 int filemap_swapout(struct vm_area_struct
* vma
, struct page
* page
)
1482 int retval
= filemap_write_page(vma
, page
->offset
, page_address(page
), 0);
1487 static inline int filemap_sync_pte(pte_t
* ptep
, struct vm_area_struct
*vma
,
1488 unsigned long address
, unsigned int flags
)
1491 unsigned long pageaddr
;
1495 if (!(flags
& MS_INVALIDATE
)) {
1496 if (!pte_present(pte
))
1498 if (!pte_dirty(pte
))
1500 flush_page_to_ram(pte_page(pte
));
1501 flush_cache_page(vma
, address
);
1502 set_pte(ptep
, pte_mkclean(pte
));
1503 flush_tlb_page(vma
, address
);
1504 pageaddr
= pte_page(pte
);
1505 page
= page_cache_entry(pageaddr
);
1510 flush_cache_page(vma
, address
);
1512 flush_tlb_page(vma
, address
);
1513 if (!pte_present(pte
)) {
1514 swap_free(pte_val(pte
));
1517 pageaddr
= pte_page(pte
);
1518 if (!pte_dirty(pte
) || flags
== MS_INVALIDATE
) {
1519 page_cache_free(pageaddr
);
1523 error
= filemap_write_page(vma
, address
- vma
->vm_start
+ vma
->vm_offset
, pageaddr
, 1);
1524 page_cache_free(pageaddr
);
1528 static inline int filemap_sync_pte_range(pmd_t
* pmd
,
1529 unsigned long address
, unsigned long size
,
1530 struct vm_area_struct
*vma
, unsigned long offset
, unsigned int flags
)
1538 if (pmd_bad(*pmd
)) {
1539 printk("filemap_sync_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd
));
1543 pte
= pte_offset(pmd
, address
);
1544 offset
+= address
& PMD_MASK
;
1545 address
&= ~PMD_MASK
;
1546 end
= address
+ size
;
1551 error
|= filemap_sync_pte(pte
, vma
, address
+ offset
, flags
);
1552 address
+= PAGE_SIZE
;
1554 } while (address
< end
);
1558 static inline int filemap_sync_pmd_range(pgd_t
* pgd
,
1559 unsigned long address
, unsigned long size
,
1560 struct vm_area_struct
*vma
, unsigned int flags
)
1563 unsigned long offset
, end
;
1568 if (pgd_bad(*pgd
)) {
1569 printk("filemap_sync_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd
));
1573 pmd
= pmd_offset(pgd
, address
);
1574 offset
= address
& PGDIR_MASK
;
1575 address
&= ~PGDIR_MASK
;
1576 end
= address
+ size
;
1577 if (end
> PGDIR_SIZE
)
1581 error
|= filemap_sync_pte_range(pmd
, address
, end
- address
, vma
, offset
, flags
);
1582 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
1584 } while (address
< end
);
1588 static int filemap_sync(struct vm_area_struct
* vma
, unsigned long address
,
1589 size_t size
, unsigned int flags
)
1592 unsigned long end
= address
+ size
;
1595 dir
= pgd_offset(vma
->vm_mm
, address
);
1596 flush_cache_range(vma
->vm_mm
, end
- size
, end
);
1597 while (address
< end
) {
1598 error
|= filemap_sync_pmd_range(dir
, address
, end
- address
, vma
, flags
);
1599 address
= (address
+ PGDIR_SIZE
) & PGDIR_MASK
;
1602 flush_tlb_range(vma
->vm_mm
, end
- size
, end
);
1607 * This handles (potentially partial) area unmaps..
1609 static void filemap_unmap(struct vm_area_struct
*vma
, unsigned long start
, size_t len
)
1611 filemap_sync(vma
, start
, len
, MS_ASYNC
);
1615 * Shared mappings need to be able to do the right thing at
1616 * close/unmap/sync. They will also use the private file as
1617 * backing-store for swapping..
1619 static struct vm_operations_struct file_shared_mmap
= {
1620 NULL
, /* no special open */
1621 NULL
, /* no special close */
1622 filemap_unmap
, /* unmap - we need to sync the pages */
1623 NULL
, /* no special protect */
1624 filemap_sync
, /* sync */
1626 filemap_nopage
, /* nopage */
1628 filemap_swapout
/* swapout */
1632 * Private mappings just need to be able to load in the map.
1634 * (This is actually used for shared mappings as well, if we
1635 * know they can't ever get write permissions..)
1637 static struct vm_operations_struct file_private_mmap
= {
1644 filemap_nopage
, /* nopage */
1649 /* This is used for a general mmap of a disk file */
1651 int generic_file_mmap(struct file
* file
, struct vm_area_struct
* vma
)
1653 struct vm_operations_struct
* ops
;
1654 struct inode
*inode
= file
->f_dentry
->d_inode
;
1656 ops
= &file_private_mmap
;
1657 if ((vma
->vm_flags
& VM_SHARED
) && (vma
->vm_flags
& VM_MAYWRITE
)) {
1658 if (!inode
->i_op
|| !inode
->i_op
->writepage
)
1660 ops
= &file_shared_mmap
;
1662 if (!inode
->i_sb
|| !S_ISREG(inode
->i_mode
))
1664 if (!inode
->i_op
|| !inode
->i_op
->readpage
)
1666 UPDATE_ATIME(inode
);
1673 * The msync() system call.
1676 static int msync_interval(struct vm_area_struct
* vma
,
1677 unsigned long start
, unsigned long end
, int flags
)
1679 if (vma
->vm_file
&& vma
->vm_ops
&& vma
->vm_ops
->sync
) {
1681 error
= vma
->vm_ops
->sync(vma
, start
, end
-start
, flags
);
1682 if (!error
&& (flags
& MS_SYNC
)) {
1683 struct file
* file
= vma
->vm_file
;
1685 struct dentry
* dentry
= file
->f_dentry
;
1686 error
= file_fsync(file
, dentry
);
1694 asmlinkage
long sys_msync(unsigned long start
, size_t len
, int flags
)
1697 struct vm_area_struct
* vma
;
1698 int unmapped_error
, error
= -EINVAL
;
1700 down(¤t
->mm
->mmap_sem
);
1702 if (start
& ~PAGE_MASK
)
1704 len
= (len
+ ~PAGE_MASK
) & PAGE_MASK
;
1708 if (flags
& ~(MS_ASYNC
| MS_INVALIDATE
| MS_SYNC
))
1714 * If the interval [start,end) covers some unmapped address ranges,
1715 * just ignore them, but return -EFAULT at the end.
1717 vma
= find_vma(current
->mm
, start
);
1720 /* Still start < end. */
1724 /* Here start < vma->vm_end. */
1725 if (start
< vma
->vm_start
) {
1726 unmapped_error
= -EFAULT
;
1727 start
= vma
->vm_start
;
1729 /* Here vma->vm_start <= start < vma->vm_end. */
1730 if (end
<= vma
->vm_end
) {
1732 error
= msync_interval(vma
, start
, end
, flags
);
1736 error
= unmapped_error
;
1739 /* Here vma->vm_start <= start < vma->vm_end < end. */
1740 error
= msync_interval(vma
, start
, vma
->vm_end
, flags
);
1743 start
= vma
->vm_end
;
1748 up(¤t
->mm
->mmap_sem
);
1753 * Write to a file through the page cache. This is mainly for the
1754 * benefit of NFS and possibly other network-based file systems.
1756 * We currently put everything into the page cache prior to writing it.
1757 * This is not a problem when writing full pages. With partial pages,
1758 * however, we first have to read the data into the cache, then
1759 * dirty the page, and finally schedule it for writing. Alternatively, we
1760 * could write-through just the portion of data that would go into that
1761 * page, but that would kill performance for applications that write data
1762 * line by line, and it's prone to race conditions.
1764 * Note that this routine doesn't try to keep track of dirty pages. Each
1765 * file system has to do this all by itself, unfortunately.
1769 generic_file_write(struct file
*file
, const char *buf
,
1770 size_t count
, loff_t
*ppos
,
1771 writepage_t write_one_page
)
1773 struct dentry
*dentry
= file
->f_dentry
;
1774 struct inode
*inode
= dentry
->d_inode
;
1775 unsigned long pos
= *ppos
;
1776 unsigned long limit
= current
->rlim
[RLIMIT_FSIZE
].rlim_cur
;
1777 struct page
*page
, **hash
;
1778 unsigned long page_cache
= 0;
1779 unsigned long written
;
1783 err
= file
->f_error
;
1791 if (file
->f_flags
& O_APPEND
)
1792 pos
= inode
->i_size
;
1795 * Check whether we've reached the file size limit.
1799 send_sig(SIGXFSZ
, current
, 0);
1805 * Check whether to truncate the write,
1806 * and send the signal if we do.
1808 if (count
> limit
- pos
) {
1809 send_sig(SIGXFSZ
, current
, 0);
1810 count
= limit
- pos
;
1814 unsigned long bytes
, pgpos
, offset
;
1816 * Try to find the page in the cache. If it isn't there,
1817 * allocate a free page.
1819 offset
= (pos
& ~PAGE_CACHE_MASK
);
1820 pgpos
= pos
& PAGE_CACHE_MASK
;
1821 bytes
= PAGE_CACHE_SIZE
- offset
;
1825 hash
= page_hash(inode
, pgpos
);
1827 page
= __find_lock_page(inode
, pgpos
, hash
);
1830 page_cache
= page_cache_alloc();
1836 page
= page_cache_entry(page_cache
);
1837 if (add_to_page_cache_unique(page
,inode
,pgpos
,hash
))
1843 /* We have exclusive IO access to the page.. */
1844 if (!PageLocked(page
)) {
1847 if (page
->owner
!= current
) {
1852 status
= write_one_page(file
, page
, offset
, bytes
, buf
);
1859 if (pos
> inode
->i_size
)
1860 inode
->i_size
= pos
;
1862 /* Mark it unlocked again and drop the page.. */
1864 page_cache_release(page
);
1872 page_cache_free(page_cache
);
1874 err
= written
? written
: status
;
1880 * Support routines for directory caching using the page cache.
1884 * Unlock and free a page.
1886 void put_cached_page(unsigned long addr
)
1888 struct page
* page
= page_cache_entry(addr
);
1891 if (page_count(page
) != 2)
1892 panic("put_cached_page: page count=%d\n",
1894 page_cache_release(page
);
1897 void __init
page_cache_init(unsigned long memory_size
)
1899 unsigned long htable_size
, order
;
1901 htable_size
= memory_size
>> PAGE_SHIFT
;
1902 htable_size
*= sizeof(struct page
*);
1903 for(order
= 0; (PAGE_SIZE
<< order
) < htable_size
; order
++)
1907 unsigned long tmp
= (PAGE_SIZE
<< order
) / sizeof(struct page
*);
1910 while((tmp
>>= 1UL) != 0UL)
1913 page_hash_table
= (struct page
**)
1914 __get_free_pages(GFP_ATOMIC
, order
);
1915 } while(page_hash_table
== NULL
&& --order
> 0);
1917 printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
1918 (1 << page_hash_bits
), order
, (PAGE_SIZE
<< order
));
1919 if (!page_hash_table
)
1920 panic("Failed to allocate page hash table\n");
1921 memset(page_hash_table
, 0, PAGE_HASH_SIZE
* sizeof(struct page
*));