4 * Copyright (C) 1994-1999 Linus Torvalds
8 * This file handles the generic file mmap semantics used by
9 * most "normal" filesystems (but you don't /have/ to use this:
10 * the NFS filesystem used to do this differently, for example)
12 #include <linux/malloc.h>
13 #include <linux/shm.h>
14 #include <linux/mman.h>
15 #include <linux/locks.h>
16 #include <linux/pagemap.h>
17 #include <linux/swap.h>
18 #include <linux/smp_lock.h>
19 #include <linux/blkdev.h>
20 #include <linux/file.h>
21 #include <linux/swapctl.h>
22 #include <linux/slab.h>
23 #include <linux/init.h>
25 #include <asm/pgtable.h>
26 #include <asm/uaccess.h>
29 * Shared mappings implemented 30.11.1994. It's not fully working yet,
32 * Shared mappings now work. 15.8.1995 Bruno.
34 * finished 'unifying' the page and buffer cache and SMP-threaded the
35 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
38 atomic_t page_cache_size
= ATOMIC_INIT(0);
39 unsigned int page_hash_bits
;
40 struct page
**page_hash_table
;
42 spinlock_t pagecache_lock
= SPIN_LOCK_UNLOCKED
;
45 void __add_page_to_hash_queue(struct page
* page
, struct page
**p
)
47 atomic_inc(&page_cache_size
);
48 if((page
->next_hash
= *p
) != NULL
)
49 (*p
)->pprev_hash
= &page
->next_hash
;
56 static void remove_page_from_hash_queue(struct page
* page
)
58 if(page
->pprev_hash
) {
60 page
->next_hash
->pprev_hash
= page
->pprev_hash
;
61 *page
->pprev_hash
= page
->next_hash
;
62 page
->pprev_hash
= NULL
;
64 atomic_dec(&page_cache_size
);
67 static void remove_page_from_inode_queue(struct page
* page
)
69 struct inode
* inode
= page
->inode
;
70 struct page
*prev
, *next
;
75 if (inode
->i_pages
== page
)
76 inode
->i_pages
= next
;
86 * Remove a page from the page cache and free it. Caller has to make
87 * sure the page is locked and that nobody else uses it - or that usage
90 void remove_inode_page(struct page
*page
)
92 if (!PageLocked(page
))
95 spin_lock(&pagecache_lock
);
96 remove_page_from_inode_queue(page
);
97 remove_page_from_hash_queue(page
);
99 spin_unlock(&pagecache_lock
);
102 void invalidate_inode_pages(struct inode
* inode
)
108 spin_lock(&pagecache_lock
);
110 while ((page
= *p
) != NULL
) {
112 if (TryLockPage(page
)) {
113 spin_unlock(&pagecache_lock
);
115 page_cache_release(page
);
118 if (page_count(page
) != 2)
119 printk("hm, busy page invalidated? (not necesserily a bug)\n");
121 remove_page_from_inode_queue(page
);
122 remove_page_from_hash_queue(page
);
125 page_cache_release(page
);
126 page_cache_release(page
);
129 spin_unlock(&pagecache_lock
);
132 * Truncate the page cache at a set offset, removing the pages
133 * that are beyond that offset (and zeroing out partial pages).
135 void truncate_inode_pages(struct inode
* inode
, unsigned long start
)
142 spin_lock(&pagecache_lock
);
144 while ((page
= *p
) != NULL
) {
145 unsigned long offset
= page
->offset
;
147 /* page wholly truncated - free it */
148 if (offset
>= start
) {
150 spin_unlock(&pagecache_lock
);
154 if (inode
->i_op
->flushpage
)
155 inode
->i_op
->flushpage(inode
, page
, 0);
158 * We remove the page from the page cache
159 * _after_ we have destroyed all buffer-cache
160 * references to it. Otherwise some other process
161 * might think this inode page is not in the
162 * page cache and creates a buffer-cache alias
163 * to it causing all sorts of fun problems ...
165 remove_inode_page(page
);
168 page_cache_release(page
);
169 page_cache_release(page
);
172 * We have done things without the pagecache lock,
173 * so we'll have to repeat the scan.
174 * It's not possible to deadlock here because
175 * we are guaranteed to make progress. (ie. we have
176 * just removed a page)
182 * there is only one partial page possible.
187 offset
= start
- offset
;
188 /* partial truncate, clear end of page */
189 if (offset
< PAGE_CACHE_SIZE
) {
190 unsigned long address
;
192 spin_unlock(&pagecache_lock
);
197 address
= page_address(page
);
198 memset((void *) (offset
+ address
), 0, PAGE_CACHE_SIZE
- offset
);
199 flush_page_to_ram(address
);
201 if (inode
->i_op
->flushpage
)
202 inode
->i_op
->flushpage(inode
, page
, offset
);
204 * we have dropped the spinlock so we have to
208 page_cache_release(page
);
212 spin_unlock(&pagecache_lock
);
215 extern atomic_t too_many_dirty_buffers
;
217 int shrink_mmap(int priority
, int gfp_mask
)
219 static unsigned long clock
= 0;
220 unsigned long limit
= num_physpages
<< 1;
224 count
= limit
>> priority
;
226 page
= mem_map
+ clock
;
230 /* This works even in the presence of PageSkip because
231 * the first two entries at the beginning of a hole will
232 * be marked, not just the first.
236 if (clock
>= max_mapnr
) {
240 if (PageSkip(page
)) {
241 /* next_hash is overloaded for PageSkip */
242 page
= page
->next_hash
;
243 clock
= page
- mem_map
;
246 referenced
= test_and_clear_bit(PG_referenced
, &page
->flags
);
248 if ((gfp_mask
& __GFP_DMA
) && !PageDMA(page
))
254 * Some common cases that we just short-circuit without
255 * getting the locks - we need to re-check this once we
256 * have the lock, but that's fine.
258 users
= page_count(page
);
261 if (!page
->buffers
) {
269 * ok, now the page looks interesting. Re-check things
272 spin_lock(&pagecache_lock
);
273 if (!page
->inode
&& !page
->buffers
) {
274 spin_unlock(&pagecache_lock
);
277 if (!page_count(page
)) {
278 spin_unlock(&pagecache_lock
);
283 if (TryLockPage(page
)) {
284 spin_unlock(&pagecache_lock
);
289 * we keep pagecache_lock locked and unlock it in
290 * each branch, so that the page->inode case doesnt
291 * have to re-grab it. Here comes the 'real' logic
295 /* Is it a buffer page? */
297 spin_unlock(&pagecache_lock
);
298 if (!try_to_free_buffers(page
))
299 goto unlock_continue
;
300 /* page was locked, inode can't go away under us */
303 atomic_sub(PAGE_CACHE_SIZE
, &buffermem
);
306 spin_lock(&pagecache_lock
);
310 * We can't free pages unless there's just one user
311 * (count == 2 because we added one ourselves above).
313 if (page_count(page
) != 2)
314 goto spin_unlock_continue
;
317 * Is it a page swap page? If so, we want to
318 * drop it if it is no longer used, even if it
319 * were to be marked referenced..
321 if (PageSwapCache(page
)) {
322 spin_unlock(&pagecache_lock
);
323 if (referenced
&& swap_count(page
->offset
) != 2)
324 goto unlock_continue
;
325 __delete_from_swap_cache(page
);
326 page_cache_release(page
);
330 /* is it a page-cache page? */
331 if (!referenced
&& page
->inode
&& !pgcache_under_min()) {
332 remove_page_from_inode_queue(page
);
333 remove_page_from_hash_queue(page
);
335 spin_unlock(&pagecache_lock
);
337 page_cache_release(page
);
340 spin_unlock_continue
:
341 spin_unlock(&pagecache_lock
);
354 static inline struct page
* __find_page_nolock(struct inode
* inode
, unsigned long offset
, struct page
*page
)
359 page
= page
->next_hash
;
363 if (page
->inode
!= inode
)
365 if (page
->offset
== offset
)
368 set_bit(PG_referenced
, &page
->flags
);
374 * By the time this is called, the page is locked and
375 * we don't have to worry about any races any more.
379 static int writeout_one_page(struct page
*page
)
381 struct buffer_head
*bh
, *head
= page
->buffers
;
385 if (buffer_locked(bh
) || !buffer_dirty(bh
) || !buffer_uptodate(bh
))
389 ll_rw_block(WRITE
, 1, &bh
);
390 } while ((bh
= bh
->b_this_page
) != head
);
394 static int waitfor_one_page(struct page
*page
)
397 struct buffer_head
*bh
, *head
= page
->buffers
;
402 if (buffer_req(bh
) && !buffer_uptodate(bh
))
404 } while ((bh
= bh
->b_this_page
) != head
);
408 static int do_buffer_fdatasync(struct inode
*inode
, unsigned long start
, unsigned long end
, int (*fn
)(struct page
*))
415 spin_lock(&pagecache_lock
);
416 next
= inode
->i_pages
;
418 struct page
*page
= next
;
422 if (page
->offset
>= end
)
424 if (page
->offset
< start
)
428 spin_unlock(&pagecache_lock
);
431 /* The buffers could have been free'd while we waited for the page lock */
436 spin_lock(&pagecache_lock
);
438 page_cache_release(page
);
440 spin_unlock(&pagecache_lock
);
446 * Two-stage data sync: first start the IO, then go back and
447 * collect the information..
449 int generic_buffer_fdatasync(struct inode
*inode
, unsigned long start
, unsigned long end
)
453 retval
= do_buffer_fdatasync(inode
, start
, end
, writeout_one_page
);
454 retval
|= do_buffer_fdatasync(inode
, start
, end
, waitfor_one_page
);
459 * This adds a page to the page cache, starting out as locked,
460 * owned by us, referenced, but not uptodate and with no errors.
462 static inline void __add_to_page_cache(struct page
* page
,
463 struct inode
* inode
, unsigned long offset
,
468 flags
= page
->flags
& ~((1 << PG_uptodate
) | (1 << PG_error
));
469 page
->flags
= flags
| ((1 << PG_locked
) | (1 << PG_referenced
));
470 page
->owner
= current
; /* REMOVEME */
472 page
->offset
= offset
;
473 add_page_to_inode_queue(inode
, page
);
474 __add_page_to_hash_queue(page
, hash
);
477 void add_to_page_cache(struct page
* page
, struct inode
* inode
, unsigned long offset
)
479 spin_lock(&pagecache_lock
);
480 __add_to_page_cache(page
, inode
, offset
, page_hash(inode
, offset
));
481 spin_unlock(&pagecache_lock
);
484 int add_to_page_cache_unique(struct page
* page
,
485 struct inode
* inode
, unsigned long offset
,
491 spin_lock(&pagecache_lock
);
492 alias
= __find_page_nolock(inode
, offset
, *hash
);
496 __add_to_page_cache(page
,inode
,offset
,hash
);
500 spin_unlock(&pagecache_lock
);
505 * Try to read ahead in the file. "page_cache" is a potentially free page
506 * that we could use for the cache (if it is 0 we can try to create one,
507 * this is all overlapped with the IO on the previous page finishing anyway)
509 static unsigned long try_to_read_ahead(struct file
* file
,
510 unsigned long offset
, unsigned long page_cache
)
512 struct inode
*inode
= file
->f_dentry
->d_inode
;
516 offset
&= PAGE_CACHE_MASK
;
517 switch (page_cache
) {
519 page_cache
= page_cache_alloc();
523 if (offset
>= inode
->i_size
)
525 hash
= page_hash(inode
, offset
);
526 page
= page_cache_entry(page_cache
);
527 if (!add_to_page_cache_unique(page
, inode
, offset
, hash
)) {
529 * We do not have to check the return value here
530 * because it's a readahead.
532 inode
->i_op
->readpage(file
, page
);
534 page_cache_release(page
);
541 * Wait for a page to get unlocked.
543 * This must be called with the caller "holding" the page,
544 * ie with increased "page->count" so that the page won't
545 * go away during the wait..
547 void ___wait_on_page(struct page
*page
)
549 struct task_struct
*tsk
= current
;
550 DECLARE_WAITQUEUE(wait
, tsk
);
552 add_wait_queue(&page
->wait
, &wait
);
554 tsk
->state
= TASK_UNINTERRUPTIBLE
;
555 run_task_queue(&tq_disk
);
556 if (!PageLocked(page
))
559 } while (PageLocked(page
));
560 tsk
->state
= TASK_RUNNING
;
561 remove_wait_queue(&page
->wait
, &wait
);
565 * Get an exclusive lock on the page..
567 void lock_page(struct page
*page
)
569 if (TryLockPage(page
)) {
570 struct task_struct
*tsk
= current
;
571 DECLARE_WAITQUEUE(wait
, current
);
573 run_task_queue(&tq_disk
);
574 add_wait_queue(&page
->wait
, &wait
);
575 tsk
->state
= TASK_UNINTERRUPTIBLE
;
577 while (TryLockPage(page
)) {
578 run_task_queue(&tq_disk
);
580 tsk
->state
= TASK_UNINTERRUPTIBLE
;
583 remove_wait_queue(&page
->wait
, &wait
);
584 tsk
->state
= TASK_RUNNING
;
590 * a rather lightweight function, finding and getting a reference to a
591 * hashed page atomically, waiting for it if it's locked.
593 struct page
* __find_get_page (struct inode
* inode
,
594 unsigned long offset
, struct page
**hash
)
599 * We scan the hash list read-only. Addition to and removal from
600 * the hash-list needs a held write-lock.
603 spin_lock(&pagecache_lock
);
604 page
= __find_page_nolock(inode
, offset
, *hash
);
607 spin_unlock(&pagecache_lock
);
609 /* Found the page, sleep if locked. */
610 if (page
&& PageLocked(page
)) {
611 struct task_struct
*tsk
= current
;
612 DECLARE_WAITQUEUE(wait
, tsk
);
614 add_wait_queue(&page
->wait
, &wait
);
615 tsk
->state
= TASK_UNINTERRUPTIBLE
;
617 run_task_queue(&tq_disk
);
618 if (PageLocked(page
))
620 tsk
->state
= TASK_RUNNING
;
621 remove_wait_queue(&page
->wait
, &wait
);
624 * The page might have been unhashed meanwhile. It's
625 * not freed though because we hold a reference to it.
626 * If this is the case then it will be freed _here_,
627 * and we recheck the hash anyway.
629 page_cache_release(page
);
633 * It's not locked so we can return the page and we hold
640 * Get the lock to a page atomically.
642 struct page
* __find_lock_page (struct inode
* inode
,
643 unsigned long offset
, struct page
**hash
)
648 * We scan the hash list read-only. Addition to and removal from
649 * the hash-list needs a held write-lock.
652 spin_lock(&pagecache_lock
);
653 page
= __find_page_nolock(inode
, offset
, *hash
);
656 spin_unlock(&pagecache_lock
);
658 /* Found the page, sleep if locked. */
659 if (page
&& TryLockPage(page
)) {
660 struct task_struct
*tsk
= current
;
661 DECLARE_WAITQUEUE(wait
, tsk
);
663 add_wait_queue(&page
->wait
, &wait
);
664 tsk
->state
= TASK_UNINTERRUPTIBLE
;
666 run_task_queue(&tq_disk
);
667 if (PageLocked(page
))
669 tsk
->state
= TASK_RUNNING
;
670 remove_wait_queue(&page
->wait
, &wait
);
673 * The page might have been unhashed meanwhile. It's
674 * not freed though because we hold a reference to it.
675 * If this is the case then it will be freed _here_,
676 * and we recheck the hash anyway.
678 page_cache_release(page
);
682 * It's not locked so we can return the page and we hold
689 #define PROFILE_READAHEAD
690 #define DEBUG_READAHEAD
694 * Read-ahead profiling information
695 * --------------------------------
696 * Every PROFILE_MAXREADCOUNT, the following information is written
698 * Percentage of asynchronous read-ahead.
699 * Average of read-ahead fields context value.
700 * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
704 #ifdef PROFILE_READAHEAD
706 #define PROFILE_MAXREADCOUNT 1000
708 static unsigned long total_reada
;
709 static unsigned long total_async
;
710 static unsigned long total_ramax
;
711 static unsigned long total_ralen
;
712 static unsigned long total_rawin
;
714 static void profile_readahead(int async
, struct file
*filp
)
722 total_ramax
+= filp
->f_ramax
;
723 total_ralen
+= filp
->f_ralen
;
724 total_rawin
+= filp
->f_rawin
;
726 if (total_reada
> PROFILE_MAXREADCOUNT
) {
729 if (!(total_reada
> PROFILE_MAXREADCOUNT
)) {
730 restore_flags(flags
);
734 printk("Readahead average: max=%ld, len=%ld, win=%ld, async=%ld%%\n",
735 total_ramax
/total_reada
,
736 total_ralen
/total_reada
,
737 total_rawin
/total_reada
,
738 (total_async
*100)/total_reada
);
739 #ifdef DEBUG_READAHEAD
740 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%ld\n",
741 filp
->f_ramax
, filp
->f_ralen
, filp
->f_rawin
, filp
->f_raend
);
750 restore_flags(flags
);
753 #endif /* defined PROFILE_READAHEAD */
756 * Read-ahead context:
757 * -------------------
758 * The read ahead context fields of the "struct file" are the following:
759 * - f_raend : position of the first byte after the last page we tried to
761 * - f_ramax : current read-ahead maximum size.
762 * - f_ralen : length of the current IO read block we tried to read-ahead.
763 * - f_rawin : length of the current read-ahead window.
764 * if last read-ahead was synchronous then
766 * otherwise (was asynchronous)
767 * f_rawin = previous value of f_ralen + f_ralen
771 * MIN_READAHEAD : minimum read-ahead size when read-ahead.
772 * MAX_READAHEAD : maximum read-ahead size when read-ahead.
774 * Synchronous read-ahead benefits:
775 * --------------------------------
776 * Using reasonable IO xfer length from peripheral devices increase system
778 * Reasonable means, in this context, not too large but not too small.
779 * The actual maximum value is:
780 * MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
781 * and 32K if defined (4K page size assumed).
783 * Asynchronous read-ahead benefits:
784 * ---------------------------------
785 * Overlapping next read request and user process execution increase system
790 * We have to guess which further data are needed by the user process.
791 * If these data are often not really needed, it's bad for system
793 * However, we know that files are often accessed sequentially by
794 * application programs and it seems that it is possible to have some good
795 * strategy in that guessing.
796 * We only try to read-ahead files that seems to be read sequentially.
798 * Asynchronous read-ahead risks:
799 * ------------------------------
800 * In order to maximize overlapping, we must start some asynchronous read
801 * request from the device, as soon as possible.
802 * We must be very careful about:
803 * - The number of effective pending IO read requests.
804 * ONE seems to be the only reasonable value.
805 * - The total memory pool usage for the file access stream.
806 * This maximum memory usage is implicitly 2 IO read chunks:
807 * 2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
808 * 64k if defined (4K page size assumed).
811 static inline int get_max_readahead(struct inode
* inode
)
813 if (!inode
->i_dev
|| !max_readahead
[MAJOR(inode
->i_dev
)])
814 return MAX_READAHEAD
;
815 return max_readahead
[MAJOR(inode
->i_dev
)][MINOR(inode
->i_dev
)];
818 static inline unsigned long generic_file_readahead(int reada_ok
,
819 struct file
* filp
, struct inode
* inode
,
820 unsigned long ppos
, struct page
* page
, unsigned long page_cache
)
822 unsigned long max_ahead
, ahead
;
824 int max_readahead
= get_max_readahead(inode
);
826 raend
= filp
->f_raend
& PAGE_CACHE_MASK
;
830 * The current page is locked.
831 * If the current position is inside the previous read IO request, do not
832 * try to reread previously read ahead pages.
833 * Otherwise decide or not to read ahead some pages synchronously.
834 * If we are not going to read ahead, set the read ahead context for this
837 if (PageLocked(page
)) {
838 if (!filp
->f_ralen
|| ppos
>= raend
|| ppos
+ filp
->f_ralen
< raend
) {
840 if (raend
< inode
->i_size
)
841 max_ahead
= filp
->f_ramax
;
843 filp
->f_ralen
= PAGE_CACHE_SIZE
;
845 filp
->f_raend
= ppos
+ filp
->f_ralen
;
846 filp
->f_rawin
+= filp
->f_ralen
;
851 * The current page is not locked.
852 * If we were reading ahead and,
853 * if the current max read ahead size is not zero and,
854 * if the current position is inside the last read-ahead IO request,
855 * it is the moment to try to read ahead asynchronously.
856 * We will later force unplug device in order to force asynchronous read IO.
858 else if (reada_ok
&& filp
->f_ramax
&& raend
>= PAGE_CACHE_SIZE
&&
859 ppos
<= raend
&& ppos
+ filp
->f_ralen
>= raend
) {
861 * Add ONE page to max_ahead in order to try to have about the same IO max size
862 * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
863 * Compute the position of the last page we have tried to read in order to
864 * begin to read ahead just at the next page.
866 raend
-= PAGE_CACHE_SIZE
;
867 if (raend
< inode
->i_size
)
868 max_ahead
= filp
->f_ramax
+ PAGE_CACHE_SIZE
;
871 filp
->f_rawin
= filp
->f_ralen
;
877 * Try to read ahead pages.
878 * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
879 * scheduler, will work enough for us to avoid too bad actuals IO requests.
882 while (ahead
< max_ahead
) {
883 ahead
+= PAGE_CACHE_SIZE
;
884 page_cache
= try_to_read_ahead(filp
, raend
+ ahead
,
888 * If we tried to read ahead some pages,
889 * If we tried to read ahead asynchronously,
890 * Try to force unplug of the device in order to start an asynchronous
892 * Update the read-ahead context.
893 * Store the length of the current read-ahead window.
894 * Double the current max read ahead size.
895 * That heuristic avoid to do some large IO for files that are not really
896 * accessed sequentially.
900 run_task_queue(&tq_disk
);
903 filp
->f_ralen
+= ahead
;
904 filp
->f_rawin
+= filp
->f_ralen
;
905 filp
->f_raend
= raend
+ ahead
+ PAGE_CACHE_SIZE
;
907 filp
->f_ramax
+= filp
->f_ramax
;
909 if (filp
->f_ramax
> max_readahead
)
910 filp
->f_ramax
= max_readahead
;
912 #ifdef PROFILE_READAHEAD
913 profile_readahead((reada_ok
== 2), filp
);
921 * "descriptor" for what we're up to with a read.
922 * This allows us to use the same read code yet
923 * have multiple different users of the data that
924 * we read from a file.
926 * The simplest case just copies the data to user
936 typedef int (*read_actor_t
)(read_descriptor_t
*, const char *, unsigned long);
939 * This is a generic file read routine, and uses the
940 * inode->i_op->readpage() function for the actual low-level
943 * This is really ugly. But the goto's actually try to clarify some
944 * of the logic when it comes to error handling etc.
946 static void do_generic_file_read(struct file
* filp
, loff_t
*ppos
, read_descriptor_t
* desc
, read_actor_t actor
)
948 struct dentry
*dentry
= filp
->f_dentry
;
949 struct inode
*inode
= dentry
->d_inode
;
950 size_t pos
, pgpos
, page_cache
;
953 int max_readahead
= get_max_readahead(inode
);
958 pgpos
= pos
& PAGE_CACHE_MASK
;
960 * If the current position is outside the previous read-ahead window,
961 * we reset the current read-ahead context and set read ahead max to zero
962 * (will be set to just needed value later),
963 * otherwise, we assume that the file accesses are sequential enough to
964 * continue read-ahead.
966 if (pgpos
> filp
->f_raend
|| pgpos
+ filp
->f_rawin
< filp
->f_raend
) {
976 * Adjust the current value of read-ahead max.
977 * If the read operation stay in the first half page, force no readahead.
978 * Otherwise try to increase read ahead max just enough to do the read request.
979 * Then, at least MIN_READAHEAD if read ahead is ok,
980 * and at most MAX_READAHEAD in all cases.
982 if (pos
+ desc
->count
<= (PAGE_CACHE_SIZE
>> 1)) {
985 unsigned long needed
;
987 needed
= ((pos
+ desc
->count
) & PAGE_CACHE_MASK
) - pgpos
;
989 if (filp
->f_ramax
< needed
)
990 filp
->f_ramax
= needed
;
992 if (reada_ok
&& filp
->f_ramax
< MIN_READAHEAD
)
993 filp
->f_ramax
= MIN_READAHEAD
;
994 if (filp
->f_ramax
> max_readahead
)
995 filp
->f_ramax
= max_readahead
;
999 struct page
*page
, **hash
;
1001 if (pos
>= inode
->i_size
)
1005 * Try to find the data in the page cache..
1007 hash
= page_hash(inode
, pos
& PAGE_CACHE_MASK
);
1009 spin_lock(&pagecache_lock
);
1010 page
= __find_page_nolock(inode
, pos
& PAGE_CACHE_MASK
, *hash
);
1012 goto no_cached_page
;
1015 spin_unlock(&pagecache_lock
);
1017 if (!Page_Uptodate(page
))
1018 goto page_not_up_to_date
;
1021 * Ok, we have the page, and it's up-to-date, so
1022 * now we can copy it to user space...
1025 unsigned long offset
, nr
;
1027 offset
= pos
& ~PAGE_CACHE_MASK
;
1028 nr
= PAGE_CACHE_SIZE
- offset
;
1029 if (nr
> inode
->i_size
- pos
)
1030 nr
= inode
->i_size
- pos
;
1033 * The actor routine returns how many bytes were actually used..
1034 * NOTE! This may not be the same as how much of a user buffer
1035 * we filled up (we may be padding etc), so we can only update
1036 * "pos" here (the actor routine has to update the user buffer
1037 * pointers and the remaining count).
1039 nr
= actor(desc
, (const char *) (page_address(page
) + offset
), nr
);
1041 page_cache_release(page
);
1042 if (nr
&& desc
->count
)
1048 * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
1050 page_not_up_to_date
:
1051 page_cache
= generic_file_readahead(reada_ok
, filp
, inode
, pos
& PAGE_CACHE_MASK
, page
, page_cache
);
1053 if (Page_Uptodate(page
))
1056 /* Get exclusive access to the page ... */
1058 if (Page_Uptodate(page
)) {
1064 /* ... and start the actual read. The read will unlock the page. */
1065 error
= inode
->i_op
->readpage(filp
, page
);
1068 if (Page_Uptodate(page
))
1071 /* Again, try some read-ahead while waiting for the page to finish.. */
1072 page_cache
= generic_file_readahead(reada_ok
, filp
, inode
, pos
& PAGE_CACHE_MASK
, page
, page_cache
);
1074 if (Page_Uptodate(page
))
1079 /* UHHUH! A synchronous read error occurred. Report it */
1080 desc
->error
= error
;
1081 page_cache_release(page
);
1086 * Ok, it wasn't cached, so we need to create a new
1089 * We get here with the page cache lock held.
1092 spin_unlock(&pagecache_lock
);
1093 page_cache
= page_cache_alloc();
1095 desc
->error
= -ENOMEM
;
1100 * Somebody may have added the page while we
1101 * dropped the page cache lock. Check for that.
1103 spin_lock(&pagecache_lock
);
1104 page
= __find_page_nolock(inode
, pos
& PAGE_CACHE_MASK
, *hash
);
1110 * Ok, add the new page to the hash-queues...
1112 page
= page_cache_entry(page_cache
);
1113 __add_to_page_cache(page
, inode
, pos
& PAGE_CACHE_MASK
, hash
);
1114 spin_unlock(&pagecache_lock
);
1123 page_cache_free(page_cache
);
1124 UPDATE_ATIME(inode
);
1127 static int file_read_actor(read_descriptor_t
* desc
, const char *area
, unsigned long size
)
1130 unsigned long count
= desc
->count
;
1134 left
= __copy_to_user(desc
->buf
, area
, size
);
1137 desc
->error
= -EFAULT
;
1139 desc
->count
= count
- size
;
1140 desc
->written
+= size
;
1146 * This is the "read()" routine for all filesystems
1147 * that can use the page cache directly.
1149 ssize_t
generic_file_read(struct file
* filp
, char * buf
, size_t count
, loff_t
*ppos
)
1154 if (access_ok(VERIFY_WRITE
, buf
, count
)) {
1157 read_descriptor_t desc
;
1163 do_generic_file_read(filp
, ppos
, &desc
, file_read_actor
);
1165 retval
= desc
.written
;
1167 retval
= desc
.error
;
1173 static int file_send_actor(read_descriptor_t
* desc
, const char *area
, unsigned long size
)
1176 unsigned long count
= desc
->count
;
1177 struct file
*file
= (struct file
*) desc
->buf
;
1178 mm_segment_t old_fs
;
1184 written
= file
->f_op
->write(file
, area
, size
, &file
->f_pos
);
1187 desc
->error
= written
;
1190 desc
->count
= count
- written
;
1191 desc
->written
+= written
;
1195 asmlinkage ssize_t
sys_sendfile(int out_fd
, int in_fd
, off_t
*offset
, size_t count
)
1198 struct file
* in_file
, * out_file
;
1199 struct inode
* in_inode
, * out_inode
;
1202 * Get input file, and verify that it is ok..
1205 in_file
= fget(in_fd
);
1208 if (!(in_file
->f_mode
& FMODE_READ
))
1211 in_inode
= in_file
->f_dentry
->d_inode
;
1214 if (!in_inode
->i_op
|| !in_inode
->i_op
->readpage
)
1216 retval
= locks_verify_area(FLOCK_VERIFY_READ
, in_inode
, in_file
, in_file
->f_pos
, count
);
1221 * Get output file, and verify that it is ok..
1224 out_file
= fget(out_fd
);
1227 if (!(out_file
->f_mode
& FMODE_WRITE
))
1230 if (!out_file
->f_op
|| !out_file
->f_op
->write
)
1232 out_inode
= out_file
->f_dentry
->d_inode
;
1235 retval
= locks_verify_area(FLOCK_VERIFY_WRITE
, out_inode
, out_file
, out_file
->f_pos
, count
);
1241 read_descriptor_t desc
;
1242 loff_t pos
= 0, *ppos
;
1245 ppos
= &in_file
->f_pos
;
1247 if (get_user(pos
, offset
))
1254 desc
.buf
= (char *) out_file
;
1256 do_generic_file_read(in_file
, ppos
, &desc
, file_send_actor
);
1258 retval
= desc
.written
;
1260 retval
= desc
.error
;
1262 put_user(pos
, offset
);
1274 * Semantics for shared and private memory areas are different past the end
1275 * of the file. A shared mapping past the last page of the file is an error
1276 * and results in a SIGBUS, while a private mapping just maps in a zero page.
1278 * The goto's are kind of ugly, but this streamlines the normal case of having
1279 * it in the page cache, and handles the special cases reasonably without
1280 * having a lot of duplicated code.
1282 * WSH 06/04/97: fixed a memory leak and moved the allocation of new_page
1283 * ahead of the wait if we're sure to need it.
1285 static unsigned long filemap_nopage(struct vm_area_struct
* area
, unsigned long address
, int no_share
)
1287 struct file
* file
= area
->vm_file
;
1288 struct dentry
* dentry
= file
->f_dentry
;
1289 struct inode
* inode
= dentry
->d_inode
;
1290 unsigned long offset
, reada
, i
;
1291 struct page
* page
, **hash
;
1292 unsigned long old_page
, new_page
;
1296 offset
= (address
& PAGE_MASK
) - area
->vm_start
+ area
->vm_offset
;
1297 if (offset
>= inode
->i_size
&& (area
->vm_flags
& VM_SHARED
) && area
->vm_mm
== current
->mm
)
1301 * Do we have something in the page cache already?
1303 hash
= page_hash(inode
, offset
);
1305 page
= __find_get_page(inode
, offset
, hash
);
1307 goto no_cached_page
;
1311 * Ok, found a page in the page cache, now we need to check
1312 * that it's up-to-date. First check whether we'll need an
1313 * extra page -- better to overlap the allocation with the I/O.
1315 if (no_share
&& !new_page
) {
1316 new_page
= page_cache_alloc();
1321 if (!Page_Uptodate(page
)) {
1323 if (!Page_Uptodate(page
))
1324 goto page_not_uptodate
;
1330 * Found the page and have a reference on it, need to check sharing
1331 * and possibly copy it over to another page..
1333 old_page
= page_address(page
);
1336 * Ok, we can share the cached page directly.. Get rid
1337 * of any potential extra pages.
1340 page_cache_free(new_page
);
1342 flush_page_to_ram(old_page
);
1347 * No sharing ... copy to the new page.
1349 copy_page(new_page
, old_page
);
1350 flush_page_to_ram(new_page
);
1351 page_cache_release(page
);
1356 * Try to read in an entire cluster at once.
1359 reada
>>= PAGE_CACHE_SHIFT
+ page_cluster
;
1360 reada
<<= PAGE_CACHE_SHIFT
+ page_cluster
;
1362 for (i
= 1 << page_cluster
; i
> 0; --i
, reada
+= PAGE_CACHE_SIZE
)
1363 new_page
= try_to_read_ahead(file
, reada
, new_page
);
1366 new_page
= page_cache_alloc();
1371 * During getting the above page we might have slept,
1372 * so we need to re-check the situation with the page
1373 * cache.. The page we just got may be useful if we
1374 * can't share, so don't get rid of it here.
1376 page
= __find_get_page(inode
, offset
, hash
);
1381 * Now, create a new page-cache page from the page we got
1383 page
= page_cache_entry(new_page
);
1384 if (add_to_page_cache_unique(page
, inode
, offset
, hash
))
1388 * Now it's ours and locked, we can do initial IO to it:
1393 error
= inode
->i_op
->readpage(file
, page
);
1397 if (PageError(page
))
1398 goto page_read_error
;
1404 * Umm, take care of errors if the page isn't up-to-date.
1405 * Try to re-read it _once_. We do this synchronously,
1406 * because there really aren't any performance issues here
1407 * and we need to check for errors.
1409 if (!PageLocked(page
))
1411 ClearPageError(page
);
1412 error
= inode
->i_op
->readpage(file
, page
);
1416 if (Page_Uptodate(page
))
1420 * Things didn't work out. Return zero to tell the
1421 * mm layer so, possibly freeing the page cache page first.
1424 page_cache_release(page
);
1426 page_cache_free(new_page
);
1432 * Tries to write a shared mapped page to its backing store. May return -EIO
1433 * if the disk is full.
1435 static inline int do_write_page(struct inode
* inode
, struct file
* file
,
1436 const char * page_addr
, unsigned long offset
)
1440 int (*writepage
) (struct file
*, struct page
*);
1443 size
= offset
+ PAGE_SIZE
;
1444 /* refuse to extend file size.. */
1445 if (S_ISREG(inode
->i_mode
)) {
1446 if (size
> inode
->i_size
)
1447 size
= inode
->i_size
;
1448 /* Ho humm.. We should have tested for this earlier */
1454 writepage
= inode
->i_op
->writepage
;
1455 page
= mem_map
+ MAP_NR(page_addr
);
1458 retval
= writepage(file
, page
);
1464 static int filemap_write_page(struct vm_area_struct
* vma
,
1465 unsigned long offset
,
1471 struct dentry
* dentry
;
1472 struct inode
* inode
;
1474 file
= vma
->vm_file
;
1475 dentry
= file
->f_dentry
;
1476 inode
= dentry
->d_inode
;
1479 * If a task terminates while we're swapping the page, the vma and
1480 * and file could be released ... increment the count to be safe.
1483 result
= do_write_page(inode
, file
, (const char *) page
, offset
);
1490 * The page cache takes care of races between somebody
1491 * trying to swap something out and swap something in
1492 * at the same time..
1494 extern void wakeup_bdflush(int);
1495 int filemap_swapout(struct vm_area_struct
* vma
, struct page
* page
)
1497 int retval
= filemap_write_page(vma
, page
->offset
, page_address(page
), 0);
1502 static inline int filemap_sync_pte(pte_t
* ptep
, struct vm_area_struct
*vma
,
1503 unsigned long address
, unsigned int flags
)
1506 unsigned long pageaddr
;
1510 if (!(flags
& MS_INVALIDATE
)) {
1511 if (!pte_present(pte
))
1513 if (!pte_dirty(pte
))
1515 flush_page_to_ram(pte_page(pte
));
1516 flush_cache_page(vma
, address
);
1517 set_pte(ptep
, pte_mkclean(pte
));
1518 flush_tlb_page(vma
, address
);
1519 pageaddr
= pte_page(pte
);
1520 page
= page_cache_entry(pageaddr
);
1525 flush_cache_page(vma
, address
);
1527 flush_tlb_page(vma
, address
);
1528 if (!pte_present(pte
)) {
1529 swap_free(pte_val(pte
));
1532 pageaddr
= pte_page(pte
);
1533 if (!pte_dirty(pte
) || flags
== MS_INVALIDATE
) {
1534 page_cache_free(pageaddr
);
1538 error
= filemap_write_page(vma
, address
- vma
->vm_start
+ vma
->vm_offset
, pageaddr
, 1);
1539 page_cache_free(pageaddr
);
1543 static inline int filemap_sync_pte_range(pmd_t
* pmd
,
1544 unsigned long address
, unsigned long size
,
1545 struct vm_area_struct
*vma
, unsigned long offset
, unsigned int flags
)
1553 if (pmd_bad(*pmd
)) {
1554 printk("filemap_sync_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd
));
1558 pte
= pte_offset(pmd
, address
);
1559 offset
+= address
& PMD_MASK
;
1560 address
&= ~PMD_MASK
;
1561 end
= address
+ size
;
1566 error
|= filemap_sync_pte(pte
, vma
, address
+ offset
, flags
);
1567 address
+= PAGE_SIZE
;
1569 } while (address
< end
);
1573 static inline int filemap_sync_pmd_range(pgd_t
* pgd
,
1574 unsigned long address
, unsigned long size
,
1575 struct vm_area_struct
*vma
, unsigned int flags
)
1578 unsigned long offset
, end
;
1583 if (pgd_bad(*pgd
)) {
1584 printk("filemap_sync_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd
));
1588 pmd
= pmd_offset(pgd
, address
);
1589 offset
= address
& PGDIR_MASK
;
1590 address
&= ~PGDIR_MASK
;
1591 end
= address
+ size
;
1592 if (end
> PGDIR_SIZE
)
1596 error
|= filemap_sync_pte_range(pmd
, address
, end
- address
, vma
, offset
, flags
);
1597 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
1599 } while (address
< end
);
1603 static int filemap_sync(struct vm_area_struct
* vma
, unsigned long address
,
1604 size_t size
, unsigned int flags
)
1607 unsigned long end
= address
+ size
;
1610 dir
= pgd_offset(vma
->vm_mm
, address
);
1611 flush_cache_range(vma
->vm_mm
, end
- size
, end
);
1612 while (address
< end
) {
1613 error
|= filemap_sync_pmd_range(dir
, address
, end
- address
, vma
, flags
);
1614 address
= (address
+ PGDIR_SIZE
) & PGDIR_MASK
;
1617 flush_tlb_range(vma
->vm_mm
, end
- size
, end
);
1622 * This handles (potentially partial) area unmaps..
1624 static void filemap_unmap(struct vm_area_struct
*vma
, unsigned long start
, size_t len
)
1626 filemap_sync(vma
, start
, len
, MS_ASYNC
);
1630 * Shared mappings need to be able to do the right thing at
1631 * close/unmap/sync. They will also use the private file as
1632 * backing-store for swapping..
1634 static struct vm_operations_struct file_shared_mmap
= {
1635 NULL
, /* no special open */
1636 NULL
, /* no special close */
1637 filemap_unmap
, /* unmap - we need to sync the pages */
1638 NULL
, /* no special protect */
1639 filemap_sync
, /* sync */
1641 filemap_nopage
, /* nopage */
1643 filemap_swapout
/* swapout */
1647 * Private mappings just need to be able to load in the map.
1649 * (This is actually used for shared mappings as well, if we
1650 * know they can't ever get write permissions..)
1652 static struct vm_operations_struct file_private_mmap
= {
1659 filemap_nopage
, /* nopage */
1664 /* This is used for a general mmap of a disk file */
1666 int generic_file_mmap(struct file
* file
, struct vm_area_struct
* vma
)
1668 struct vm_operations_struct
* ops
;
1669 struct inode
*inode
= file
->f_dentry
->d_inode
;
1671 ops
= &file_private_mmap
;
1672 if ((vma
->vm_flags
& VM_SHARED
) && (vma
->vm_flags
& VM_MAYWRITE
)) {
1673 if (!inode
->i_op
|| !inode
->i_op
->writepage
)
1675 ops
= &file_shared_mmap
;
1677 if (!inode
->i_sb
|| !S_ISREG(inode
->i_mode
))
1679 if (!inode
->i_op
|| !inode
->i_op
->readpage
)
1681 UPDATE_ATIME(inode
);
1688 * The msync() system call.
1691 static int msync_interval(struct vm_area_struct
* vma
,
1692 unsigned long start
, unsigned long end
, int flags
)
1694 if (vma
->vm_file
&& vma
->vm_ops
&& vma
->vm_ops
->sync
) {
1696 error
= vma
->vm_ops
->sync(vma
, start
, end
-start
, flags
);
1697 if (!error
&& (flags
& MS_SYNC
)) {
1698 struct file
* file
= vma
->vm_file
;
1700 struct dentry
* dentry
= file
->f_dentry
;
1701 error
= file_fsync(file
, dentry
);
1709 asmlinkage
int sys_msync(unsigned long start
, size_t len
, int flags
)
1712 struct vm_area_struct
* vma
;
1713 int unmapped_error
, error
= -EINVAL
;
1715 down(¤t
->mm
->mmap_sem
);
1717 if (start
& ~PAGE_MASK
)
1719 len
= (len
+ ~PAGE_MASK
) & PAGE_MASK
;
1723 if (flags
& ~(MS_ASYNC
| MS_INVALIDATE
| MS_SYNC
))
1729 * If the interval [start,end) covers some unmapped address ranges,
1730 * just ignore them, but return -EFAULT at the end.
1732 vma
= find_vma(current
->mm
, start
);
1735 /* Still start < end. */
1739 /* Here start < vma->vm_end. */
1740 if (start
< vma
->vm_start
) {
1741 unmapped_error
= -EFAULT
;
1742 start
= vma
->vm_start
;
1744 /* Here vma->vm_start <= start < vma->vm_end. */
1745 if (end
<= vma
->vm_end
) {
1747 error
= msync_interval(vma
, start
, end
, flags
);
1751 error
= unmapped_error
;
1754 /* Here vma->vm_start <= start < vma->vm_end < end. */
1755 error
= msync_interval(vma
, start
, vma
->vm_end
, flags
);
1758 start
= vma
->vm_end
;
1763 up(¤t
->mm
->mmap_sem
);
1768 * Write to a file through the page cache. This is mainly for the
1769 * benefit of NFS and possibly other network-based file systems.
1771 * We currently put everything into the page cache prior to writing it.
1772 * This is not a problem when writing full pages. With partial pages,
1773 * however, we first have to read the data into the cache, then
1774 * dirty the page, and finally schedule it for writing. Alternatively, we
1775 * could write-through just the portion of data that would go into that
1776 * page, but that would kill performance for applications that write data
1777 * line by line, and it's prone to race conditions.
1779 * Note that this routine doesn't try to keep track of dirty pages. Each
1780 * file system has to do this all by itself, unfortunately.
1784 generic_file_write(struct file
*file
, const char *buf
,
1785 size_t count
, loff_t
*ppos
,
1786 writepage_t write_one_page
)
1788 struct dentry
*dentry
= file
->f_dentry
;
1789 struct inode
*inode
= dentry
->d_inode
;
1790 unsigned long pos
= *ppos
;
1791 unsigned long limit
= current
->rlim
[RLIMIT_FSIZE
].rlim_cur
;
1792 struct page
*page
, **hash
;
1793 unsigned long page_cache
= 0;
1794 unsigned long written
;
1798 err
= file
->f_error
;
1806 if (file
->f_flags
& O_APPEND
)
1807 pos
= inode
->i_size
;
1810 * Check whether we've reached the file size limit.
1814 send_sig(SIGXFSZ
, current
, 0);
1820 * Check whether to truncate the write,
1821 * and send the signal if we do.
1823 if (count
> limit
- pos
) {
1824 send_sig(SIGXFSZ
, current
, 0);
1825 count
= limit
- pos
;
1829 unsigned long bytes
, pgpos
, offset
;
1831 * Try to find the page in the cache. If it isn't there,
1832 * allocate a free page.
1834 offset
= (pos
& ~PAGE_CACHE_MASK
);
1835 pgpos
= pos
& PAGE_CACHE_MASK
;
1836 bytes
= PAGE_CACHE_SIZE
- offset
;
1840 hash
= page_hash(inode
, pgpos
);
1842 page
= __find_lock_page(inode
, pgpos
, hash
);
1845 page_cache
= page_cache_alloc();
1851 page
= page_cache_entry(page_cache
);
1852 if (add_to_page_cache_unique(page
,inode
,pgpos
,hash
))
1858 /* We have exclusive IO access to the page.. */
1859 if (!PageLocked(page
)) {
1862 if (page
->owner
!= current
) {
1867 status
= write_one_page(file
, page
, offset
, bytes
, buf
);
1869 /* Mark it unlocked again and drop the page.. */
1871 page_cache_release(page
);
1882 if (pos
> inode
->i_size
)
1883 inode
->i_size
= pos
;
1886 page_cache_free(page_cache
);
1888 err
= written
? written
: status
;
1894 * Support routines for directory caching using the page cache.
1898 * Unlock and free a page.
1900 void put_cached_page(unsigned long addr
)
1902 struct page
* page
= page_cache_entry(addr
);
1905 if (page_count(page
) != 2)
1906 panic("put_cached_page: page count=%d\n",
1908 page_cache_release(page
);
1911 void __init
page_cache_init(unsigned long memory_size
)
1913 unsigned long htable_size
, order
;
1915 htable_size
= memory_size
>> PAGE_SHIFT
;
1916 htable_size
*= sizeof(struct page
*);
1917 for(order
= 0; (PAGE_SIZE
<< order
) < htable_size
; order
++)
1921 unsigned long tmp
= (PAGE_SIZE
<< order
) / sizeof(struct page
*);
1924 while((tmp
>>= 1UL) != 0UL)
1927 page_hash_table
= (struct page
**)
1928 __get_free_pages(GFP_ATOMIC
, order
);
1929 } while(page_hash_table
== NULL
&& --order
> 0);
1931 printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
1932 (1 << page_hash_bits
), order
, (PAGE_SIZE
<< order
));
1933 if (!page_hash_table
)
1934 panic("Failed to allocate page hash table\n");
1935 memset(page_hash_table
, 0, PAGE_HASH_SIZE
* sizeof(struct page
*));