4 * Copyright (C) 1994-1999 Linus Torvalds
8 * This file handles the generic file mmap semantics used by
9 * most "normal" filesystems (but you don't /have/ to use this:
10 * the NFS filesystem used to do this differently, for example)
12 #include <linux/malloc.h>
13 #include <linux/shm.h>
14 #include <linux/mman.h>
15 #include <linux/locks.h>
16 #include <linux/pagemap.h>
17 #include <linux/swap.h>
18 #include <linux/smp_lock.h>
19 #include <linux/blkdev.h>
20 #include <linux/file.h>
21 #include <linux/swapctl.h>
22 #include <linux/slab.h>
23 #include <linux/init.h>
25 #include <asm/pgtable.h>
26 #include <asm/uaccess.h>
29 * Shared mappings implemented 30.11.1994. It's not fully working yet,
32 * Shared mappings now work. 15.8.1995 Bruno.
34 * finished 'unifying' the page and buffer cache and SMP-threaded the
35 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
38 atomic_t page_cache_size
= ATOMIC_INIT(0);
39 unsigned int page_hash_bits
;
40 struct page
**page_hash_table
;
42 spinlock_t pagecache_lock
= SPIN_LOCK_UNLOCKED
;
45 void __add_page_to_hash_queue(struct page
* page
, struct page
**p
)
47 atomic_inc(&page_cache_size
);
48 if((page
->next_hash
= *p
) != NULL
)
49 (*p
)->pprev_hash
= &page
->next_hash
;
56 static void remove_page_from_hash_queue(struct page
* page
)
58 if(page
->pprev_hash
) {
60 page
->next_hash
->pprev_hash
= page
->pprev_hash
;
61 *page
->pprev_hash
= page
->next_hash
;
62 page
->pprev_hash
= NULL
;
64 atomic_dec(&page_cache_size
);
67 static void remove_page_from_inode_queue(struct page
* page
)
69 struct inode
* inode
= page
->inode
;
70 struct page
*prev
, *next
;
75 if (inode
->i_pages
== page
)
76 inode
->i_pages
= next
;
86 * Remove a page from the page cache and free it. Caller has to make
87 * sure the page is locked and that nobody else uses it - or that usage
90 void remove_inode_page(struct page
*page
)
92 if (!PageLocked(page
))
95 spin_lock(&pagecache_lock
);
96 remove_page_from_inode_queue(page
);
97 remove_page_from_hash_queue(page
);
99 spin_unlock(&pagecache_lock
);
102 void invalidate_inode_pages(struct inode
* inode
)
108 spin_lock(&pagecache_lock
);
110 while ((page
= *p
) != NULL
) {
112 if (TryLockPage(page
)) {
113 spin_unlock(&pagecache_lock
);
115 page_cache_release(page
);
118 if (page_count(page
) != 2)
119 printk("hm, busy page invalidated? (not necesserily a bug)\n");
121 remove_page_from_inode_queue(page
);
122 remove_page_from_hash_queue(page
);
125 page_cache_release(page
);
126 page_cache_release(page
);
129 spin_unlock(&pagecache_lock
);
132 * Truncate the page cache at a set offset, removing the pages
133 * that are beyond that offset (and zeroing out partial pages).
135 void truncate_inode_pages(struct inode
* inode
, unsigned long start
)
142 spin_lock(&pagecache_lock
);
144 while ((page
= *p
) != NULL
) {
145 unsigned long offset
= page
->offset
;
147 /* page wholly truncated - free it */
148 if (offset
>= start
) {
150 spin_unlock(&pagecache_lock
);
154 if (inode
->i_op
->flushpage
)
155 inode
->i_op
->flushpage(inode
, page
, 0);
158 * We remove the page from the page cache
159 * _after_ we have destroyed all buffer-cache
160 * references to it. Otherwise some other process
161 * might think this inode page is not in the
162 * page cache and creates a buffer-cache alias
163 * to it causing all sorts of fun problems ...
165 remove_inode_page(page
);
168 page_cache_release(page
);
169 page_cache_release(page
);
172 * We have done things without the pagecache lock,
173 * so we'll have to repeat the scan.
174 * It's not possible to deadlock here because
175 * we are guaranteed to make progress. (ie. we have
176 * just removed a page)
182 * there is only one partial page possible.
187 offset
= start
- offset
;
188 /* partial truncate, clear end of page */
189 if (offset
< PAGE_CACHE_SIZE
) {
190 unsigned long address
;
192 spin_unlock(&pagecache_lock
);
197 address
= page_address(page
);
198 memset((void *) (offset
+ address
), 0, PAGE_CACHE_SIZE
- offset
);
199 flush_page_to_ram(address
);
201 if (inode
->i_op
->flushpage
)
202 inode
->i_op
->flushpage(inode
, page
, offset
);
204 * we have dropped the spinlock so we have to
208 page_cache_release(page
);
212 spin_unlock(&pagecache_lock
);
215 extern atomic_t too_many_dirty_buffers
;
217 int shrink_mmap(int priority
, int gfp_mask
)
219 static unsigned long clock
= 0;
220 unsigned long limit
= num_physpages
<< 1;
224 count
= limit
>> priority
;
226 page
= mem_map
+ clock
;
230 /* This works even in the presence of PageSkip because
231 * the first two entries at the beginning of a hole will
232 * be marked, not just the first.
236 if (clock
>= max_mapnr
) {
240 if (PageSkip(page
)) {
241 /* next_hash is overloaded for PageSkip */
242 page
= page
->next_hash
;
243 clock
= page
- mem_map
;
246 referenced
= test_and_clear_bit(PG_referenced
, &page
->flags
);
248 if ((gfp_mask
& __GFP_DMA
) && !PageDMA(page
))
254 * Some common cases that we just short-circuit without
255 * getting the locks - we need to re-check this once we
256 * have the lock, but that's fine.
258 users
= page_count(page
);
261 if (!page
->buffers
) {
269 * ok, now the page looks interesting. Re-check things
272 spin_lock(&pagecache_lock
);
273 if (!page
->inode
&& !page
->buffers
) {
274 spin_unlock(&pagecache_lock
);
277 if (!page_count(page
)) {
278 spin_unlock(&pagecache_lock
);
283 if (TryLockPage(page
)) {
284 spin_unlock(&pagecache_lock
);
289 * we keep pagecache_lock locked and unlock it in
290 * each branch, so that the page->inode case doesnt
291 * have to re-grab it. Here comes the 'real' logic
295 /* Is it a buffer page? */
297 int mem
= page
->inode
? 0 : PAGE_CACHE_SIZE
;
298 spin_unlock(&pagecache_lock
);
299 if (!try_to_free_buffers(page
))
300 goto unlock_continue
;
301 atomic_sub(mem
, &buffermem
);
302 spin_lock(&pagecache_lock
);
306 * We can't free pages unless there's just one user
307 * (count == 2 because we added one ourselves above).
309 if (page_count(page
) != 2)
310 goto spin_unlock_continue
;
313 * Is it a page swap page? If so, we want to
314 * drop it if it is no longer used, even if it
315 * were to be marked referenced..
317 if (PageSwapCache(page
)) {
318 spin_unlock(&pagecache_lock
);
319 if (referenced
&& swap_count(page
->offset
) != 2)
320 goto unlock_continue
;
321 __delete_from_swap_cache(page
);
322 page_cache_release(page
);
326 /* is it a page-cache page? */
327 if (!referenced
&& page
->inode
&& !pgcache_under_min()) {
328 remove_page_from_inode_queue(page
);
329 remove_page_from_hash_queue(page
);
331 spin_unlock(&pagecache_lock
);
333 page_cache_release(page
);
336 spin_unlock_continue
:
337 spin_unlock(&pagecache_lock
);
350 static inline struct page
* __find_page_nolock(struct inode
* inode
, unsigned long offset
, struct page
*page
)
355 page
= page
->next_hash
;
359 if (page
->inode
!= inode
)
361 if (page
->offset
== offset
)
364 set_bit(PG_referenced
, &page
->flags
);
370 * By the time this is called, the page is locked and
371 * we don't have to worry about any races any more.
375 static int writeout_one_page(struct page
*page
)
377 struct buffer_head
*bh
, *head
= page
->buffers
;
381 if (buffer_locked(bh
) || !buffer_dirty(bh
) || !buffer_uptodate(bh
))
385 ll_rw_block(WRITE
, 1, &bh
);
386 } while ((bh
= bh
->b_this_page
) != head
);
390 static int waitfor_one_page(struct page
*page
)
393 struct buffer_head
*bh
, *head
= page
->buffers
;
398 if (buffer_req(bh
) && !buffer_uptodate(bh
))
400 } while ((bh
= bh
->b_this_page
) != head
);
404 static int do_buffer_fdatasync(struct inode
*inode
, unsigned long start
, unsigned long end
, int (*fn
)(struct page
*))
411 spin_lock(&pagecache_lock
);
412 next
= inode
->i_pages
;
414 struct page
*page
= next
;
418 if (page
->offset
>= end
)
420 if (page
->offset
< start
)
424 spin_unlock(&pagecache_lock
);
427 /* The buffers could have been free'd while we waited for the page lock */
432 spin_lock(&pagecache_lock
);
434 page_cache_release(page
);
436 spin_unlock(&pagecache_lock
);
442 * Two-stage data sync: first start the IO, then go back and
443 * collect the information..
445 int generic_buffer_fdatasync(struct inode
*inode
, unsigned long start
, unsigned long end
)
449 retval
= do_buffer_fdatasync(inode
, start
, end
, writeout_one_page
);
450 retval
|= do_buffer_fdatasync(inode
, start
, end
, waitfor_one_page
);
455 * This adds a page to the page cache, starting out as locked,
456 * owned by us, referenced, but not uptodate and with no errors.
458 static inline void __add_to_page_cache(struct page
* page
,
459 struct inode
* inode
, unsigned long offset
,
464 flags
= page
->flags
& ~((1 << PG_uptodate
) | (1 << PG_error
));
465 page
->flags
= flags
| ((1 << PG_locked
) | (1 << PG_referenced
));
466 page
->owner
= current
; /* REMOVEME */
468 page
->offset
= offset
;
469 add_page_to_inode_queue(inode
, page
);
470 __add_page_to_hash_queue(page
, hash
);
473 void add_to_page_cache(struct page
* page
, struct inode
* inode
, unsigned long offset
)
475 spin_lock(&pagecache_lock
);
476 __add_to_page_cache(page
, inode
, offset
, page_hash(inode
, offset
));
477 spin_unlock(&pagecache_lock
);
480 int add_to_page_cache_unique(struct page
* page
,
481 struct inode
* inode
, unsigned long offset
,
487 spin_lock(&pagecache_lock
);
488 alias
= __find_page_nolock(inode
, offset
, *hash
);
492 __add_to_page_cache(page
,inode
,offset
,hash
);
496 spin_unlock(&pagecache_lock
);
501 * Try to read ahead in the file. "page_cache" is a potentially free page
502 * that we could use for the cache (if it is 0 we can try to create one,
503 * this is all overlapped with the IO on the previous page finishing anyway)
505 static unsigned long try_to_read_ahead(struct file
* file
,
506 unsigned long offset
, unsigned long page_cache
)
508 struct inode
*inode
= file
->f_dentry
->d_inode
;
512 offset
&= PAGE_CACHE_MASK
;
513 switch (page_cache
) {
515 page_cache
= page_cache_alloc();
519 if (offset
>= inode
->i_size
)
521 hash
= page_hash(inode
, offset
);
522 page
= page_cache_entry(page_cache
);
523 if (!add_to_page_cache_unique(page
, inode
, offset
, hash
)) {
525 * We do not have to check the return value here
526 * because it's a readahead.
528 inode
->i_op
->readpage(file
, page
);
530 page_cache_release(page
);
537 * Wait for a page to get unlocked.
539 * This must be called with the caller "holding" the page,
540 * ie with increased "page->count" so that the page won't
541 * go away during the wait..
543 void ___wait_on_page(struct page
*page
)
545 struct task_struct
*tsk
= current
;
546 DECLARE_WAITQUEUE(wait
, tsk
);
548 add_wait_queue(&page
->wait
, &wait
);
550 tsk
->state
= TASK_UNINTERRUPTIBLE
;
551 run_task_queue(&tq_disk
);
552 if (!PageLocked(page
))
555 } while (PageLocked(page
));
556 tsk
->state
= TASK_RUNNING
;
557 remove_wait_queue(&page
->wait
, &wait
);
561 * Get an exclusive lock on the page..
563 void lock_page(struct page
*page
)
565 if (TryLockPage(page
)) {
566 struct task_struct
*tsk
= current
;
567 DECLARE_WAITQUEUE(wait
, current
);
569 run_task_queue(&tq_disk
);
570 add_wait_queue(&page
->wait
, &wait
);
571 tsk
->state
= TASK_UNINTERRUPTIBLE
;
573 while (TryLockPage(page
)) {
574 run_task_queue(&tq_disk
);
576 tsk
->state
= TASK_UNINTERRUPTIBLE
;
579 remove_wait_queue(&page
->wait
, &wait
);
580 tsk
->state
= TASK_RUNNING
;
586 * a rather lightweight function, finding and getting a reference to a
587 * hashed page atomically, waiting for it if it's locked.
589 struct page
* __find_get_page (struct inode
* inode
,
590 unsigned long offset
, struct page
**hash
)
595 * We scan the hash list read-only. Addition to and removal from
596 * the hash-list needs a held write-lock.
599 spin_lock(&pagecache_lock
);
600 page
= __find_page_nolock(inode
, offset
, *hash
);
603 spin_unlock(&pagecache_lock
);
605 /* Found the page, sleep if locked. */
606 if (page
&& PageLocked(page
)) {
607 struct task_struct
*tsk
= current
;
608 DECLARE_WAITQUEUE(wait
, tsk
);
610 add_wait_queue(&page
->wait
, &wait
);
611 tsk
->state
= TASK_UNINTERRUPTIBLE
;
613 run_task_queue(&tq_disk
);
614 if (PageLocked(page
))
616 tsk
->state
= TASK_RUNNING
;
617 remove_wait_queue(&page
->wait
, &wait
);
620 * The page might have been unhashed meanwhile. It's
621 * not freed though because we hold a reference to it.
622 * If this is the case then it will be freed _here_,
623 * and we recheck the hash anyway.
625 page_cache_release(page
);
629 * It's not locked so we can return the page and we hold
636 * Get the lock to a page atomically.
638 struct page
* __find_lock_page (struct inode
* inode
,
639 unsigned long offset
, struct page
**hash
)
644 * We scan the hash list read-only. Addition to and removal from
645 * the hash-list needs a held write-lock.
648 spin_lock(&pagecache_lock
);
649 page
= __find_page_nolock(inode
, offset
, *hash
);
652 spin_unlock(&pagecache_lock
);
654 /* Found the page, sleep if locked. */
655 if (page
&& TryLockPage(page
)) {
656 struct task_struct
*tsk
= current
;
657 DECLARE_WAITQUEUE(wait
, tsk
);
659 add_wait_queue(&page
->wait
, &wait
);
660 tsk
->state
= TASK_UNINTERRUPTIBLE
;
662 run_task_queue(&tq_disk
);
663 if (PageLocked(page
))
665 tsk
->state
= TASK_RUNNING
;
666 remove_wait_queue(&page
->wait
, &wait
);
669 * The page might have been unhashed meanwhile. It's
670 * not freed though because we hold a reference to it.
671 * If this is the case then it will be freed _here_,
672 * and we recheck the hash anyway.
674 page_cache_release(page
);
678 * It's not locked so we can return the page and we hold
685 #define PROFILE_READAHEAD
686 #define DEBUG_READAHEAD
690 * Read-ahead profiling information
691 * --------------------------------
692 * Every PROFILE_MAXREADCOUNT, the following information is written
694 * Percentage of asynchronous read-ahead.
695 * Average of read-ahead fields context value.
696 * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
700 #ifdef PROFILE_READAHEAD
702 #define PROFILE_MAXREADCOUNT 1000
704 static unsigned long total_reada
;
705 static unsigned long total_async
;
706 static unsigned long total_ramax
;
707 static unsigned long total_ralen
;
708 static unsigned long total_rawin
;
710 static void profile_readahead(int async
, struct file
*filp
)
718 total_ramax
+= filp
->f_ramax
;
719 total_ralen
+= filp
->f_ralen
;
720 total_rawin
+= filp
->f_rawin
;
722 if (total_reada
> PROFILE_MAXREADCOUNT
) {
725 if (!(total_reada
> PROFILE_MAXREADCOUNT
)) {
726 restore_flags(flags
);
730 printk("Readahead average: max=%ld, len=%ld, win=%ld, async=%ld%%\n",
731 total_ramax
/total_reada
,
732 total_ralen
/total_reada
,
733 total_rawin
/total_reada
,
734 (total_async
*100)/total_reada
);
735 #ifdef DEBUG_READAHEAD
736 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%ld\n",
737 filp
->f_ramax
, filp
->f_ralen
, filp
->f_rawin
, filp
->f_raend
);
746 restore_flags(flags
);
749 #endif /* defined PROFILE_READAHEAD */
752 * Read-ahead context:
753 * -------------------
754 * The read ahead context fields of the "struct file" are the following:
755 * - f_raend : position of the first byte after the last page we tried to
757 * - f_ramax : current read-ahead maximum size.
758 * - f_ralen : length of the current IO read block we tried to read-ahead.
759 * - f_rawin : length of the current read-ahead window.
760 * if last read-ahead was synchronous then
762 * otherwise (was asynchronous)
763 * f_rawin = previous value of f_ralen + f_ralen
767 * MIN_READAHEAD : minimum read-ahead size when read-ahead.
768 * MAX_READAHEAD : maximum read-ahead size when read-ahead.
770 * Synchronous read-ahead benefits:
771 * --------------------------------
772 * Using reasonable IO xfer length from peripheral devices increase system
774 * Reasonable means, in this context, not too large but not too small.
775 * The actual maximum value is:
776 * MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
777 * and 32K if defined (4K page size assumed).
779 * Asynchronous read-ahead benefits:
780 * ---------------------------------
781 * Overlapping next read request and user process execution increase system
786 * We have to guess which further data are needed by the user process.
787 * If these data are often not really needed, it's bad for system
789 * However, we know that files are often accessed sequentially by
790 * application programs and it seems that it is possible to have some good
791 * strategy in that guessing.
792 * We only try to read-ahead files that seems to be read sequentially.
794 * Asynchronous read-ahead risks:
795 * ------------------------------
796 * In order to maximize overlapping, we must start some asynchronous read
797 * request from the device, as soon as possible.
798 * We must be very careful about:
799 * - The number of effective pending IO read requests.
800 * ONE seems to be the only reasonable value.
801 * - The total memory pool usage for the file access stream.
802 * This maximum memory usage is implicitly 2 IO read chunks:
803 * 2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
804 * 64k if defined (4K page size assumed).
807 static inline int get_max_readahead(struct inode
* inode
)
809 if (!inode
->i_dev
|| !max_readahead
[MAJOR(inode
->i_dev
)])
810 return MAX_READAHEAD
;
811 return max_readahead
[MAJOR(inode
->i_dev
)][MINOR(inode
->i_dev
)];
814 static inline unsigned long generic_file_readahead(int reada_ok
,
815 struct file
* filp
, struct inode
* inode
,
816 unsigned long ppos
, struct page
* page
, unsigned long page_cache
)
818 unsigned long max_ahead
, ahead
;
820 int max_readahead
= get_max_readahead(inode
);
822 raend
= filp
->f_raend
& PAGE_CACHE_MASK
;
826 * The current page is locked.
827 * If the current position is inside the previous read IO request, do not
828 * try to reread previously read ahead pages.
829 * Otherwise decide or not to read ahead some pages synchronously.
830 * If we are not going to read ahead, set the read ahead context for this
833 if (PageLocked(page
)) {
834 if (!filp
->f_ralen
|| ppos
>= raend
|| ppos
+ filp
->f_ralen
< raend
) {
836 if (raend
< inode
->i_size
)
837 max_ahead
= filp
->f_ramax
;
839 filp
->f_ralen
= PAGE_CACHE_SIZE
;
841 filp
->f_raend
= ppos
+ filp
->f_ralen
;
842 filp
->f_rawin
+= filp
->f_ralen
;
847 * The current page is not locked.
848 * If we were reading ahead and,
849 * if the current max read ahead size is not zero and,
850 * if the current position is inside the last read-ahead IO request,
851 * it is the moment to try to read ahead asynchronously.
852 * We will later force unplug device in order to force asynchronous read IO.
854 else if (reada_ok
&& filp
->f_ramax
&& raend
>= PAGE_CACHE_SIZE
&&
855 ppos
<= raend
&& ppos
+ filp
->f_ralen
>= raend
) {
857 * Add ONE page to max_ahead in order to try to have about the same IO max size
858 * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
859 * Compute the position of the last page we have tried to read in order to
860 * begin to read ahead just at the next page.
862 raend
-= PAGE_CACHE_SIZE
;
863 if (raend
< inode
->i_size
)
864 max_ahead
= filp
->f_ramax
+ PAGE_CACHE_SIZE
;
867 filp
->f_rawin
= filp
->f_ralen
;
873 * Try to read ahead pages.
874 * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
875 * scheduler, will work enough for us to avoid too bad actuals IO requests.
878 while (ahead
< max_ahead
) {
879 ahead
+= PAGE_CACHE_SIZE
;
880 page_cache
= try_to_read_ahead(filp
, raend
+ ahead
,
884 * If we tried to read ahead some pages,
885 * If we tried to read ahead asynchronously,
886 * Try to force unplug of the device in order to start an asynchronous
888 * Update the read-ahead context.
889 * Store the length of the current read-ahead window.
890 * Double the current max read ahead size.
891 * That heuristic avoid to do some large IO for files that are not really
892 * accessed sequentially.
896 run_task_queue(&tq_disk
);
899 filp
->f_ralen
+= ahead
;
900 filp
->f_rawin
+= filp
->f_ralen
;
901 filp
->f_raend
= raend
+ ahead
+ PAGE_CACHE_SIZE
;
903 filp
->f_ramax
+= filp
->f_ramax
;
905 if (filp
->f_ramax
> max_readahead
)
906 filp
->f_ramax
= max_readahead
;
908 #ifdef PROFILE_READAHEAD
909 profile_readahead((reada_ok
== 2), filp
);
917 * "descriptor" for what we're up to with a read.
918 * This allows us to use the same read code yet
919 * have multiple different users of the data that
920 * we read from a file.
922 * The simplest case just copies the data to user
932 typedef int (*read_actor_t
)(read_descriptor_t
*, const char *, unsigned long);
935 * This is a generic file read routine, and uses the
936 * inode->i_op->readpage() function for the actual low-level
939 * This is really ugly. But the goto's actually try to clarify some
940 * of the logic when it comes to error handling etc.
942 static void do_generic_file_read(struct file
* filp
, loff_t
*ppos
, read_descriptor_t
* desc
, read_actor_t actor
)
944 struct dentry
*dentry
= filp
->f_dentry
;
945 struct inode
*inode
= dentry
->d_inode
;
946 size_t pos
, pgpos
, page_cache
;
949 int max_readahead
= get_max_readahead(inode
);
954 pgpos
= pos
& PAGE_CACHE_MASK
;
956 * If the current position is outside the previous read-ahead window,
957 * we reset the current read-ahead context and set read ahead max to zero
958 * (will be set to just needed value later),
959 * otherwise, we assume that the file accesses are sequential enough to
960 * continue read-ahead.
962 if (pgpos
> filp
->f_raend
|| pgpos
+ filp
->f_rawin
< filp
->f_raend
) {
972 * Adjust the current value of read-ahead max.
973 * If the read operation stay in the first half page, force no readahead.
974 * Otherwise try to increase read ahead max just enough to do the read request.
975 * Then, at least MIN_READAHEAD if read ahead is ok,
976 * and at most MAX_READAHEAD in all cases.
978 if (pos
+ desc
->count
<= (PAGE_CACHE_SIZE
>> 1)) {
981 unsigned long needed
;
983 needed
= ((pos
+ desc
->count
) & PAGE_CACHE_MASK
) - pgpos
;
985 if (filp
->f_ramax
< needed
)
986 filp
->f_ramax
= needed
;
988 if (reada_ok
&& filp
->f_ramax
< MIN_READAHEAD
)
989 filp
->f_ramax
= MIN_READAHEAD
;
990 if (filp
->f_ramax
> max_readahead
)
991 filp
->f_ramax
= max_readahead
;
995 struct page
*page
, **hash
;
997 if (pos
>= inode
->i_size
)
1001 * Try to find the data in the page cache..
1003 hash
= page_hash(inode
, pos
& PAGE_CACHE_MASK
);
1005 spin_lock(&pagecache_lock
);
1006 page
= __find_page_nolock(inode
, pos
& PAGE_CACHE_MASK
, *hash
);
1008 goto no_cached_page
;
1011 spin_unlock(&pagecache_lock
);
1013 if (!Page_Uptodate(page
))
1014 goto page_not_up_to_date
;
1017 * Ok, we have the page, and it's up-to-date, so
1018 * now we can copy it to user space...
1021 unsigned long offset
, nr
;
1023 offset
= pos
& ~PAGE_CACHE_MASK
;
1024 nr
= PAGE_CACHE_SIZE
- offset
;
1025 if (nr
> inode
->i_size
- pos
)
1026 nr
= inode
->i_size
- pos
;
1029 * The actor routine returns how many bytes were actually used..
1030 * NOTE! This may not be the same as how much of a user buffer
1031 * we filled up (we may be padding etc), so we can only update
1032 * "pos" here (the actor routine has to update the user buffer
1033 * pointers and the remaining count).
1035 nr
= actor(desc
, (const char *) (page_address(page
) + offset
), nr
);
1037 page_cache_release(page
);
1038 if (nr
&& desc
->count
)
1044 * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
1046 page_not_up_to_date
:
1047 page_cache
= generic_file_readahead(reada_ok
, filp
, inode
, pos
& PAGE_CACHE_MASK
, page
, page_cache
);
1049 if (Page_Uptodate(page
))
1052 /* Get exclusive access to the page ... */
1054 if (Page_Uptodate(page
)) {
1060 /* ... and start the actual read. The read will unlock the page. */
1061 error
= inode
->i_op
->readpage(filp
, page
);
1064 if (Page_Uptodate(page
))
1067 /* Again, try some read-ahead while waiting for the page to finish.. */
1068 page_cache
= generic_file_readahead(reada_ok
, filp
, inode
, pos
& PAGE_CACHE_MASK
, page
, page_cache
);
1070 if (Page_Uptodate(page
))
1075 /* UHHUH! A synchronous read error occurred. Report it */
1076 desc
->error
= error
;
1077 page_cache_release(page
);
1082 * Ok, it wasn't cached, so we need to create a new
1085 * We get here with the page cache lock held.
1088 spin_unlock(&pagecache_lock
);
1089 page_cache
= page_cache_alloc();
1091 desc
->error
= -ENOMEM
;
1096 * Somebody may have added the page while we
1097 * dropped the page cache lock. Check for that.
1099 spin_lock(&pagecache_lock
);
1100 page
= __find_page_nolock(inode
, pos
& PAGE_CACHE_MASK
, *hash
);
1106 * Ok, add the new page to the hash-queues...
1108 page
= page_cache_entry(page_cache
);
1109 __add_to_page_cache(page
, inode
, pos
& PAGE_CACHE_MASK
, hash
);
1110 spin_unlock(&pagecache_lock
);
1119 page_cache_free(page_cache
);
1120 UPDATE_ATIME(inode
);
1123 static int file_read_actor(read_descriptor_t
* desc
, const char *area
, unsigned long size
)
1126 unsigned long count
= desc
->count
;
1130 left
= __copy_to_user(desc
->buf
, area
, size
);
1133 desc
->error
= -EFAULT
;
1135 desc
->count
= count
- size
;
1136 desc
->written
+= size
;
1142 * This is the "read()" routine for all filesystems
1143 * that can use the page cache directly.
1145 ssize_t
generic_file_read(struct file
* filp
, char * buf
, size_t count
, loff_t
*ppos
)
1150 if (access_ok(VERIFY_WRITE
, buf
, count
)) {
1153 read_descriptor_t desc
;
1159 do_generic_file_read(filp
, ppos
, &desc
, file_read_actor
);
1161 retval
= desc
.written
;
1163 retval
= desc
.error
;
1169 static int file_send_actor(read_descriptor_t
* desc
, const char *area
, unsigned long size
)
1172 unsigned long count
= desc
->count
;
1173 struct file
*file
= (struct file
*) desc
->buf
;
1174 mm_segment_t old_fs
;
1180 written
= file
->f_op
->write(file
, area
, size
, &file
->f_pos
);
1183 desc
->error
= written
;
1186 desc
->count
= count
- written
;
1187 desc
->written
+= written
;
1191 asmlinkage ssize_t
sys_sendfile(int out_fd
, int in_fd
, off_t
*offset
, size_t count
)
1194 struct file
* in_file
, * out_file
;
1195 struct inode
* in_inode
, * out_inode
;
1198 * Get input file, and verify that it is ok..
1201 in_file
= fget(in_fd
);
1204 if (!(in_file
->f_mode
& FMODE_READ
))
1207 in_inode
= in_file
->f_dentry
->d_inode
;
1210 if (!in_inode
->i_op
|| !in_inode
->i_op
->readpage
)
1212 retval
= locks_verify_area(FLOCK_VERIFY_READ
, in_inode
, in_file
, in_file
->f_pos
, count
);
1217 * Get output file, and verify that it is ok..
1220 out_file
= fget(out_fd
);
1223 if (!(out_file
->f_mode
& FMODE_WRITE
))
1226 if (!out_file
->f_op
|| !out_file
->f_op
->write
)
1228 out_inode
= out_file
->f_dentry
->d_inode
;
1231 retval
= locks_verify_area(FLOCK_VERIFY_WRITE
, out_inode
, out_file
, out_file
->f_pos
, count
);
1237 read_descriptor_t desc
;
1238 loff_t pos
= 0, *ppos
;
1241 ppos
= &in_file
->f_pos
;
1243 if (get_user(pos
, offset
))
1250 desc
.buf
= (char *) out_file
;
1252 do_generic_file_read(in_file
, ppos
, &desc
, file_send_actor
);
1254 retval
= desc
.written
;
1256 retval
= desc
.error
;
1258 put_user(pos
, offset
);
1270 * Semantics for shared and private memory areas are different past the end
1271 * of the file. A shared mapping past the last page of the file is an error
1272 * and results in a SIGBUS, while a private mapping just maps in a zero page.
1274 * The goto's are kind of ugly, but this streamlines the normal case of having
1275 * it in the page cache, and handles the special cases reasonably without
1276 * having a lot of duplicated code.
1278 * WSH 06/04/97: fixed a memory leak and moved the allocation of new_page
1279 * ahead of the wait if we're sure to need it.
1281 static unsigned long filemap_nopage(struct vm_area_struct
* area
, unsigned long address
, int no_share
)
1283 struct file
* file
= area
->vm_file
;
1284 struct dentry
* dentry
= file
->f_dentry
;
1285 struct inode
* inode
= dentry
->d_inode
;
1286 unsigned long offset
, reada
, i
;
1287 struct page
* page
, **hash
;
1288 unsigned long old_page
, new_page
;
1292 offset
= (address
& PAGE_MASK
) - area
->vm_start
+ area
->vm_offset
;
1293 if (offset
>= inode
->i_size
&& (area
->vm_flags
& VM_SHARED
) && area
->vm_mm
== current
->mm
)
1297 * Do we have something in the page cache already?
1299 hash
= page_hash(inode
, offset
);
1301 page
= __find_get_page(inode
, offset
, hash
);
1303 goto no_cached_page
;
1307 * Ok, found a page in the page cache, now we need to check
1308 * that it's up-to-date. First check whether we'll need an
1309 * extra page -- better to overlap the allocation with the I/O.
1311 if (no_share
&& !new_page
) {
1312 new_page
= page_cache_alloc();
1317 if (!Page_Uptodate(page
)) {
1319 if (!Page_Uptodate(page
))
1320 goto page_not_uptodate
;
1326 * Found the page and have a reference on it, need to check sharing
1327 * and possibly copy it over to another page..
1329 old_page
= page_address(page
);
1332 * Ok, we can share the cached page directly.. Get rid
1333 * of any potential extra pages.
1336 page_cache_free(new_page
);
1338 flush_page_to_ram(old_page
);
1343 * No sharing ... copy to the new page.
1345 copy_page(new_page
, old_page
);
1346 flush_page_to_ram(new_page
);
1347 page_cache_release(page
);
1352 * Try to read in an entire cluster at once.
1355 reada
>>= PAGE_CACHE_SHIFT
+ page_cluster
;
1356 reada
<<= PAGE_CACHE_SHIFT
+ page_cluster
;
1358 for (i
= 1 << page_cluster
; i
> 0; --i
, reada
+= PAGE_CACHE_SIZE
)
1359 new_page
= try_to_read_ahead(file
, reada
, new_page
);
1362 new_page
= page_cache_alloc();
1367 * During getting the above page we might have slept,
1368 * so we need to re-check the situation with the page
1369 * cache.. The page we just got may be useful if we
1370 * can't share, so don't get rid of it here.
1372 page
= __find_get_page(inode
, offset
, hash
);
1377 * Now, create a new page-cache page from the page we got
1379 page
= page_cache_entry(new_page
);
1380 if (add_to_page_cache_unique(page
, inode
, offset
, hash
))
1384 * Now it's ours and locked, we can do initial IO to it:
1389 error
= inode
->i_op
->readpage(file
, page
);
1393 if (PageError(page
))
1394 goto page_read_error
;
1400 * Umm, take care of errors if the page isn't up-to-date.
1401 * Try to re-read it _once_. We do this synchronously,
1402 * because there really aren't any performance issues here
1403 * and we need to check for errors.
1405 if (!PageLocked(page
))
1407 ClearPageError(page
);
1408 error
= inode
->i_op
->readpage(file
, page
);
1412 if (Page_Uptodate(page
))
1416 * Things didn't work out. Return zero to tell the
1417 * mm layer so, possibly freeing the page cache page first.
1420 page_cache_release(page
);
1422 page_cache_free(new_page
);
1428 * Tries to write a shared mapped page to its backing store. May return -EIO
1429 * if the disk is full.
1431 static inline int do_write_page(struct inode
* inode
, struct file
* file
,
1432 const char * page_addr
, unsigned long offset
)
1436 int (*writepage
) (struct file
*, struct page
*);
1439 size
= offset
+ PAGE_SIZE
;
1440 /* refuse to extend file size.. */
1441 if (S_ISREG(inode
->i_mode
)) {
1442 if (size
> inode
->i_size
)
1443 size
= inode
->i_size
;
1444 /* Ho humm.. We should have tested for this earlier */
1450 writepage
= inode
->i_op
->writepage
;
1451 page
= mem_map
+ MAP_NR(page_addr
);
1454 retval
= writepage(file
, page
);
1460 static int filemap_write_page(struct vm_area_struct
* vma
,
1461 unsigned long offset
,
1467 struct dentry
* dentry
;
1468 struct inode
* inode
;
1470 file
= vma
->vm_file
;
1471 dentry
= file
->f_dentry
;
1472 inode
= dentry
->d_inode
;
1475 * If a task terminates while we're swapping the page, the vma and
1476 * and file could be released ... increment the count to be safe.
1479 result
= do_write_page(inode
, file
, (const char *) page
, offset
);
1486 * The page cache takes care of races between somebody
1487 * trying to swap something out and swap something in
1488 * at the same time..
1490 extern void wakeup_bdflush(int);
1491 int filemap_swapout(struct vm_area_struct
* vma
, struct page
* page
)
1493 int retval
= filemap_write_page(vma
, page
->offset
, page_address(page
), 0);
1498 static inline int filemap_sync_pte(pte_t
* ptep
, struct vm_area_struct
*vma
,
1499 unsigned long address
, unsigned int flags
)
1502 unsigned long pageaddr
;
1506 if (!(flags
& MS_INVALIDATE
)) {
1507 if (!pte_present(pte
))
1509 if (!pte_dirty(pte
))
1511 flush_page_to_ram(pte_page(pte
));
1512 flush_cache_page(vma
, address
);
1513 set_pte(ptep
, pte_mkclean(pte
));
1514 flush_tlb_page(vma
, address
);
1515 pageaddr
= pte_page(pte
);
1516 page
= page_cache_entry(pageaddr
);
1521 flush_cache_page(vma
, address
);
1523 flush_tlb_page(vma
, address
);
1524 if (!pte_present(pte
)) {
1525 swap_free(pte_val(pte
));
1528 pageaddr
= pte_page(pte
);
1529 if (!pte_dirty(pte
) || flags
== MS_INVALIDATE
) {
1530 page_cache_free(pageaddr
);
1534 error
= filemap_write_page(vma
, address
- vma
->vm_start
+ vma
->vm_offset
, pageaddr
, 1);
1535 page_cache_free(pageaddr
);
1539 static inline int filemap_sync_pte_range(pmd_t
* pmd
,
1540 unsigned long address
, unsigned long size
,
1541 struct vm_area_struct
*vma
, unsigned long offset
, unsigned int flags
)
1549 if (pmd_bad(*pmd
)) {
1550 printk("filemap_sync_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd
));
1554 pte
= pte_offset(pmd
, address
);
1555 offset
+= address
& PMD_MASK
;
1556 address
&= ~PMD_MASK
;
1557 end
= address
+ size
;
1562 error
|= filemap_sync_pte(pte
, vma
, address
+ offset
, flags
);
1563 address
+= PAGE_SIZE
;
1565 } while (address
< end
);
1569 static inline int filemap_sync_pmd_range(pgd_t
* pgd
,
1570 unsigned long address
, unsigned long size
,
1571 struct vm_area_struct
*vma
, unsigned int flags
)
1574 unsigned long offset
, end
;
1579 if (pgd_bad(*pgd
)) {
1580 printk("filemap_sync_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd
));
1584 pmd
= pmd_offset(pgd
, address
);
1585 offset
= address
& PGDIR_MASK
;
1586 address
&= ~PGDIR_MASK
;
1587 end
= address
+ size
;
1588 if (end
> PGDIR_SIZE
)
1592 error
|= filemap_sync_pte_range(pmd
, address
, end
- address
, vma
, offset
, flags
);
1593 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
1595 } while (address
< end
);
1599 static int filemap_sync(struct vm_area_struct
* vma
, unsigned long address
,
1600 size_t size
, unsigned int flags
)
1603 unsigned long end
= address
+ size
;
1606 dir
= pgd_offset(vma
->vm_mm
, address
);
1607 flush_cache_range(vma
->vm_mm
, end
- size
, end
);
1608 while (address
< end
) {
1609 error
|= filemap_sync_pmd_range(dir
, address
, end
- address
, vma
, flags
);
1610 address
= (address
+ PGDIR_SIZE
) & PGDIR_MASK
;
1613 flush_tlb_range(vma
->vm_mm
, end
- size
, end
);
1618 * This handles (potentially partial) area unmaps..
1620 static void filemap_unmap(struct vm_area_struct
*vma
, unsigned long start
, size_t len
)
1622 filemap_sync(vma
, start
, len
, MS_ASYNC
);
1626 * Shared mappings need to be able to do the right thing at
1627 * close/unmap/sync. They will also use the private file as
1628 * backing-store for swapping..
1630 static struct vm_operations_struct file_shared_mmap
= {
1631 NULL
, /* no special open */
1632 NULL
, /* no special close */
1633 filemap_unmap
, /* unmap - we need to sync the pages */
1634 NULL
, /* no special protect */
1635 filemap_sync
, /* sync */
1637 filemap_nopage
, /* nopage */
1639 filemap_swapout
/* swapout */
1643 * Private mappings just need to be able to load in the map.
1645 * (This is actually used for shared mappings as well, if we
1646 * know they can't ever get write permissions..)
1648 static struct vm_operations_struct file_private_mmap
= {
1655 filemap_nopage
, /* nopage */
1660 /* This is used for a general mmap of a disk file */
1662 int generic_file_mmap(struct file
* file
, struct vm_area_struct
* vma
)
1664 struct vm_operations_struct
* ops
;
1665 struct inode
*inode
= file
->f_dentry
->d_inode
;
1667 ops
= &file_private_mmap
;
1668 if ((vma
->vm_flags
& VM_SHARED
) && (vma
->vm_flags
& VM_MAYWRITE
)) {
1669 if (!inode
->i_op
|| !inode
->i_op
->writepage
)
1671 ops
= &file_shared_mmap
;
1673 if (!inode
->i_sb
|| !S_ISREG(inode
->i_mode
))
1675 if (!inode
->i_op
|| !inode
->i_op
->readpage
)
1677 UPDATE_ATIME(inode
);
1684 * The msync() system call.
1687 static int msync_interval(struct vm_area_struct
* vma
,
1688 unsigned long start
, unsigned long end
, int flags
)
1690 if (vma
->vm_file
&& vma
->vm_ops
&& vma
->vm_ops
->sync
) {
1692 error
= vma
->vm_ops
->sync(vma
, start
, end
-start
, flags
);
1693 if (!error
&& (flags
& MS_SYNC
)) {
1694 struct file
* file
= vma
->vm_file
;
1696 struct dentry
* dentry
= file
->f_dentry
;
1697 error
= file_fsync(file
, dentry
);
1705 asmlinkage
int sys_msync(unsigned long start
, size_t len
, int flags
)
1708 struct vm_area_struct
* vma
;
1709 int unmapped_error
, error
= -EINVAL
;
1711 down(¤t
->mm
->mmap_sem
);
1713 if (start
& ~PAGE_MASK
)
1715 len
= (len
+ ~PAGE_MASK
) & PAGE_MASK
;
1719 if (flags
& ~(MS_ASYNC
| MS_INVALIDATE
| MS_SYNC
))
1725 * If the interval [start,end) covers some unmapped address ranges,
1726 * just ignore them, but return -EFAULT at the end.
1728 vma
= find_vma(current
->mm
, start
);
1731 /* Still start < end. */
1735 /* Here start < vma->vm_end. */
1736 if (start
< vma
->vm_start
) {
1737 unmapped_error
= -EFAULT
;
1738 start
= vma
->vm_start
;
1740 /* Here vma->vm_start <= start < vma->vm_end. */
1741 if (end
<= vma
->vm_end
) {
1743 error
= msync_interval(vma
, start
, end
, flags
);
1747 error
= unmapped_error
;
1750 /* Here vma->vm_start <= start < vma->vm_end < end. */
1751 error
= msync_interval(vma
, start
, vma
->vm_end
, flags
);
1754 start
= vma
->vm_end
;
1759 up(¤t
->mm
->mmap_sem
);
1764 * Write to a file through the page cache. This is mainly for the
1765 * benefit of NFS and possibly other network-based file systems.
1767 * We currently put everything into the page cache prior to writing it.
1768 * This is not a problem when writing full pages. With partial pages,
1769 * however, we first have to read the data into the cache, then
1770 * dirty the page, and finally schedule it for writing. Alternatively, we
1771 * could write-through just the portion of data that would go into that
1772 * page, but that would kill performance for applications that write data
1773 * line by line, and it's prone to race conditions.
1775 * Note that this routine doesn't try to keep track of dirty pages. Each
1776 * file system has to do this all by itself, unfortunately.
1780 generic_file_write(struct file
*file
, const char *buf
,
1781 size_t count
, loff_t
*ppos
,
1782 writepage_t write_one_page
)
1784 struct dentry
*dentry
= file
->f_dentry
;
1785 struct inode
*inode
= dentry
->d_inode
;
1786 unsigned long pos
= *ppos
;
1787 unsigned long limit
= current
->rlim
[RLIMIT_FSIZE
].rlim_cur
;
1788 struct page
*page
, **hash
;
1789 unsigned long page_cache
= 0;
1790 unsigned long written
;
1794 err
= file
->f_error
;
1802 if (file
->f_flags
& O_APPEND
)
1803 pos
= inode
->i_size
;
1806 * Check whether we've reached the file size limit.
1810 send_sig(SIGXFSZ
, current
, 0);
1816 * Check whether to truncate the write,
1817 * and send the signal if we do.
1819 if (count
> limit
- pos
) {
1820 send_sig(SIGXFSZ
, current
, 0);
1821 count
= limit
- pos
;
1825 unsigned long bytes
, pgpos
, offset
;
1827 * Try to find the page in the cache. If it isn't there,
1828 * allocate a free page.
1830 offset
= (pos
& ~PAGE_CACHE_MASK
);
1831 pgpos
= pos
& PAGE_CACHE_MASK
;
1832 bytes
= PAGE_CACHE_SIZE
- offset
;
1836 hash
= page_hash(inode
, pgpos
);
1838 page
= __find_lock_page(inode
, pgpos
, hash
);
1841 page_cache
= page_cache_alloc();
1847 page
= page_cache_entry(page_cache
);
1848 if (add_to_page_cache_unique(page
,inode
,pgpos
,hash
))
1854 /* We have exclusive IO access to the page.. */
1855 if (!PageLocked(page
)) {
1858 if (page
->owner
!= current
) {
1863 status
= write_one_page(file
, page
, offset
, bytes
, buf
);
1865 /* Mark it unlocked again and drop the page.. */
1867 page_cache_release(page
);
1878 if (pos
> inode
->i_size
)
1879 inode
->i_size
= pos
;
1882 page_cache_free(page_cache
);
1884 err
= written
? written
: status
;
1890 * Support routines for directory caching using the page cache.
1894 * Unlock and free a page.
1896 void put_cached_page(unsigned long addr
)
1898 struct page
* page
= page_cache_entry(addr
);
1901 if (page_count(page
) != 2)
1902 panic("put_cached_page: page count=%d\n",
1904 page_cache_release(page
);
1907 void __init
page_cache_init(unsigned long memory_size
)
1909 unsigned long htable_size
, order
;
1911 htable_size
= memory_size
>> PAGE_SHIFT
;
1912 htable_size
*= sizeof(struct page
*);
1913 for(order
= 0; (PAGE_SIZE
<< order
) < htable_size
; order
++)
1917 unsigned long tmp
= (PAGE_SIZE
<< order
) / sizeof(struct page
*);
1920 while((tmp
>>= 1UL) != 0UL)
1923 page_hash_table
= (struct page
**)
1924 __get_free_pages(GFP_ATOMIC
, order
);
1925 } while(page_hash_table
== NULL
&& --order
> 0);
1927 printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
1928 (1 << page_hash_bits
), order
, (PAGE_SIZE
<< order
));
1929 if (!page_hash_table
)
1930 panic("Failed to allocate page hash table\n");
1931 memset(page_hash_table
, 0, PAGE_HASH_SIZE
* sizeof(struct page
*));