4 * Copyright (C) 1994-1999 Linus Torvalds
8 * This file handles the generic file mmap semantics used by
9 * most "normal" filesystems (but you don't /have/ to use this:
10 * the NFS filesystem used to do this differently, for example)
12 #include <linux/malloc.h>
13 #include <linux/shm.h>
14 #include <linux/mman.h>
15 #include <linux/locks.h>
16 #include <linux/pagemap.h>
17 #include <linux/swap.h>
18 #include <linux/smp_lock.h>
19 #include <linux/blkdev.h>
20 #include <linux/file.h>
21 #include <linux/swapctl.h>
22 #include <linux/slab.h>
24 #include <asm/pgtable.h>
25 #include <asm/uaccess.h>
28 * Shared mappings implemented 30.11.1994. It's not fully working yet,
31 * Shared mappings now work. 15.8.1995 Bruno.
33 * finished 'unifying' the page and buffer cache and SMP-threaded the
34 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
37 atomic_t page_cache_size
= ATOMIC_INIT(0);
38 struct page
* page_hash_table
[PAGE_HASH_SIZE
];
41 * Define a request structure for outstanding page write requests
42 * to the background page io daemon
47 struct pio_request
* next
;
52 static struct pio_request
*pio_first
= NULL
, **pio_last
= &pio_first
;
53 static kmem_cache_t
*pio_request_cache
;
54 static DECLARE_WAIT_QUEUE_HEAD(pio_wait
);
56 spinlock_t pagecache_lock
= SPIN_LOCK_UNLOCKED
;
60 make_pio_request(struct file
*, unsigned long, unsigned long);
62 void __add_page_to_hash_queue(struct page
* page
, struct page
**p
){
63 atomic_inc(&page_cache_size
);
64 if((page
->next_hash
= *p
) != NULL
)
65 (*p
)->pprev_hash
= &page
->next_hash
;
72 static void remove_page_from_hash_queue(struct page
* page
)
74 if(page
->pprev_hash
) {
76 page
->next_hash
->pprev_hash
= page
->pprev_hash
;
77 *page
->pprev_hash
= page
->next_hash
;
78 page
->pprev_hash
= NULL
;
80 atomic_dec(&page_cache_size
);
83 void invalidate_inode_pages(struct inode
* inode
)
89 spin_lock(&pagecache_lock
);
91 while ((page
= *p
) != NULL
) {
93 if (TryLockPage(page
)) {
94 spin_unlock(&pagecache_lock
);
96 page_cache_release(page
);
99 if (page_count(page
) != 2)
100 printk("hm, busy page invalidated? (not necesserily a bug)\n");
102 if ((*p
= page
->next
) != NULL
)
103 (*p
)->prev
= page
->prev
;
106 remove_page_from_hash_queue(page
);
109 page_cache_release(page
);
110 page_cache_release(page
);
113 spin_unlock(&pagecache_lock
);
116 * Truncate the page cache at a set offset, removing the pages
117 * that are beyond that offset (and zeroing out partial pages).
119 void truncate_inode_pages(struct inode
* inode
, unsigned long start
)
126 spin_lock(&pagecache_lock
);
128 while ((page
= *p
) != NULL
) {
129 unsigned long offset
= page
->offset
;
131 /* page wholly truncated - free it */
132 if (offset
>= start
) {
134 if (TryLockPage(page
)) {
135 spin_unlock(&pagecache_lock
);
137 page_cache_release(page
);
140 spin_unlock(&pagecache_lock
);
142 if (inode
->i_op
->flushpage
)
143 inode
->i_op
->flushpage(inode
, page
, 0);
146 * We remove the page from the page cache
147 * _after_ we have destroyed all buffer-cache
148 * references to it. Otherwise some other process
149 * might think this inode page is not in the
150 * page cache and creates a buffer-cache alias
151 * to it causing all sorts of fun problems ...
153 spin_lock(&pagecache_lock
);
155 if ((*p
= page
->next
) != NULL
)
156 (*p
)->prev
= page
->prev
;
159 remove_page_from_hash_queue(page
);
161 spin_unlock(&pagecache_lock
);
164 page_cache_release(page
);
165 page_cache_release(page
);
168 * We have done things without the pagecache lock,
169 * so we'll have to repeat the scan.
170 * It's not possible to deadlock here because
171 * we are guaranteed to make progress. (ie. we have
172 * just removed a page)
178 * there is only one partial page possible.
183 offset
= start
- offset
;
184 /* partial truncate, clear end of page */
185 if (offset
< PAGE_CACHE_SIZE
) {
186 unsigned long address
;
188 if (TryLockPage(page
)) {
189 spin_unlock(&pagecache_lock
);
191 page_cache_release(page
);
195 * It's worth dropping the write lock only at
196 * this point. We are holding the page lock
197 * so nobody can do anything bad to us.
199 spin_unlock(&pagecache_lock
);
202 address
= page_address(page
);
203 memset((void *) (offset
+ address
), 0, PAGE_CACHE_SIZE
- offset
);
204 flush_page_to_ram(address
);
206 if (inode
->i_op
->flushpage
)
207 inode
->i_op
->flushpage(inode
, page
, offset
);
209 * we have dropped the spinlock so we have to
213 page_cache_release(page
);
217 spin_unlock(&pagecache_lock
);
221 * Remove a page from the page cache and free it. Caller has to make
222 * sure the page is locked and that nobody else uses it - or that usage
225 void remove_inode_page(struct page
*page
)
227 if (!PageLocked(page
))
230 spin_lock(&pagecache_lock
);
231 remove_page_from_inode_queue(page
);
232 remove_page_from_hash_queue(page
);
234 spin_unlock(&pagecache_lock
);
237 int shrink_mmap(int priority
, int gfp_mask
)
239 static unsigned long clock
= 0;
240 unsigned long limit
= num_physpages
;
244 count
= limit
>> priority
;
246 page
= mem_map
+ clock
;
250 /* This works even in the presence of PageSkip because
251 * the first two entries at the beginning of a hole will
252 * be marked, not just the first.
256 if (clock
>= max_mapnr
) {
260 if (PageSkip(page
)) {
261 /* next_hash is overloaded for PageSkip */
262 page
= page
->next_hash
;
263 clock
= page
- mem_map
;
266 referenced
= test_and_clear_bit(PG_referenced
, &page
->flags
);
268 if ((gfp_mask
& __GFP_DMA
) && !PageDMA(page
))
271 if (PageLocked(page
))
274 /* Is it a buffer page? */
276 if (buffer_under_min())
279 if (TryLockPage(page
))
281 err
= try_to_free_buffers(page
);
289 /* We can't free pages unless there's just one user */
290 if (page_count(page
) != 1)
296 * Is it a page swap page? If so, we want to
297 * drop it if it is no longer used, even if it
298 * were to be marked referenced..
300 if (PageSwapCache(page
)) {
301 if (referenced
&& swap_count(page
->offset
) != 1)
303 delete_from_swap_cache(page
);
311 /* is it a page-cache page? */
312 spin_lock(&pagecache_lock
);
314 if (pgcache_under_min())
315 goto unlock_continue
;
316 if (TryLockPage(page
))
317 goto unlock_continue
;
319 if (page_count(page
) == 1) {
320 remove_page_from_inode_queue(page
);
321 remove_page_from_hash_queue(page
);
324 spin_unlock(&pagecache_lock
);
327 page_cache_release(page
);
331 spin_unlock(&pagecache_lock
);
334 spin_unlock(&pagecache_lock
);
341 static inline struct page
* __find_page_nolock(struct inode
* inode
, unsigned long offset
, struct page
*page
)
346 page
= page
->next_hash
;
350 if (page
->inode
!= inode
)
352 if (page
->offset
== offset
)
360 * This adds a page to the page cache, starting out as locked,
361 * owned by us, referenced, but not uptodate and with no errors.
363 static inline void __add_to_page_cache(struct page
* page
,
364 struct inode
* inode
, unsigned long offset
,
369 flags
= page
->flags
& ~((1 << PG_uptodate
) | (1 << PG_error
));
370 page
->flags
= flags
| ((1 << PG_locked
) | (1 << PG_referenced
));
371 page
->owner
= (int)current
; /* REMOVEME */
373 page
->offset
= offset
;
374 add_page_to_inode_queue(inode
, page
);
375 __add_page_to_hash_queue(page
, hash
);
378 int add_to_page_cache_unique(struct page
* page
,
379 struct inode
* inode
, unsigned long offset
,
385 spin_lock(&pagecache_lock
);
386 alias
= __find_page_nolock(inode
, offset
, *hash
);
390 __add_to_page_cache(page
,inode
,offset
,hash
);
394 spin_unlock(&pagecache_lock
);
399 * Try to read ahead in the file. "page_cache" is a potentially free page
400 * that we could use for the cache (if it is 0 we can try to create one,
401 * this is all overlapped with the IO on the previous page finishing anyway)
403 static unsigned long try_to_read_ahead(struct file
* file
,
404 unsigned long offset
, unsigned long page_cache
)
406 struct inode
*inode
= file
->f_dentry
->d_inode
;
410 offset
&= PAGE_CACHE_MASK
;
411 switch (page_cache
) {
413 page_cache
= page_cache_alloc();
417 if (offset
>= inode
->i_size
)
419 hash
= page_hash(inode
, offset
);
420 page
= page_cache_entry(page_cache
);
421 if (!add_to_page_cache_unique(page
, inode
, offset
, hash
)) {
423 * We do not have to check the return value here
424 * because it's a readahead.
427 inode
->i_op
->readpage(file
, page
);
430 page_cache_release(page
);
437 * Wait for a page to get unlocked.
439 * This must be called with the caller "holding" the page,
440 * ie with increased "page->count" so that the page won't
441 * go away during the wait..
443 void ___wait_on_page(struct page
*page
)
445 struct task_struct
*tsk
= current
;
446 DECLARE_WAITQUEUE(wait
, tsk
);
448 add_wait_queue(&page
->wait
, &wait
);
450 tsk
->state
= TASK_UNINTERRUPTIBLE
;
451 run_task_queue(&tq_disk
);
452 if (PageLocked(page
)) {
454 left
= schedule_timeout(HZ
*20);
459 tsk
->state
= TASK_RUNNING
;
460 remove_wait_queue(&page
->wait
, &wait
);
464 * Get an exclusive lock on the page..
466 void lock_page(struct page
*page
)
468 if (TryLockPage(page
)) {
469 struct task_struct
*tsk
= current
;
470 DECLARE_WAITQUEUE(wait
, current
);
472 run_task_queue(&tq_disk
);
473 add_wait_queue(&page
->wait
, &wait
);
474 tsk
->state
= TASK_UNINTERRUPTIBLE
;
476 while (TryLockPage(page
)) {
477 run_task_queue(&tq_disk
);
479 tsk
->state
= TASK_UNINTERRUPTIBLE
;
482 remove_wait_queue(&page
->wait
, &wait
);
483 tsk
->state
= TASK_RUNNING
;
489 * a rather lightweight function, finding and getting a reference to a
490 * hashed page atomically, waiting for it if it's locked.
492 struct page
* __find_get_page (struct inode
* inode
,
493 unsigned long offset
, struct page
*page
)
497 * We scan the hash list read-only. Addition to and removal from
498 * the hash-list needs a held write-lock.
501 spin_lock(&pagecache_lock
);
502 page
= __find_page_nolock(inode
, offset
, page
);
505 spin_unlock(&pagecache_lock
);
507 /* Found the page, sleep if locked. */
508 if (page
&& PageLocked(page
)) {
509 struct task_struct
*tsk
= current
;
510 DECLARE_WAITQUEUE(wait
, tsk
);
512 add_wait_queue(&page
->wait
, &wait
);
513 tsk
->state
= TASK_UNINTERRUPTIBLE
;
515 run_task_queue(&tq_disk
);
516 if (PageLocked(page
))
518 tsk
->state
= TASK_RUNNING
;
519 remove_wait_queue(&page
->wait
, &wait
);
522 * The page might have been unhashed meanwhile. It's
523 * not freed though because we hold a reference to it.
524 * If this is the case then it will be freed _here_,
525 * and we recheck the hash anyway.
527 page_cache_release(page
);
531 * It's not locked so we can return the page and we hold
538 * Get the lock to a page atomically.
540 struct page
* __find_lock_page (struct inode
* inode
,
541 unsigned long offset
, struct page
*page
)
547 * We scan the hash list read-only. Addition to and removal from
548 * the hash-list needs a held write-lock.
551 spin_lock(&pagecache_lock
);
552 page
= __find_page_nolock(inode
, offset
, page
);
556 if (TryLockPage(page
))
559 spin_unlock(&pagecache_lock
);
561 /* Found the page, sleep if locked. */
562 if (page
&& locked
) {
563 struct task_struct
*tsk
= current
;
564 DECLARE_WAITQUEUE(wait
, tsk
);
566 add_wait_queue(&page
->wait
, &wait
);
567 tsk
->state
= TASK_UNINTERRUPTIBLE
;
569 run_task_queue(&tq_disk
);
570 if (PageLocked(page
))
572 tsk
->state
= TASK_RUNNING
;
573 remove_wait_queue(&page
->wait
, &wait
);
576 * The page might have been unhashed meanwhile. It's
577 * not freed though because we hold a reference to it.
578 * If this is the case then it will be freed _here_,
579 * and we recheck the hash anyway.
581 page_cache_release(page
);
585 * It's not locked so we can return the page and we hold
592 #define PROFILE_READAHEAD
593 #define DEBUG_READAHEAD
597 * Read-ahead profiling information
598 * --------------------------------
599 * Every PROFILE_MAXREADCOUNT, the following information is written
601 * Percentage of asynchronous read-ahead.
602 * Average of read-ahead fields context value.
603 * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
607 #ifdef PROFILE_READAHEAD
609 #define PROFILE_MAXREADCOUNT 1000
611 static unsigned long total_reada
;
612 static unsigned long total_async
;
613 static unsigned long total_ramax
;
614 static unsigned long total_ralen
;
615 static unsigned long total_rawin
;
617 static void profile_readahead(int async
, struct file
*filp
)
625 total_ramax
+= filp
->f_ramax
;
626 total_ralen
+= filp
->f_ralen
;
627 total_rawin
+= filp
->f_rawin
;
629 if (total_reada
> PROFILE_MAXREADCOUNT
) {
632 if (!(total_reada
> PROFILE_MAXREADCOUNT
)) {
633 restore_flags(flags
);
637 printk("Readahead average: max=%ld, len=%ld, win=%ld, async=%ld%%\n",
638 total_ramax
/total_reada
,
639 total_ralen
/total_reada
,
640 total_rawin
/total_reada
,
641 (total_async
*100)/total_reada
);
642 #ifdef DEBUG_READAHEAD
643 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%ld\n",
644 filp
->f_ramax
, filp
->f_ralen
, filp
->f_rawin
, filp
->f_raend
);
653 restore_flags(flags
);
656 #endif /* defined PROFILE_READAHEAD */
659 * Read-ahead context:
660 * -------------------
661 * The read ahead context fields of the "struct file" are the following:
662 * - f_raend : position of the first byte after the last page we tried to
664 * - f_ramax : current read-ahead maximum size.
665 * - f_ralen : length of the current IO read block we tried to read-ahead.
666 * - f_rawin : length of the current read-ahead window.
667 * if last read-ahead was synchronous then
669 * otherwise (was asynchronous)
670 * f_rawin = previous value of f_ralen + f_ralen
674 * MIN_READAHEAD : minimum read-ahead size when read-ahead.
675 * MAX_READAHEAD : maximum read-ahead size when read-ahead.
677 * Synchronous read-ahead benefits:
678 * --------------------------------
679 * Using reasonable IO xfer length from peripheral devices increase system
681 * Reasonable means, in this context, not too large but not too small.
682 * The actual maximum value is:
683 * MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
684 * and 32K if defined (4K page size assumed).
686 * Asynchronous read-ahead benefits:
687 * ---------------------------------
688 * Overlapping next read request and user process execution increase system
693 * We have to guess which further data are needed by the user process.
694 * If these data are often not really needed, it's bad for system
696 * However, we know that files are often accessed sequentially by
697 * application programs and it seems that it is possible to have some good
698 * strategy in that guessing.
699 * We only try to read-ahead files that seems to be read sequentially.
701 * Asynchronous read-ahead risks:
702 * ------------------------------
703 * In order to maximize overlapping, we must start some asynchronous read
704 * request from the device, as soon as possible.
705 * We must be very careful about:
706 * - The number of effective pending IO read requests.
707 * ONE seems to be the only reasonable value.
708 * - The total memory pool usage for the file access stream.
709 * This maximum memory usage is implicitly 2 IO read chunks:
710 * 2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
711 * 64k if defined (4K page size assumed).
714 static inline int get_max_readahead(struct inode
* inode
)
716 if (!inode
->i_dev
|| !max_readahead
[MAJOR(inode
->i_dev
)])
717 return MAX_READAHEAD
;
718 return max_readahead
[MAJOR(inode
->i_dev
)][MINOR(inode
->i_dev
)];
721 static inline unsigned long generic_file_readahead(int reada_ok
,
722 struct file
* filp
, struct inode
* inode
,
723 unsigned long ppos
, struct page
* page
, unsigned long page_cache
)
725 unsigned long max_ahead
, ahead
;
727 int max_readahead
= get_max_readahead(inode
);
729 raend
= filp
->f_raend
& PAGE_CACHE_MASK
;
733 * The current page is locked.
734 * If the current position is inside the previous read IO request, do not
735 * try to reread previously read ahead pages.
736 * Otherwise decide or not to read ahead some pages synchronously.
737 * If we are not going to read ahead, set the read ahead context for this
740 if (PageLocked(page
)) {
741 if (!filp
->f_ralen
|| ppos
>= raend
|| ppos
+ filp
->f_ralen
< raend
) {
743 if (raend
< inode
->i_size
)
744 max_ahead
= filp
->f_ramax
;
746 filp
->f_ralen
= PAGE_CACHE_SIZE
;
748 filp
->f_raend
= ppos
+ filp
->f_ralen
;
749 filp
->f_rawin
+= filp
->f_ralen
;
754 * The current page is not locked.
755 * If we were reading ahead and,
756 * if the current max read ahead size is not zero and,
757 * if the current position is inside the last read-ahead IO request,
758 * it is the moment to try to read ahead asynchronously.
759 * We will later force unplug device in order to force asynchronous read IO.
761 else if (reada_ok
&& filp
->f_ramax
&& raend
>= PAGE_CACHE_SIZE
&&
762 ppos
<= raend
&& ppos
+ filp
->f_ralen
>= raend
) {
764 * Add ONE page to max_ahead in order to try to have about the same IO max size
765 * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
766 * Compute the position of the last page we have tried to read in order to
767 * begin to read ahead just at the next page.
769 raend
-= PAGE_CACHE_SIZE
;
770 if (raend
< inode
->i_size
)
771 max_ahead
= filp
->f_ramax
+ PAGE_CACHE_SIZE
;
774 filp
->f_rawin
= filp
->f_ralen
;
780 * Try to read ahead pages.
781 * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
782 * scheduler, will work enough for us to avoid too bad actuals IO requests.
785 while (ahead
< max_ahead
) {
786 ahead
+= PAGE_CACHE_SIZE
;
787 page_cache
= try_to_read_ahead(filp
, raend
+ ahead
,
791 * If we tried to read ahead some pages,
792 * If we tried to read ahead asynchronously,
793 * Try to force unplug of the device in order to start an asynchronous
795 * Update the read-ahead context.
796 * Store the length of the current read-ahead window.
797 * Double the current max read ahead size.
798 * That heuristic avoid to do some large IO for files that are not really
799 * accessed sequentially.
803 run_task_queue(&tq_disk
);
806 filp
->f_ralen
+= ahead
;
807 filp
->f_rawin
+= filp
->f_ralen
;
808 filp
->f_raend
= raend
+ ahead
+ PAGE_CACHE_SIZE
;
810 filp
->f_ramax
+= filp
->f_ramax
;
812 if (filp
->f_ramax
> max_readahead
)
813 filp
->f_ramax
= max_readahead
;
815 #ifdef PROFILE_READAHEAD
816 profile_readahead((reada_ok
== 2), filp
);
824 * "descriptor" for what we're up to with a read.
825 * This allows us to use the same read code yet
826 * have multiple different users of the data that
827 * we read from a file.
829 * The simplest case just copies the data to user
839 typedef int (*read_actor_t
)(read_descriptor_t
*, const char *, unsigned long);
842 * This is a generic file read routine, and uses the
843 * inode->i_op->readpage() function for the actual low-level
846 * This is really ugly. But the goto's actually try to clarify some
847 * of the logic when it comes to error handling etc.
849 static void do_generic_file_read(struct file
* filp
, loff_t
*ppos
, read_descriptor_t
* desc
, read_actor_t actor
)
851 struct dentry
*dentry
= filp
->f_dentry
;
852 struct inode
*inode
= dentry
->d_inode
;
853 size_t pos
, pgpos
, page_cache
;
856 int max_readahead
= get_max_readahead(inode
);
861 pgpos
= pos
& PAGE_CACHE_MASK
;
863 * If the current position is outside the previous read-ahead window,
864 * we reset the current read-ahead context and set read ahead max to zero
865 * (will be set to just needed value later),
866 * otherwise, we assume that the file accesses are sequential enough to
867 * continue read-ahead.
869 if (pgpos
> filp
->f_raend
|| pgpos
+ filp
->f_rawin
< filp
->f_raend
) {
879 * Adjust the current value of read-ahead max.
880 * If the read operation stay in the first half page, force no readahead.
881 * Otherwise try to increase read ahead max just enough to do the read request.
882 * Then, at least MIN_READAHEAD if read ahead is ok,
883 * and at most MAX_READAHEAD in all cases.
885 if (pos
+ desc
->count
<= (PAGE_CACHE_SIZE
>> 1)) {
888 unsigned long needed
;
890 needed
= ((pos
+ desc
->count
) & PAGE_CACHE_MASK
) - pgpos
;
892 if (filp
->f_ramax
< needed
)
893 filp
->f_ramax
= needed
;
895 if (reada_ok
&& filp
->f_ramax
< MIN_READAHEAD
)
896 filp
->f_ramax
= MIN_READAHEAD
;
897 if (filp
->f_ramax
> max_readahead
)
898 filp
->f_ramax
= max_readahead
;
902 struct page
*page
, **hash
;
904 if (pos
>= inode
->i_size
)
908 * Try to find the data in the page cache..
910 hash
= page_hash(inode
, pos
& PAGE_CACHE_MASK
);
912 spin_lock(&pagecache_lock
);
913 page
= __find_page_nolock(inode
, pos
& PAGE_CACHE_MASK
, *hash
);
918 spin_unlock(&pagecache_lock
);
920 if (!Page_Uptodate(page
))
921 goto page_not_up_to_date
;
924 * Ok, we have the page, and it's up-to-date, so
925 * now we can copy it to user space...
928 unsigned long offset
, nr
;
930 offset
= pos
& ~PAGE_CACHE_MASK
;
931 nr
= PAGE_CACHE_SIZE
- offset
;
932 if (nr
> inode
->i_size
- pos
)
933 nr
= inode
->i_size
- pos
;
936 * The actor routine returns how many bytes were actually used..
937 * NOTE! This may not be the same as how much of a user buffer
938 * we filled up (we may be padding etc), so we can only update
939 * "pos" here (the actor routine has to update the user buffer
940 * pointers and the remaining count).
942 nr
= actor(desc
, (const char *) (page_address(page
) + offset
), nr
);
944 page_cache_release(page
);
945 if (nr
&& desc
->count
)
951 * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
954 page_cache
= generic_file_readahead(reada_ok
, filp
, inode
, pos
& PAGE_CACHE_MASK
, page
, page_cache
);
956 if (Page_Uptodate(page
))
959 /* Get exclusive access to the page ... */
961 if (Page_Uptodate(page
)) {
967 /* ... and start the actual read. The read will unlock the page. */
969 error
= inode
->i_op
->readpage(filp
, page
);
973 if (Page_Uptodate(page
))
976 /* Again, try some read-ahead while waiting for the page to finish.. */
977 page_cache
= generic_file_readahead(reada_ok
, filp
, inode
, pos
& PAGE_CACHE_MASK
, page
, page_cache
);
979 if (Page_Uptodate(page
))
984 /* UHHUH! A synchronous read error occurred. Report it */
986 page_cache_release(page
);
991 * Ok, it wasn't cached, so we need to create a new
994 * We get here with the page cache lock held.
997 spin_unlock(&pagecache_lock
);
998 page_cache
= page_cache_alloc();
1000 desc
->error
= -ENOMEM
;
1005 * Somebody may have added the page while we
1006 * dropped the page cache lock. Check for that.
1008 spin_lock(&pagecache_lock
);
1009 page
= __find_page_nolock(inode
, pos
& PAGE_CACHE_MASK
, *hash
);
1015 * Ok, add the new page to the hash-queues...
1017 page
= page_cache_entry(page_cache
);
1018 __add_to_page_cache(page
, inode
, pos
& PAGE_CACHE_MASK
, hash
);
1019 spin_unlock(&pagecache_lock
);
1028 page_cache_free(page_cache
);
1029 UPDATE_ATIME(inode
);
1032 static int file_read_actor(read_descriptor_t
* desc
, const char *area
, unsigned long size
)
1035 unsigned long count
= desc
->count
;
1039 left
= __copy_to_user(desc
->buf
, area
, size
);
1042 desc
->error
= -EFAULT
;
1044 desc
->count
= count
- size
;
1045 desc
->written
+= size
;
1051 * This is the "read()" routine for all filesystems
1052 * that can use the page cache directly.
1054 ssize_t
generic_file_read(struct file
* filp
, char * buf
, size_t count
, loff_t
*ppos
)
1060 if (access_ok(VERIFY_WRITE
, buf
, count
)) {
1063 read_descriptor_t desc
;
1069 do_generic_file_read(filp
, ppos
, &desc
, file_read_actor
);
1071 retval
= desc
.written
;
1073 retval
= desc
.error
;
1080 static int file_send_actor(read_descriptor_t
* desc
, const char *area
, unsigned long size
)
1083 unsigned long count
= desc
->count
;
1084 struct file
*file
= (struct file
*) desc
->buf
;
1085 mm_segment_t old_fs
;
1091 written
= file
->f_op
->write(file
, area
, size
, &file
->f_pos
);
1094 desc
->error
= written
;
1097 desc
->count
= count
- written
;
1098 desc
->written
+= written
;
1102 asmlinkage ssize_t
sys_sendfile(int out_fd
, int in_fd
, off_t
*offset
, size_t count
)
1105 struct file
* in_file
, * out_file
;
1106 struct inode
* in_inode
, * out_inode
;
1111 * Get input file, and verify that it is ok..
1114 in_file
= fget(in_fd
);
1117 if (!(in_file
->f_mode
& FMODE_READ
))
1120 in_inode
= in_file
->f_dentry
->d_inode
;
1123 if (!in_inode
->i_op
|| !in_inode
->i_op
->readpage
)
1125 retval
= locks_verify_area(FLOCK_VERIFY_READ
, in_inode
, in_file
, in_file
->f_pos
, count
);
1130 * Get output file, and verify that it is ok..
1133 out_file
= fget(out_fd
);
1136 if (!(out_file
->f_mode
& FMODE_WRITE
))
1139 if (!out_file
->f_op
|| !out_file
->f_op
->write
)
1141 out_inode
= out_file
->f_dentry
->d_inode
;
1144 retval
= locks_verify_area(FLOCK_VERIFY_WRITE
, out_inode
, out_file
, out_file
->f_pos
, count
);
1150 read_descriptor_t desc
;
1151 loff_t pos
= 0, *ppos
;
1154 ppos
= &in_file
->f_pos
;
1156 if (get_user(pos
, offset
))
1163 desc
.buf
= (char *) out_file
;
1165 do_generic_file_read(in_file
, ppos
, &desc
, file_send_actor
);
1167 retval
= desc
.written
;
1169 retval
= desc
.error
;
1171 put_user(pos
, offset
);
1185 * Semantics for shared and private memory areas are different past the end
1186 * of the file. A shared mapping past the last page of the file is an error
1187 * and results in a SIGBUS, while a private mapping just maps in a zero page.
1189 * The goto's are kind of ugly, but this streamlines the normal case of having
1190 * it in the page cache, and handles the special cases reasonably without
1191 * having a lot of duplicated code.
1193 * WSH 06/04/97: fixed a memory leak and moved the allocation of new_page
1194 * ahead of the wait if we're sure to need it.
1196 static unsigned long filemap_nopage(struct vm_area_struct
* area
, unsigned long address
, int no_share
)
1198 struct file
* file
= area
->vm_file
;
1199 struct dentry
* dentry
= file
->f_dentry
;
1200 struct inode
* inode
= dentry
->d_inode
;
1201 unsigned long offset
, reada
, i
;
1202 struct page
* page
, **hash
;
1203 unsigned long old_page
, new_page
;
1207 offset
= (address
& PAGE_MASK
) - area
->vm_start
+ area
->vm_offset
;
1208 if (offset
>= inode
->i_size
&& (area
->vm_flags
& VM_SHARED
) && area
->vm_mm
== current
->mm
)
1209 goto no_page_nolock
;
1213 * Do we have something in the page cache already?
1215 hash
= page_hash(inode
, offset
);
1217 page
= __find_get_page(inode
, offset
, *hash
);
1219 goto no_cached_page
;
1223 * Ok, found a page in the page cache, now we need to check
1224 * that it's up-to-date. First check whether we'll need an
1225 * extra page -- better to overlap the allocation with the I/O.
1227 if (no_share
&& !new_page
) {
1228 new_page
= page_cache_alloc();
1235 if (!Page_Uptodate(page
))
1240 * Found the page and have a reference on it, need to check sharing
1241 * and possibly copy it over to another page..
1243 old_page
= page_address(page
);
1246 * Ok, we can share the cached page directly.. Get rid
1247 * of any potential extra pages.
1250 page_cache_free(new_page
);
1252 flush_page_to_ram(old_page
);
1258 * No sharing ... copy to the new page.
1260 copy_page(new_page
, old_page
);
1261 flush_page_to_ram(new_page
);
1262 page_cache_release(page
);
1268 * Try to read in an entire cluster at once.
1271 reada
>>= PAGE_CACHE_SHIFT
+ page_cluster
;
1272 reada
<<= PAGE_CACHE_SHIFT
+ page_cluster
;
1274 for (i
= 1 << page_cluster
; i
> 0; --i
, reada
+= PAGE_CACHE_SIZE
)
1275 new_page
= try_to_read_ahead(file
, reada
, new_page
);
1278 new_page
= page_cache_alloc();
1283 * During getting the above page we might have slept,
1284 * so we need to re-check the situation with the page
1285 * cache.. The page we just got may be useful if we
1286 * can't share, so don't get rid of it here.
1288 page
= __find_get_page(inode
, offset
, *hash
);
1293 * Now, create a new page-cache page from the page we got
1295 page
= page_cache_entry(new_page
);
1296 if (add_to_page_cache_unique(page
, inode
, offset
, hash
))
1300 * Now it's ours and locked, we can do initial IO to it:
1305 error
= inode
->i_op
->readpage(file
, page
);
1310 if (PageError(page
))
1311 goto page_read_error
;
1317 * Umm, take care of errors if the page isn't up-to-date.
1318 * Try to re-read it _once_. We do this synchronously,
1319 * because there really aren't any performance issues here
1320 * and we need to check for errors.
1322 if (!PageLocked(page
))
1324 ClearPageError(page
);
1326 error
= inode
->i_op
->readpage(file
, page
);
1331 if (Page_Uptodate(page
))
1335 * Things didn't work out. Return zero to tell the
1336 * mm layer so, possibly freeing the page cache page first.
1339 page_cache_release(page
);
1341 page_cache_free(new_page
);
1349 * Tries to write a shared mapped page to its backing store. May return -EIO
1350 * if the disk is full.
1352 static inline int do_write_page(struct inode
* inode
, struct file
* file
,
1353 const char * page_addr
, unsigned long offset
)
1357 loff_t loff
= offset
;
1358 int (*writepage
) (struct file
*, struct page
*);
1361 size
= offset
+ PAGE_SIZE
;
1362 /* refuse to extend file size.. */
1363 if (S_ISREG(inode
->i_mode
)) {
1364 if (size
> inode
->i_size
)
1365 size
= inode
->i_size
;
1366 /* Ho humm.. We should have tested for this earlier */
1372 writepage
= inode
->i_op
->writepage
;
1373 page
= mem_map
+ MAP_NR(page_addr
);
1377 retval
= writepage(file
, page
);
1379 mm_segment_t old_fs
= get_fs();
1381 if (size
== file
->f_op
->write(file
, page_addr
, size
, &loff
))
1389 static int filemap_write_page(struct vm_area_struct
* vma
,
1390 unsigned long offset
,
1396 struct dentry
* dentry
;
1397 struct inode
* inode
;
1399 file
= vma
->vm_file
;
1400 dentry
= file
->f_dentry
;
1401 inode
= dentry
->d_inode
;
1402 if (!file
->f_op
->write
)
1406 * If a task terminates while we're swapping the page, the vma and
1407 * and file could be released ... increment the count to be safe.
1412 * If this is a swapping operation rather than msync(), then
1413 * leave the actual IO, and the restoration of the file count,
1414 * to the kpiod thread. Just queue the request for now.
1417 make_pio_request(file
, offset
, page
);
1421 result
= do_write_page(inode
, file
, (const char *) page
, offset
);
1428 * The page cache takes care of races between somebody
1429 * trying to swap something out and swap something in
1430 * at the same time..
1432 int filemap_swapout(struct vm_area_struct
* vma
, struct page
* page
)
1434 return filemap_write_page(vma
, page
->offset
, page_address(page
), 0);
1437 static inline int filemap_sync_pte(pte_t
* ptep
, struct vm_area_struct
*vma
,
1438 unsigned long address
, unsigned int flags
)
1441 unsigned long pageaddr
;
1445 if (!(flags
& MS_INVALIDATE
)) {
1446 if (!pte_present(pte
))
1448 if (!pte_dirty(pte
))
1450 flush_page_to_ram(pte_page(pte
));
1451 flush_cache_page(vma
, address
);
1452 set_pte(ptep
, pte_mkclean(pte
));
1453 flush_tlb_page(vma
, address
);
1454 pageaddr
= pte_page(pte
);
1455 page
= page_cache_entry(pageaddr
);
1460 flush_cache_page(vma
, address
);
1462 flush_tlb_page(vma
, address
);
1463 if (!pte_present(pte
)) {
1464 swap_free(pte_val(pte
));
1467 pageaddr
= pte_page(pte
);
1468 if (!pte_dirty(pte
) || flags
== MS_INVALIDATE
) {
1469 page_cache_free(pageaddr
);
1473 error
= filemap_write_page(vma
, address
- vma
->vm_start
+ vma
->vm_offset
, pageaddr
, 1);
1474 page_cache_free(pageaddr
);
1478 static inline int filemap_sync_pte_range(pmd_t
* pmd
,
1479 unsigned long address
, unsigned long size
,
1480 struct vm_area_struct
*vma
, unsigned long offset
, unsigned int flags
)
1488 if (pmd_bad(*pmd
)) {
1489 printk("filemap_sync_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd
));
1493 pte
= pte_offset(pmd
, address
);
1494 offset
+= address
& PMD_MASK
;
1495 address
&= ~PMD_MASK
;
1496 end
= address
+ size
;
1501 error
|= filemap_sync_pte(pte
, vma
, address
+ offset
, flags
);
1502 address
+= PAGE_SIZE
;
1504 } while (address
< end
);
1508 static inline int filemap_sync_pmd_range(pgd_t
* pgd
,
1509 unsigned long address
, unsigned long size
,
1510 struct vm_area_struct
*vma
, unsigned int flags
)
1513 unsigned long offset
, end
;
1518 if (pgd_bad(*pgd
)) {
1519 printk("filemap_sync_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd
));
1523 pmd
= pmd_offset(pgd
, address
);
1524 offset
= address
& PGDIR_MASK
;
1525 address
&= ~PGDIR_MASK
;
1526 end
= address
+ size
;
1527 if (end
> PGDIR_SIZE
)
1531 error
|= filemap_sync_pte_range(pmd
, address
, end
- address
, vma
, offset
, flags
);
1532 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
1534 } while (address
< end
);
1538 static int filemap_sync(struct vm_area_struct
* vma
, unsigned long address
,
1539 size_t size
, unsigned int flags
)
1542 unsigned long end
= address
+ size
;
1545 dir
= pgd_offset(vma
->vm_mm
, address
);
1546 flush_cache_range(vma
->vm_mm
, end
- size
, end
);
1547 while (address
< end
) {
1548 error
|= filemap_sync_pmd_range(dir
, address
, end
- address
, vma
, flags
);
1549 address
= (address
+ PGDIR_SIZE
) & PGDIR_MASK
;
1552 flush_tlb_range(vma
->vm_mm
, end
- size
, end
);
1557 * This handles (potentially partial) area unmaps..
1559 static void filemap_unmap(struct vm_area_struct
*vma
, unsigned long start
, size_t len
)
1561 filemap_sync(vma
, start
, len
, MS_ASYNC
);
1565 * Shared mappings need to be able to do the right thing at
1566 * close/unmap/sync. They will also use the private file as
1567 * backing-store for swapping..
1569 static struct vm_operations_struct file_shared_mmap
= {
1570 NULL
, /* no special open */
1571 NULL
, /* no special close */
1572 filemap_unmap
, /* unmap - we need to sync the pages */
1573 NULL
, /* no special protect */
1574 filemap_sync
, /* sync */
1576 filemap_nopage
, /* nopage */
1578 filemap_swapout
, /* swapout */
1583 * Private mappings just need to be able to load in the map.
1585 * (This is actually used for shared mappings as well, if we
1586 * know they can't ever get write permissions..)
1588 static struct vm_operations_struct file_private_mmap
= {
1595 filemap_nopage
, /* nopage */
1601 /* This is used for a general mmap of a disk file */
1603 int generic_file_mmap(struct file
* file
, struct vm_area_struct
* vma
)
1605 struct vm_operations_struct
* ops
;
1606 struct inode
*inode
= file
->f_dentry
->d_inode
;
1608 ops
= &file_private_mmap
;
1609 if ((vma
->vm_flags
& VM_SHARED
) && (vma
->vm_flags
& VM_MAYWRITE
))
1610 ops
= &file_shared_mmap
;
1611 if (!inode
->i_sb
|| !S_ISREG(inode
->i_mode
))
1613 if (!inode
->i_op
|| !inode
->i_op
->readpage
)
1615 UPDATE_ATIME(inode
);
1622 * The msync() system call.
1625 static int msync_interval(struct vm_area_struct
* vma
,
1626 unsigned long start
, unsigned long end
, int flags
)
1628 if (vma
->vm_file
&& vma
->vm_ops
&& vma
->vm_ops
->sync
) {
1630 error
= vma
->vm_ops
->sync(vma
, start
, end
-start
, flags
);
1631 if (!error
&& (flags
& MS_SYNC
)) {
1632 struct file
* file
= vma
->vm_file
;
1634 struct dentry
* dentry
= file
->f_dentry
;
1635 error
= file_fsync(file
, dentry
);
1643 asmlinkage
int sys_msync(unsigned long start
, size_t len
, int flags
)
1646 struct vm_area_struct
* vma
;
1647 int unmapped_error
, error
= -EINVAL
;
1649 down(¤t
->mm
->mmap_sem
);
1651 if (start
& ~PAGE_MASK
)
1653 len
= (len
+ ~PAGE_MASK
) & PAGE_MASK
;
1657 if (flags
& ~(MS_ASYNC
| MS_INVALIDATE
| MS_SYNC
))
1663 * If the interval [start,end) covers some unmapped address ranges,
1664 * just ignore them, but return -EFAULT at the end.
1666 vma
= find_vma(current
->mm
, start
);
1669 /* Still start < end. */
1673 /* Here start < vma->vm_end. */
1674 if (start
< vma
->vm_start
) {
1675 unmapped_error
= -EFAULT
;
1676 start
= vma
->vm_start
;
1678 /* Here vma->vm_start <= start < vma->vm_end. */
1679 if (end
<= vma
->vm_end
) {
1681 error
= msync_interval(vma
, start
, end
, flags
);
1685 error
= unmapped_error
;
1688 /* Here vma->vm_start <= start < vma->vm_end < end. */
1689 error
= msync_interval(vma
, start
, vma
->vm_end
, flags
);
1692 start
= vma
->vm_end
;
1697 up(¤t
->mm
->mmap_sem
);
1702 * Write to a file through the page cache. This is mainly for the
1703 * benefit of NFS and possibly other network-based file systems.
1705 * We currently put everything into the page cache prior to writing it.
1706 * This is not a problem when writing full pages. With partial pages,
1707 * however, we first have to read the data into the cache, then
1708 * dirty the page, and finally schedule it for writing. Alternatively, we
1709 * could write-through just the portion of data that would go into that
1710 * page, but that would kill performance for applications that write data
1711 * line by line, and it's prone to race conditions.
1713 * Note that this routine doesn't try to keep track of dirty pages. Each
1714 * file system has to do this all by itself, unfortunately.
1718 generic_file_write(struct file
*file
, const char *buf
,
1719 size_t count
, loff_t
*ppos
,
1720 writepage_t write_one_page
)
1722 struct dentry
*dentry
= file
->f_dentry
;
1723 struct inode
*inode
= dentry
->d_inode
;
1724 unsigned long pos
= *ppos
;
1725 unsigned long limit
= current
->rlim
[RLIMIT_FSIZE
].rlim_cur
;
1726 struct page
*page
, **hash
;
1727 unsigned long page_cache
= 0;
1728 unsigned long written
;
1732 err
= file
->f_error
;
1740 if (file
->f_flags
& O_APPEND
)
1741 pos
= inode
->i_size
;
1744 * Check whether we've reached the file size limit.
1748 send_sig(SIGXFSZ
, current
, 0);
1754 * Check whether to truncate the write,
1755 * and send the signal if we do.
1757 if (count
> limit
- pos
) {
1758 send_sig(SIGXFSZ
, current
, 0);
1759 count
= limit
- pos
;
1765 unsigned long bytes
, pgpos
, offset
;
1767 * Try to find the page in the cache. If it isn't there,
1768 * allocate a free page.
1770 offset
= (pos
& ~PAGE_CACHE_MASK
);
1771 pgpos
= pos
& PAGE_CACHE_MASK
;
1772 bytes
= PAGE_CACHE_SIZE
- offset
;
1776 hash
= page_hash(inode
, pgpos
);
1778 page
= __find_lock_page(inode
, pgpos
, *hash
);
1781 page_cache
= page_cache_alloc();
1787 page
= page_cache_entry(page_cache
);
1788 if (add_to_page_cache_unique(page
,inode
,pgpos
,hash
))
1794 /* We have exclusive IO access to the page.. */
1795 if (!PageLocked(page
)) {
1798 if (page
->owner
!= (int)current
) {
1803 status
= write_one_page(file
, page
, offset
, bytes
, buf
);
1805 /* Mark it unlocked again and drop the page.. */
1807 page_cache_release(page
);
1818 if (pos
> inode
->i_size
)
1819 inode
->i_size
= pos
;
1822 page_cache_free(page_cache
);
1824 err
= written
? written
: status
;
1831 * Support routines for directory caching using the page cache.
1835 * Unlock and free a page.
1837 void put_cached_page(unsigned long addr
)
1839 struct page
* page
= page_cache_entry(addr
);
1842 if (page_count(page
) != 2)
1843 panic("put_cached_page: page count=%d\n",
1845 page_cache_release(page
);
1849 /* Add request for page IO to the queue */
1851 static inline void put_pio_request(struct pio_request
*p
)
1855 pio_last
= &p
->next
;
1858 /* Take the first page IO request off the queue */
1860 static inline struct pio_request
* get_pio_request(void)
1862 struct pio_request
* p
= pio_first
;
1863 pio_first
= p
->next
;
1865 pio_last
= &pio_first
;
1869 /* Make a new page IO request and queue it to the kpiod thread */
1871 static inline void make_pio_request(struct file
*file
,
1872 unsigned long offset
,
1873 unsigned long pageaddr
)
1875 struct pio_request
*p
;
1878 page
= page_cache_entry(pageaddr
);
1882 * We need to allocate without causing any recursive IO in the
1883 * current thread's context. We might currently be swapping out
1884 * as a result of an allocation made while holding a critical
1885 * filesystem lock. To avoid deadlock, we *MUST* not reenter
1886 * the filesystem in this thread.
1888 * We can wait for kswapd to free memory, or we can try to free
1889 * pages without actually performing further IO, without fear of
1893 while ((p
= kmem_cache_alloc(pio_request_cache
, GFP_BUFFER
)) == NULL
) {
1894 if (try_to_free_pages(__GFP_WAIT
))
1896 current
->state
= TASK_INTERRUPTIBLE
;
1897 schedule_timeout(HZ
/10);
1910 * This is the only thread which is allowed to write out filemap pages
1913 * To avoid deadlock, it is important that we never reenter this thread.
1914 * Although recursive memory allocations within this thread may result
1915 * in more page swapping, that swapping will always be done by queuing
1916 * another IO request to the same thread: we will never actually start
1917 * that IO request until we have finished with the current one, and so
1918 * we will not deadlock.
1921 int kpiod(void * unused
)
1923 struct task_struct
*tsk
= current
;
1924 DECLARE_WAITQUEUE(wait
, tsk
);
1925 struct inode
* inode
;
1926 struct dentry
* dentry
;
1927 struct pio_request
* p
;
1931 strcpy(tsk
->comm
, "kpiod");
1932 sigfillset(&tsk
->blocked
);
1934 * Mark this task as a memory allocator - we don't want to get caught
1935 * up in the regular mm freeing frenzy if we have to allocate memory
1936 * in order to write stuff out.
1938 tsk
->flags
|= PF_MEMALLOC
;
1942 pio_request_cache
= kmem_cache_create("pio_request",
1943 sizeof(struct pio_request
),
1944 0, SLAB_HWCACHE_ALIGN
,
1946 if (!pio_request_cache
)
1947 panic ("Could not create pio_request slab cache");
1950 tsk
->state
= TASK_INTERRUPTIBLE
;
1951 add_wait_queue(&pio_wait
, &wait
);
1954 remove_wait_queue(&pio_wait
, &wait
);
1955 tsk
->state
= TASK_RUNNING
;
1958 p
= get_pio_request();
1959 dentry
= p
->file
->f_dentry
;
1960 inode
= dentry
->d_inode
;
1962 do_write_page(inode
, p
->file
,
1963 (const char *) p
->page
, p
->offset
);
1965 page_cache_free(p
->page
);
1966 kmem_cache_free(pio_request_cache
, p
);