4 * Copyright (C) 1994-1999 Linus Torvalds
8 * This file handles the generic file mmap semantics used by
9 * most "normal" filesystems (but you don't /have/ to use this:
10 * the NFS filesystem used to do this differently, for example)
12 #include <linux/malloc.h>
13 #include <linux/shm.h>
14 #include <linux/mman.h>
15 #include <linux/locks.h>
16 #include <linux/pagemap.h>
17 #include <linux/swap.h>
18 #include <linux/smp_lock.h>
19 #include <linux/blkdev.h>
20 #include <linux/file.h>
21 #include <linux/swapctl.h>
22 #include <linux/slab.h>
23 #include <linux/init.h>
26 #include <asm/pgalloc.h>
27 #include <asm/uaccess.h>
30 #include <linux/highmem.h>
33 * Shared mappings implemented 30.11.1994. It's not fully working yet,
36 * Shared mappings now work. 15.8.1995 Bruno.
38 * finished 'unifying' the page and buffer cache and SMP-threaded the
39 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
41 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
44 atomic_t page_cache_size
= ATOMIC_INIT(0);
45 unsigned int page_hash_bits
;
46 struct page
**page_hash_table
;
48 spinlock_t pagecache_lock
= SPIN_LOCK_UNLOCKED
;
50 * NOTE: to avoid deadlocking you must never acquire the pagecache_lock with
51 * the pagemap_lru_lock held.
53 spinlock_t pagemap_lru_lock
= SPIN_LOCK_UNLOCKED
;
55 #define CLUSTER_PAGES (1 << page_cluster)
56 #define CLUSTER_OFFSET(x) (((x) >> page_cluster) << page_cluster)
58 void __add_page_to_hash_queue(struct page
* page
, struct page
**p
)
60 atomic_inc(&page_cache_size
);
61 if((page
->next_hash
= *p
) != NULL
)
62 (*p
)->pprev_hash
= &page
->next_hash
;
69 static inline void remove_page_from_hash_queue(struct page
* page
)
71 if(page
->pprev_hash
) {
73 page
->next_hash
->pprev_hash
= page
->pprev_hash
;
74 *page
->pprev_hash
= page
->next_hash
;
75 page
->pprev_hash
= NULL
;
77 atomic_dec(&page_cache_size
);
80 static inline int sync_page(struct page
*page
)
82 struct address_space
*mapping
= page
->mapping
;
84 if (mapping
&& mapping
->a_ops
&& mapping
->a_ops
->sync_page
)
85 return mapping
->a_ops
->sync_page(page
);
90 * Remove a page from the page cache and free it. Caller has to make
91 * sure the page is locked and that nobody else uses it - or that usage
94 void __remove_inode_page(struct page
*page
)
96 remove_page_from_inode_queue(page
);
97 remove_page_from_hash_queue(page
);
101 void remove_inode_page(struct page
*page
)
103 if (!PageLocked(page
))
106 spin_lock(&pagecache_lock
);
107 __remove_inode_page(page
);
108 spin_unlock(&pagecache_lock
);
112 * invalidate_inode_pages - Invalidate all the unlocked pages of one inode
113 * @inode: the inode which pages we want to invalidate
115 * This function only removes the unlocked pages, if you want to
116 * remove all the pages of one inode, you must call truncate_inode_pages.
119 void invalidate_inode_pages(struct inode
* inode
)
121 struct list_head
*head
, *curr
;
124 head
= &inode
->i_mapping
->pages
;
126 spin_lock(&pagecache_lock
);
127 spin_lock(&pagemap_lru_lock
);
130 while (curr
!= head
) {
131 page
= list_entry(curr
, struct page
, list
);
134 /* We cannot invalidate a locked page */
135 if (TryLockPage(page
))
138 /* Neither can we invalidate something in use.. */
139 if (page_count(page
) != 1) {
144 __lru_cache_del(page
);
145 __remove_inode_page(page
);
147 page_cache_release(page
);
150 spin_unlock(&pagemap_lru_lock
);
151 spin_unlock(&pagecache_lock
);
154 static inline void truncate_partial_page(struct page
*page
, unsigned partial
)
156 memclear_highpage_flush(page
, partial
, PAGE_CACHE_SIZE
-partial
);
159 block_flushpage(page
, partial
);
163 static inline void truncate_complete_page(struct page
*page
)
165 /* Leave it on the LRU if it gets converted into anonymous buffers */
166 if (!page
->buffers
|| block_flushpage(page
, 0))
170 * We remove the page from the page cache _after_ we have
171 * destroyed all buffer-cache references to it. Otherwise some
172 * other process might think this inode page is not in the
173 * page cache and creates a buffer-cache alias to it causing
174 * all sorts of fun problems ...
176 ClearPageDirty(page
);
177 ClearPageUptodate(page
);
178 remove_inode_page(page
);
179 page_cache_release(page
);
183 * truncate_inode_pages - truncate *all* the pages from an offset
184 * @mapping: mapping to truncate
185 * @lstart: offset from with to truncate
187 * Truncate the page cache at a set offset, removing the pages
188 * that are beyond that offset (and zeroing out partial pages).
189 * If any page is locked we wait for it to become unlocked.
191 void truncate_inode_pages(struct address_space
* mapping
, loff_t lstart
)
193 struct list_head
*head
, *curr
;
195 unsigned partial
= lstart
& (PAGE_CACHE_SIZE
- 1);
198 start
= (lstart
+ PAGE_CACHE_SIZE
- 1) >> PAGE_CACHE_SHIFT
;
201 head
= &mapping
->pages
;
202 spin_lock(&pagecache_lock
);
204 while (curr
!= head
) {
205 unsigned long offset
;
207 page
= list_entry(curr
, struct page
, list
);
209 offset
= page
->index
;
211 /* Is one of the pages to truncate? */
212 if ((offset
>= start
) || (partial
&& (offset
+ 1) == start
)) {
213 if (TryLockPage(page
)) {
214 page_cache_get(page
);
215 spin_unlock(&pagecache_lock
);
217 page_cache_release(page
);
220 page_cache_get(page
);
221 spin_unlock(&pagecache_lock
);
223 if (partial
&& (offset
+ 1) == start
) {
224 truncate_partial_page(page
, partial
);
227 truncate_complete_page(page
);
230 page_cache_release(page
);
233 * We have done things without the pagecache lock,
234 * so we'll have to repeat the scan.
235 * It's not possible to deadlock here because
236 * we are guaranteed to make progress. (ie. we have
237 * just removed a page)
242 spin_unlock(&pagecache_lock
);
245 static inline struct page
* __find_page_nolock(struct address_space
*mapping
, unsigned long offset
, struct page
*page
)
250 page
= page
->next_hash
;
254 if (page
->mapping
!= mapping
)
256 if (page
->index
== offset
)
260 * Touching the page may move it to the active list.
261 * If we end up with too few inactive pages, we wake
265 if (inactive_shortage() > inactive_target
/ 2 && free_shortage())
272 * By the time this is called, the page is locked and
273 * we don't have to worry about any races any more.
277 static int writeout_one_page(struct page
*page
)
279 struct buffer_head
*bh
, *head
= page
->buffers
;
283 if (buffer_locked(bh
) || !buffer_dirty(bh
) || !buffer_uptodate(bh
))
286 bh
->b_flushtime
= jiffies
;
287 ll_rw_block(WRITE
, 1, &bh
);
288 } while ((bh
= bh
->b_this_page
) != head
);
292 static int waitfor_one_page(struct page
*page
)
295 struct buffer_head
*bh
, *head
= page
->buffers
;
300 if (buffer_req(bh
) && !buffer_uptodate(bh
))
302 } while ((bh
= bh
->b_this_page
) != head
);
306 static int do_buffer_fdatasync(struct inode
*inode
, unsigned long start
, unsigned long end
, int (*fn
)(struct page
*))
308 struct list_head
*head
, *curr
;
312 head
= &inode
->i_mapping
->pages
;
314 spin_lock(&pagecache_lock
);
316 while (curr
!= head
) {
317 page
= list_entry(curr
, struct page
, list
);
321 if (page
->index
>= end
)
323 if (page
->index
< start
)
326 page_cache_get(page
);
327 spin_unlock(&pagecache_lock
);
330 /* The buffers could have been free'd while we waited for the page lock */
335 spin_lock(&pagecache_lock
);
336 curr
= page
->list
.next
;
337 page_cache_release(page
);
339 spin_unlock(&pagecache_lock
);
345 * Two-stage data sync: first start the IO, then go back and
346 * collect the information..
348 int generic_buffer_fdatasync(struct inode
*inode
, unsigned long start_idx
, unsigned long end_idx
)
352 retval
= do_buffer_fdatasync(inode
, start_idx
, end_idx
, writeout_one_page
);
353 retval
|= do_buffer_fdatasync(inode
, start_idx
, end_idx
, waitfor_one_page
);
358 * Add a page to the inode page cache.
360 * The caller must have locked the page and
361 * set all the page flags correctly..
363 void add_to_page_cache_locked(struct page
* page
, struct address_space
*mapping
, unsigned long index
)
365 if (!PageLocked(page
))
368 page_cache_get(page
);
369 spin_lock(&pagecache_lock
);
371 add_page_to_inode_queue(mapping
, page
);
372 __add_page_to_hash_queue(page
, page_hash(mapping
, index
));
374 spin_unlock(&pagecache_lock
);
378 * This adds a page to the page cache, starting out as locked,
379 * owned by us, but unreferenced, not uptodate and with no errors.
381 static inline void __add_to_page_cache(struct page
* page
,
382 struct address_space
*mapping
, unsigned long offset
,
387 if (PageLocked(page
))
390 flags
= page
->flags
& ~((1 << PG_uptodate
) | (1 << PG_error
) | (1 << PG_dirty
) | (1 << PG_referenced
) | (1 << PG_arch_1
));
391 page
->flags
= flags
| (1 << PG_locked
);
392 page_cache_get(page
);
393 page
->index
= offset
;
394 add_page_to_inode_queue(mapping
, page
);
395 __add_page_to_hash_queue(page
, hash
);
399 void add_to_page_cache(struct page
* page
, struct address_space
* mapping
, unsigned long offset
)
401 spin_lock(&pagecache_lock
);
402 __add_to_page_cache(page
, mapping
, offset
, page_hash(mapping
, offset
));
403 spin_unlock(&pagecache_lock
);
406 static int add_to_page_cache_unique(struct page
* page
,
407 struct address_space
*mapping
, unsigned long offset
,
413 spin_lock(&pagecache_lock
);
414 alias
= __find_page_nolock(mapping
, offset
, *hash
);
418 __add_to_page_cache(page
,mapping
,offset
,hash
);
422 spin_unlock(&pagecache_lock
);
427 * This adds the requested page to the page cache if it isn't already there,
428 * and schedules an I/O to read in its contents from disk.
430 static inline int page_cache_read(struct file
* file
, unsigned long offset
)
432 struct inode
*inode
= file
->f_dentry
->d_inode
;
433 struct address_space
*mapping
= inode
->i_mapping
;
434 struct page
**hash
= page_hash(mapping
, offset
);
437 spin_lock(&pagecache_lock
);
438 page
= __find_page_nolock(mapping
, offset
, *hash
);
439 spin_unlock(&pagecache_lock
);
443 page
= page_cache_alloc();
447 if (!add_to_page_cache_unique(page
, mapping
, offset
, hash
)) {
448 int error
= mapping
->a_ops
->readpage(file
, page
);
449 page_cache_release(page
);
453 * We arrive here in the unlikely event that someone
454 * raced with us and added our page to the cache first.
456 page_cache_free(page
);
461 * Read in an entire cluster at once. A cluster is usually a 64k-
462 * aligned block that includes the page requested in "offset."
464 static int read_cluster_nonblocking(struct file
* file
, unsigned long offset
,
465 unsigned long filesize
)
467 unsigned long pages
= CLUSTER_PAGES
;
469 offset
= CLUSTER_OFFSET(offset
);
470 while ((pages
-- > 0) && (offset
< filesize
)) {
471 int error
= page_cache_read(file
, offset
);
481 * Wait for a page to get unlocked.
483 * This must be called with the caller "holding" the page,
484 * ie with increased "page->count" so that the page won't
485 * go away during the wait..
487 void ___wait_on_page(struct page
*page
)
489 struct task_struct
*tsk
= current
;
490 DECLARE_WAITQUEUE(wait
, tsk
);
492 add_wait_queue(&page
->wait
, &wait
);
495 set_task_state(tsk
, TASK_UNINTERRUPTIBLE
);
496 if (!PageLocked(page
))
498 run_task_queue(&tq_disk
);
500 } while (PageLocked(page
));
501 tsk
->state
= TASK_RUNNING
;
502 remove_wait_queue(&page
->wait
, &wait
);
506 * Get a lock on the page, assuming we need to sleep
509 static void __lock_page(struct page
*page
)
511 struct task_struct
*tsk
= current
;
512 DECLARE_WAITQUEUE(wait
, tsk
);
514 add_wait_queue_exclusive(&page
->wait
, &wait
);
517 set_task_state(tsk
, TASK_UNINTERRUPTIBLE
);
518 if (PageLocked(page
)) {
519 run_task_queue(&tq_disk
);
523 if (!TryLockPage(page
))
526 tsk
->state
= TASK_RUNNING
;
527 remove_wait_queue(&page
->wait
, &wait
);
532 * Get an exclusive lock on the page, optimistically
533 * assuming it's not locked..
535 void lock_page(struct page
*page
)
537 if (TryLockPage(page
))
542 * a rather lightweight function, finding and getting a reference to a
543 * hashed page atomically, waiting for it if it's locked.
545 static struct page
* __find_get_page(struct address_space
*mapping
,
546 unsigned long offset
, struct page
**hash
)
551 * We scan the hash list read-only. Addition to and removal from
552 * the hash-list needs a held write-lock.
554 spin_lock(&pagecache_lock
);
555 page
= __find_page_nolock(mapping
, offset
, *hash
);
557 page_cache_get(page
);
558 spin_unlock(&pagecache_lock
);
563 * Get the lock to a page atomically.
565 struct page
* __find_lock_page (struct address_space
*mapping
,
566 unsigned long offset
, struct page
**hash
)
571 * We scan the hash list read-only. Addition to and removal from
572 * the hash-list needs a held write-lock.
575 spin_lock(&pagecache_lock
);
576 page
= __find_page_nolock(mapping
, offset
, *hash
);
578 page_cache_get(page
);
579 spin_unlock(&pagecache_lock
);
583 /* Is the page still hashed? Ok, good.. */
587 /* Nope: we raced. Release and try again.. */
589 page_cache_release(page
);
592 spin_unlock(&pagecache_lock
);
597 #define PROFILE_READAHEAD
598 #define DEBUG_READAHEAD
602 * We combine this with read-ahead to deactivate pages when we
603 * think there's sequential IO going on. Note that this is
604 * harmless since we don't actually evict the pages from memory
605 * but just move them to the inactive list.
608 * - make the readahead code smarter
609 * - move readahead to the VMA level so we can do the same
614 static void drop_behind(struct file
* file
, unsigned long index
)
616 struct inode
*inode
= file
->f_dentry
->d_inode
;
617 struct address_space
*mapping
= inode
->i_mapping
;
622 /* Nothing to drop-behind if we're on the first page. */
626 if (index
> file
->f_rawin
)
627 start
= index
- file
->f_rawin
;
632 * Go backwards from index-1 and drop all pages in the
633 * readahead window. Since the readahead window may have
634 * been increased since the last time we were called, we
635 * stop when the page isn't there.
637 spin_lock(&pagecache_lock
);
638 while (--index
>= start
) {
639 hash
= page_hash(mapping
, index
);
640 page
= __find_page_nolock(mapping
, index
, *hash
);
643 deactivate_page(page
);
645 spin_unlock(&pagecache_lock
);
649 * Read-ahead profiling information
650 * --------------------------------
651 * Every PROFILE_MAXREADCOUNT, the following information is written
653 * Percentage of asynchronous read-ahead.
654 * Average of read-ahead fields context value.
655 * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
659 #ifdef PROFILE_READAHEAD
661 #define PROFILE_MAXREADCOUNT 1000
663 static unsigned long total_reada
;
664 static unsigned long total_async
;
665 static unsigned long total_ramax
;
666 static unsigned long total_ralen
;
667 static unsigned long total_rawin
;
669 static void profile_readahead(int async
, struct file
*filp
)
677 total_ramax
+= filp
->f_ramax
;
678 total_ralen
+= filp
->f_ralen
;
679 total_rawin
+= filp
->f_rawin
;
681 if (total_reada
> PROFILE_MAXREADCOUNT
) {
684 if (!(total_reada
> PROFILE_MAXREADCOUNT
)) {
685 restore_flags(flags
);
689 printk("Readahead average: max=%ld, len=%ld, win=%ld, async=%ld%%\n",
690 total_ramax
/total_reada
,
691 total_ralen
/total_reada
,
692 total_rawin
/total_reada
,
693 (total_async
*100)/total_reada
);
694 #ifdef DEBUG_READAHEAD
695 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n",
696 filp
->f_ramax
, filp
->f_ralen
, filp
->f_rawin
, filp
->f_raend
);
705 restore_flags(flags
);
708 #endif /* defined PROFILE_READAHEAD */
711 * Read-ahead context:
712 * -------------------
713 * The read ahead context fields of the "struct file" are the following:
714 * - f_raend : position of the first byte after the last page we tried to
716 * - f_ramax : current read-ahead maximum size.
717 * - f_ralen : length of the current IO read block we tried to read-ahead.
718 * - f_rawin : length of the current read-ahead window.
719 * if last read-ahead was synchronous then
721 * otherwise (was asynchronous)
722 * f_rawin = previous value of f_ralen + f_ralen
726 * MIN_READAHEAD : minimum read-ahead size when read-ahead.
727 * MAX_READAHEAD : maximum read-ahead size when read-ahead.
729 * Synchronous read-ahead benefits:
730 * --------------------------------
731 * Using reasonable IO xfer length from peripheral devices increase system
733 * Reasonable means, in this context, not too large but not too small.
734 * The actual maximum value is:
735 * MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
736 * and 32K if defined (4K page size assumed).
738 * Asynchronous read-ahead benefits:
739 * ---------------------------------
740 * Overlapping next read request and user process execution increase system
745 * We have to guess which further data are needed by the user process.
746 * If these data are often not really needed, it's bad for system
748 * However, we know that files are often accessed sequentially by
749 * application programs and it seems that it is possible to have some good
750 * strategy in that guessing.
751 * We only try to read-ahead files that seems to be read sequentially.
753 * Asynchronous read-ahead risks:
754 * ------------------------------
755 * In order to maximize overlapping, we must start some asynchronous read
756 * request from the device, as soon as possible.
757 * We must be very careful about:
758 * - The number of effective pending IO read requests.
759 * ONE seems to be the only reasonable value.
760 * - The total memory pool usage for the file access stream.
761 * This maximum memory usage is implicitly 2 IO read chunks:
762 * 2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
763 * 64k if defined (4K page size assumed).
766 static inline int get_max_readahead(struct inode
* inode
)
768 if (!inode
->i_dev
|| !max_readahead
[MAJOR(inode
->i_dev
)])
769 return MAX_READAHEAD
;
770 return max_readahead
[MAJOR(inode
->i_dev
)][MINOR(inode
->i_dev
)];
773 static void generic_file_readahead(int reada_ok
,
774 struct file
* filp
, struct inode
* inode
,
777 unsigned long end_index
= inode
->i_size
>> PAGE_CACHE_SHIFT
;
778 unsigned long index
= page
->index
;
779 unsigned long max_ahead
, ahead
;
781 int max_readahead
= get_max_readahead(inode
);
783 raend
= filp
->f_raend
;
787 * The current page is locked.
788 * If the current position is inside the previous read IO request, do not
789 * try to reread previously read ahead pages.
790 * Otherwise decide or not to read ahead some pages synchronously.
791 * If we are not going to read ahead, set the read ahead context for this
794 if (PageLocked(page
)) {
795 if (!filp
->f_ralen
|| index
>= raend
|| index
+ filp
->f_rawin
< raend
) {
797 if (raend
< end_index
)
798 max_ahead
= filp
->f_ramax
;
802 filp
->f_raend
= index
+ filp
->f_ralen
;
803 filp
->f_rawin
+= filp
->f_ralen
;
808 * The current page is not locked.
809 * If we were reading ahead and,
810 * if the current max read ahead size is not zero and,
811 * if the current position is inside the last read-ahead IO request,
812 * it is the moment to try to read ahead asynchronously.
813 * We will later force unplug device in order to force asynchronous read IO.
815 else if (reada_ok
&& filp
->f_ramax
&& raend
>= 1 &&
816 index
<= raend
&& index
+ filp
->f_ralen
>= raend
) {
818 * Add ONE page to max_ahead in order to try to have about the same IO max size
819 * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
820 * Compute the position of the last page we have tried to read in order to
821 * begin to read ahead just at the next page.
824 if (raend
< end_index
)
825 max_ahead
= filp
->f_ramax
+ 1;
828 filp
->f_rawin
= filp
->f_ralen
;
834 * Try to read ahead pages.
835 * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
836 * scheduler, will work enough for us to avoid too bad actuals IO requests.
839 while (ahead
< max_ahead
) {
841 if ((raend
+ ahead
) >= end_index
)
843 if (page_cache_read(filp
, raend
+ ahead
) < 0)
847 * If we tried to read ahead some pages,
848 * If we tried to read ahead asynchronously,
849 * Try to force unplug of the device in order to start an asynchronous
851 * Update the read-ahead context.
852 * Store the length of the current read-ahead window.
853 * Double the current max read ahead size.
854 * That heuristic avoid to do some large IO for files that are not really
855 * accessed sequentially.
859 run_task_queue(&tq_disk
);
862 filp
->f_ralen
+= ahead
;
863 filp
->f_rawin
+= filp
->f_ralen
;
864 filp
->f_raend
= raend
+ ahead
+ 1;
866 filp
->f_ramax
+= filp
->f_ramax
;
868 if (filp
->f_ramax
> max_readahead
)
869 filp
->f_ramax
= max_readahead
;
872 * Move the pages that have already been passed
873 * to the inactive list.
875 drop_behind(filp
, index
);
877 #ifdef PROFILE_READAHEAD
878 profile_readahead((reada_ok
== 2), filp
);
887 * This is a generic file read routine, and uses the
888 * inode->i_op->readpage() function for the actual low-level
891 * This is really ugly. But the goto's actually try to clarify some
892 * of the logic when it comes to error handling etc.
894 void do_generic_file_read(struct file
* filp
, loff_t
*ppos
, read_descriptor_t
* desc
, read_actor_t actor
)
896 struct inode
*inode
= filp
->f_dentry
->d_inode
;
897 struct address_space
*mapping
= inode
->i_mapping
;
898 unsigned long index
, offset
;
899 struct page
*cached_page
;
902 int max_readahead
= get_max_readahead(inode
);
905 index
= *ppos
>> PAGE_CACHE_SHIFT
;
906 offset
= *ppos
& ~PAGE_CACHE_MASK
;
909 * If the current position is outside the previous read-ahead window,
910 * we reset the current read-ahead context and set read ahead max to zero
911 * (will be set to just needed value later),
912 * otherwise, we assume that the file accesses are sequential enough to
913 * continue read-ahead.
915 if (index
> filp
->f_raend
|| index
+ filp
->f_rawin
< filp
->f_raend
) {
925 * Adjust the current value of read-ahead max.
926 * If the read operation stay in the first half page, force no readahead.
927 * Otherwise try to increase read ahead max just enough to do the read request.
928 * Then, at least MIN_READAHEAD if read ahead is ok,
929 * and at most MAX_READAHEAD in all cases.
931 if (!index
&& offset
+ desc
->count
<= (PAGE_CACHE_SIZE
>> 1)) {
934 unsigned long needed
;
936 needed
= ((offset
+ desc
->count
) >> PAGE_CACHE_SHIFT
) + 1;
938 if (filp
->f_ramax
< needed
)
939 filp
->f_ramax
= needed
;
941 if (reada_ok
&& filp
->f_ramax
< MIN_READAHEAD
)
942 filp
->f_ramax
= MIN_READAHEAD
;
943 if (filp
->f_ramax
> max_readahead
)
944 filp
->f_ramax
= max_readahead
;
948 struct page
*page
, **hash
;
949 unsigned long end_index
, nr
;
951 end_index
= inode
->i_size
>> PAGE_CACHE_SHIFT
;
952 if (index
> end_index
)
954 nr
= PAGE_CACHE_SIZE
;
955 if (index
== end_index
) {
956 nr
= inode
->i_size
& ~PAGE_CACHE_MASK
;
964 * Try to find the data in the page cache..
966 hash
= page_hash(mapping
, index
);
968 spin_lock(&pagecache_lock
);
969 page
= __find_page_nolock(mapping
, index
, *hash
);
973 page_cache_get(page
);
974 spin_unlock(&pagecache_lock
);
976 if (!Page_Uptodate(page
))
977 goto page_not_up_to_date
;
978 generic_file_readahead(reada_ok
, filp
, inode
, page
);
980 /* If users can be writing to this page using arbitrary
981 * virtual addresses, take care about potential aliasing
982 * before reading the page on the kernel side.
984 if (mapping
->i_mmap_shared
!= NULL
)
985 flush_dcache_page(page
);
988 * Ok, we have the page, and it's up-to-date, so
989 * now we can copy it to user space...
991 * The actor routine returns how many bytes were actually used..
992 * NOTE! This may not be the same as how much of a user buffer
993 * we filled up (we may be padding etc), so we can only update
994 * "pos" here (the actor routine has to update the user buffer
995 * pointers and the remaining count).
997 nr
= actor(desc
, page
, offset
, nr
);
999 index
+= offset
>> PAGE_CACHE_SHIFT
;
1000 offset
&= ~PAGE_CACHE_MASK
;
1002 page_cache_release(page
);
1003 if (nr
&& desc
->count
)
1008 * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
1010 page_not_up_to_date
:
1011 generic_file_readahead(reada_ok
, filp
, inode
, page
);
1013 if (Page_Uptodate(page
))
1016 /* Get exclusive access to the page ... */
1019 /* Did it get unhashed before we got the lock? */
1020 if (!page
->mapping
) {
1022 page_cache_release(page
);
1026 /* Did somebody else fill it already? */
1027 if (Page_Uptodate(page
)) {
1033 /* ... and start the actual read. The read will unlock the page. */
1034 error
= mapping
->a_ops
->readpage(filp
, page
);
1037 if (Page_Uptodate(page
))
1040 /* Again, try some read-ahead while waiting for the page to finish.. */
1041 generic_file_readahead(reada_ok
, filp
, inode
, page
);
1043 if (Page_Uptodate(page
))
1048 /* UHHUH! A synchronous read error occurred. Report it */
1049 desc
->error
= error
;
1050 page_cache_release(page
);
1055 * Ok, it wasn't cached, so we need to create a new
1058 * We get here with the page cache lock held.
1061 spin_unlock(&pagecache_lock
);
1062 cached_page
= page_cache_alloc();
1064 desc
->error
= -ENOMEM
;
1069 * Somebody may have added the page while we
1070 * dropped the page cache lock. Check for that.
1072 spin_lock(&pagecache_lock
);
1073 page
= __find_page_nolock(mapping
, index
, *hash
);
1079 * Ok, add the new page to the hash-queues...
1082 __add_to_page_cache(page
, mapping
, index
, hash
);
1083 spin_unlock(&pagecache_lock
);
1089 *ppos
= ((loff_t
) index
<< PAGE_CACHE_SHIFT
) + offset
;
1092 page_cache_free(cached_page
);
1093 UPDATE_ATIME(inode
);
1096 static int file_read_actor(read_descriptor_t
* desc
, struct page
*page
, unsigned long offset
, unsigned long size
)
1099 unsigned long left
, count
= desc
->count
;
1105 left
= __copy_to_user(desc
->buf
, kaddr
+ offset
, size
);
1110 desc
->error
= -EFAULT
;
1112 desc
->count
= count
- size
;
1113 desc
->written
+= size
;
1119 * This is the "read()" routine for all filesystems
1120 * that can use the page cache directly.
1122 ssize_t
generic_file_read(struct file
* filp
, char * buf
, size_t count
, loff_t
*ppos
)
1127 if (access_ok(VERIFY_WRITE
, buf
, count
)) {
1131 read_descriptor_t desc
;
1137 do_generic_file_read(filp
, ppos
, &desc
, file_read_actor
);
1139 retval
= desc
.written
;
1141 retval
= desc
.error
;
1147 static int file_send_actor(read_descriptor_t
* desc
, struct page
*page
, unsigned long offset
, unsigned long size
)
1151 unsigned long count
= desc
->count
;
1152 struct file
*file
= (struct file
*) desc
->buf
;
1153 mm_segment_t old_fs
;
1161 written
= file
->f_op
->write(file
, kaddr
+ offset
, size
, &file
->f_pos
);
1165 desc
->error
= written
;
1168 desc
->count
= count
- written
;
1169 desc
->written
+= written
;
1173 asmlinkage ssize_t
sys_sendfile(int out_fd
, int in_fd
, off_t
*offset
, size_t count
)
1176 struct file
* in_file
, * out_file
;
1177 struct inode
* in_inode
, * out_inode
;
1180 * Get input file, and verify that it is ok..
1183 in_file
= fget(in_fd
);
1186 if (!(in_file
->f_mode
& FMODE_READ
))
1189 in_inode
= in_file
->f_dentry
->d_inode
;
1192 if (!in_inode
->i_mapping
->a_ops
->readpage
)
1194 retval
= locks_verify_area(FLOCK_VERIFY_READ
, in_inode
, in_file
, in_file
->f_pos
, count
);
1199 * Get output file, and verify that it is ok..
1202 out_file
= fget(out_fd
);
1205 if (!(out_file
->f_mode
& FMODE_WRITE
))
1208 if (!out_file
->f_op
|| !out_file
->f_op
->write
)
1210 out_inode
= out_file
->f_dentry
->d_inode
;
1211 retval
= locks_verify_area(FLOCK_VERIFY_WRITE
, out_inode
, out_file
, out_file
->f_pos
, count
);
1217 read_descriptor_t desc
;
1218 loff_t pos
= 0, *ppos
;
1221 ppos
= &in_file
->f_pos
;
1223 if (get_user(pos
, offset
))
1230 desc
.buf
= (char *) out_file
;
1232 do_generic_file_read(in_file
, ppos
, &desc
, file_send_actor
);
1234 retval
= desc
.written
;
1236 retval
= desc
.error
;
1238 put_user(pos
, offset
);
1250 * Read-ahead and flush behind for MADV_SEQUENTIAL areas. Since we are
1251 * sure this is sequential access, we don't need a flexible read-ahead
1252 * window size -- we can always use a large fixed size window.
1254 static void nopage_sequential_readahead(struct vm_area_struct
* vma
,
1255 unsigned long pgoff
, unsigned long filesize
)
1257 unsigned long ra_window
;
1259 ra_window
= get_max_readahead(vma
->vm_file
->f_dentry
->d_inode
);
1260 ra_window
= CLUSTER_OFFSET(ra_window
+ CLUSTER_PAGES
- 1);
1262 /* vm_raend is zero if we haven't read ahead in this area yet. */
1263 if (vma
->vm_raend
== 0)
1264 vma
->vm_raend
= vma
->vm_pgoff
+ ra_window
;
1267 * If we've just faulted the page half-way through our window,
1268 * then schedule reads for the next window, and release the
1269 * pages in the previous window.
1271 if ((pgoff
+ (ra_window
>> 1)) == vma
->vm_raend
) {
1272 unsigned long start
= vma
->vm_pgoff
+ vma
->vm_raend
;
1273 unsigned long end
= start
+ ra_window
;
1275 if (end
> ((vma
->vm_end
>> PAGE_SHIFT
) + vma
->vm_pgoff
))
1276 end
= (vma
->vm_end
>> PAGE_SHIFT
) + vma
->vm_pgoff
;
1280 while ((start
< end
) && (start
< filesize
)) {
1281 if (read_cluster_nonblocking(vma
->vm_file
,
1282 start
, filesize
) < 0)
1284 start
+= CLUSTER_PAGES
;
1286 run_task_queue(&tq_disk
);
1288 /* if we're far enough past the beginning of this area,
1289 recycle pages that are in the previous window. */
1290 if (vma
->vm_raend
> (vma
->vm_pgoff
+ ra_window
+ ra_window
)) {
1291 unsigned long window
= ra_window
<< PAGE_SHIFT
;
1293 end
= vma
->vm_start
+ (vma
->vm_raend
<< PAGE_SHIFT
);
1294 end
-= window
+ window
;
1295 filemap_sync(vma
, end
- window
, window
, MS_INVALIDATE
);
1298 vma
->vm_raend
+= ra_window
;
1305 * filemap_nopage() is invoked via the vma operations vector for a
1306 * mapped memory region to read in file data during a page fault.
1308 * The goto's are kind of ugly, but this streamlines the normal case of having
1309 * it in the page cache, and handles the special cases reasonably without
1310 * having a lot of duplicated code.
1312 struct page
* filemap_nopage(struct vm_area_struct
* area
,
1313 unsigned long address
, int no_share
)
1316 struct file
*file
= area
->vm_file
;
1317 struct inode
*inode
= file
->f_dentry
->d_inode
;
1318 struct address_space
*mapping
= inode
->i_mapping
;
1319 struct page
*page
, **hash
, *old_page
;
1320 unsigned long size
, pgoff
;
1322 pgoff
= ((address
- area
->vm_start
) >> PAGE_CACHE_SHIFT
) + area
->vm_pgoff
;
1326 * An external ptracer can access pages that normally aren't
1329 size
= (inode
->i_size
+ PAGE_CACHE_SIZE
- 1) >> PAGE_CACHE_SHIFT
;
1330 if ((pgoff
>= size
) && (area
->vm_mm
== current
->mm
))
1334 * Do we have something in the page cache already?
1336 hash
= page_hash(mapping
, pgoff
);
1338 page
= __find_get_page(mapping
, pgoff
, hash
);
1340 goto no_cached_page
;
1343 * Ok, found a page in the page cache, now we need to check
1344 * that it's up-to-date.
1346 if (!Page_Uptodate(page
))
1347 goto page_not_uptodate
;
1351 * Try read-ahead for sequential areas.
1353 if (VM_SequentialReadHint(area
))
1354 nopage_sequential_readahead(area
, pgoff
, size
);
1357 * Found the page and have a reference on it, need to check sharing
1358 * and possibly copy it over to another page..
1362 struct page
*new_page
= page_cache_alloc();
1365 copy_user_highpage(new_page
, old_page
, address
);
1366 flush_page_to_ram(new_page
);
1368 new_page
= NOPAGE_OOM
;
1369 page_cache_release(page
);
1373 flush_page_to_ram(old_page
);
1378 * If the requested offset is within our file, try to read a whole
1379 * cluster of pages at once.
1381 * Otherwise, we're off the end of a privately mapped file,
1382 * so we need to map a zero page.
1384 if ((pgoff
< size
) && !VM_RandomReadHint(area
))
1385 error
= read_cluster_nonblocking(file
, pgoff
, size
);
1387 error
= page_cache_read(file
, pgoff
);
1390 * The page we want has now been added to the page cache.
1391 * In the unlikely event that someone removed it in the
1392 * meantime, we'll just come back here and read it again.
1398 * An error return from page_cache_read can result if the
1399 * system is low on memory, or a problem occurs while trying
1402 if (error
== -ENOMEM
)
1409 /* Did it get unhashed while we waited for it? */
1410 if (!page
->mapping
) {
1412 page_cache_release(page
);
1416 /* Did somebody else get it up-to-date? */
1417 if (Page_Uptodate(page
)) {
1422 if (!mapping
->a_ops
->readpage(file
, page
)) {
1424 if (Page_Uptodate(page
))
1429 * Umm, take care of errors if the page isn't up-to-date.
1430 * Try to re-read it _once_. We do this synchronously,
1431 * because there really aren't any performance issues here
1432 * and we need to check for errors.
1436 /* Somebody truncated the page on us? */
1437 if (!page
->mapping
) {
1439 page_cache_release(page
);
1443 /* Somebody else successfully read it in? */
1444 if (Page_Uptodate(page
)) {
1448 ClearPageError(page
);
1449 if (!mapping
->a_ops
->readpage(file
, page
)) {
1451 if (Page_Uptodate(page
))
1456 * Things didn't work out. Return zero to tell the
1457 * mm layer so, possibly freeing the page cache page first.
1459 page_cache_release(page
);
1464 * If a task terminates while we're swapping the page, the vma and
1465 * and file could be released: try_to_swap_out has done a get_file.
1466 * vma/file is guaranteed to exist in the unmap/sync cases because
1469 * The "mapping" test takes care of somebody having truncated the
1470 * page and thus made this write-page a no-op..
1472 static int filemap_write_page(struct page
* page
, int wait
)
1474 struct address_space
* mapping
= page
->mapping
;
1477 if (mapping
&& mapping
->a_ops
->writepage
) {
1478 ClearPageDirty(page
);
1479 error
= mapping
->a_ops
->writepage(page
);
1486 * The page cache takes care of races between somebody
1487 * trying to swap something out and swap something in
1488 * at the same time..
1490 extern void wakeup_bdflush(int);
1491 int filemap_swapout(struct page
* page
, struct file
*file
)
1497 /* Called with mm->page_table_lock held to protect against other
1498 * threads/the swapper from ripping pte's out from under us.
1500 static inline int filemap_sync_pte(pte_t
* ptep
, struct vm_area_struct
*vma
,
1501 unsigned long address
, unsigned int flags
)
1509 if (!pte_present(pte
))
1511 if (!ptep_test_and_clear_dirty(ptep
))
1514 flush_page_to_ram(pte_page(pte
));
1515 flush_cache_page(vma
, address
);
1516 flush_tlb_page(vma
, address
);
1517 page
= pte_page(pte
);
1518 page_cache_get(page
);
1519 spin_unlock(&vma
->vm_mm
->page_table_lock
);
1522 error
= filemap_write_page(page
, 1);
1523 page_cache_free(page
);
1525 spin_lock(&vma
->vm_mm
->page_table_lock
);
1532 static inline int filemap_sync_pte_range(pmd_t
* pmd
,
1533 unsigned long address
, unsigned long size
,
1534 struct vm_area_struct
*vma
, unsigned long offset
, unsigned int flags
)
1542 if (pmd_bad(*pmd
)) {
1547 pte
= pte_offset(pmd
, address
);
1548 offset
+= address
& PMD_MASK
;
1549 address
&= ~PMD_MASK
;
1550 end
= address
+ size
;
1555 error
|= filemap_sync_pte(pte
, vma
, address
+ offset
, flags
);
1556 address
+= PAGE_SIZE
;
1558 } while (address
&& (address
< end
));
1562 static inline int filemap_sync_pmd_range(pgd_t
* pgd
,
1563 unsigned long address
, unsigned long size
,
1564 struct vm_area_struct
*vma
, unsigned int flags
)
1567 unsigned long offset
, end
;
1572 if (pgd_bad(*pgd
)) {
1577 pmd
= pmd_offset(pgd
, address
);
1578 offset
= address
& PGDIR_MASK
;
1579 address
&= ~PGDIR_MASK
;
1580 end
= address
+ size
;
1581 if (end
> PGDIR_SIZE
)
1585 error
|= filemap_sync_pte_range(pmd
, address
, end
- address
, vma
, offset
, flags
);
1586 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
1588 } while (address
&& (address
< end
));
1592 int filemap_sync(struct vm_area_struct
* vma
, unsigned long address
,
1593 size_t size
, unsigned int flags
)
1596 unsigned long end
= address
+ size
;
1599 /* Aquire the lock early; it may be possible to avoid dropping
1600 * and reaquiring it repeatedly.
1602 spin_lock(&vma
->vm_mm
->page_table_lock
);
1604 dir
= pgd_offset(vma
->vm_mm
, address
);
1605 flush_cache_range(vma
->vm_mm
, end
- size
, end
);
1609 error
|= filemap_sync_pmd_range(dir
, address
, end
- address
, vma
, flags
);
1610 address
= (address
+ PGDIR_SIZE
) & PGDIR_MASK
;
1612 } while (address
&& (address
< end
));
1613 flush_tlb_range(vma
->vm_mm
, end
- size
, end
);
1615 spin_unlock(&vma
->vm_mm
->page_table_lock
);
1621 * Shared mappings need to be able to do the right thing at
1622 * close/unmap/sync. They will also use the private file as
1623 * backing-store for swapping..
1625 static struct vm_operations_struct file_shared_mmap
= {
1627 nopage
: filemap_nopage
,
1628 swapout
: filemap_swapout
,
1632 * Private mappings just need to be able to load in the map.
1634 * (This is actually used for shared mappings as well, if we
1635 * know they can't ever get write permissions..)
1637 static struct vm_operations_struct file_private_mmap
= {
1638 nopage
: filemap_nopage
,
1641 /* This is used for a general mmap of a disk file */
1643 int generic_file_mmap(struct file
* file
, struct vm_area_struct
* vma
)
1645 struct vm_operations_struct
* ops
;
1646 struct inode
*inode
= file
->f_dentry
->d_inode
;
1648 ops
= &file_private_mmap
;
1649 if ((vma
->vm_flags
& VM_SHARED
) && (vma
->vm_flags
& VM_MAYWRITE
)) {
1650 if (!inode
->i_mapping
->a_ops
->writepage
)
1652 ops
= &file_shared_mmap
;
1654 if (!inode
->i_sb
|| !S_ISREG(inode
->i_mode
))
1656 if (!inode
->i_mapping
->a_ops
->readpage
)
1658 UPDATE_ATIME(inode
);
1664 * The msync() system call.
1667 static int msync_interval(struct vm_area_struct
* vma
,
1668 unsigned long start
, unsigned long end
, int flags
)
1670 if (vma
->vm_file
&& vma
->vm_ops
&& vma
->vm_ops
->sync
) {
1672 error
= vma
->vm_ops
->sync(vma
, start
, end
-start
, flags
);
1673 if (!error
&& (flags
& MS_SYNC
)) {
1674 struct file
* file
= vma
->vm_file
;
1675 if (file
&& file
->f_op
&& file
->f_op
->fsync
) {
1676 down(&file
->f_dentry
->d_inode
->i_sem
);
1677 error
= file
->f_op
->fsync(file
, file
->f_dentry
, 1);
1678 up(&file
->f_dentry
->d_inode
->i_sem
);
1686 asmlinkage
long sys_msync(unsigned long start
, size_t len
, int flags
)
1689 struct vm_area_struct
* vma
;
1690 int unmapped_error
, error
= -EINVAL
;
1692 down(¤t
->mm
->mmap_sem
);
1693 if (start
& ~PAGE_MASK
)
1695 len
= (len
+ ~PAGE_MASK
) & PAGE_MASK
;
1699 if (flags
& ~(MS_ASYNC
| MS_INVALIDATE
| MS_SYNC
))
1705 * If the interval [start,end) covers some unmapped address ranges,
1706 * just ignore them, but return -EFAULT at the end.
1708 vma
= find_vma(current
->mm
, start
);
1711 /* Still start < end. */
1715 /* Here start < vma->vm_end. */
1716 if (start
< vma
->vm_start
) {
1717 unmapped_error
= -EFAULT
;
1718 start
= vma
->vm_start
;
1720 /* Here vma->vm_start <= start < vma->vm_end. */
1721 if (end
<= vma
->vm_end
) {
1723 error
= msync_interval(vma
, start
, end
, flags
);
1727 error
= unmapped_error
;
1730 /* Here vma->vm_start <= start < vma->vm_end < end. */
1731 error
= msync_interval(vma
, start
, vma
->vm_end
, flags
);
1734 start
= vma
->vm_end
;
1738 up(¤t
->mm
->mmap_sem
);
1742 static inline void setup_read_behavior(struct vm_area_struct
* vma
,
1745 VM_ClearReadHint(vma
);
1747 case MADV_SEQUENTIAL
:
1748 vma
->vm_flags
|= VM_SEQ_READ
;
1751 vma
->vm_flags
|= VM_RAND_READ
;
1759 static long madvise_fixup_start(struct vm_area_struct
* vma
,
1760 unsigned long end
, int behavior
)
1762 struct vm_area_struct
* n
;
1764 n
= kmem_cache_alloc(vm_area_cachep
, SLAB_KERNEL
);
1769 setup_read_behavior(n
, behavior
);
1771 get_file(n
->vm_file
);
1772 if (n
->vm_ops
&& n
->vm_ops
->open
)
1774 lock_vma_mappings(vma
);
1775 spin_lock(&vma
->vm_mm
->page_table_lock
);
1776 vma
->vm_pgoff
+= (end
- vma
->vm_start
) >> PAGE_SHIFT
;
1777 vma
->vm_start
= end
;
1778 __insert_vm_struct(current
->mm
, n
);
1779 spin_unlock(&vma
->vm_mm
->page_table_lock
);
1780 unlock_vma_mappings(vma
);
1784 static long madvise_fixup_end(struct vm_area_struct
* vma
,
1785 unsigned long start
, int behavior
)
1787 struct vm_area_struct
* n
;
1789 n
= kmem_cache_alloc(vm_area_cachep
, SLAB_KERNEL
);
1793 n
->vm_start
= start
;
1794 n
->vm_pgoff
+= (n
->vm_start
- vma
->vm_start
) >> PAGE_SHIFT
;
1795 setup_read_behavior(n
, behavior
);
1797 get_file(n
->vm_file
);
1798 if (n
->vm_ops
&& n
->vm_ops
->open
)
1800 lock_vma_mappings(vma
);
1801 spin_lock(&vma
->vm_mm
->page_table_lock
);
1802 vma
->vm_end
= start
;
1803 __insert_vm_struct(current
->mm
, n
);
1804 spin_unlock(&vma
->vm_mm
->page_table_lock
);
1805 unlock_vma_mappings(vma
);
1809 static long madvise_fixup_middle(struct vm_area_struct
* vma
,
1810 unsigned long start
, unsigned long end
, int behavior
)
1812 struct vm_area_struct
* left
, * right
;
1814 left
= kmem_cache_alloc(vm_area_cachep
, SLAB_KERNEL
);
1817 right
= kmem_cache_alloc(vm_area_cachep
, SLAB_KERNEL
);
1819 kmem_cache_free(vm_area_cachep
, left
);
1824 left
->vm_end
= start
;
1825 right
->vm_start
= end
;
1826 right
->vm_pgoff
+= (right
->vm_start
- left
->vm_start
) >> PAGE_SHIFT
;
1828 right
->vm_raend
= 0;
1829 atomic_add(2, &vma
->vm_file
->f_count
);
1831 if (vma
->vm_ops
&& vma
->vm_ops
->open
) {
1832 vma
->vm_ops
->open(left
);
1833 vma
->vm_ops
->open(right
);
1835 lock_vma_mappings(vma
);
1836 spin_lock(&vma
->vm_mm
->page_table_lock
);
1837 vma
->vm_pgoff
+= (start
- vma
->vm_start
) >> PAGE_SHIFT
;
1838 vma
->vm_start
= start
;
1840 setup_read_behavior(vma
, behavior
);
1842 __insert_vm_struct(current
->mm
, left
);
1843 __insert_vm_struct(current
->mm
, right
);
1844 spin_unlock(&vma
->vm_mm
->page_table_lock
);
1845 unlock_vma_mappings(vma
);
1850 * We can potentially split a vm area into separate
1851 * areas, each area with its own behavior.
1853 static long madvise_behavior(struct vm_area_struct
* vma
,
1854 unsigned long start
, unsigned long end
, int behavior
)
1858 /* This caps the number of vma's this process can own */
1859 if (vma
->vm_mm
->map_count
> MAX_MAP_COUNT
)
1862 if (start
== vma
->vm_start
) {
1863 if (end
== vma
->vm_end
) {
1864 setup_read_behavior(vma
, behavior
);
1867 error
= madvise_fixup_start(vma
, end
, behavior
);
1869 if (end
== vma
->vm_end
)
1870 error
= madvise_fixup_end(vma
, start
, behavior
);
1872 error
= madvise_fixup_middle(vma
, start
, end
, behavior
);
1879 * Schedule all required I/O operations, then run the disk queue
1880 * to make sure they are started. Do not wait for completion.
1882 static long madvise_willneed(struct vm_area_struct
* vma
,
1883 unsigned long start
, unsigned long end
)
1885 long error
= -EBADF
;
1887 unsigned long size
, rlim_rss
;
1889 /* Doesn't work if there's no mapped file. */
1892 file
= vma
->vm_file
;
1893 size
= (file
->f_dentry
->d_inode
->i_size
+ PAGE_CACHE_SIZE
- 1) >>
1896 start
= ((start
- vma
->vm_start
) >> PAGE_SHIFT
) + vma
->vm_pgoff
;
1897 if (end
> vma
->vm_end
)
1899 end
= ((end
- vma
->vm_start
) >> PAGE_SHIFT
) + vma
->vm_pgoff
;
1901 /* Make sure this doesn't exceed the process's max rss. */
1903 rlim_rss
= current
->rlim
? current
->rlim
[RLIMIT_RSS
].rlim_cur
:
1904 LONG_MAX
; /* default: see resource.h */
1905 if ((vma
->vm_mm
->rss
+ (end
- start
)) > rlim_rss
)
1908 /* round to cluster boundaries if this isn't a "random" area. */
1909 if (!VM_RandomReadHint(vma
)) {
1910 start
= CLUSTER_OFFSET(start
);
1911 end
= CLUSTER_OFFSET(end
+ CLUSTER_PAGES
- 1);
1913 while ((start
< end
) && (start
< size
)) {
1914 error
= read_cluster_nonblocking(file
, start
, size
);
1915 start
+= CLUSTER_PAGES
;
1920 while ((start
< end
) && (start
< size
)) {
1921 error
= page_cache_read(file
, start
);
1928 /* Don't wait for someone else to push these requests. */
1929 run_task_queue(&tq_disk
);
1935 * Application no longer needs these pages. If the pages are dirty,
1936 * it's OK to just throw them away. The app will be more careful about
1937 * data it wants to keep. Be sure to free swap resources too. The
1938 * zap_page_range call sets things up for refill_inactive to actually free
1939 * these pages later if no one else has touched them in the meantime,
1940 * although we could add these pages to a global reuse list for
1941 * refill_inactive to pick up before reclaiming other pages.
1943 * NB: This interface discards data rather than pushes it out to swap,
1944 * as some implementations do. This has performance implications for
1945 * applications like large transactional databases which want to discard
1946 * pages in anonymous maps after committing to backing store the data
1947 * that was kept in them. There is no reason to write this data out to
1948 * the swap area if the application is discarding it.
1950 * An interface that causes the system to free clean pages and flush
1951 * dirty pages is already available as msync(MS_INVALIDATE).
1953 static long madvise_dontneed(struct vm_area_struct
* vma
,
1954 unsigned long start
, unsigned long end
)
1956 if (vma
->vm_flags
& VM_LOCKED
)
1959 flush_cache_range(vma
->vm_mm
, start
, end
);
1960 zap_page_range(vma
->vm_mm
, start
, end
- start
);
1961 flush_tlb_range(vma
->vm_mm
, start
, end
);
1965 static long madvise_vma(struct vm_area_struct
* vma
, unsigned long start
,
1966 unsigned long end
, int behavior
)
1968 long error
= -EBADF
;
1972 case MADV_SEQUENTIAL
:
1974 error
= madvise_behavior(vma
, start
, end
, behavior
);
1978 error
= madvise_willneed(vma
, start
, end
);
1982 error
= madvise_dontneed(vma
, start
, end
);
1994 * The madvise(2) system call.
1996 * Applications can use madvise() to advise the kernel how it should
1997 * handle paging I/O in this VM area. The idea is to help the kernel
1998 * use appropriate read-ahead and caching techniques. The information
1999 * provided is advisory only, and can be safely disregarded by the
2000 * kernel without affecting the correct operation of the application.
2003 * MADV_NORMAL - the default behavior is to read clusters. This
2004 * results in some read-ahead and read-behind.
2005 * MADV_RANDOM - the system should read the minimum amount of data
2006 * on any access, since it is unlikely that the appli-
2007 * cation will need more than what it asks for.
2008 * MADV_SEQUENTIAL - pages in the given range will probably be accessed
2009 * once, so they can be aggressively read ahead, and
2010 * can be freed soon after they are accessed.
2011 * MADV_WILLNEED - the application is notifying the system to read
2013 * MADV_DONTNEED - the application is finished with the given range,
2014 * so the kernel can free resources associated with it.
2018 * -EINVAL - start + len < 0, start is not page-aligned,
2019 * "behavior" is not a valid value, or application
2020 * is attempting to release locked or shared pages.
2021 * -ENOMEM - addresses in the specified range are not currently
2022 * mapped, or are outside the AS of the process.
2023 * -EIO - an I/O error occurred while paging in data.
2024 * -EBADF - map exists, but area maps something that isn't a file.
2025 * -EAGAIN - a kernel resource was temporarily unavailable.
2027 asmlinkage
long sys_madvise(unsigned long start
, size_t len
, int behavior
)
2030 struct vm_area_struct
* vma
;
2031 int unmapped_error
= 0;
2032 int error
= -EINVAL
;
2034 down(¤t
->mm
->mmap_sem
);
2036 if (start
& ~PAGE_MASK
)
2038 len
= (len
+ ~PAGE_MASK
) & PAGE_MASK
;
2048 * If the interval [start,end) covers some unmapped address
2049 * ranges, just ignore them, but return -ENOMEM at the end.
2051 vma
= find_vma(current
->mm
, start
);
2053 /* Still start < end. */
2058 /* Here start < vma->vm_end. */
2059 if (start
< vma
->vm_start
) {
2060 unmapped_error
= -ENOMEM
;
2061 start
= vma
->vm_start
;
2064 /* Here vma->vm_start <= start < vma->vm_end. */
2065 if (end
<= vma
->vm_end
) {
2067 error
= madvise_vma(vma
, start
, end
,
2072 error
= unmapped_error
;
2076 /* Here vma->vm_start <= start < vma->vm_end < end. */
2077 error
= madvise_vma(vma
, start
, vma
->vm_end
, behavior
);
2080 start
= vma
->vm_end
;
2085 up(¤t
->mm
->mmap_sem
);
2090 * Later we can get more picky about what "in core" means precisely.
2091 * For now, simply check to see if the page is in the page cache,
2092 * and is up to date; i.e. that no page-in operation would be required
2093 * at this time if an application were to map and access this page.
2095 static unsigned char mincore_page(struct vm_area_struct
* vma
,
2096 unsigned long pgoff
)
2098 unsigned char present
= 0;
2099 struct address_space
* as
= &vma
->vm_file
->f_dentry
->d_inode
->i_data
;
2100 struct page
* page
, ** hash
= page_hash(as
, pgoff
);
2102 spin_lock(&pagecache_lock
);
2103 page
= __find_page_nolock(as
, pgoff
, *hash
);
2104 if ((page
) && (Page_Uptodate(page
)))
2106 spin_unlock(&pagecache_lock
);
2111 static long mincore_vma(struct vm_area_struct
* vma
,
2112 unsigned long start
, unsigned long end
, unsigned char * vec
)
2114 long error
, i
, remaining
;
2115 unsigned char * tmp
;
2121 start
= ((start
- vma
->vm_start
) >> PAGE_SHIFT
) + vma
->vm_pgoff
;
2122 if (end
> vma
->vm_end
)
2124 end
= ((end
- vma
->vm_start
) >> PAGE_SHIFT
) + vma
->vm_pgoff
;
2127 tmp
= (unsigned char *) __get_free_page(GFP_KERNEL
);
2131 /* (end - start) is # of pages, and also # of bytes in "vec */
2132 remaining
= (end
- start
),
2135 for (i
= 0; remaining
> 0; remaining
-= PAGE_SIZE
, i
++) {
2137 long thispiece
= (remaining
< PAGE_SIZE
) ?
2138 remaining
: PAGE_SIZE
;
2140 while (j
< thispiece
)
2141 tmp
[j
++] = mincore_page(vma
, start
++);
2143 if (copy_to_user(vec
+ PAGE_SIZE
* i
, tmp
, thispiece
)) {
2149 free_page((unsigned long) tmp
);
2154 * The mincore(2) system call.
2156 * mincore() returns the memory residency status of the pages in the
2157 * current process's address space specified by [addr, addr + len).
2158 * The status is returned in a vector of bytes. The least significant
2159 * bit of each byte is 1 if the referenced page is in memory, otherwise
2162 * Because the status of a page can change after mincore() checks it
2163 * but before it returns to the application, the returned vector may
2164 * contain stale information. Only locked pages are guaranteed to
2169 * -EFAULT - vec points to an illegal address
2170 * -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE,
2171 * or len has a nonpositive value
2172 * -ENOMEM - Addresses in the range [addr, addr + len] are
2173 * invalid for the address space of this process, or
2174 * specify one or more pages which are not currently
2176 * -EAGAIN - A kernel resource was temporarily unavailable.
2178 asmlinkage
long sys_mincore(unsigned long start
, size_t len
,
2179 unsigned char * vec
)
2183 struct vm_area_struct
* vma
;
2184 int unmapped_error
= 0;
2185 long error
= -EINVAL
;
2187 down(¤t
->mm
->mmap_sem
);
2189 if (start
& ~PAGE_CACHE_MASK
)
2191 len
= (len
+ ~PAGE_CACHE_MASK
) & PAGE_CACHE_MASK
;
2201 * If the interval [start,end) covers some unmapped address
2202 * ranges, just ignore them, but return -ENOMEM at the end.
2204 vma
= find_vma(current
->mm
, start
);
2206 /* Still start < end. */
2211 /* Here start < vma->vm_end. */
2212 if (start
< vma
->vm_start
) {
2213 unmapped_error
= -ENOMEM
;
2214 start
= vma
->vm_start
;
2217 /* Here vma->vm_start <= start < vma->vm_end. */
2218 if (end
<= vma
->vm_end
) {
2220 error
= mincore_vma(vma
, start
, end
,
2225 error
= unmapped_error
;
2229 /* Here vma->vm_start <= start < vma->vm_end < end. */
2230 error
= mincore_vma(vma
, start
, vma
->vm_end
, &vec
[index
]);
2233 index
+= (vma
->vm_end
- start
) >> PAGE_CACHE_SHIFT
;
2234 start
= vma
->vm_end
;
2239 up(¤t
->mm
->mmap_sem
);
2244 struct page
*__read_cache_page(struct address_space
*mapping
,
2245 unsigned long index
,
2246 int (*filler
)(void *,struct page
*),
2249 struct page
**hash
= page_hash(mapping
, index
);
2250 struct page
*page
, *cached_page
= NULL
;
2253 page
= __find_get_page(mapping
, index
, hash
);
2256 cached_page
= page_cache_alloc();
2258 return ERR_PTR(-ENOMEM
);
2261 if (add_to_page_cache_unique(page
, mapping
, index
, hash
))
2264 err
= filler(data
, page
);
2266 page_cache_release(page
);
2267 page
= ERR_PTR(err
);
2271 page_cache_free(cached_page
);
2276 * Read into the page cache. If a page already exists,
2277 * and Page_Uptodate() is not set, try to fill the page.
2279 struct page
*read_cache_page(struct address_space
*mapping
,
2280 unsigned long index
,
2281 int (*filler
)(void *,struct page
*),
2288 page
= __read_cache_page(mapping
, index
, filler
, data
);
2289 if (IS_ERR(page
) || Page_Uptodate(page
))
2293 if (!page
->mapping
) {
2295 page_cache_release(page
);
2298 if (Page_Uptodate(page
)) {
2302 err
= filler(data
, page
);
2304 page_cache_release(page
);
2305 page
= ERR_PTR(err
);
2311 static inline struct page
* __grab_cache_page(struct address_space
*mapping
,
2312 unsigned long index
, struct page
**cached_page
)
2314 struct page
*page
, **hash
= page_hash(mapping
, index
);
2316 page
= __find_lock_page(mapping
, index
, hash
);
2318 if (!*cached_page
) {
2319 *cached_page
= page_cache_alloc();
2323 page
= *cached_page
;
2324 if (add_to_page_cache_unique(page
, mapping
, index
, hash
))
2326 *cached_page
= NULL
;
2332 * Returns locked page at given index in given cache, creating it if needed.
2335 struct page
*grab_cache_page(struct address_space
*mapping
, unsigned long index
)
2337 struct page
*cached_page
= NULL
;
2338 struct page
*page
= __grab_cache_page(mapping
,index
,&cached_page
);
2340 page_cache_free(cached_page
);
2344 static inline void remove_suid(struct inode
*inode
)
2348 /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
2349 mode
= (inode
->i_mode
& S_IXGRP
)*(S_ISGID
/S_IXGRP
) | S_ISUID
;
2351 /* was any of the uid bits set? */
2352 mode
&= inode
->i_mode
;
2353 if (mode
&& !capable(CAP_FSETID
)) {
2354 inode
->i_mode
&= ~mode
;
2355 mark_inode_dirty(inode
);
2360 * Write to a file through the page cache.
2362 * We currently put everything into the page cache prior to writing it.
2363 * This is not a problem when writing full pages. With partial pages,
2364 * however, we first have to read the data into the cache, then
2365 * dirty the page, and finally schedule it for writing. Alternatively, we
2366 * could write-through just the portion of data that would go into that
2367 * page, but that would kill performance for applications that write data
2368 * line by line, and it's prone to race conditions.
2370 * Note that this routine doesn't try to keep track of dirty pages. Each
2371 * file system has to do this all by itself, unfortunately.
2375 generic_file_write(struct file
*file
,const char *buf
,size_t count
,loff_t
*ppos
)
2377 struct inode
*inode
= file
->f_dentry
->d_inode
;
2378 struct address_space
*mapping
= inode
->i_mapping
;
2379 unsigned long limit
= current
->rlim
[RLIMIT_FSIZE
].rlim_cur
;
2381 struct page
*page
, *cached_page
;
2382 unsigned long written
;
2388 down(&inode
->i_sem
);
2395 err
= file
->f_error
;
2403 if (file
->f_flags
& O_APPEND
)
2404 pos
= inode
->i_size
;
2407 * Check whether we've reached the file size limit.
2410 if (limit
!= RLIM_INFINITY
) {
2412 send_sig(SIGXFSZ
, current
, 0);
2415 if (count
> limit
- pos
) {
2416 send_sig(SIGXFSZ
, current
, 0);
2417 count
= limit
- pos
;
2424 inode
->i_ctime
= inode
->i_mtime
= CURRENT_TIME
;
2425 mark_inode_dirty_sync(inode
);
2429 unsigned long bytes
, index
, offset
;
2433 * Try to find the page in the cache. If it isn't there,
2434 * allocate a free page.
2436 offset
= (pos
& (PAGE_CACHE_SIZE
-1)); /* Within page */
2437 index
= pos
>> PAGE_CACHE_SHIFT
;
2438 bytes
= PAGE_CACHE_SIZE
- offset
;
2442 status
= -ENOMEM
; /* we'll assign it later anyway */
2443 page
= __grab_cache_page(mapping
, index
, &cached_page
);
2447 /* We have exclusive IO access to the page.. */
2448 if (!PageLocked(page
)) {
2452 status
= mapping
->a_ops
->prepare_write(file
, page
, offset
, offset
+bytes
);
2455 kaddr
= page_address(page
);
2456 status
= copy_from_user(kaddr
+offset
, buf
, bytes
);
2457 flush_dcache_page(page
);
2460 status
= mapping
->a_ops
->commit_write(file
, page
, offset
, offset
+bytes
);
2471 /* Mark it unlocked again and drop the page.. */
2473 deactivate_page(page
);
2474 page_cache_release(page
);
2482 page_cache_free(cached_page
);
2484 /* For now, when the user asks for O_SYNC, we'll actually
2485 * provide O_DSYNC. */
2486 if ((status
>= 0) && (file
->f_flags
& O_SYNC
))
2487 status
= generic_osync_inode(inode
, 1); /* 1 means datasync */
2489 err
= written
? written
: status
;
2496 ClearPageUptodate(page
);
2501 void __init
page_cache_init(unsigned long mempages
)
2503 unsigned long htable_size
, order
;
2505 htable_size
= mempages
;
2506 htable_size
*= sizeof(struct page
*);
2507 for(order
= 0; (PAGE_SIZE
<< order
) < htable_size
; order
++)
2511 unsigned long tmp
= (PAGE_SIZE
<< order
) / sizeof(struct page
*);
2514 while((tmp
>>= 1UL) != 0UL)
2517 page_hash_table
= (struct page
**)
2518 __get_free_pages(GFP_ATOMIC
, order
);
2519 } while(page_hash_table
== NULL
&& --order
> 0);
2521 printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
2522 (1 << page_hash_bits
), order
, (PAGE_SIZE
<< order
));
2523 if (!page_hash_table
)
2524 panic("Failed to allocate page hash table\n");
2525 memset((void *)page_hash_table
, 0, PAGE_HASH_SIZE
* sizeof(struct page
*));