4 * Copyright (C) 1994-1999 Linus Torvalds
8 * This file handles the generic file mmap semantics used by
9 * most "normal" filesystems (but you don't /have/ to use this:
10 * the NFS filesystem used to do this differently, for example)
12 #include <linux/malloc.h>
13 #include <linux/shm.h>
14 #include <linux/mman.h>
15 #include <linux/locks.h>
16 #include <linux/pagemap.h>
17 #include <linux/swap.h>
18 #include <linux/smp_lock.h>
19 #include <linux/blkdev.h>
20 #include <linux/file.h>
21 #include <linux/swapctl.h>
22 #include <linux/slab.h>
23 #include <linux/init.h>
26 #include <asm/pgalloc.h>
27 #include <asm/uaccess.h>
30 #include <linux/highmem.h>
33 * Shared mappings implemented 30.11.1994. It's not fully working yet,
36 * Shared mappings now work. 15.8.1995 Bruno.
38 * finished 'unifying' the page and buffer cache and SMP-threaded the
39 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
41 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
44 atomic_t page_cache_size
= ATOMIC_INIT(0);
45 unsigned int page_hash_bits
;
46 struct page
**page_hash_table
;
48 spinlock_t pagecache_lock
= SPIN_LOCK_UNLOCKED
;
50 * NOTE: to avoid deadlocking you must never acquire the pagecache_lock with
51 * the pagemap_lru_lock held.
53 spinlock_t pagemap_lru_lock
= SPIN_LOCK_UNLOCKED
;
55 #define CLUSTER_PAGES (1 << page_cluster)
56 #define CLUSTER_OFFSET(x) (((x) >> page_cluster) << page_cluster)
58 static void add_page_to_hash_queue(struct page
* page
, struct page
**p
)
60 struct page
*next
= *p
;
63 page
->next_hash
= next
;
66 next
->pprev_hash
= &page
->next_hash
;
69 atomic_inc(&page_cache_size
);
72 static inline void add_page_to_inode_queue(struct address_space
*mapping
, struct page
* page
)
74 struct list_head
*head
= &mapping
->clean_pages
;
77 list_add(&page
->list
, head
);
78 page
->mapping
= mapping
;
81 static inline void remove_page_from_inode_queue(struct page
* page
)
83 struct address_space
* mapping
= page
->mapping
;
86 list_del(&page
->list
);
90 static inline void remove_page_from_hash_queue(struct page
* page
)
92 struct page
*next
= page
->next_hash
;
93 struct page
**pprev
= page
->pprev_hash
;
96 next
->pprev_hash
= pprev
;
98 page
->pprev_hash
= NULL
;
99 atomic_dec(&page_cache_size
);
103 * Remove a page from the page cache and free it. Caller has to make
104 * sure the page is locked and that nobody else uses it - or that usage
107 void __remove_inode_page(struct page
*page
)
109 if (PageDirty(page
)) BUG();
110 remove_page_from_inode_queue(page
);
111 remove_page_from_hash_queue(page
);
112 page
->mapping
= NULL
;
115 void remove_inode_page(struct page
*page
)
117 if (!PageLocked(page
))
120 spin_lock(&pagecache_lock
);
121 __remove_inode_page(page
);
122 spin_unlock(&pagecache_lock
);
125 static inline int sync_page(struct page
*page
)
127 struct address_space
*mapping
= page
->mapping
;
129 if (mapping
&& mapping
->a_ops
&& mapping
->a_ops
->sync_page
)
130 return mapping
->a_ops
->sync_page(page
);
135 * Add a page to the dirty page list.
137 void __set_page_dirty(struct page
*page
)
139 struct address_space
*mapping
= page
->mapping
;
141 spin_lock(&pagecache_lock
);
142 list_del(&page
->list
);
143 list_add(&page
->list
, &mapping
->dirty_pages
);
144 spin_unlock(&pagecache_lock
);
146 mark_inode_dirty_pages(mapping
->host
);
150 * invalidate_inode_pages - Invalidate all the unlocked pages of one inode
151 * @inode: the inode which pages we want to invalidate
153 * This function only removes the unlocked pages, if you want to
154 * remove all the pages of one inode, you must call truncate_inode_pages.
157 void invalidate_inode_pages(struct inode
* inode
)
159 struct list_head
*head
, *curr
;
162 head
= &inode
->i_mapping
->clean_pages
;
164 spin_lock(&pagecache_lock
);
165 spin_lock(&pagemap_lru_lock
);
168 while (curr
!= head
) {
169 page
= list_entry(curr
, struct page
, list
);
172 /* We cannot invalidate something in use.. */
173 if (page_count(page
) != 1)
181 if (TryLockPage(page
))
184 __lru_cache_del(page
);
185 __remove_inode_page(page
);
187 page_cache_release(page
);
190 spin_unlock(&pagemap_lru_lock
);
191 spin_unlock(&pagecache_lock
);
194 static inline void truncate_partial_page(struct page
*page
, unsigned partial
)
196 memclear_highpage_flush(page
, partial
, PAGE_CACHE_SIZE
-partial
);
199 block_flushpage(page
, partial
);
203 static inline void truncate_complete_page(struct page
*page
)
205 /* Leave it on the LRU if it gets converted into anonymous buffers */
206 if (!page
->buffers
|| block_flushpage(page
, 0))
210 * We remove the page from the page cache _after_ we have
211 * destroyed all buffer-cache references to it. Otherwise some
212 * other process might think this inode page is not in the
213 * page cache and creates a buffer-cache alias to it causing
214 * all sorts of fun problems ...
216 ClearPageDirty(page
);
217 ClearPageUptodate(page
);
218 remove_inode_page(page
);
219 page_cache_release(page
);
222 void truncate_list_pages(struct list_head
*head
, unsigned long start
, unsigned partial
)
224 struct list_head
*curr
;
228 spin_lock(&pagecache_lock
);
230 while (curr
!= head
) {
231 unsigned long offset
;
233 page
= list_entry(curr
, struct page
, list
);
235 offset
= page
->index
;
237 /* Is one of the pages to truncate? */
238 if ((offset
>= start
) || (partial
&& (offset
+ 1) == start
)) {
239 if (TryLockPage(page
)) {
240 page_cache_get(page
);
241 spin_unlock(&pagecache_lock
);
243 page_cache_release(page
);
246 page_cache_get(page
);
247 spin_unlock(&pagecache_lock
);
249 if (partial
&& (offset
+ 1) == start
) {
250 truncate_partial_page(page
, partial
);
253 truncate_complete_page(page
);
256 page_cache_release(page
);
259 * We have done things without the pagecache lock,
260 * so we'll have to repeat the scan.
261 * It's not possible to deadlock here because
262 * we are guaranteed to make progress. (ie. we have
263 * just removed a page)
268 spin_unlock(&pagecache_lock
);
273 * truncate_inode_pages - truncate *all* the pages from an offset
274 * @mapping: mapping to truncate
275 * @lstart: offset from with to truncate
277 * Truncate the page cache at a set offset, removing the pages
278 * that are beyond that offset (and zeroing out partial pages).
279 * If any page is locked we wait for it to become unlocked.
281 void truncate_inode_pages(struct address_space
* mapping
, loff_t lstart
)
283 unsigned long start
= (lstart
+ PAGE_CACHE_SIZE
- 1) >> PAGE_CACHE_SHIFT
;
284 unsigned partial
= lstart
& (PAGE_CACHE_SIZE
- 1);
286 truncate_list_pages(&mapping
->clean_pages
, start
, partial
);
287 truncate_list_pages(&mapping
->dirty_pages
, start
, partial
);
288 truncate_list_pages(&mapping
->locked_pages
, start
, partial
);
291 static inline struct page
* __find_page_nolock(struct address_space
*mapping
, unsigned long offset
, struct page
*page
)
296 page
= page
->next_hash
;
300 if (page
->mapping
!= mapping
)
302 if (page
->index
== offset
)
306 * Touching the page may move it to the active list.
307 * If we end up with too few inactive pages, we wake
311 if (inactive_shortage() > inactive_target
/ 2 && free_shortage())
318 * By the time this is called, the page is locked and
319 * we don't have to worry about any races any more.
323 static int writeout_one_page(struct page
*page
)
325 struct buffer_head
*bh
, *head
= page
->buffers
;
329 if (buffer_locked(bh
) || !buffer_dirty(bh
) || !buffer_uptodate(bh
))
332 bh
->b_flushtime
= jiffies
;
333 ll_rw_block(WRITE
, 1, &bh
);
334 } while ((bh
= bh
->b_this_page
) != head
);
338 static int waitfor_one_page(struct page
*page
)
341 struct buffer_head
*bh
, *head
= page
->buffers
;
346 if (buffer_req(bh
) && !buffer_uptodate(bh
))
348 } while ((bh
= bh
->b_this_page
) != head
);
352 static int do_buffer_fdatasync(struct list_head
*head
, unsigned long start
, unsigned long end
, int (*fn
)(struct page
*))
354 struct list_head
*curr
;
358 spin_lock(&pagecache_lock
);
360 while (curr
!= head
) {
361 page
= list_entry(curr
, struct page
, list
);
365 if (page
->index
>= end
)
367 if (page
->index
< start
)
370 page_cache_get(page
);
371 spin_unlock(&pagecache_lock
);
374 /* The buffers could have been free'd while we waited for the page lock */
379 spin_lock(&pagecache_lock
);
380 curr
= page
->list
.next
;
381 page_cache_release(page
);
383 spin_unlock(&pagecache_lock
);
389 * Two-stage data sync: first start the IO, then go back and
390 * collect the information..
392 int generic_buffer_fdatasync(struct inode
*inode
, unsigned long start_idx
, unsigned long end_idx
)
396 /* writeout dirty buffers on pages from both clean and dirty lists */
397 retval
= do_buffer_fdatasync(&inode
->i_mapping
->dirty_pages
, start_idx
, end_idx
, writeout_one_page
);
398 retval
|= do_buffer_fdatasync(&inode
->i_mapping
->clean_pages
, start_idx
, end_idx
, writeout_one_page
);
399 retval
|= do_buffer_fdatasync(&inode
->i_mapping
->locked_pages
, start_idx
, end_idx
, writeout_one_page
);
401 /* now wait for locked buffers on pages from both clean and dirty lists */
402 retval
|= do_buffer_fdatasync(&inode
->i_mapping
->dirty_pages
, start_idx
, end_idx
, writeout_one_page
);
403 retval
|= do_buffer_fdatasync(&inode
->i_mapping
->clean_pages
, start_idx
, end_idx
, waitfor_one_page
);
404 retval
|= do_buffer_fdatasync(&inode
->i_mapping
->locked_pages
, start_idx
, end_idx
, waitfor_one_page
);
410 * filemap_fdatasync - walk the list of dirty pages of the given address space
411 * and writepage() all of them.
413 * @mapping: address space structure to write
416 void filemap_fdatasync(struct address_space
* mapping
)
418 int (*writepage
)(struct page
*) = mapping
->a_ops
->writepage
;
420 spin_lock(&pagecache_lock
);
422 while (!list_empty(&mapping
->dirty_pages
)) {
423 struct page
*page
= list_entry(mapping
->dirty_pages
.next
, struct page
, list
);
425 list_del(&page
->list
);
426 list_add(&page
->list
, &mapping
->locked_pages
);
428 if (!PageDirty(page
))
431 page_cache_get(page
);
432 spin_unlock(&pagecache_lock
);
436 if (PageDirty(page
)) {
437 ClearPageDirty(page
);
442 page_cache_release(page
);
443 spin_lock(&pagecache_lock
);
445 spin_unlock(&pagecache_lock
);
449 * filemap_fdatawait - walk the list of locked pages of the given address space
450 * and wait for all of them.
452 * @mapping: address space structure to wait for
455 void filemap_fdatawait(struct address_space
* mapping
)
457 spin_lock(&pagecache_lock
);
459 while (!list_empty(&mapping
->locked_pages
)) {
460 struct page
*page
= list_entry(mapping
->locked_pages
.next
, struct page
, list
);
462 list_del(&page
->list
);
463 list_add(&page
->list
, &mapping
->clean_pages
);
465 if (!PageLocked(page
))
468 page_cache_get(page
);
469 spin_unlock(&pagecache_lock
);
471 ___wait_on_page(page
);
473 page_cache_release(page
);
474 spin_lock(&pagecache_lock
);
476 spin_unlock(&pagecache_lock
);
480 * Add a page to the inode page cache.
482 * The caller must have locked the page and
483 * set all the page flags correctly..
485 void add_to_page_cache_locked(struct page
* page
, struct address_space
*mapping
, unsigned long index
)
487 if (!PageLocked(page
))
490 page_cache_get(page
);
491 spin_lock(&pagecache_lock
);
493 add_page_to_inode_queue(mapping
, page
);
494 add_page_to_hash_queue(page
, page_hash(mapping
, index
));
496 spin_unlock(&pagecache_lock
);
500 * This adds a page to the page cache, starting out as locked,
501 * owned by us, but unreferenced, not uptodate and with no errors.
503 static inline void __add_to_page_cache(struct page
* page
,
504 struct address_space
*mapping
, unsigned long offset
,
509 if (PageLocked(page
))
512 flags
= page
->flags
& ~((1 << PG_uptodate
) | (1 << PG_error
) | (1 << PG_dirty
) | (1 << PG_referenced
) | (1 << PG_arch_1
));
513 page
->flags
= flags
| (1 << PG_locked
);
514 page_cache_get(page
);
515 page
->index
= offset
;
516 add_page_to_inode_queue(mapping
, page
);
517 add_page_to_hash_queue(page
, hash
);
521 void add_to_page_cache(struct page
* page
, struct address_space
* mapping
, unsigned long offset
)
523 spin_lock(&pagecache_lock
);
524 __add_to_page_cache(page
, mapping
, offset
, page_hash(mapping
, offset
));
525 spin_unlock(&pagecache_lock
);
528 static int add_to_page_cache_unique(struct page
* page
,
529 struct address_space
*mapping
, unsigned long offset
,
535 spin_lock(&pagecache_lock
);
536 alias
= __find_page_nolock(mapping
, offset
, *hash
);
540 __add_to_page_cache(page
,mapping
,offset
,hash
);
544 spin_unlock(&pagecache_lock
);
549 * This adds the requested page to the page cache if it isn't already there,
550 * and schedules an I/O to read in its contents from disk.
552 static inline int page_cache_read(struct file
* file
, unsigned long offset
)
554 struct inode
*inode
= file
->f_dentry
->d_inode
;
555 struct address_space
*mapping
= inode
->i_mapping
;
556 struct page
**hash
= page_hash(mapping
, offset
);
559 spin_lock(&pagecache_lock
);
560 page
= __find_page_nolock(mapping
, offset
, *hash
);
561 spin_unlock(&pagecache_lock
);
565 page
= page_cache_alloc();
569 if (!add_to_page_cache_unique(page
, mapping
, offset
, hash
)) {
570 int error
= mapping
->a_ops
->readpage(file
, page
);
571 page_cache_release(page
);
575 * We arrive here in the unlikely event that someone
576 * raced with us and added our page to the cache first.
578 page_cache_free(page
);
583 * Read in an entire cluster at once. A cluster is usually a 64k-
584 * aligned block that includes the page requested in "offset."
586 static int read_cluster_nonblocking(struct file
* file
, unsigned long offset
,
587 unsigned long filesize
)
589 unsigned long pages
= CLUSTER_PAGES
;
591 offset
= CLUSTER_OFFSET(offset
);
592 while ((pages
-- > 0) && (offset
< filesize
)) {
593 int error
= page_cache_read(file
, offset
);
603 * Wait for a page to get unlocked.
605 * This must be called with the caller "holding" the page,
606 * ie with increased "page->count" so that the page won't
607 * go away during the wait..
609 void ___wait_on_page(struct page
*page
)
611 struct task_struct
*tsk
= current
;
612 DECLARE_WAITQUEUE(wait
, tsk
);
614 add_wait_queue(&page
->wait
, &wait
);
617 set_task_state(tsk
, TASK_UNINTERRUPTIBLE
);
618 if (!PageLocked(page
))
620 run_task_queue(&tq_disk
);
622 } while (PageLocked(page
));
623 tsk
->state
= TASK_RUNNING
;
624 remove_wait_queue(&page
->wait
, &wait
);
628 * Get a lock on the page, assuming we need to sleep
631 static void __lock_page(struct page
*page
)
633 struct task_struct
*tsk
= current
;
634 DECLARE_WAITQUEUE(wait
, tsk
);
636 add_wait_queue_exclusive(&page
->wait
, &wait
);
639 set_task_state(tsk
, TASK_UNINTERRUPTIBLE
);
640 if (PageLocked(page
)) {
641 run_task_queue(&tq_disk
);
645 if (!TryLockPage(page
))
648 tsk
->state
= TASK_RUNNING
;
649 remove_wait_queue(&page
->wait
, &wait
);
654 * Get an exclusive lock on the page, optimistically
655 * assuming it's not locked..
657 void lock_page(struct page
*page
)
659 if (TryLockPage(page
))
664 * a rather lightweight function, finding and getting a reference to a
665 * hashed page atomically, waiting for it if it's locked.
667 struct page
* __find_get_page(struct address_space
*mapping
,
668 unsigned long offset
, struct page
**hash
)
673 * We scan the hash list read-only. Addition to and removal from
674 * the hash-list needs a held write-lock.
676 spin_lock(&pagecache_lock
);
677 page
= __find_page_nolock(mapping
, offset
, *hash
);
679 page_cache_get(page
);
680 spin_unlock(&pagecache_lock
);
685 * Get the lock to a page atomically.
687 struct page
* __find_lock_page (struct address_space
*mapping
,
688 unsigned long offset
, struct page
**hash
)
693 * We scan the hash list read-only. Addition to and removal from
694 * the hash-list needs a held write-lock.
697 spin_lock(&pagecache_lock
);
698 page
= __find_page_nolock(mapping
, offset
, *hash
);
700 page_cache_get(page
);
701 spin_unlock(&pagecache_lock
);
705 /* Is the page still hashed? Ok, good.. */
709 /* Nope: we raced. Release and try again.. */
711 page_cache_release(page
);
714 spin_unlock(&pagecache_lock
);
719 #define PROFILE_READAHEAD
720 #define DEBUG_READAHEAD
724 * We combine this with read-ahead to deactivate pages when we
725 * think there's sequential IO going on. Note that this is
726 * harmless since we don't actually evict the pages from memory
727 * but just move them to the inactive list.
730 * - make the readahead code smarter
731 * - move readahead to the VMA level so we can do the same
736 static void drop_behind(struct file
* file
, unsigned long index
)
738 struct inode
*inode
= file
->f_dentry
->d_inode
;
739 struct address_space
*mapping
= inode
->i_mapping
;
744 /* Nothing to drop-behind if we're on the first page. */
748 if (index
> file
->f_rawin
)
749 start
= index
- file
->f_rawin
;
754 * Go backwards from index-1 and drop all pages in the
755 * readahead window. Since the readahead window may have
756 * been increased since the last time we were called, we
757 * stop when the page isn't there.
759 spin_lock(&pagecache_lock
);
760 while (--index
>= start
) {
761 hash
= page_hash(mapping
, index
);
762 page
= __find_page_nolock(mapping
, index
, *hash
);
765 deactivate_page(page
);
767 spin_unlock(&pagecache_lock
);
771 * Read-ahead profiling information
772 * --------------------------------
773 * Every PROFILE_MAXREADCOUNT, the following information is written
775 * Percentage of asynchronous read-ahead.
776 * Average of read-ahead fields context value.
777 * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
781 #ifdef PROFILE_READAHEAD
783 #define PROFILE_MAXREADCOUNT 1000
785 static unsigned long total_reada
;
786 static unsigned long total_async
;
787 static unsigned long total_ramax
;
788 static unsigned long total_ralen
;
789 static unsigned long total_rawin
;
791 static void profile_readahead(int async
, struct file
*filp
)
799 total_ramax
+= filp
->f_ramax
;
800 total_ralen
+= filp
->f_ralen
;
801 total_rawin
+= filp
->f_rawin
;
803 if (total_reada
> PROFILE_MAXREADCOUNT
) {
806 if (!(total_reada
> PROFILE_MAXREADCOUNT
)) {
807 restore_flags(flags
);
811 printk("Readahead average: max=%ld, len=%ld, win=%ld, async=%ld%%\n",
812 total_ramax
/total_reada
,
813 total_ralen
/total_reada
,
814 total_rawin
/total_reada
,
815 (total_async
*100)/total_reada
);
816 #ifdef DEBUG_READAHEAD
817 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n",
818 filp
->f_ramax
, filp
->f_ralen
, filp
->f_rawin
, filp
->f_raend
);
827 restore_flags(flags
);
830 #endif /* defined PROFILE_READAHEAD */
833 * Read-ahead context:
834 * -------------------
835 * The read ahead context fields of the "struct file" are the following:
836 * - f_raend : position of the first byte after the last page we tried to
838 * - f_ramax : current read-ahead maximum size.
839 * - f_ralen : length of the current IO read block we tried to read-ahead.
840 * - f_rawin : length of the current read-ahead window.
841 * if last read-ahead was synchronous then
843 * otherwise (was asynchronous)
844 * f_rawin = previous value of f_ralen + f_ralen
848 * MIN_READAHEAD : minimum read-ahead size when read-ahead.
849 * MAX_READAHEAD : maximum read-ahead size when read-ahead.
851 * Synchronous read-ahead benefits:
852 * --------------------------------
853 * Using reasonable IO xfer length from peripheral devices increase system
855 * Reasonable means, in this context, not too large but not too small.
856 * The actual maximum value is:
857 * MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
858 * and 32K if defined (4K page size assumed).
860 * Asynchronous read-ahead benefits:
861 * ---------------------------------
862 * Overlapping next read request and user process execution increase system
867 * We have to guess which further data are needed by the user process.
868 * If these data are often not really needed, it's bad for system
870 * However, we know that files are often accessed sequentially by
871 * application programs and it seems that it is possible to have some good
872 * strategy in that guessing.
873 * We only try to read-ahead files that seems to be read sequentially.
875 * Asynchronous read-ahead risks:
876 * ------------------------------
877 * In order to maximize overlapping, we must start some asynchronous read
878 * request from the device, as soon as possible.
879 * We must be very careful about:
880 * - The number of effective pending IO read requests.
881 * ONE seems to be the only reasonable value.
882 * - The total memory pool usage for the file access stream.
883 * This maximum memory usage is implicitly 2 IO read chunks:
884 * 2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
885 * 64k if defined (4K page size assumed).
888 static inline int get_max_readahead(struct inode
* inode
)
890 if (!inode
->i_dev
|| !max_readahead
[MAJOR(inode
->i_dev
)])
891 return MAX_READAHEAD
;
892 return max_readahead
[MAJOR(inode
->i_dev
)][MINOR(inode
->i_dev
)];
895 static void generic_file_readahead(int reada_ok
,
896 struct file
* filp
, struct inode
* inode
,
899 unsigned long end_index
= inode
->i_size
>> PAGE_CACHE_SHIFT
;
900 unsigned long index
= page
->index
;
901 unsigned long max_ahead
, ahead
;
903 int max_readahead
= get_max_readahead(inode
);
905 raend
= filp
->f_raend
;
909 * The current page is locked.
910 * If the current position is inside the previous read IO request, do not
911 * try to reread previously read ahead pages.
912 * Otherwise decide or not to read ahead some pages synchronously.
913 * If we are not going to read ahead, set the read ahead context for this
916 if (PageLocked(page
)) {
917 if (!filp
->f_ralen
|| index
>= raend
|| index
+ filp
->f_rawin
< raend
) {
919 if (raend
< end_index
)
920 max_ahead
= filp
->f_ramax
;
924 filp
->f_raend
= index
+ filp
->f_ralen
;
925 filp
->f_rawin
+= filp
->f_ralen
;
930 * The current page is not locked.
931 * If we were reading ahead and,
932 * if the current max read ahead size is not zero and,
933 * if the current position is inside the last read-ahead IO request,
934 * it is the moment to try to read ahead asynchronously.
935 * We will later force unplug device in order to force asynchronous read IO.
937 else if (reada_ok
&& filp
->f_ramax
&& raend
>= 1 &&
938 index
<= raend
&& index
+ filp
->f_ralen
>= raend
) {
940 * Add ONE page to max_ahead in order to try to have about the same IO max size
941 * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
942 * Compute the position of the last page we have tried to read in order to
943 * begin to read ahead just at the next page.
946 if (raend
< end_index
)
947 max_ahead
= filp
->f_ramax
+ 1;
950 filp
->f_rawin
= filp
->f_ralen
;
956 * Try to read ahead pages.
957 * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
958 * scheduler, will work enough for us to avoid too bad actuals IO requests.
961 while (ahead
< max_ahead
) {
963 if ((raend
+ ahead
) >= end_index
)
965 if (page_cache_read(filp
, raend
+ ahead
) < 0)
969 * If we tried to read ahead some pages,
970 * If we tried to read ahead asynchronously,
971 * Try to force unplug of the device in order to start an asynchronous
973 * Update the read-ahead context.
974 * Store the length of the current read-ahead window.
975 * Double the current max read ahead size.
976 * That heuristic avoid to do some large IO for files that are not really
977 * accessed sequentially.
981 run_task_queue(&tq_disk
);
984 filp
->f_ralen
+= ahead
;
985 filp
->f_rawin
+= filp
->f_ralen
;
986 filp
->f_raend
= raend
+ ahead
+ 1;
988 filp
->f_ramax
+= filp
->f_ramax
;
990 if (filp
->f_ramax
> max_readahead
)
991 filp
->f_ramax
= max_readahead
;
994 * Move the pages that have already been passed
995 * to the inactive list.
997 drop_behind(filp
, index
);
999 #ifdef PROFILE_READAHEAD
1000 profile_readahead((reada_ok
== 2), filp
);
1009 * This is a generic file read routine, and uses the
1010 * inode->i_op->readpage() function for the actual low-level
1013 * This is really ugly. But the goto's actually try to clarify some
1014 * of the logic when it comes to error handling etc.
1016 void do_generic_file_read(struct file
* filp
, loff_t
*ppos
, read_descriptor_t
* desc
, read_actor_t actor
)
1018 struct inode
*inode
= filp
->f_dentry
->d_inode
;
1019 struct address_space
*mapping
= inode
->i_mapping
;
1020 unsigned long index
, offset
;
1021 struct page
*cached_page
;
1024 int max_readahead
= get_max_readahead(inode
);
1027 index
= *ppos
>> PAGE_CACHE_SHIFT
;
1028 offset
= *ppos
& ~PAGE_CACHE_MASK
;
1031 * If the current position is outside the previous read-ahead window,
1032 * we reset the current read-ahead context and set read ahead max to zero
1033 * (will be set to just needed value later),
1034 * otherwise, we assume that the file accesses are sequential enough to
1035 * continue read-ahead.
1037 if (index
> filp
->f_raend
|| index
+ filp
->f_rawin
< filp
->f_raend
) {
1047 * Adjust the current value of read-ahead max.
1048 * If the read operation stay in the first half page, force no readahead.
1049 * Otherwise try to increase read ahead max just enough to do the read request.
1050 * Then, at least MIN_READAHEAD if read ahead is ok,
1051 * and at most MAX_READAHEAD in all cases.
1053 if (!index
&& offset
+ desc
->count
<= (PAGE_CACHE_SIZE
>> 1)) {
1056 unsigned long needed
;
1058 needed
= ((offset
+ desc
->count
) >> PAGE_CACHE_SHIFT
) + 1;
1060 if (filp
->f_ramax
< needed
)
1061 filp
->f_ramax
= needed
;
1063 if (reada_ok
&& filp
->f_ramax
< MIN_READAHEAD
)
1064 filp
->f_ramax
= MIN_READAHEAD
;
1065 if (filp
->f_ramax
> max_readahead
)
1066 filp
->f_ramax
= max_readahead
;
1070 struct page
*page
, **hash
;
1071 unsigned long end_index
, nr
;
1073 end_index
= inode
->i_size
>> PAGE_CACHE_SHIFT
;
1074 if (index
> end_index
)
1076 nr
= PAGE_CACHE_SIZE
;
1077 if (index
== end_index
) {
1078 nr
= inode
->i_size
& ~PAGE_CACHE_MASK
;
1086 * Try to find the data in the page cache..
1088 hash
= page_hash(mapping
, index
);
1090 spin_lock(&pagecache_lock
);
1091 page
= __find_page_nolock(mapping
, index
, *hash
);
1093 goto no_cached_page
;
1095 page_cache_get(page
);
1096 spin_unlock(&pagecache_lock
);
1098 if (!Page_Uptodate(page
))
1099 goto page_not_up_to_date
;
1100 generic_file_readahead(reada_ok
, filp
, inode
, page
);
1102 /* If users can be writing to this page using arbitrary
1103 * virtual addresses, take care about potential aliasing
1104 * before reading the page on the kernel side.
1106 if (mapping
->i_mmap_shared
!= NULL
)
1107 flush_dcache_page(page
);
1110 * Ok, we have the page, and it's up-to-date, so
1111 * now we can copy it to user space...
1113 * The actor routine returns how many bytes were actually used..
1114 * NOTE! This may not be the same as how much of a user buffer
1115 * we filled up (we may be padding etc), so we can only update
1116 * "pos" here (the actor routine has to update the user buffer
1117 * pointers and the remaining count).
1119 nr
= actor(desc
, page
, offset
, nr
);
1121 index
+= offset
>> PAGE_CACHE_SHIFT
;
1122 offset
&= ~PAGE_CACHE_MASK
;
1124 page_cache_release(page
);
1125 if (nr
&& desc
->count
)
1130 * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
1132 page_not_up_to_date
:
1133 generic_file_readahead(reada_ok
, filp
, inode
, page
);
1135 if (Page_Uptodate(page
))
1138 /* Get exclusive access to the page ... */
1141 /* Did it get unhashed before we got the lock? */
1142 if (!page
->mapping
) {
1144 page_cache_release(page
);
1148 /* Did somebody else fill it already? */
1149 if (Page_Uptodate(page
)) {
1155 /* ... and start the actual read. The read will unlock the page. */
1156 error
= mapping
->a_ops
->readpage(filp
, page
);
1159 if (Page_Uptodate(page
))
1162 /* Again, try some read-ahead while waiting for the page to finish.. */
1163 generic_file_readahead(reada_ok
, filp
, inode
, page
);
1165 if (Page_Uptodate(page
))
1170 /* UHHUH! A synchronous read error occurred. Report it */
1171 desc
->error
= error
;
1172 page_cache_release(page
);
1177 * Ok, it wasn't cached, so we need to create a new
1180 * We get here with the page cache lock held.
1183 spin_unlock(&pagecache_lock
);
1184 cached_page
= page_cache_alloc();
1186 desc
->error
= -ENOMEM
;
1191 * Somebody may have added the page while we
1192 * dropped the page cache lock. Check for that.
1194 spin_lock(&pagecache_lock
);
1195 page
= __find_page_nolock(mapping
, index
, *hash
);
1201 * Ok, add the new page to the hash-queues...
1204 __add_to_page_cache(page
, mapping
, index
, hash
);
1205 spin_unlock(&pagecache_lock
);
1211 *ppos
= ((loff_t
) index
<< PAGE_CACHE_SHIFT
) + offset
;
1214 page_cache_free(cached_page
);
1215 UPDATE_ATIME(inode
);
1218 static int file_read_actor(read_descriptor_t
* desc
, struct page
*page
, unsigned long offset
, unsigned long size
)
1221 unsigned long left
, count
= desc
->count
;
1227 left
= __copy_to_user(desc
->buf
, kaddr
+ offset
, size
);
1232 desc
->error
= -EFAULT
;
1234 desc
->count
= count
- size
;
1235 desc
->written
+= size
;
1241 * This is the "read()" routine for all filesystems
1242 * that can use the page cache directly.
1244 ssize_t
generic_file_read(struct file
* filp
, char * buf
, size_t count
, loff_t
*ppos
)
1249 if (access_ok(VERIFY_WRITE
, buf
, count
)) {
1253 read_descriptor_t desc
;
1259 do_generic_file_read(filp
, ppos
, &desc
, file_read_actor
);
1261 retval
= desc
.written
;
1263 retval
= desc
.error
;
1269 static int file_send_actor(read_descriptor_t
* desc
, struct page
*page
, unsigned long offset
, unsigned long size
)
1273 unsigned long count
= desc
->count
;
1274 struct file
*file
= (struct file
*) desc
->buf
;
1275 mm_segment_t old_fs
;
1283 written
= file
->f_op
->write(file
, kaddr
+ offset
, size
, &file
->f_pos
);
1287 desc
->error
= written
;
1290 desc
->count
= count
- written
;
1291 desc
->written
+= written
;
1295 asmlinkage ssize_t
sys_sendfile(int out_fd
, int in_fd
, off_t
*offset
, size_t count
)
1298 struct file
* in_file
, * out_file
;
1299 struct inode
* in_inode
, * out_inode
;
1302 * Get input file, and verify that it is ok..
1305 in_file
= fget(in_fd
);
1308 if (!(in_file
->f_mode
& FMODE_READ
))
1311 in_inode
= in_file
->f_dentry
->d_inode
;
1314 if (!in_inode
->i_mapping
->a_ops
->readpage
)
1316 retval
= locks_verify_area(FLOCK_VERIFY_READ
, in_inode
, in_file
, in_file
->f_pos
, count
);
1321 * Get output file, and verify that it is ok..
1324 out_file
= fget(out_fd
);
1327 if (!(out_file
->f_mode
& FMODE_WRITE
))
1330 if (!out_file
->f_op
|| !out_file
->f_op
->write
)
1332 out_inode
= out_file
->f_dentry
->d_inode
;
1333 retval
= locks_verify_area(FLOCK_VERIFY_WRITE
, out_inode
, out_file
, out_file
->f_pos
, count
);
1339 read_descriptor_t desc
;
1340 loff_t pos
= 0, *ppos
;
1343 ppos
= &in_file
->f_pos
;
1345 if (get_user(pos
, offset
))
1352 desc
.buf
= (char *) out_file
;
1354 do_generic_file_read(in_file
, ppos
, &desc
, file_send_actor
);
1356 retval
= desc
.written
;
1358 retval
= desc
.error
;
1360 put_user(pos
, offset
);
1372 * Read-ahead and flush behind for MADV_SEQUENTIAL areas. Since we are
1373 * sure this is sequential access, we don't need a flexible read-ahead
1374 * window size -- we can always use a large fixed size window.
1376 static void nopage_sequential_readahead(struct vm_area_struct
* vma
,
1377 unsigned long pgoff
, unsigned long filesize
)
1379 unsigned long ra_window
;
1381 ra_window
= get_max_readahead(vma
->vm_file
->f_dentry
->d_inode
);
1382 ra_window
= CLUSTER_OFFSET(ra_window
+ CLUSTER_PAGES
- 1);
1384 /* vm_raend is zero if we haven't read ahead in this area yet. */
1385 if (vma
->vm_raend
== 0)
1386 vma
->vm_raend
= vma
->vm_pgoff
+ ra_window
;
1389 * If we've just faulted the page half-way through our window,
1390 * then schedule reads for the next window, and release the
1391 * pages in the previous window.
1393 if ((pgoff
+ (ra_window
>> 1)) == vma
->vm_raend
) {
1394 unsigned long start
= vma
->vm_pgoff
+ vma
->vm_raend
;
1395 unsigned long end
= start
+ ra_window
;
1397 if (end
> ((vma
->vm_end
>> PAGE_SHIFT
) + vma
->vm_pgoff
))
1398 end
= (vma
->vm_end
>> PAGE_SHIFT
) + vma
->vm_pgoff
;
1402 while ((start
< end
) && (start
< filesize
)) {
1403 if (read_cluster_nonblocking(vma
->vm_file
,
1404 start
, filesize
) < 0)
1406 start
+= CLUSTER_PAGES
;
1408 run_task_queue(&tq_disk
);
1410 /* if we're far enough past the beginning of this area,
1411 recycle pages that are in the previous window. */
1412 if (vma
->vm_raend
> (vma
->vm_pgoff
+ ra_window
+ ra_window
)) {
1413 unsigned long window
= ra_window
<< PAGE_SHIFT
;
1415 end
= vma
->vm_start
+ (vma
->vm_raend
<< PAGE_SHIFT
);
1416 end
-= window
+ window
;
1417 filemap_sync(vma
, end
- window
, window
, MS_INVALIDATE
);
1420 vma
->vm_raend
+= ra_window
;
1427 * filemap_nopage() is invoked via the vma operations vector for a
1428 * mapped memory region to read in file data during a page fault.
1430 * The goto's are kind of ugly, but this streamlines the normal case of having
1431 * it in the page cache, and handles the special cases reasonably without
1432 * having a lot of duplicated code.
1434 struct page
* filemap_nopage(struct vm_area_struct
* area
,
1435 unsigned long address
, int no_share
)
1438 struct file
*file
= area
->vm_file
;
1439 struct inode
*inode
= file
->f_dentry
->d_inode
;
1440 struct address_space
*mapping
= inode
->i_mapping
;
1441 struct page
*page
, **hash
, *old_page
;
1442 unsigned long size
, pgoff
;
1444 pgoff
= ((address
- area
->vm_start
) >> PAGE_CACHE_SHIFT
) + area
->vm_pgoff
;
1448 * An external ptracer can access pages that normally aren't
1451 size
= (inode
->i_size
+ PAGE_CACHE_SIZE
- 1) >> PAGE_CACHE_SHIFT
;
1452 if ((pgoff
>= size
) && (area
->vm_mm
== current
->mm
))
1456 * Do we have something in the page cache already?
1458 hash
= page_hash(mapping
, pgoff
);
1460 page
= __find_get_page(mapping
, pgoff
, hash
);
1462 goto no_cached_page
;
1465 * Ok, found a page in the page cache, now we need to check
1466 * that it's up-to-date.
1468 if (!Page_Uptodate(page
))
1469 goto page_not_uptodate
;
1473 * Try read-ahead for sequential areas.
1475 if (VM_SequentialReadHint(area
))
1476 nopage_sequential_readahead(area
, pgoff
, size
);
1479 * Found the page and have a reference on it, need to check sharing
1480 * and possibly copy it over to another page..
1484 struct page
*new_page
= page_cache_alloc();
1487 copy_user_highpage(new_page
, old_page
, address
);
1488 flush_page_to_ram(new_page
);
1490 new_page
= NOPAGE_OOM
;
1491 page_cache_release(page
);
1495 flush_page_to_ram(old_page
);
1500 * If the requested offset is within our file, try to read a whole
1501 * cluster of pages at once.
1503 * Otherwise, we're off the end of a privately mapped file,
1504 * so we need to map a zero page.
1506 if ((pgoff
< size
) && !VM_RandomReadHint(area
))
1507 error
= read_cluster_nonblocking(file
, pgoff
, size
);
1509 error
= page_cache_read(file
, pgoff
);
1512 * The page we want has now been added to the page cache.
1513 * In the unlikely event that someone removed it in the
1514 * meantime, we'll just come back here and read it again.
1520 * An error return from page_cache_read can result if the
1521 * system is low on memory, or a problem occurs while trying
1524 if (error
== -ENOMEM
)
1531 /* Did it get unhashed while we waited for it? */
1532 if (!page
->mapping
) {
1534 page_cache_release(page
);
1538 /* Did somebody else get it up-to-date? */
1539 if (Page_Uptodate(page
)) {
1544 if (!mapping
->a_ops
->readpage(file
, page
)) {
1546 if (Page_Uptodate(page
))
1551 * Umm, take care of errors if the page isn't up-to-date.
1552 * Try to re-read it _once_. We do this synchronously,
1553 * because there really aren't any performance issues here
1554 * and we need to check for errors.
1558 /* Somebody truncated the page on us? */
1559 if (!page
->mapping
) {
1561 page_cache_release(page
);
1565 /* Somebody else successfully read it in? */
1566 if (Page_Uptodate(page
)) {
1570 ClearPageError(page
);
1571 if (!mapping
->a_ops
->readpage(file
, page
)) {
1573 if (Page_Uptodate(page
))
1578 * Things didn't work out. Return zero to tell the
1579 * mm layer so, possibly freeing the page cache page first.
1581 page_cache_release(page
);
1585 /* Called with mm->page_table_lock held to protect against other
1586 * threads/the swapper from ripping pte's out from under us.
1588 static inline int filemap_sync_pte(pte_t
* ptep
, struct vm_area_struct
*vma
,
1589 unsigned long address
, unsigned int flags
)
1593 if (pte_present(pte
) && ptep_test_and_clear_dirty(ptep
)) {
1594 struct page
*page
= pte_page(pte
);
1595 flush_tlb_page(vma
, address
);
1596 set_page_dirty(page
);
1601 static inline int filemap_sync_pte_range(pmd_t
* pmd
,
1602 unsigned long address
, unsigned long size
,
1603 struct vm_area_struct
*vma
, unsigned long offset
, unsigned int flags
)
1611 if (pmd_bad(*pmd
)) {
1616 pte
= pte_offset(pmd
, address
);
1617 offset
+= address
& PMD_MASK
;
1618 address
&= ~PMD_MASK
;
1619 end
= address
+ size
;
1624 error
|= filemap_sync_pte(pte
, vma
, address
+ offset
, flags
);
1625 address
+= PAGE_SIZE
;
1627 } while (address
&& (address
< end
));
1631 static inline int filemap_sync_pmd_range(pgd_t
* pgd
,
1632 unsigned long address
, unsigned long size
,
1633 struct vm_area_struct
*vma
, unsigned int flags
)
1636 unsigned long offset
, end
;
1641 if (pgd_bad(*pgd
)) {
1646 pmd
= pmd_offset(pgd
, address
);
1647 offset
= address
& PGDIR_MASK
;
1648 address
&= ~PGDIR_MASK
;
1649 end
= address
+ size
;
1650 if (end
> PGDIR_SIZE
)
1654 error
|= filemap_sync_pte_range(pmd
, address
, end
- address
, vma
, offset
, flags
);
1655 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
1657 } while (address
&& (address
< end
));
1661 int filemap_sync(struct vm_area_struct
* vma
, unsigned long address
,
1662 size_t size
, unsigned int flags
)
1665 unsigned long end
= address
+ size
;
1668 /* Aquire the lock early; it may be possible to avoid dropping
1669 * and reaquiring it repeatedly.
1671 spin_lock(&vma
->vm_mm
->page_table_lock
);
1673 dir
= pgd_offset(vma
->vm_mm
, address
);
1674 flush_cache_range(vma
->vm_mm
, end
- size
, end
);
1678 error
|= filemap_sync_pmd_range(dir
, address
, end
- address
, vma
, flags
);
1679 address
= (address
+ PGDIR_SIZE
) & PGDIR_MASK
;
1681 } while (address
&& (address
< end
));
1682 flush_tlb_range(vma
->vm_mm
, end
- size
, end
);
1684 spin_unlock(&vma
->vm_mm
->page_table_lock
);
1690 * Shared mappings need to be able to do the right thing at
1691 * close/unmap/sync. They will also use the private file as
1692 * backing-store for swapping..
1694 static struct vm_operations_struct file_shared_mmap
= {
1695 nopage
: filemap_nopage
,
1699 * Private mappings just need to be able to load in the map.
1701 * (This is actually used for shared mappings as well, if we
1702 * know they can't ever get write permissions..)
1704 static struct vm_operations_struct file_private_mmap
= {
1705 nopage
: filemap_nopage
,
1708 /* This is used for a general mmap of a disk file */
1710 int generic_file_mmap(struct file
* file
, struct vm_area_struct
* vma
)
1712 struct vm_operations_struct
* ops
;
1713 struct inode
*inode
= file
->f_dentry
->d_inode
;
1715 ops
= &file_private_mmap
;
1716 if ((vma
->vm_flags
& VM_SHARED
) && (vma
->vm_flags
& VM_MAYWRITE
)) {
1717 if (!inode
->i_mapping
->a_ops
->writepage
)
1719 ops
= &file_shared_mmap
;
1721 if (!inode
->i_sb
|| !S_ISREG(inode
->i_mode
))
1723 if (!inode
->i_mapping
->a_ops
->readpage
)
1725 UPDATE_ATIME(inode
);
1731 * The msync() system call.
1734 static int msync_interval(struct vm_area_struct
* vma
,
1735 unsigned long start
, unsigned long end
, int flags
)
1737 struct file
* file
= vma
->vm_file
;
1738 if (file
&& (vma
->vm_flags
& VM_SHARED
)) {
1740 error
= filemap_sync(vma
, start
, end
-start
, flags
);
1742 if (!error
&& (flags
& MS_SYNC
)) {
1743 struct inode
* inode
= file
->f_dentry
->d_inode
;
1744 down(&inode
->i_sem
);
1745 filemap_fdatasync(inode
->i_mapping
);
1746 if (file
->f_op
&& file
->f_op
->fsync
)
1747 error
= file
->f_op
->fsync(file
, file
->f_dentry
, 1);
1748 filemap_fdatawait(inode
->i_mapping
);
1756 asmlinkage
long sys_msync(unsigned long start
, size_t len
, int flags
)
1759 struct vm_area_struct
* vma
;
1760 int unmapped_error
, error
= -EINVAL
;
1762 down(¤t
->mm
->mmap_sem
);
1763 if (start
& ~PAGE_MASK
)
1765 len
= (len
+ ~PAGE_MASK
) & PAGE_MASK
;
1769 if (flags
& ~(MS_ASYNC
| MS_INVALIDATE
| MS_SYNC
))
1775 * If the interval [start,end) covers some unmapped address ranges,
1776 * just ignore them, but return -EFAULT at the end.
1778 vma
= find_vma(current
->mm
, start
);
1781 /* Still start < end. */
1785 /* Here start < vma->vm_end. */
1786 if (start
< vma
->vm_start
) {
1787 unmapped_error
= -EFAULT
;
1788 start
= vma
->vm_start
;
1790 /* Here vma->vm_start <= start < vma->vm_end. */
1791 if (end
<= vma
->vm_end
) {
1793 error
= msync_interval(vma
, start
, end
, flags
);
1797 error
= unmapped_error
;
1800 /* Here vma->vm_start <= start < vma->vm_end < end. */
1801 error
= msync_interval(vma
, start
, vma
->vm_end
, flags
);
1804 start
= vma
->vm_end
;
1808 up(¤t
->mm
->mmap_sem
);
1812 static inline void setup_read_behavior(struct vm_area_struct
* vma
,
1815 VM_ClearReadHint(vma
);
1817 case MADV_SEQUENTIAL
:
1818 vma
->vm_flags
|= VM_SEQ_READ
;
1821 vma
->vm_flags
|= VM_RAND_READ
;
1829 static long madvise_fixup_start(struct vm_area_struct
* vma
,
1830 unsigned long end
, int behavior
)
1832 struct vm_area_struct
* n
;
1834 n
= kmem_cache_alloc(vm_area_cachep
, SLAB_KERNEL
);
1839 setup_read_behavior(n
, behavior
);
1841 get_file(n
->vm_file
);
1842 if (n
->vm_ops
&& n
->vm_ops
->open
)
1844 lock_vma_mappings(vma
);
1845 spin_lock(&vma
->vm_mm
->page_table_lock
);
1846 vma
->vm_pgoff
+= (end
- vma
->vm_start
) >> PAGE_SHIFT
;
1847 vma
->vm_start
= end
;
1848 __insert_vm_struct(current
->mm
, n
);
1849 spin_unlock(&vma
->vm_mm
->page_table_lock
);
1850 unlock_vma_mappings(vma
);
1854 static long madvise_fixup_end(struct vm_area_struct
* vma
,
1855 unsigned long start
, int behavior
)
1857 struct vm_area_struct
* n
;
1859 n
= kmem_cache_alloc(vm_area_cachep
, SLAB_KERNEL
);
1863 n
->vm_start
= start
;
1864 n
->vm_pgoff
+= (n
->vm_start
- vma
->vm_start
) >> PAGE_SHIFT
;
1865 setup_read_behavior(n
, behavior
);
1867 get_file(n
->vm_file
);
1868 if (n
->vm_ops
&& n
->vm_ops
->open
)
1870 lock_vma_mappings(vma
);
1871 spin_lock(&vma
->vm_mm
->page_table_lock
);
1872 vma
->vm_end
= start
;
1873 __insert_vm_struct(current
->mm
, n
);
1874 spin_unlock(&vma
->vm_mm
->page_table_lock
);
1875 unlock_vma_mappings(vma
);
1879 static long madvise_fixup_middle(struct vm_area_struct
* vma
,
1880 unsigned long start
, unsigned long end
, int behavior
)
1882 struct vm_area_struct
* left
, * right
;
1884 left
= kmem_cache_alloc(vm_area_cachep
, SLAB_KERNEL
);
1887 right
= kmem_cache_alloc(vm_area_cachep
, SLAB_KERNEL
);
1889 kmem_cache_free(vm_area_cachep
, left
);
1894 left
->vm_end
= start
;
1895 right
->vm_start
= end
;
1896 right
->vm_pgoff
+= (right
->vm_start
- left
->vm_start
) >> PAGE_SHIFT
;
1898 right
->vm_raend
= 0;
1899 atomic_add(2, &vma
->vm_file
->f_count
);
1901 if (vma
->vm_ops
&& vma
->vm_ops
->open
) {
1902 vma
->vm_ops
->open(left
);
1903 vma
->vm_ops
->open(right
);
1905 lock_vma_mappings(vma
);
1906 spin_lock(&vma
->vm_mm
->page_table_lock
);
1907 vma
->vm_pgoff
+= (start
- vma
->vm_start
) >> PAGE_SHIFT
;
1908 vma
->vm_start
= start
;
1910 setup_read_behavior(vma
, behavior
);
1912 __insert_vm_struct(current
->mm
, left
);
1913 __insert_vm_struct(current
->mm
, right
);
1914 spin_unlock(&vma
->vm_mm
->page_table_lock
);
1915 unlock_vma_mappings(vma
);
1920 * We can potentially split a vm area into separate
1921 * areas, each area with its own behavior.
1923 static long madvise_behavior(struct vm_area_struct
* vma
,
1924 unsigned long start
, unsigned long end
, int behavior
)
1928 /* This caps the number of vma's this process can own */
1929 if (vma
->vm_mm
->map_count
> MAX_MAP_COUNT
)
1932 if (start
== vma
->vm_start
) {
1933 if (end
== vma
->vm_end
) {
1934 setup_read_behavior(vma
, behavior
);
1937 error
= madvise_fixup_start(vma
, end
, behavior
);
1939 if (end
== vma
->vm_end
)
1940 error
= madvise_fixup_end(vma
, start
, behavior
);
1942 error
= madvise_fixup_middle(vma
, start
, end
, behavior
);
1949 * Schedule all required I/O operations, then run the disk queue
1950 * to make sure they are started. Do not wait for completion.
1952 static long madvise_willneed(struct vm_area_struct
* vma
,
1953 unsigned long start
, unsigned long end
)
1955 long error
= -EBADF
;
1957 unsigned long size
, rlim_rss
;
1959 /* Doesn't work if there's no mapped file. */
1962 file
= vma
->vm_file
;
1963 size
= (file
->f_dentry
->d_inode
->i_size
+ PAGE_CACHE_SIZE
- 1) >>
1966 start
= ((start
- vma
->vm_start
) >> PAGE_SHIFT
) + vma
->vm_pgoff
;
1967 if (end
> vma
->vm_end
)
1969 end
= ((end
- vma
->vm_start
) >> PAGE_SHIFT
) + vma
->vm_pgoff
;
1971 /* Make sure this doesn't exceed the process's max rss. */
1973 rlim_rss
= current
->rlim
? current
->rlim
[RLIMIT_RSS
].rlim_cur
:
1974 LONG_MAX
; /* default: see resource.h */
1975 if ((vma
->vm_mm
->rss
+ (end
- start
)) > rlim_rss
)
1978 /* round to cluster boundaries if this isn't a "random" area. */
1979 if (!VM_RandomReadHint(vma
)) {
1980 start
= CLUSTER_OFFSET(start
);
1981 end
= CLUSTER_OFFSET(end
+ CLUSTER_PAGES
- 1);
1983 while ((start
< end
) && (start
< size
)) {
1984 error
= read_cluster_nonblocking(file
, start
, size
);
1985 start
+= CLUSTER_PAGES
;
1990 while ((start
< end
) && (start
< size
)) {
1991 error
= page_cache_read(file
, start
);
1998 /* Don't wait for someone else to push these requests. */
1999 run_task_queue(&tq_disk
);
2005 * Application no longer needs these pages. If the pages are dirty,
2006 * it's OK to just throw them away. The app will be more careful about
2007 * data it wants to keep. Be sure to free swap resources too. The
2008 * zap_page_range call sets things up for refill_inactive to actually free
2009 * these pages later if no one else has touched them in the meantime,
2010 * although we could add these pages to a global reuse list for
2011 * refill_inactive to pick up before reclaiming other pages.
2013 * NB: This interface discards data rather than pushes it out to swap,
2014 * as some implementations do. This has performance implications for
2015 * applications like large transactional databases which want to discard
2016 * pages in anonymous maps after committing to backing store the data
2017 * that was kept in them. There is no reason to write this data out to
2018 * the swap area if the application is discarding it.
2020 * An interface that causes the system to free clean pages and flush
2021 * dirty pages is already available as msync(MS_INVALIDATE).
2023 static long madvise_dontneed(struct vm_area_struct
* vma
,
2024 unsigned long start
, unsigned long end
)
2026 if (vma
->vm_flags
& VM_LOCKED
)
2029 flush_cache_range(vma
->vm_mm
, start
, end
);
2030 zap_page_range(vma
->vm_mm
, start
, end
- start
);
2031 flush_tlb_range(vma
->vm_mm
, start
, end
);
2035 static long madvise_vma(struct vm_area_struct
* vma
, unsigned long start
,
2036 unsigned long end
, int behavior
)
2038 long error
= -EBADF
;
2042 case MADV_SEQUENTIAL
:
2044 error
= madvise_behavior(vma
, start
, end
, behavior
);
2048 error
= madvise_willneed(vma
, start
, end
);
2052 error
= madvise_dontneed(vma
, start
, end
);
2064 * The madvise(2) system call.
2066 * Applications can use madvise() to advise the kernel how it should
2067 * handle paging I/O in this VM area. The idea is to help the kernel
2068 * use appropriate read-ahead and caching techniques. The information
2069 * provided is advisory only, and can be safely disregarded by the
2070 * kernel without affecting the correct operation of the application.
2073 * MADV_NORMAL - the default behavior is to read clusters. This
2074 * results in some read-ahead and read-behind.
2075 * MADV_RANDOM - the system should read the minimum amount of data
2076 * on any access, since it is unlikely that the appli-
2077 * cation will need more than what it asks for.
2078 * MADV_SEQUENTIAL - pages in the given range will probably be accessed
2079 * once, so they can be aggressively read ahead, and
2080 * can be freed soon after they are accessed.
2081 * MADV_WILLNEED - the application is notifying the system to read
2083 * MADV_DONTNEED - the application is finished with the given range,
2084 * so the kernel can free resources associated with it.
2088 * -EINVAL - start + len < 0, start is not page-aligned,
2089 * "behavior" is not a valid value, or application
2090 * is attempting to release locked or shared pages.
2091 * -ENOMEM - addresses in the specified range are not currently
2092 * mapped, or are outside the AS of the process.
2093 * -EIO - an I/O error occurred while paging in data.
2094 * -EBADF - map exists, but area maps something that isn't a file.
2095 * -EAGAIN - a kernel resource was temporarily unavailable.
2097 asmlinkage
long sys_madvise(unsigned long start
, size_t len
, int behavior
)
2100 struct vm_area_struct
* vma
;
2101 int unmapped_error
= 0;
2102 int error
= -EINVAL
;
2104 down(¤t
->mm
->mmap_sem
);
2106 if (start
& ~PAGE_MASK
)
2108 len
= (len
+ ~PAGE_MASK
) & PAGE_MASK
;
2118 * If the interval [start,end) covers some unmapped address
2119 * ranges, just ignore them, but return -ENOMEM at the end.
2121 vma
= find_vma(current
->mm
, start
);
2123 /* Still start < end. */
2128 /* Here start < vma->vm_end. */
2129 if (start
< vma
->vm_start
) {
2130 unmapped_error
= -ENOMEM
;
2131 start
= vma
->vm_start
;
2134 /* Here vma->vm_start <= start < vma->vm_end. */
2135 if (end
<= vma
->vm_end
) {
2137 error
= madvise_vma(vma
, start
, end
,
2142 error
= unmapped_error
;
2146 /* Here vma->vm_start <= start < vma->vm_end < end. */
2147 error
= madvise_vma(vma
, start
, vma
->vm_end
, behavior
);
2150 start
= vma
->vm_end
;
2155 up(¤t
->mm
->mmap_sem
);
2160 * Later we can get more picky about what "in core" means precisely.
2161 * For now, simply check to see if the page is in the page cache,
2162 * and is up to date; i.e. that no page-in operation would be required
2163 * at this time if an application were to map and access this page.
2165 static unsigned char mincore_page(struct vm_area_struct
* vma
,
2166 unsigned long pgoff
)
2168 unsigned char present
= 0;
2169 struct address_space
* as
= &vma
->vm_file
->f_dentry
->d_inode
->i_data
;
2170 struct page
* page
, ** hash
= page_hash(as
, pgoff
);
2172 spin_lock(&pagecache_lock
);
2173 page
= __find_page_nolock(as
, pgoff
, *hash
);
2174 if ((page
) && (Page_Uptodate(page
)))
2176 spin_unlock(&pagecache_lock
);
2181 static long mincore_vma(struct vm_area_struct
* vma
,
2182 unsigned long start
, unsigned long end
, unsigned char * vec
)
2184 long error
, i
, remaining
;
2185 unsigned char * tmp
;
2191 start
= ((start
- vma
->vm_start
) >> PAGE_SHIFT
) + vma
->vm_pgoff
;
2192 if (end
> vma
->vm_end
)
2194 end
= ((end
- vma
->vm_start
) >> PAGE_SHIFT
) + vma
->vm_pgoff
;
2197 tmp
= (unsigned char *) __get_free_page(GFP_KERNEL
);
2201 /* (end - start) is # of pages, and also # of bytes in "vec */
2202 remaining
= (end
- start
),
2205 for (i
= 0; remaining
> 0; remaining
-= PAGE_SIZE
, i
++) {
2207 long thispiece
= (remaining
< PAGE_SIZE
) ?
2208 remaining
: PAGE_SIZE
;
2210 while (j
< thispiece
)
2211 tmp
[j
++] = mincore_page(vma
, start
++);
2213 if (copy_to_user(vec
+ PAGE_SIZE
* i
, tmp
, thispiece
)) {
2219 free_page((unsigned long) tmp
);
2224 * The mincore(2) system call.
2226 * mincore() returns the memory residency status of the pages in the
2227 * current process's address space specified by [addr, addr + len).
2228 * The status is returned in a vector of bytes. The least significant
2229 * bit of each byte is 1 if the referenced page is in memory, otherwise
2232 * Because the status of a page can change after mincore() checks it
2233 * but before it returns to the application, the returned vector may
2234 * contain stale information. Only locked pages are guaranteed to
2239 * -EFAULT - vec points to an illegal address
2240 * -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE,
2241 * or len has a nonpositive value
2242 * -ENOMEM - Addresses in the range [addr, addr + len] are
2243 * invalid for the address space of this process, or
2244 * specify one or more pages which are not currently
2246 * -EAGAIN - A kernel resource was temporarily unavailable.
2248 asmlinkage
long sys_mincore(unsigned long start
, size_t len
,
2249 unsigned char * vec
)
2253 struct vm_area_struct
* vma
;
2254 int unmapped_error
= 0;
2255 long error
= -EINVAL
;
2257 down(¤t
->mm
->mmap_sem
);
2259 if (start
& ~PAGE_CACHE_MASK
)
2261 len
= (len
+ ~PAGE_CACHE_MASK
) & PAGE_CACHE_MASK
;
2271 * If the interval [start,end) covers some unmapped address
2272 * ranges, just ignore them, but return -ENOMEM at the end.
2274 vma
= find_vma(current
->mm
, start
);
2276 /* Still start < end. */
2281 /* Here start < vma->vm_end. */
2282 if (start
< vma
->vm_start
) {
2283 unmapped_error
= -ENOMEM
;
2284 start
= vma
->vm_start
;
2287 /* Here vma->vm_start <= start < vma->vm_end. */
2288 if (end
<= vma
->vm_end
) {
2290 error
= mincore_vma(vma
, start
, end
,
2295 error
= unmapped_error
;
2299 /* Here vma->vm_start <= start < vma->vm_end < end. */
2300 error
= mincore_vma(vma
, start
, vma
->vm_end
, &vec
[index
]);
2303 index
+= (vma
->vm_end
- start
) >> PAGE_CACHE_SHIFT
;
2304 start
= vma
->vm_end
;
2309 up(¤t
->mm
->mmap_sem
);
2314 struct page
*__read_cache_page(struct address_space
*mapping
,
2315 unsigned long index
,
2316 int (*filler
)(void *,struct page
*),
2319 struct page
**hash
= page_hash(mapping
, index
);
2320 struct page
*page
, *cached_page
= NULL
;
2323 page
= __find_get_page(mapping
, index
, hash
);
2326 cached_page
= page_cache_alloc();
2328 return ERR_PTR(-ENOMEM
);
2331 if (add_to_page_cache_unique(page
, mapping
, index
, hash
))
2334 err
= filler(data
, page
);
2336 page_cache_release(page
);
2337 page
= ERR_PTR(err
);
2341 page_cache_free(cached_page
);
2346 * Read into the page cache. If a page already exists,
2347 * and Page_Uptodate() is not set, try to fill the page.
2349 struct page
*read_cache_page(struct address_space
*mapping
,
2350 unsigned long index
,
2351 int (*filler
)(void *,struct page
*),
2358 page
= __read_cache_page(mapping
, index
, filler
, data
);
2359 if (IS_ERR(page
) || Page_Uptodate(page
))
2363 if (!page
->mapping
) {
2365 page_cache_release(page
);
2368 if (Page_Uptodate(page
)) {
2372 err
= filler(data
, page
);
2374 page_cache_release(page
);
2375 page
= ERR_PTR(err
);
2381 static inline struct page
* __grab_cache_page(struct address_space
*mapping
,
2382 unsigned long index
, struct page
**cached_page
)
2384 struct page
*page
, **hash
= page_hash(mapping
, index
);
2386 page
= __find_lock_page(mapping
, index
, hash
);
2388 if (!*cached_page
) {
2389 *cached_page
= page_cache_alloc();
2393 page
= *cached_page
;
2394 if (add_to_page_cache_unique(page
, mapping
, index
, hash
))
2396 *cached_page
= NULL
;
2402 * Returns locked page at given index in given cache, creating it if needed.
2405 struct page
*grab_cache_page(struct address_space
*mapping
, unsigned long index
)
2407 struct page
*cached_page
= NULL
;
2408 struct page
*page
= __grab_cache_page(mapping
,index
,&cached_page
);
2410 page_cache_free(cached_page
);
2414 static inline void remove_suid(struct inode
*inode
)
2418 /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
2419 mode
= (inode
->i_mode
& S_IXGRP
)*(S_ISGID
/S_IXGRP
) | S_ISUID
;
2421 /* was any of the uid bits set? */
2422 mode
&= inode
->i_mode
;
2423 if (mode
&& !capable(CAP_FSETID
)) {
2424 inode
->i_mode
&= ~mode
;
2425 mark_inode_dirty(inode
);
2430 * Write to a file through the page cache.
2432 * We currently put everything into the page cache prior to writing it.
2433 * This is not a problem when writing full pages. With partial pages,
2434 * however, we first have to read the data into the cache, then
2435 * dirty the page, and finally schedule it for writing. Alternatively, we
2436 * could write-through just the portion of data that would go into that
2437 * page, but that would kill performance for applications that write data
2438 * line by line, and it's prone to race conditions.
2440 * Note that this routine doesn't try to keep track of dirty pages. Each
2441 * file system has to do this all by itself, unfortunately.
2445 generic_file_write(struct file
*file
,const char *buf
,size_t count
,loff_t
*ppos
)
2447 struct inode
*inode
= file
->f_dentry
->d_inode
;
2448 struct address_space
*mapping
= inode
->i_mapping
;
2449 unsigned long limit
= current
->rlim
[RLIMIT_FSIZE
].rlim_cur
;
2451 struct page
*page
, *cached_page
;
2452 unsigned long written
;
2458 down(&inode
->i_sem
);
2465 err
= file
->f_error
;
2473 if (file
->f_flags
& O_APPEND
)
2474 pos
= inode
->i_size
;
2477 * Check whether we've reached the file size limit.
2480 if (limit
!= RLIM_INFINITY
) {
2482 send_sig(SIGXFSZ
, current
, 0);
2485 if (count
> limit
- pos
) {
2486 send_sig(SIGXFSZ
, current
, 0);
2487 count
= limit
- pos
;
2494 inode
->i_ctime
= inode
->i_mtime
= CURRENT_TIME
;
2495 mark_inode_dirty_sync(inode
);
2499 unsigned long bytes
, index
, offset
;
2503 * Try to find the page in the cache. If it isn't there,
2504 * allocate a free page.
2506 offset
= (pos
& (PAGE_CACHE_SIZE
-1)); /* Within page */
2507 index
= pos
>> PAGE_CACHE_SHIFT
;
2508 bytes
= PAGE_CACHE_SIZE
- offset
;
2513 * Bring in the user page that we will copy from _first_.
2514 * Otherwise there's a nasty deadlock on copying from the
2515 * same page as we're writing to, without it being marked
2518 { volatile unsigned char dummy
;
2519 __get_user(dummy
, buf
);
2520 __get_user(dummy
, buf
+bytes
-1);
2523 status
= -ENOMEM
; /* we'll assign it later anyway */
2524 page
= __grab_cache_page(mapping
, index
, &cached_page
);
2528 /* We have exclusive IO access to the page.. */
2529 if (!PageLocked(page
)) {
2533 status
= mapping
->a_ops
->prepare_write(file
, page
, offset
, offset
+bytes
);
2536 kaddr
= page_address(page
);
2537 status
= copy_from_user(kaddr
+offset
, buf
, bytes
);
2538 flush_dcache_page(page
);
2541 status
= mapping
->a_ops
->commit_write(file
, page
, offset
, offset
+bytes
);
2552 /* Mark it unlocked again and drop the page.. */
2554 deactivate_page(page
);
2555 page_cache_release(page
);
2563 page_cache_free(cached_page
);
2565 /* For now, when the user asks for O_SYNC, we'll actually
2566 * provide O_DSYNC. */
2567 if ((status
>= 0) && (file
->f_flags
& O_SYNC
))
2568 status
= generic_osync_inode(inode
, 1); /* 1 means datasync */
2570 err
= written
? written
: status
;
2577 ClearPageUptodate(page
);
2582 void __init
page_cache_init(unsigned long mempages
)
2584 unsigned long htable_size
, order
;
2586 htable_size
= mempages
;
2587 htable_size
*= sizeof(struct page
*);
2588 for(order
= 0; (PAGE_SIZE
<< order
) < htable_size
; order
++)
2592 unsigned long tmp
= (PAGE_SIZE
<< order
) / sizeof(struct page
*);
2595 while((tmp
>>= 1UL) != 0UL)
2598 page_hash_table
= (struct page
**)
2599 __get_free_pages(GFP_ATOMIC
, order
);
2600 } while(page_hash_table
== NULL
&& --order
> 0);
2602 printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
2603 (1 << page_hash_bits
), order
, (PAGE_SIZE
<< order
));
2604 if (!page_hash_table
)
2605 panic("Failed to allocate page hash table\n");
2606 memset((void *)page_hash_table
, 0, PAGE_HASH_SIZE
* sizeof(struct page
*));