4 * Copyright (C) 1994, 1995 Linus Torvalds
8 * This file handles the generic file mmap semantics used by
9 * most "normal" filesystems (but you don't /have/ to use this:
10 * the NFS filesystem used to do this differently, for example)
12 #include <linux/malloc.h>
13 #include <linux/shm.h>
14 #include <linux/mman.h>
15 #include <linux/locks.h>
16 #include <linux/pagemap.h>
17 #include <linux/swap.h>
18 #include <linux/smp_lock.h>
19 #include <linux/blkdev.h>
20 #include <linux/file.h>
21 #include <linux/swapctl.h>
22 #include <linux/slab.h>
24 #include <asm/pgtable.h>
25 #include <asm/uaccess.h>
28 * Shared mappings implemented 30.11.1994. It's not fully working yet,
31 * Shared mappings now work. 15.8.1995 Bruno.
34 unsigned long page_cache_size
= 0;
35 struct page
* page_hash_table
[PAGE_HASH_SIZE
];
38 * Define a request structure for outstanding page write requests
39 * to the background page io daemon
44 struct pio_request
* next
;
49 static struct pio_request
*pio_first
= NULL
, **pio_last
= &pio_first
;
50 static kmem_cache_t
*pio_request_cache
;
51 static struct wait_queue
*pio_wait
= NULL
;
54 make_pio_request(struct file
*, unsigned long, unsigned long);
58 * Invalidate the pages of an inode, removing all pages that aren't
59 * locked down (those are sure to be up-to-date anyway, so we shouldn't
62 void invalidate_inode_pages(struct inode
* inode
)
68 while ((page
= *p
) != NULL
) {
69 if (PageLocked(page
)) {
74 if ((*p
= page
->next
) != NULL
)
75 (*p
)->prev
= page
->prev
;
78 remove_page_from_hash_queue(page
);
80 page_cache_release(page
);
86 * Truncate the page cache at a set offset, removing the pages
87 * that are beyond that offset (and zeroing out partial pages).
89 void truncate_inode_pages(struct inode
* inode
, unsigned long start
)
96 while ((page
= *p
) != NULL
) {
97 unsigned long offset
= page
->offset
;
99 /* page wholly truncated - free it */
100 if (offset
>= start
) {
101 if (PageLocked(page
)) {
106 if ((*p
= page
->next
) != NULL
)
107 (*p
)->prev
= page
->prev
;
110 remove_page_from_hash_queue(page
);
112 page_cache_release(page
);
116 offset
= start
- offset
;
117 /* partial truncate, clear end of page */
118 if (offset
< PAGE_CACHE_SIZE
) {
119 unsigned long address
= page_address(page
);
120 memset((void *) (offset
+ address
), 0, PAGE_CACHE_SIZE
- offset
);
121 flush_page_to_ram(address
);
127 * Remove a page from the page cache and free it.
129 void remove_inode_page(struct page
*page
)
131 remove_page_from_hash_queue(page
);
132 remove_page_from_inode_queue(page
);
133 page_cache_release(page
);
136 int shrink_mmap(int priority
, int gfp_mask
)
138 static unsigned long clock
= 0;
139 unsigned long limit
= num_physpages
;
143 count
= limit
>> priority
;
145 page
= mem_map
+ clock
;
149 /* This works even in the presence of PageSkip because
150 * the first two entries at the beginning of a hole will
151 * be marked, not just the first.
155 if (clock
>= max_mapnr
) {
159 if (PageSkip(page
)) {
160 /* next_hash is overloaded for PageSkip */
161 page
= page
->next_hash
;
162 clock
= page
- mem_map
;
165 referenced
= test_and_clear_bit(PG_referenced
, &page
->flags
);
167 if (PageLocked(page
))
170 if ((gfp_mask
& __GFP_DMA
) && !PageDMA(page
))
173 /* We can't free pages unless there's just one user */
174 if (atomic_read(&page
->count
) != 1)
180 * Is it a page swap page? If so, we want to
181 * drop it if it is no longer used, even if it
182 * were to be marked referenced..
184 if (PageSwapCache(page
)) {
185 if (referenced
&& swap_count(page
->offset
) != 1)
187 delete_from_swap_cache(page
);
194 /* Is it a buffer page? */
196 if (buffer_under_min())
198 if (!try_to_free_buffers(page
))
203 /* is it a page-cache page? */
205 if (pgcache_under_min())
207 remove_inode_page(page
);
216 * Update a page cache copy, when we're doing a "write()" system call
217 * See also "update_vm_cache()".
219 void update_vm_cache(struct inode
* inode
, unsigned long pos
, const char * buf
, int count
)
221 unsigned long offset
, len
;
223 offset
= (pos
& ~PAGE_CACHE_MASK
);
224 pos
= pos
& PAGE_CACHE_MASK
;
225 len
= PAGE_CACHE_SIZE
- offset
;
231 page
= find_page(inode
, pos
);
234 memcpy((void *) (offset
+ page_address(page
)), buf
, len
);
235 page_cache_release(page
);
239 len
= PAGE_CACHE_SIZE
;
241 pos
+= PAGE_CACHE_SIZE
;
245 static inline void add_to_page_cache(struct page
* page
,
246 struct inode
* inode
, unsigned long offset
,
249 atomic_inc(&page
->count
);
250 page
->flags
= (page
->flags
& ~((1 << PG_uptodate
) | (1 << PG_error
))) | (1 << PG_referenced
);
251 page
->offset
= offset
;
252 add_page_to_inode_queue(inode
, page
);
253 __add_page_to_hash_queue(page
, hash
);
257 * Try to read ahead in the file. "page_cache" is a potentially free page
258 * that we could use for the cache (if it is 0 we can try to create one,
259 * this is all overlapped with the IO on the previous page finishing anyway)
261 static unsigned long try_to_read_ahead(struct file
* file
,
262 unsigned long offset
, unsigned long page_cache
)
264 struct inode
*inode
= file
->f_dentry
->d_inode
;
268 offset
&= PAGE_CACHE_MASK
;
269 switch (page_cache
) {
271 page_cache
= page_cache_alloc();
275 if (offset
>= inode
->i_size
)
277 hash
= page_hash(inode
, offset
);
278 page
= __find_page(inode
, offset
, *hash
);
281 * Ok, add the new page to the hash-queues...
283 page
= page_cache_entry(page_cache
);
284 add_to_page_cache(page
, inode
, offset
, hash
);
285 inode
->i_op
->readpage(file
, page
);
288 page_cache_release(page
);
294 * Wait for IO to complete on a locked page.
296 * This must be called with the caller "holding" the page,
297 * ie with increased "page->count" so that the page won't
298 * go away during the wait..
300 void __wait_on_page(struct page
*page
)
302 struct task_struct
*tsk
= current
;
303 struct wait_queue wait
;
306 add_wait_queue(&page
->wait
, &wait
);
308 tsk
->state
= TASK_UNINTERRUPTIBLE
;
309 run_task_queue(&tq_disk
);
310 if (PageLocked(page
)) {
314 tsk
->state
= TASK_RUNNING
;
315 remove_wait_queue(&page
->wait
, &wait
);
319 #define PROFILE_READAHEAD
320 #define DEBUG_READAHEAD
324 * Read-ahead profiling information
325 * --------------------------------
326 * Every PROFILE_MAXREADCOUNT, the following information is written
328 * Percentage of asynchronous read-ahead.
329 * Average of read-ahead fields context value.
330 * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
334 #ifdef PROFILE_READAHEAD
336 #define PROFILE_MAXREADCOUNT 1000
338 static unsigned long total_reada
;
339 static unsigned long total_async
;
340 static unsigned long total_ramax
;
341 static unsigned long total_ralen
;
342 static unsigned long total_rawin
;
344 static void profile_readahead(int async
, struct file
*filp
)
352 total_ramax
+= filp
->f_ramax
;
353 total_ralen
+= filp
->f_ralen
;
354 total_rawin
+= filp
->f_rawin
;
356 if (total_reada
> PROFILE_MAXREADCOUNT
) {
359 if (!(total_reada
> PROFILE_MAXREADCOUNT
)) {
360 restore_flags(flags
);
364 printk("Readahead average: max=%ld, len=%ld, win=%ld, async=%ld%%\n",
365 total_ramax
/total_reada
,
366 total_ralen
/total_reada
,
367 total_rawin
/total_reada
,
368 (total_async
*100)/total_reada
);
369 #ifdef DEBUG_READAHEAD
370 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%ld\n",
371 filp
->f_ramax
, filp
->f_ralen
, filp
->f_rawin
, filp
->f_raend
);
380 restore_flags(flags
);
383 #endif /* defined PROFILE_READAHEAD */
386 * Read-ahead context:
387 * -------------------
388 * The read ahead context fields of the "struct file" are the following:
389 * - f_raend : position of the first byte after the last page we tried to
391 * - f_ramax : current read-ahead maximum size.
392 * - f_ralen : length of the current IO read block we tried to read-ahead.
393 * - f_rawin : length of the current read-ahead window.
394 * if last read-ahead was synchronous then
396 * otherwise (was asynchronous)
397 * f_rawin = previous value of f_ralen + f_ralen
401 * MIN_READAHEAD : minimum read-ahead size when read-ahead.
402 * MAX_READAHEAD : maximum read-ahead size when read-ahead.
404 * Synchronous read-ahead benefits:
405 * --------------------------------
406 * Using reasonable IO xfer length from peripheral devices increase system
408 * Reasonable means, in this context, not too large but not too small.
409 * The actual maximum value is:
410 * MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
411 * and 32K if defined (4K page size assumed).
413 * Asynchronous read-ahead benefits:
414 * ---------------------------------
415 * Overlapping next read request and user process execution increase system
420 * We have to guess which further data are needed by the user process.
421 * If these data are often not really needed, it's bad for system
423 * However, we know that files are often accessed sequentially by
424 * application programs and it seems that it is possible to have some good
425 * strategy in that guessing.
426 * We only try to read-ahead files that seems to be read sequentially.
428 * Asynchronous read-ahead risks:
429 * ------------------------------
430 * In order to maximize overlapping, we must start some asynchronous read
431 * request from the device, as soon as possible.
432 * We must be very careful about:
433 * - The number of effective pending IO read requests.
434 * ONE seems to be the only reasonable value.
435 * - The total memory pool usage for the file access stream.
436 * This maximum memory usage is implicitly 2 IO read chunks:
437 * 2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
438 * 64k if defined (4K page size assumed).
441 static inline int get_max_readahead(struct inode
* inode
)
443 if (!inode
->i_dev
|| !max_readahead
[MAJOR(inode
->i_dev
)])
444 return MAX_READAHEAD
;
445 return max_readahead
[MAJOR(inode
->i_dev
)][MINOR(inode
->i_dev
)];
448 static inline unsigned long generic_file_readahead(int reada_ok
,
449 struct file
* filp
, struct inode
* inode
,
450 unsigned long ppos
, struct page
* page
, unsigned long page_cache
)
452 unsigned long max_ahead
, ahead
;
454 int max_readahead
= get_max_readahead(inode
);
456 raend
= filp
->f_raend
& PAGE_CACHE_MASK
;
460 * The current page is locked.
461 * If the current position is inside the previous read IO request, do not
462 * try to reread previously read ahead pages.
463 * Otherwise decide or not to read ahead some pages synchronously.
464 * If we are not going to read ahead, set the read ahead context for this
467 if (PageLocked(page
)) {
468 if (!filp
->f_ralen
|| ppos
>= raend
|| ppos
+ filp
->f_ralen
< raend
) {
470 if (raend
< inode
->i_size
)
471 max_ahead
= filp
->f_ramax
;
473 filp
->f_ralen
= PAGE_CACHE_SIZE
;
475 filp
->f_raend
= ppos
+ filp
->f_ralen
;
476 filp
->f_rawin
+= filp
->f_ralen
;
481 * The current page is not locked.
482 * If we were reading ahead and,
483 * if the current max read ahead size is not zero and,
484 * if the current position is inside the last read-ahead IO request,
485 * it is the moment to try to read ahead asynchronously.
486 * We will later force unplug device in order to force asynchronous read IO.
488 else if (reada_ok
&& filp
->f_ramax
&& raend
>= PAGE_CACHE_SIZE
&&
489 ppos
<= raend
&& ppos
+ filp
->f_ralen
>= raend
) {
491 * Add ONE page to max_ahead in order to try to have about the same IO max size
492 * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
493 * Compute the position of the last page we have tried to read in order to
494 * begin to read ahead just at the next page.
496 raend
-= PAGE_CACHE_SIZE
;
497 if (raend
< inode
->i_size
)
498 max_ahead
= filp
->f_ramax
+ PAGE_CACHE_SIZE
;
501 filp
->f_rawin
= filp
->f_ralen
;
507 * Try to read ahead pages.
508 * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
509 * scheduler, will work enough for us to avoid too bad actuals IO requests.
512 while (ahead
< max_ahead
) {
513 ahead
+= PAGE_CACHE_SIZE
;
514 page_cache
= try_to_read_ahead(filp
, raend
+ ahead
,
518 * If we tried to read ahead some pages,
519 * If we tried to read ahead asynchronously,
520 * Try to force unplug of the device in order to start an asynchronous
522 * Update the read-ahead context.
523 * Store the length of the current read-ahead window.
524 * Double the current max read ahead size.
525 * That heuristic avoid to do some large IO for files that are not really
526 * accessed sequentially.
530 run_task_queue(&tq_disk
);
533 filp
->f_ralen
+= ahead
;
534 filp
->f_rawin
+= filp
->f_ralen
;
535 filp
->f_raend
= raend
+ ahead
+ PAGE_CACHE_SIZE
;
537 filp
->f_ramax
+= filp
->f_ramax
;
539 if (filp
->f_ramax
> max_readahead
)
540 filp
->f_ramax
= max_readahead
;
542 #ifdef PROFILE_READAHEAD
543 profile_readahead((reada_ok
== 2), filp
);
551 * "descriptor" for what we're up to with a read.
552 * This allows us to use the same read code yet
553 * have multiple different users of the data that
554 * we read from a file.
556 * The simplest case just copies the data to user
566 typedef int (*read_actor_t
)(read_descriptor_t
*, const char *, unsigned long);
569 * This is a generic file read routine, and uses the
570 * inode->i_op->readpage() function for the actual low-level
573 * This is really ugly. But the goto's actually try to clarify some
574 * of the logic when it comes to error handling etc.
576 static void do_generic_file_read(struct file
* filp
, loff_t
*ppos
, read_descriptor_t
* desc
, read_actor_t actor
)
578 struct dentry
*dentry
= filp
->f_dentry
;
579 struct inode
*inode
= dentry
->d_inode
;
580 size_t pos
, pgpos
, page_cache
;
582 int max_readahead
= get_max_readahead(inode
);
587 pgpos
= pos
& PAGE_CACHE_MASK
;
589 * If the current position is outside the previous read-ahead window,
590 * we reset the current read-ahead context and set read ahead max to zero
591 * (will be set to just needed value later),
592 * otherwise, we assume that the file accesses are sequential enough to
593 * continue read-ahead.
595 if (pgpos
> filp
->f_raend
|| pgpos
+ filp
->f_rawin
< filp
->f_raend
) {
605 * Adjust the current value of read-ahead max.
606 * If the read operation stay in the first half page, force no readahead.
607 * Otherwise try to increase read ahead max just enough to do the read request.
608 * Then, at least MIN_READAHEAD if read ahead is ok,
609 * and at most MAX_READAHEAD in all cases.
611 if (pos
+ desc
->count
<= (PAGE_CACHE_SIZE
>> 1)) {
614 unsigned long needed
;
616 needed
= ((pos
+ desc
->count
) & PAGE_CACHE_MASK
) - pgpos
;
618 if (filp
->f_ramax
< needed
)
619 filp
->f_ramax
= needed
;
621 if (reada_ok
&& filp
->f_ramax
< MIN_READAHEAD
)
622 filp
->f_ramax
= MIN_READAHEAD
;
623 if (filp
->f_ramax
> max_readahead
)
624 filp
->f_ramax
= max_readahead
;
628 struct page
*page
, **hash
;
630 if (pos
>= inode
->i_size
)
634 * Try to find the data in the page cache..
636 hash
= page_hash(inode
, pos
& PAGE_CACHE_MASK
);
637 page
= __find_page(inode
, pos
& PAGE_CACHE_MASK
, *hash
);
643 * Try to read ahead only if the current page is filled or being filled.
644 * Otherwise, if we were reading ahead, decrease max read ahead size to
646 * In this context, that seems to may happen only on some read error or if
647 * the page has been rewritten.
649 if (PageUptodate(page
) || PageLocked(page
))
650 page_cache
= generic_file_readahead(reada_ok
, filp
, inode
, pos
& PAGE_CACHE_MASK
, page
, page_cache
);
651 else if (reada_ok
&& filp
->f_ramax
> MIN_READAHEAD
)
652 filp
->f_ramax
= MIN_READAHEAD
;
656 if (!PageUptodate(page
))
657 goto page_read_error
;
661 * Ok, we have the page, it's up-to-date and ok,
662 * so now we can finally copy it to user space...
665 unsigned long offset
, nr
;
667 offset
= pos
& ~PAGE_CACHE_MASK
;
668 nr
= PAGE_CACHE_SIZE
- offset
;
669 if (nr
> inode
->i_size
- pos
)
670 nr
= inode
->i_size
- pos
;
673 * The actor routine returns how many bytes were actually used..
674 * NOTE! This may not be the same as how much of a user buffer
675 * we filled up (we may be padding etc), so we can only update
676 * "pos" here (the actor routine has to update the user buffer
677 * pointers and the remaining count).
679 nr
= actor(desc
, (const char *) (page_address(page
) + offset
), nr
);
681 page_cache_release(page
);
682 if (nr
&& desc
->count
)
689 * Ok, it wasn't cached, so we need to create a new
693 page_cache
= page_cache_alloc();
695 * That could have slept, so go around to the
700 desc
->error
= -ENOMEM
;
705 * Ok, add the new page to the hash-queues...
707 page
= page_cache_entry(page_cache
);
709 add_to_page_cache(page
, inode
, pos
& PAGE_CACHE_MASK
, hash
);
712 * Error handling is tricky. If we get a read error,
713 * the cached page stays in the cache (but uptodate=0),
714 * and the next process that accesses it will try to
715 * re-read it. This is needed for NFS etc, where the
716 * identity of the reader can decide if we can read the
720 * We have to read the page.
721 * If we were reading ahead, we had previously tried to read this page,
722 * That means that the page has probably been removed from the cache before
723 * the application process needs it, or has been rewritten.
724 * Decrease max readahead size to the minimum value in that situation.
726 if (reada_ok
&& filp
->f_ramax
> MIN_READAHEAD
)
727 filp
->f_ramax
= MIN_READAHEAD
;
730 int error
= inode
->i_op
->readpage(filp
, page
);
734 page_cache_release(page
);
740 * We found the page, but it wasn't up-to-date.
741 * Try to re-read it _once_. We do this synchronously,
742 * because this happens only if there were errors.
745 int error
= inode
->i_op
->readpage(filp
, page
);
748 if (PageUptodate(page
) && !PageError(page
))
750 error
= -EIO
; /* Some unspecified error occurred.. */
753 page_cache_release(page
);
761 page_cache_free(page_cache
);
765 static int file_read_actor(read_descriptor_t
* desc
, const char *area
, unsigned long size
)
768 unsigned long count
= desc
->count
;
772 left
= __copy_to_user(desc
->buf
, area
, size
);
775 desc
->error
= -EFAULT
;
777 desc
->count
= count
- size
;
778 desc
->written
+= size
;
784 * This is the "read()" routine for all filesystems
785 * that can use the page cache directly.
787 ssize_t
generic_file_read(struct file
* filp
, char * buf
, size_t count
, loff_t
*ppos
)
792 if (access_ok(VERIFY_WRITE
, buf
, count
)) {
795 read_descriptor_t desc
;
801 do_generic_file_read(filp
, ppos
, &desc
, file_read_actor
);
803 retval
= desc
.written
;
811 static int file_send_actor(read_descriptor_t
* desc
, const char *area
, unsigned long size
)
814 unsigned long count
= desc
->count
;
815 struct file
*file
= (struct file
*) desc
->buf
;
816 struct inode
*inode
= file
->f_dentry
->d_inode
;
824 written
= file
->f_op
->write(file
, area
, size
, &file
->f_pos
);
828 desc
->error
= written
;
831 desc
->count
= count
- written
;
832 desc
->written
+= written
;
836 asmlinkage ssize_t
sys_sendfile(int out_fd
, int in_fd
, off_t
*offset
, size_t count
)
839 struct file
* in_file
, * out_file
;
840 struct inode
* in_inode
, * out_inode
;
845 * Get input file, and verify that it is ok..
848 in_file
= fget(in_fd
);
851 if (!(in_file
->f_mode
& FMODE_READ
))
854 in_inode
= in_file
->f_dentry
->d_inode
;
857 if (!in_inode
->i_op
|| !in_inode
->i_op
->readpage
)
859 retval
= locks_verify_area(FLOCK_VERIFY_READ
, in_inode
, in_file
, in_file
->f_pos
, count
);
864 * Get output file, and verify that it is ok..
867 out_file
= fget(out_fd
);
870 if (!(out_file
->f_mode
& FMODE_WRITE
))
873 if (!out_file
->f_op
|| !out_file
->f_op
->write
)
875 out_inode
= out_file
->f_dentry
->d_inode
;
878 retval
= locks_verify_area(FLOCK_VERIFY_WRITE
, out_inode
, out_file
, out_file
->f_pos
, count
);
884 read_descriptor_t desc
;
885 loff_t pos
= 0, *ppos
;
888 ppos
= &in_file
->f_pos
;
890 if (get_user(pos
, offset
))
897 desc
.buf
= (char *) out_file
;
899 do_generic_file_read(in_file
, ppos
, &desc
, file_send_actor
);
901 retval
= desc
.written
;
905 put_user(pos
, offset
);
919 * Semantics for shared and private memory areas are different past the end
920 * of the file. A shared mapping past the last page of the file is an error
921 * and results in a SIGBUS, while a private mapping just maps in a zero page.
923 * The goto's are kind of ugly, but this streamlines the normal case of having
924 * it in the page cache, and handles the special cases reasonably without
925 * having a lot of duplicated code.
927 * WSH 06/04/97: fixed a memory leak and moved the allocation of new_page
928 * ahead of the wait if we're sure to need it.
930 static unsigned long filemap_nopage(struct vm_area_struct
* area
, unsigned long address
, int no_share
)
932 struct file
* file
= area
->vm_file
;
933 struct dentry
* dentry
= file
->f_dentry
;
934 struct inode
* inode
= dentry
->d_inode
;
935 unsigned long offset
, reada
, i
;
936 struct page
* page
, **hash
;
937 unsigned long old_page
, new_page
;
940 offset
= (address
- area
->vm_start
+ area
->vm_offset
) & PAGE_MASK
;
941 if (offset
>= inode
->i_size
&& (area
->vm_flags
& VM_SHARED
) && area
->vm_mm
== current
->mm
)
945 * Do we have something in the page cache already?
947 hash
= page_hash(inode
, offset
);
948 page
= __find_page(inode
, offset
, *hash
);
954 * Ok, found a page in the page cache, now we need to check
955 * that it's up-to-date. First check whether we'll need an
956 * extra page -- better to overlap the allocation with the I/O.
958 if (no_share
&& !new_page
) {
959 new_page
= page_cache_alloc();
964 if (PageLocked(page
))
965 goto page_locked_wait
;
966 if (!PageUptodate(page
))
967 goto page_read_error
;
971 * Found the page, need to check sharing and possibly
972 * copy it over to another page..
974 old_page
= page_address(page
);
977 * Ok, we can share the cached page directly.. Get rid
978 * of any potential extra pages.
981 page_cache_free(new_page
);
983 flush_page_to_ram(old_page
);
988 * No sharing ... copy to the new page.
990 copy_page(new_page
, old_page
);
991 flush_page_to_ram(new_page
);
992 page_cache_release(page
);
997 * Try to read in an entire cluster at once.
1000 reada
>>= PAGE_CACHE_SHIFT
+ page_cluster
;
1001 reada
<<= PAGE_CACHE_SHIFT
+ page_cluster
;
1003 for (i
= 1 << page_cluster
; i
> 0; --i
, reada
+= PAGE_CACHE_SIZE
)
1004 new_page
= try_to_read_ahead(file
, reada
, new_page
);
1007 new_page
= page_cache_alloc();
1012 * During getting the above page we might have slept,
1013 * so we need to re-check the situation with the page
1014 * cache.. The page we just got may be useful if we
1015 * can't share, so don't get rid of it here.
1017 page
= find_page(inode
, offset
);
1022 * Now, create a new page-cache page from the page we got
1024 page
= page_cache_entry(new_page
);
1026 add_to_page_cache(page
, inode
, offset
, hash
);
1028 if (inode
->i_op
->readpage(file
, page
) != 0)
1034 __wait_on_page(page
);
1035 if (PageUptodate(page
))
1040 * Umm, take care of errors if the page isn't up-to-date.
1041 * Try to re-read it _once_. We do this synchronously,
1042 * because there really aren't any performance issues here
1043 * and we need to check for errors.
1045 if (inode
->i_op
->readpage(file
, page
) != 0)
1048 if (PageError(page
))
1050 if (PageUptodate(page
))
1054 * Things didn't work out. Return zero to tell the
1055 * mm layer so, possibly freeing the page cache page first.
1058 page_cache_release(page
);
1060 page_cache_free(new_page
);
1066 * Tries to write a shared mapped page to its backing store. May return -EIO
1067 * if the disk is full.
1069 static inline int do_write_page(struct inode
* inode
, struct file
* file
,
1070 const char * page
, unsigned long offset
)
1074 loff_t loff
= offset
;
1075 mm_segment_t old_fs
;
1077 size
= offset
+ PAGE_SIZE
;
1078 /* refuse to extend file size.. */
1079 if (S_ISREG(inode
->i_mode
)) {
1080 if (size
> inode
->i_size
)
1081 size
= inode
->i_size
;
1082 /* Ho humm.. We should have tested for this earlier */
1090 if (size
== file
->f_op
->write(file
, (const char *) page
, size
, &loff
))
1096 static int filemap_write_page(struct vm_area_struct
* vma
,
1097 unsigned long offset
,
1103 struct dentry
* dentry
;
1104 struct inode
* inode
;
1106 file
= vma
->vm_file
;
1107 dentry
= file
->f_dentry
;
1108 inode
= dentry
->d_inode
;
1109 if (!file
->f_op
->write
)
1113 * If a task terminates while we're swapping the page, the vma and
1114 * and file could be released ... increment the count to be safe.
1119 * If this is a swapping operation rather than msync(), then
1120 * leave the actual IO, and the restoration of the file count,
1121 * to the kpiod thread. Just queue the request for now.
1124 make_pio_request(file
, offset
, page
);
1128 down(&inode
->i_sem
);
1129 result
= do_write_page(inode
, file
, (const char *) page
, offset
);
1137 * The page cache takes care of races between somebody
1138 * trying to swap something out and swap something in
1139 * at the same time..
1141 int filemap_swapout(struct vm_area_struct
* vma
, struct page
* page
)
1143 return filemap_write_page(vma
, page
->offset
, page_address(page
), 0);
1146 static inline int filemap_sync_pte(pte_t
* ptep
, struct vm_area_struct
*vma
,
1147 unsigned long address
, unsigned int flags
)
1153 if (!(flags
& MS_INVALIDATE
)) {
1154 if (!pte_present(pte
))
1156 if (!pte_dirty(pte
))
1158 flush_page_to_ram(pte_page(pte
));
1159 flush_cache_page(vma
, address
);
1160 set_pte(ptep
, pte_mkclean(pte
));
1161 flush_tlb_page(vma
, address
);
1162 page
= pte_page(pte
);
1163 atomic_inc(&page_cache_entry(page
)->count
);
1167 flush_cache_page(vma
, address
);
1169 flush_tlb_page(vma
, address
);
1170 if (!pte_present(pte
)) {
1171 swap_free(pte_val(pte
));
1174 page
= pte_page(pte
);
1175 if (!pte_dirty(pte
) || flags
== MS_INVALIDATE
) {
1176 page_cache_free(page
);
1180 error
= filemap_write_page(vma
, address
- vma
->vm_start
+ vma
->vm_offset
, page
, 1);
1181 page_cache_free(page
);
1185 static inline int filemap_sync_pte_range(pmd_t
* pmd
,
1186 unsigned long address
, unsigned long size
,
1187 struct vm_area_struct
*vma
, unsigned long offset
, unsigned int flags
)
1195 if (pmd_bad(*pmd
)) {
1196 printk("filemap_sync_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd
));
1200 pte
= pte_offset(pmd
, address
);
1201 offset
+= address
& PMD_MASK
;
1202 address
&= ~PMD_MASK
;
1203 end
= address
+ size
;
1208 error
|= filemap_sync_pte(pte
, vma
, address
+ offset
, flags
);
1209 address
+= PAGE_SIZE
;
1211 } while (address
< end
);
1215 static inline int filemap_sync_pmd_range(pgd_t
* pgd
,
1216 unsigned long address
, unsigned long size
,
1217 struct vm_area_struct
*vma
, unsigned int flags
)
1220 unsigned long offset
, end
;
1225 if (pgd_bad(*pgd
)) {
1226 printk("filemap_sync_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd
));
1230 pmd
= pmd_offset(pgd
, address
);
1231 offset
= address
& PGDIR_MASK
;
1232 address
&= ~PGDIR_MASK
;
1233 end
= address
+ size
;
1234 if (end
> PGDIR_SIZE
)
1238 error
|= filemap_sync_pte_range(pmd
, address
, end
- address
, vma
, offset
, flags
);
1239 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
1241 } while (address
< end
);
1245 static int filemap_sync(struct vm_area_struct
* vma
, unsigned long address
,
1246 size_t size
, unsigned int flags
)
1249 unsigned long end
= address
+ size
;
1252 dir
= pgd_offset(vma
->vm_mm
, address
);
1253 flush_cache_range(vma
->vm_mm
, end
- size
, end
);
1254 while (address
< end
) {
1255 error
|= filemap_sync_pmd_range(dir
, address
, end
- address
, vma
, flags
);
1256 address
= (address
+ PGDIR_SIZE
) & PGDIR_MASK
;
1259 flush_tlb_range(vma
->vm_mm
, end
- size
, end
);
1264 * This handles (potentially partial) area unmaps..
1266 static void filemap_unmap(struct vm_area_struct
*vma
, unsigned long start
, size_t len
)
1268 filemap_sync(vma
, start
, len
, MS_ASYNC
);
1272 * Shared mappings need to be able to do the right thing at
1273 * close/unmap/sync. They will also use the private file as
1274 * backing-store for swapping..
1276 static struct vm_operations_struct file_shared_mmap
= {
1277 NULL
, /* no special open */
1278 NULL
, /* no special close */
1279 filemap_unmap
, /* unmap - we need to sync the pages */
1280 NULL
, /* no special protect */
1281 filemap_sync
, /* sync */
1283 filemap_nopage
, /* nopage */
1285 filemap_swapout
, /* swapout */
1290 * Private mappings just need to be able to load in the map.
1292 * (This is actually used for shared mappings as well, if we
1293 * know they can't ever get write permissions..)
1295 static struct vm_operations_struct file_private_mmap
= {
1302 filemap_nopage
, /* nopage */
1308 /* This is used for a general mmap of a disk file */
1310 int generic_file_mmap(struct file
* file
, struct vm_area_struct
* vma
)
1312 struct vm_operations_struct
* ops
;
1313 struct inode
*inode
= file
->f_dentry
->d_inode
;
1315 if ((vma
->vm_flags
& VM_SHARED
) && (vma
->vm_flags
& VM_MAYWRITE
)) {
1316 ops
= &file_shared_mmap
;
1317 /* share_page() can only guarantee proper page sharing if
1318 * the offsets are all page aligned. */
1319 if (vma
->vm_offset
& (PAGE_SIZE
- 1))
1322 ops
= &file_private_mmap
;
1323 if (inode
->i_op
&& inode
->i_op
->bmap
&&
1324 (vma
->vm_offset
& (inode
->i_sb
->s_blocksize
- 1)))
1327 if (!inode
->i_sb
|| !S_ISREG(inode
->i_mode
))
1329 if (!inode
->i_op
|| !inode
->i_op
->readpage
)
1331 UPDATE_ATIME(inode
);
1338 * The msync() system call.
1341 static int msync_interval(struct vm_area_struct
* vma
,
1342 unsigned long start
, unsigned long end
, int flags
)
1344 if (vma
->vm_file
&& vma
->vm_ops
&& vma
->vm_ops
->sync
) {
1346 error
= vma
->vm_ops
->sync(vma
, start
, end
-start
, flags
);
1347 if (!error
&& (flags
& MS_SYNC
)) {
1348 struct file
* file
= vma
->vm_file
;
1350 struct dentry
* dentry
= file
->f_dentry
;
1351 struct inode
* inode
= dentry
->d_inode
;
1352 down(&inode
->i_sem
);
1353 error
= file_fsync(file
, dentry
);
1362 asmlinkage
int sys_msync(unsigned long start
, size_t len
, int flags
)
1365 struct vm_area_struct
* vma
;
1366 int unmapped_error
, error
= -EINVAL
;
1368 down(¤t
->mm
->mmap_sem
);
1370 if (start
& ~PAGE_MASK
)
1372 len
= (len
+ ~PAGE_MASK
) & PAGE_MASK
;
1376 if (flags
& ~(MS_ASYNC
| MS_INVALIDATE
| MS_SYNC
))
1382 * If the interval [start,end) covers some unmapped address ranges,
1383 * just ignore them, but return -EFAULT at the end.
1385 vma
= find_vma(current
->mm
, start
);
1388 /* Still start < end. */
1392 /* Here start < vma->vm_end. */
1393 if (start
< vma
->vm_start
) {
1394 unmapped_error
= -EFAULT
;
1395 start
= vma
->vm_start
;
1397 /* Here vma->vm_start <= start < vma->vm_end. */
1398 if (end
<= vma
->vm_end
) {
1400 error
= msync_interval(vma
, start
, end
, flags
);
1404 error
= unmapped_error
;
1407 /* Here vma->vm_start <= start < vma->vm_end < end. */
1408 error
= msync_interval(vma
, start
, vma
->vm_end
, flags
);
1411 start
= vma
->vm_end
;
1416 up(¤t
->mm
->mmap_sem
);
1421 * Write to a file through the page cache. This is mainly for the
1422 * benefit of NFS and possibly other network-based file systems.
1424 * We currently put everything into the page cache prior to writing it.
1425 * This is not a problem when writing full pages. With partial pages,
1426 * however, we first have to read the data into the cache, then
1427 * dirty the page, and finally schedule it for writing. Alternatively, we
1428 * could write-through just the portion of data that would go into that
1429 * page, but that would kill performance for applications that write data
1430 * line by line, and it's prone to race conditions.
1432 * Note that this routine doesn't try to keep track of dirty pages. Each
1433 * file system has to do this all by itself, unfortunately.
1437 generic_file_write(struct file
*file
, const char *buf
,
1438 size_t count
, loff_t
*ppos
)
1440 struct dentry
*dentry
= file
->f_dentry
;
1441 struct inode
*inode
= dentry
->d_inode
;
1442 unsigned long pos
= *ppos
;
1443 unsigned long limit
= current
->rlim
[RLIMIT_FSIZE
].rlim_cur
;
1444 struct page
*page
, **hash
;
1445 unsigned long page_cache
= 0;
1446 unsigned long written
;
1449 if (!inode
->i_op
|| !inode
->i_op
->updatepage
)
1452 if (file
->f_error
) {
1453 int error
= file
->f_error
;
1458 sync
= file
->f_flags
& O_SYNC
;
1461 if (file
->f_flags
& O_APPEND
)
1462 pos
= inode
->i_size
;
1465 * Check whether we've reached the file size limit.
1469 send_sig(SIGXFSZ
, current
, 0);
1475 * Check whether to truncate the write,
1476 * and send the signal if we do.
1478 if (count
> limit
- pos
) {
1479 send_sig(SIGXFSZ
, current
, 0);
1480 count
= limit
- pos
;
1484 unsigned long bytes
, pgpos
, offset
;
1486 * Try to find the page in the cache. If it isn't there,
1487 * allocate a free page.
1489 offset
= (pos
& ~PAGE_CACHE_MASK
);
1490 pgpos
= pos
& PAGE_CACHE_MASK
;
1491 bytes
= PAGE_CACHE_SIZE
- offset
;
1495 hash
= page_hash(inode
, pgpos
);
1496 page
= __find_page(inode
, pgpos
, *hash
);
1499 page_cache
= page_cache_alloc();
1505 page
= page_cache_entry(page_cache
);
1506 add_to_page_cache(page
, inode
, pgpos
, hash
);
1510 /* Get exclusive IO access to the page.. */
1512 set_bit(PG_locked
, &page
->flags
);
1515 * Do the real work.. If the writer ends up delaying the write,
1516 * the writer needs to increment the page use counts until he
1517 * is done with the page.
1519 bytes
-= copy_from_user((u8
*)page_address(page
) + offset
, buf
, bytes
);
1522 status
= inode
->i_op
->updatepage(file
, page
, offset
, bytes
, sync
);
1524 /* Mark it unlocked again and drop the page.. */
1525 clear_bit(PG_locked
, &page
->flags
);
1526 wake_up(&page
->wait
);
1527 page_cache_release(page
);
1538 if (pos
> inode
->i_size
)
1539 inode
->i_size
= pos
;
1542 page_cache_free(page_cache
);
1544 return written
? written
: status
;
1548 * Support routines for directory cacheing using the page cache.
1552 * Finds the page at the specified offset, installing a new page
1553 * if requested. The count is incremented and the page is locked.
1555 * Note: we don't have to worry about races here, as the caller
1556 * is holding the inode semaphore.
1558 unsigned long get_cached_page(struct inode
* inode
, unsigned long offset
,
1562 struct page
** hash
;
1563 unsigned long page_cache
= 0;
1565 hash
= page_hash(inode
, offset
);
1566 page
= __find_page(inode
, offset
, *hash
);
1570 page_cache
= page_cache_alloc();
1573 clear_page(page_cache
);
1574 page
= page_cache_entry(page_cache
);
1575 add_to_page_cache(page
, inode
, offset
, hash
);
1577 if (atomic_read(&page
->count
) != 2)
1578 printk(KERN_ERR
"get_cached_page: page count=%d\n",
1579 atomic_read(&page
->count
));
1580 if (test_bit(PG_locked
, &page
->flags
))
1581 printk(KERN_ERR
"get_cached_page: page already locked!\n");
1582 set_bit(PG_locked
, &page
->flags
);
1583 page_cache
= page_address(page
);
1590 * Unlock and free a page.
1592 void put_cached_page(unsigned long addr
)
1594 struct page
* page
= page_cache_entry(addr
);
1596 if (!test_bit(PG_locked
, &page
->flags
))
1597 printk("put_cached_page: page not locked!\n");
1598 if (atomic_read(&page
->count
) != 2)
1599 printk("put_cached_page: page count=%d\n",
1600 atomic_read(&page
->count
));
1601 clear_bit(PG_locked
, &page
->flags
);
1602 wake_up(&page
->wait
);
1603 page_cache_release(page
);
1607 /* Add request for page IO to the queue */
1609 static inline void put_pio_request(struct pio_request
*p
)
1613 pio_last
= &p
->next
;
1616 /* Take the first page IO request off the queue */
1618 static inline struct pio_request
* get_pio_request(void)
1620 struct pio_request
* p
= pio_first
;
1621 pio_first
= p
->next
;
1623 pio_last
= &pio_first
;
1627 /* Make a new page IO request and queue it to the kpiod thread */
1629 static inline void make_pio_request(struct file
*file
,
1630 unsigned long offset
,
1633 struct pio_request
*p
;
1635 atomic_inc(&page_cache_entry(page
)->count
);
1638 * We need to allocate without causing any recursive IO in the
1639 * current thread's context. We might currently be swapping out
1640 * as a result of an allocation made while holding a critical
1641 * filesystem lock. To avoid deadlock, we *MUST* not reenter
1642 * the filesystem in this thread.
1644 * We can wait for kswapd to free memory, or we can try to free
1645 * pages without actually performing further IO, without fear of
1649 while ((p
= kmem_cache_alloc(pio_request_cache
, GFP_BUFFER
)) == NULL
) {
1650 if (try_to_free_pages(__GFP_WAIT
))
1652 current
->state
= TASK_INTERRUPTIBLE
;
1653 schedule_timeout(HZ
/10);
1666 * This is the only thread which is allowed to write out filemap pages
1669 * To avoid deadlock, it is important that we never reenter this thread.
1670 * Although recursive memory allocations within this thread may result
1671 * in more page swapping, that swapping will always be done by queuing
1672 * another IO request to the same thread: we will never actually start
1673 * that IO request until we have finished with the current one, and so
1674 * we will not deadlock.
1677 int kpiod(void * unused
)
1679 struct task_struct
*tsk
= current
;
1680 struct wait_queue wait
= { tsk
, };
1681 struct inode
* inode
;
1682 struct dentry
* dentry
;
1683 struct pio_request
* p
;
1687 strcpy(tsk
->comm
, "kpiod");
1688 sigfillset(&tsk
->blocked
);
1689 init_waitqueue(&pio_wait
);
1691 * Mark this task as a memory allocator - we don't want to get caught
1692 * up in the regular mm freeing frenzy if we have to allocate memory
1693 * in order to write stuff out.
1695 tsk
->flags
|= PF_MEMALLOC
;
1699 pio_request_cache
= kmem_cache_create("pio_request",
1700 sizeof(struct pio_request
),
1701 0, SLAB_HWCACHE_ALIGN
,
1703 if (!pio_request_cache
)
1704 panic ("Could not create pio_request slab cache");
1707 tsk
->state
= TASK_INTERRUPTIBLE
;
1708 add_wait_queue(&pio_wait
, &wait
);
1711 remove_wait_queue(&pio_wait
, &wait
);
1712 tsk
->state
= TASK_RUNNING
;
1715 p
= get_pio_request();
1716 dentry
= p
->file
->f_dentry
;
1717 inode
= dentry
->d_inode
;
1719 down(&inode
->i_sem
);
1720 do_write_page(inode
, p
->file
,
1721 (const char *) p
->page
, p
->offset
);
1724 page_cache_free(p
->page
);
1725 kmem_cache_free(pio_request_cache
, p
);