4 * Copyright (C) 1994, 1995 Linus Torvalds
8 * This file handles the generic file mmap semantics used by
9 * most "normal" filesystems (but you don't /have/ to use this:
10 * the NFS filesystem used to do this differently, for example)
12 #include <linux/malloc.h>
13 #include <linux/shm.h>
14 #include <linux/mman.h>
15 #include <linux/locks.h>
16 #include <linux/pagemap.h>
17 #include <linux/swap.h>
18 #include <linux/smp_lock.h>
19 #include <linux/blkdev.h>
20 #include <linux/file.h>
21 #include <linux/swapctl.h>
23 #include <asm/pgtable.h>
24 #include <asm/uaccess.h>
27 * Shared mappings implemented 30.11.1994. It's not fully working yet,
30 * Shared mappings now work. 15.8.1995 Bruno.
33 unsigned long page_cache_size
= 0;
34 struct page
* page_hash_table
[PAGE_HASH_SIZE
];
37 * Simple routines for both non-shared and shared mappings.
40 #define release_page(page) __free_page((page))
43 * Invalidate the pages of an inode, removing all pages that aren't
44 * locked down (those are sure to be up-to-date anyway, so we shouldn't
47 void invalidate_inode_pages(struct inode
* inode
)
53 while ((page
= *p
) != NULL
) {
54 if (PageLocked(page
)) {
59 if ((*p
= page
->next
) != NULL
)
60 (*p
)->prev
= page
->prev
;
63 remove_page_from_hash_queue(page
);
71 * Truncate the page cache at a set offset, removing the pages
72 * that are beyond that offset (and zeroing out partial pages).
74 void truncate_inode_pages(struct inode
* inode
, unsigned long start
)
81 while ((page
= *p
) != NULL
) {
82 unsigned long offset
= page
->offset
;
84 /* page wholly truncated - free it */
85 if (offset
>= start
) {
86 if (PageLocked(page
)) {
91 if ((*p
= page
->next
) != NULL
)
92 (*p
)->prev
= page
->prev
;
95 remove_page_from_hash_queue(page
);
101 offset
= start
- offset
;
102 /* partial truncate, clear end of page */
103 if (offset
< PAGE_SIZE
) {
104 unsigned long address
= page_address(page
);
105 memset((void *) (offset
+ address
), 0, PAGE_SIZE
- offset
);
106 flush_page_to_ram(address
);
112 * Remove a page from the page cache and free it.
114 void remove_inode_page(struct page
*page
)
116 remove_page_from_hash_queue(page
);
117 remove_page_from_inode_queue(page
);
121 int shrink_mmap(int priority
, int gfp_mask
)
123 static unsigned long clock
= 0;
124 unsigned long limit
= num_physpages
;
128 count
= (limit
<< 1) >> priority
;
130 page
= mem_map
+ clock
;
134 /* This works even in the presence of PageSkip because
135 * the first two entries at the beginning of a hole will
136 * be marked, not just the first.
140 if (clock
>= max_mapnr
) {
144 if (PageSkip(page
)) {
145 /* next_hash is overloaded for PageSkip */
146 page
= page
->next_hash
;
147 clock
= page
- mem_map
;
151 referenced
= test_and_clear_bit(PG_referenced
, &page
->flags
);
153 if (PageLocked(page
))
156 if ((gfp_mask
& __GFP_DMA
) && !PageDMA(page
))
159 /* We can't free pages unless there's just one user */
160 if (atomic_read(&page
->count
) != 1)
164 * Is it a page swap page? If so, we want to
165 * drop it if it is no longer used, even if it
166 * were to be marked referenced..
168 if (PageSwapCache(page
)) {
169 if (referenced
&& swap_count(page
->offset
) != 1)
171 delete_from_swap_cache(page
);
178 /* Is it a buffer page? */
180 if (buffer_under_min())
182 if (!try_to_free_buffers(page
))
187 /* is it a page-cache page? */
189 if (pgcache_under_min())
191 remove_inode_page(page
);
200 * Update a page cache copy, when we're doing a "write()" system call
201 * See also "update_vm_cache()".
203 void update_vm_cache(struct inode
* inode
, unsigned long pos
, const char * buf
, int count
)
205 unsigned long offset
, len
;
207 offset
= (pos
& ~PAGE_MASK
);
208 pos
= pos
& PAGE_MASK
;
209 len
= PAGE_SIZE
- offset
;
215 page
= find_page(inode
, pos
);
218 memcpy((void *) (offset
+ page_address(page
)), buf
, len
);
229 static inline void add_to_page_cache(struct page
* page
,
230 struct inode
* inode
, unsigned long offset
,
233 atomic_inc(&page
->count
);
234 page
->flags
= (page
->flags
& ~((1 << PG_uptodate
) | (1 << PG_error
))) | (1 << PG_referenced
);
235 page
->offset
= offset
;
236 add_page_to_inode_queue(inode
, page
);
237 __add_page_to_hash_queue(page
, hash
);
241 * Try to read ahead in the file. "page_cache" is a potentially free page
242 * that we could use for the cache (if it is 0 we can try to create one,
243 * this is all overlapped with the IO on the previous page finishing anyway)
245 static unsigned long try_to_read_ahead(struct file
* file
,
246 unsigned long offset
, unsigned long page_cache
)
248 struct inode
*inode
= file
->f_dentry
->d_inode
;
253 switch (page_cache
) {
255 page_cache
= __get_free_page(GFP_USER
);
259 if (offset
>= inode
->i_size
)
261 hash
= page_hash(inode
, offset
);
262 page
= __find_page(inode
, offset
, *hash
);
265 * Ok, add the new page to the hash-queues...
267 page
= mem_map
+ MAP_NR(page_cache
);
268 add_to_page_cache(page
, inode
, offset
, hash
);
269 inode
->i_op
->readpage(file
, page
);
278 * Wait for IO to complete on a locked page.
280 * This must be called with the caller "holding" the page,
281 * ie with increased "page->count" so that the page won't
282 * go away during the wait..
284 void __wait_on_page(struct page
*page
)
286 struct task_struct
*tsk
= current
;
287 struct wait_queue wait
;
290 add_wait_queue(&page
->wait
, &wait
);
292 tsk
->state
= TASK_UNINTERRUPTIBLE
;
293 run_task_queue(&tq_disk
);
294 if (PageLocked(page
)) {
298 tsk
->state
= TASK_RUNNING
;
299 remove_wait_queue(&page
->wait
, &wait
);
303 #define PROFILE_READAHEAD
304 #define DEBUG_READAHEAD
308 * Read-ahead profiling information
309 * --------------------------------
310 * Every PROFILE_MAXREADCOUNT, the following information is written
312 * Percentage of asynchronous read-ahead.
313 * Average of read-ahead fields context value.
314 * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
318 #ifdef PROFILE_READAHEAD
320 #define PROFILE_MAXREADCOUNT 1000
322 static unsigned long total_reada
;
323 static unsigned long total_async
;
324 static unsigned long total_ramax
;
325 static unsigned long total_ralen
;
326 static unsigned long total_rawin
;
328 static void profile_readahead(int async
, struct file
*filp
)
336 total_ramax
+= filp
->f_ramax
;
337 total_ralen
+= filp
->f_ralen
;
338 total_rawin
+= filp
->f_rawin
;
340 if (total_reada
> PROFILE_MAXREADCOUNT
) {
343 if (!(total_reada
> PROFILE_MAXREADCOUNT
)) {
344 restore_flags(flags
);
348 printk("Readahead average: max=%ld, len=%ld, win=%ld, async=%ld%%\n",
349 total_ramax
/total_reada
,
350 total_ralen
/total_reada
,
351 total_rawin
/total_reada
,
352 (total_async
*100)/total_reada
);
353 #ifdef DEBUG_READAHEAD
354 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%ld\n",
355 filp
->f_ramax
, filp
->f_ralen
, filp
->f_rawin
, filp
->f_raend
);
364 restore_flags(flags
);
367 #endif /* defined PROFILE_READAHEAD */
370 * Read-ahead context:
371 * -------------------
372 * The read ahead context fields of the "struct file" are the following:
373 * - f_raend : position of the first byte after the last page we tried to
375 * - f_ramax : current read-ahead maximum size.
376 * - f_ralen : length of the current IO read block we tried to read-ahead.
377 * - f_rawin : length of the current read-ahead window.
378 * if last read-ahead was synchronous then
380 * otherwise (was asynchronous)
381 * f_rawin = previous value of f_ralen + f_ralen
385 * MIN_READAHEAD : minimum read-ahead size when read-ahead.
386 * MAX_READAHEAD : maximum read-ahead size when read-ahead.
388 * Synchronous read-ahead benefits:
389 * --------------------------------
390 * Using reasonable IO xfer length from peripheral devices increase system
392 * Reasonable means, in this context, not too large but not too small.
393 * The actual maximum value is:
394 * MAX_READAHEAD + PAGE_SIZE = 76k is CONFIG_READA_SMALL is undefined
395 * and 32K if defined (4K page size assumed).
397 * Asynchronous read-ahead benefits:
398 * ---------------------------------
399 * Overlapping next read request and user process execution increase system
404 * We have to guess which further data are needed by the user process.
405 * If these data are often not really needed, it's bad for system
407 * However, we know that files are often accessed sequentially by
408 * application programs and it seems that it is possible to have some good
409 * strategy in that guessing.
410 * We only try to read-ahead files that seems to be read sequentially.
412 * Asynchronous read-ahead risks:
413 * ------------------------------
414 * In order to maximize overlapping, we must start some asynchronous read
415 * request from the device, as soon as possible.
416 * We must be very careful about:
417 * - The number of effective pending IO read requests.
418 * ONE seems to be the only reasonable value.
419 * - The total memory pool usage for the file access stream.
420 * This maximum memory usage is implicitly 2 IO read chunks:
421 * 2*(MAX_READAHEAD + PAGE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
422 * 64k if defined (4K page size assumed).
425 static inline int get_max_readahead(struct inode
* inode
)
427 if (!inode
->i_dev
|| !max_readahead
[MAJOR(inode
->i_dev
)])
428 return MAX_READAHEAD
;
429 return max_readahead
[MAJOR(inode
->i_dev
)][MINOR(inode
->i_dev
)];
432 static inline unsigned long generic_file_readahead(int reada_ok
,
433 struct file
* filp
, struct inode
* inode
,
434 unsigned long ppos
, struct page
* page
, unsigned long page_cache
)
436 unsigned long max_ahead
, ahead
;
438 int max_readahead
= get_max_readahead(inode
);
440 raend
= filp
->f_raend
& PAGE_MASK
;
444 * The current page is locked.
445 * If the current position is inside the previous read IO request, do not
446 * try to reread previously read ahead pages.
447 * Otherwise decide or not to read ahead some pages synchronously.
448 * If we are not going to read ahead, set the read ahead context for this
451 if (PageLocked(page
)) {
452 if (!filp
->f_ralen
|| ppos
>= raend
|| ppos
+ filp
->f_ralen
< raend
) {
454 if (raend
< inode
->i_size
)
455 max_ahead
= filp
->f_ramax
;
457 filp
->f_ralen
= PAGE_SIZE
;
459 filp
->f_raend
= ppos
+ filp
->f_ralen
;
460 filp
->f_rawin
+= filp
->f_ralen
;
465 * The current page is not locked.
466 * If we were reading ahead and,
467 * if the current max read ahead size is not zero and,
468 * if the current position is inside the last read-ahead IO request,
469 * it is the moment to try to read ahead asynchronously.
470 * We will later force unplug device in order to force asynchronous read IO.
472 else if (reada_ok
&& filp
->f_ramax
&& raend
>= PAGE_SIZE
&&
473 ppos
<= raend
&& ppos
+ filp
->f_ralen
>= raend
) {
475 * Add ONE page to max_ahead in order to try to have about the same IO max size
476 * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_SIZE.
477 * Compute the position of the last page we have tried to read in order to
478 * begin to read ahead just at the next page.
481 if (raend
< inode
->i_size
)
482 max_ahead
= filp
->f_ramax
+ PAGE_SIZE
;
485 filp
->f_rawin
= filp
->f_ralen
;
491 * Try to read ahead pages.
492 * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
493 * scheduler, will work enough for us to avoid too bad actuals IO requests.
496 while (ahead
< max_ahead
) {
498 page_cache
= try_to_read_ahead(filp
, raend
+ ahead
,
502 * If we tried to read ahead some pages,
503 * If we tried to read ahead asynchronously,
504 * Try to force unplug of the device in order to start an asynchronous
506 * Update the read-ahead context.
507 * Store the length of the current read-ahead window.
508 * Double the current max read ahead size.
509 * That heuristic avoid to do some large IO for files that are not really
510 * accessed sequentially.
514 run_task_queue(&tq_disk
);
517 filp
->f_ralen
+= ahead
;
518 filp
->f_rawin
+= filp
->f_ralen
;
519 filp
->f_raend
= raend
+ ahead
+ PAGE_SIZE
;
521 filp
->f_ramax
+= filp
->f_ramax
;
523 if (filp
->f_ramax
> max_readahead
)
524 filp
->f_ramax
= max_readahead
;
526 #ifdef PROFILE_READAHEAD
527 profile_readahead((reada_ok
== 2), filp
);
535 * "descriptor" for what we're up to with a read.
536 * This allows us to use the same read code yet
537 * have multiple different users of the data that
538 * we read from a file.
540 * The simplest case just copies the data to user
550 typedef int (*read_actor_t
)(read_descriptor_t
*, const char *, unsigned long);
553 * This is a generic file read routine, and uses the
554 * inode->i_op->readpage() function for the actual low-level
557 * This is really ugly. But the goto's actually try to clarify some
558 * of the logic when it comes to error handling etc.
560 static void do_generic_file_read(struct file
* filp
, loff_t
*ppos
, read_descriptor_t
* desc
, read_actor_t actor
)
562 struct dentry
*dentry
= filp
->f_dentry
;
563 struct inode
*inode
= dentry
->d_inode
;
564 size_t pos
, pgpos
, page_cache
;
566 int max_readahead
= get_max_readahead(inode
);
571 pgpos
= pos
& PAGE_MASK
;
573 * If the current position is outside the previous read-ahead window,
574 * we reset the current read-ahead context and set read ahead max to zero
575 * (will be set to just needed value later),
576 * otherwise, we assume that the file accesses are sequential enough to
577 * continue read-ahead.
579 if (pgpos
> filp
->f_raend
|| pgpos
+ filp
->f_rawin
< filp
->f_raend
) {
589 * Adjust the current value of read-ahead max.
590 * If the read operation stay in the first half page, force no readahead.
591 * Otherwise try to increase read ahead max just enough to do the read request.
592 * Then, at least MIN_READAHEAD if read ahead is ok,
593 * and at most MAX_READAHEAD in all cases.
595 if (pos
+ desc
->count
<= (PAGE_SIZE
>> 1)) {
598 unsigned long needed
;
600 needed
= ((pos
+ desc
->count
) & PAGE_MASK
) - pgpos
;
602 if (filp
->f_ramax
< needed
)
603 filp
->f_ramax
= needed
;
605 if (reada_ok
&& filp
->f_ramax
< MIN_READAHEAD
)
606 filp
->f_ramax
= MIN_READAHEAD
;
607 if (filp
->f_ramax
> max_readahead
)
608 filp
->f_ramax
= max_readahead
;
612 struct page
*page
, **hash
;
614 if (pos
>= inode
->i_size
)
618 * Try to find the data in the page cache..
620 hash
= page_hash(inode
, pos
& PAGE_MASK
);
621 page
= __find_page(inode
, pos
& PAGE_MASK
, *hash
);
627 * Try to read ahead only if the current page is filled or being filled.
628 * Otherwise, if we were reading ahead, decrease max read ahead size to
630 * In this context, that seems to may happen only on some read error or if
631 * the page has been rewritten.
633 if (PageUptodate(page
) || PageLocked(page
))
634 page_cache
= generic_file_readahead(reada_ok
, filp
, inode
, pos
& PAGE_MASK
, page
, page_cache
);
635 else if (reada_ok
&& filp
->f_ramax
> MIN_READAHEAD
)
636 filp
->f_ramax
= MIN_READAHEAD
;
640 if (!PageUptodate(page
))
641 goto page_read_error
;
645 * Ok, we have the page, it's up-to-date and ok,
646 * so now we can finally copy it to user space...
649 unsigned long offset
, nr
;
651 offset
= pos
& ~PAGE_MASK
;
652 nr
= PAGE_SIZE
- offset
;
653 if (nr
> inode
->i_size
- pos
)
654 nr
= inode
->i_size
- pos
;
657 * The actor routine returns how many bytes were actually used..
658 * NOTE! This may not be the same as how much of a user buffer
659 * we filled up (we may be padding etc), so we can only update
660 * "pos" here (the actor routine has to update the user buffer
661 * pointers and the remaining count).
663 nr
= actor(desc
, (const char *) (page_address(page
) + offset
), nr
);
666 if (nr
&& desc
->count
)
673 * Ok, it wasn't cached, so we need to create a new
677 page_cache
= __get_free_page(GFP_USER
);
679 * That could have slept, so go around to the
684 desc
->error
= -ENOMEM
;
689 * Ok, add the new page to the hash-queues...
691 page
= mem_map
+ MAP_NR(page_cache
);
693 add_to_page_cache(page
, inode
, pos
& PAGE_MASK
, hash
);
696 * Error handling is tricky. If we get a read error,
697 * the cached page stays in the cache (but uptodate=0),
698 * and the next process that accesses it will try to
699 * re-read it. This is needed for NFS etc, where the
700 * identity of the reader can decide if we can read the
704 * We have to read the page.
705 * If we were reading ahead, we had previously tried to read this page,
706 * That means that the page has probably been removed from the cache before
707 * the application process needs it, or has been rewritten.
708 * Decrease max readahead size to the minimum value in that situation.
710 if (reada_ok
&& filp
->f_ramax
> MIN_READAHEAD
)
711 filp
->f_ramax
= MIN_READAHEAD
;
714 int error
= inode
->i_op
->readpage(filp
, page
);
724 * We found the page, but it wasn't up-to-date.
725 * Try to re-read it _once_. We do this synchronously,
726 * because this happens only if there were errors.
729 int error
= inode
->i_op
->readpage(filp
, page
);
732 if (PageUptodate(page
) && !PageError(page
))
734 error
= -EIO
; /* Some unspecified error occurred.. */
745 free_page(page_cache
);
749 static int file_read_actor(read_descriptor_t
* desc
, const char *area
, unsigned long size
)
752 unsigned long count
= desc
->count
;
756 left
= __copy_to_user(desc
->buf
, area
, size
);
759 desc
->error
= -EFAULT
;
761 desc
->count
= count
- size
;
762 desc
->written
+= size
;
768 * This is the "read()" routine for all filesystems
769 * that can use the page cache directly.
771 ssize_t
generic_file_read(struct file
* filp
, char * buf
, size_t count
, loff_t
*ppos
)
776 if (access_ok(VERIFY_WRITE
, buf
, count
)) {
779 read_descriptor_t desc
;
785 do_generic_file_read(filp
, ppos
, &desc
, file_read_actor
);
787 retval
= desc
.written
;
795 static int file_send_actor(read_descriptor_t
* desc
, const char *area
, unsigned long size
)
798 unsigned long count
= desc
->count
;
799 struct file
*file
= (struct file
*) desc
->buf
;
800 struct inode
*inode
= file
->f_dentry
->d_inode
;
808 written
= file
->f_op
->write(file
, area
, size
, &file
->f_pos
);
812 desc
->error
= written
;
815 desc
->count
= count
- written
;
816 desc
->written
+= written
;
820 asmlinkage ssize_t
sys_sendfile(int out_fd
, int in_fd
, off_t
*offset
, size_t count
)
823 struct file
* in_file
, * out_file
;
824 struct inode
* in_inode
, * out_inode
;
829 * Get input file, and verify that it is ok..
832 in_file
= fget(in_fd
);
835 if (!(in_file
->f_mode
& FMODE_READ
))
838 in_inode
= in_file
->f_dentry
->d_inode
;
841 if (!in_inode
->i_op
|| !in_inode
->i_op
->readpage
)
843 retval
= locks_verify_area(FLOCK_VERIFY_READ
, in_inode
, in_file
, in_file
->f_pos
, count
);
848 * Get output file, and verify that it is ok..
851 out_file
= fget(out_fd
);
854 if (!(out_file
->f_mode
& FMODE_WRITE
))
857 if (!out_file
->f_op
|| !out_file
->f_op
->write
)
859 out_inode
= out_file
->f_dentry
->d_inode
;
862 retval
= locks_verify_area(FLOCK_VERIFY_WRITE
, out_inode
, out_file
, out_file
->f_pos
, count
);
868 read_descriptor_t desc
;
869 loff_t pos
= 0, *ppos
;
872 ppos
= &in_file
->f_pos
;
874 if (get_user(pos
, offset
))
881 desc
.buf
= (char *) out_file
;
883 do_generic_file_read(in_file
, ppos
, &desc
, file_send_actor
);
885 retval
= desc
.written
;
889 put_user(pos
, offset
);
903 * Semantics for shared and private memory areas are different past the end
904 * of the file. A shared mapping past the last page of the file is an error
905 * and results in a SIGBUS, while a private mapping just maps in a zero page.
907 * The goto's are kind of ugly, but this streamlines the normal case of having
908 * it in the page cache, and handles the special cases reasonably without
909 * having a lot of duplicated code.
911 * WSH 06/04/97: fixed a memory leak and moved the allocation of new_page
912 * ahead of the wait if we're sure to need it.
914 static unsigned long filemap_nopage(struct vm_area_struct
* area
, unsigned long address
, int no_share
)
916 struct file
* file
= area
->vm_file
;
917 struct dentry
* dentry
= file
->f_dentry
;
918 struct inode
* inode
= dentry
->d_inode
;
919 unsigned long offset
, reada
, i
;
920 struct page
* page
, **hash
;
921 unsigned long old_page
, new_page
;
924 offset
= (address
& PAGE_MASK
) - area
->vm_start
+ area
->vm_offset
;
925 if (offset
>= inode
->i_size
&& (area
->vm_flags
& VM_SHARED
) && area
->vm_mm
== current
->mm
)
929 * Do we have something in the page cache already?
931 hash
= page_hash(inode
, offset
);
932 page
= __find_page(inode
, offset
, *hash
);
938 * Ok, found a page in the page cache, now we need to check
939 * that it's up-to-date. First check whether we'll need an
940 * extra page -- better to overlap the allocation with the I/O.
942 if (no_share
&& !new_page
) {
943 new_page
= __get_free_page(GFP_USER
);
948 if (PageLocked(page
))
949 goto page_locked_wait
;
950 if (!PageUptodate(page
))
951 goto page_read_error
;
955 * Found the page, need to check sharing and possibly
956 * copy it over to another page..
958 old_page
= page_address(page
);
961 * Ok, we can share the cached page directly.. Get rid
962 * of any potential extra pages.
967 flush_page_to_ram(old_page
);
972 * No sharing ... copy to the new page.
974 copy_page(new_page
, old_page
);
975 flush_page_to_ram(new_page
);
981 * Try to read in an entire cluster at once.
984 reada
>>= PAGE_SHIFT
+ page_cluster
;
985 reada
<<= PAGE_SHIFT
+ page_cluster
;
987 for (i
= 1 << page_cluster
; i
> 0; --i
, reada
+= PAGE_SIZE
)
988 new_page
= try_to_read_ahead(file
, reada
, new_page
);
991 new_page
= __get_free_page(GFP_USER
);
996 * During getting the above page we might have slept,
997 * so we need to re-check the situation with the page
998 * cache.. The page we just got may be useful if we
999 * can't share, so don't get rid of it here.
1001 page
= find_page(inode
, offset
);
1006 * Now, create a new page-cache page from the page we got
1008 page
= mem_map
+ MAP_NR(new_page
);
1010 add_to_page_cache(page
, inode
, offset
, hash
);
1012 if (inode
->i_op
->readpage(file
, page
) != 0)
1018 __wait_on_page(page
);
1019 if (PageUptodate(page
))
1024 * Umm, take care of errors if the page isn't up-to-date.
1025 * Try to re-read it _once_. We do this synchronously,
1026 * because there really aren't any performance issues here
1027 * and we need to check for errors.
1029 if (inode
->i_op
->readpage(file
, page
) != 0)
1032 if (PageError(page
))
1034 if (PageUptodate(page
))
1038 * Things didn't work out. Return zero to tell the
1039 * mm layer so, possibly freeing the page cache page first.
1044 free_page(new_page
);
1050 * Tries to write a shared mapped page to its backing store. May return -EIO
1051 * if the disk is full.
1053 static inline int do_write_page(struct inode
* inode
, struct file
* file
,
1054 const char * page
, unsigned long offset
)
1058 loff_t loff
= offset
;
1059 mm_segment_t old_fs
;
1061 size
= offset
+ PAGE_SIZE
;
1062 /* refuse to extend file size.. */
1063 if (S_ISREG(inode
->i_mode
)) {
1064 if (size
> inode
->i_size
)
1065 size
= inode
->i_size
;
1066 /* Ho humm.. We should have tested for this earlier */
1074 if (size
== file
->f_op
->write(file
, (const char *) page
, size
, &loff
))
1080 static int filemap_write_page(struct vm_area_struct
* vma
,
1081 unsigned long offset
,
1086 struct dentry
* dentry
;
1087 struct inode
* inode
;
1089 file
= vma
->vm_file
;
1090 dentry
= file
->f_dentry
;
1091 inode
= dentry
->d_inode
;
1092 if (!file
->f_op
->write
)
1096 * If a task terminates while we're swapping the page, the vma and
1097 * and file could be released ... increment the count to be safe.
1100 down(&inode
->i_sem
);
1101 result
= do_write_page(inode
, file
, (const char *) page
, offset
);
1109 * The page cache takes care of races between somebody
1110 * trying to swap something out and swap something in
1111 * at the same time..
1113 int filemap_swapout(struct vm_area_struct
* vma
, struct page
* page
)
1115 return filemap_write_page(vma
, page
->offset
, page_address(page
));
1118 static inline int filemap_sync_pte(pte_t
* ptep
, struct vm_area_struct
*vma
,
1119 unsigned long address
, unsigned int flags
)
1125 if (!(flags
& MS_INVALIDATE
)) {
1126 if (!pte_present(pte
))
1128 if (!pte_dirty(pte
))
1130 flush_page_to_ram(pte_page(pte
));
1131 flush_cache_page(vma
, address
);
1132 set_pte(ptep
, pte_mkclean(pte
));
1133 flush_tlb_page(vma
, address
);
1134 page
= pte_page(pte
);
1135 atomic_inc(&mem_map
[MAP_NR(page
)].count
);
1139 flush_cache_page(vma
, address
);
1141 flush_tlb_page(vma
, address
);
1142 if (!pte_present(pte
)) {
1143 swap_free(pte_val(pte
));
1146 page
= pte_page(pte
);
1147 if (!pte_dirty(pte
) || flags
== MS_INVALIDATE
) {
1152 error
= filemap_write_page(vma
, address
- vma
->vm_start
+ vma
->vm_offset
, page
);
1157 static inline int filemap_sync_pte_range(pmd_t
* pmd
,
1158 unsigned long address
, unsigned long size
,
1159 struct vm_area_struct
*vma
, unsigned long offset
, unsigned int flags
)
1167 if (pmd_bad(*pmd
)) {
1168 printk("filemap_sync_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd
));
1172 pte
= pte_offset(pmd
, address
);
1173 offset
+= address
& PMD_MASK
;
1174 address
&= ~PMD_MASK
;
1175 end
= address
+ size
;
1180 error
|= filemap_sync_pte(pte
, vma
, address
+ offset
, flags
);
1181 address
+= PAGE_SIZE
;
1183 } while (address
< end
);
1187 static inline int filemap_sync_pmd_range(pgd_t
* pgd
,
1188 unsigned long address
, unsigned long size
,
1189 struct vm_area_struct
*vma
, unsigned int flags
)
1192 unsigned long offset
, end
;
1197 if (pgd_bad(*pgd
)) {
1198 printk("filemap_sync_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd
));
1202 pmd
= pmd_offset(pgd
, address
);
1203 offset
= address
& PGDIR_MASK
;
1204 address
&= ~PGDIR_MASK
;
1205 end
= address
+ size
;
1206 if (end
> PGDIR_SIZE
)
1210 error
|= filemap_sync_pte_range(pmd
, address
, end
- address
, vma
, offset
, flags
);
1211 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
1213 } while (address
< end
);
1217 static int filemap_sync(struct vm_area_struct
* vma
, unsigned long address
,
1218 size_t size
, unsigned int flags
)
1221 unsigned long end
= address
+ size
;
1224 dir
= pgd_offset(vma
->vm_mm
, address
);
1225 flush_cache_range(vma
->vm_mm
, end
- size
, end
);
1226 while (address
< end
) {
1227 error
|= filemap_sync_pmd_range(dir
, address
, end
- address
, vma
, flags
);
1228 address
= (address
+ PGDIR_SIZE
) & PGDIR_MASK
;
1231 flush_tlb_range(vma
->vm_mm
, end
- size
, end
);
1236 * This handles (potentially partial) area unmaps..
1238 static void filemap_unmap(struct vm_area_struct
*vma
, unsigned long start
, size_t len
)
1240 filemap_sync(vma
, start
, len
, MS_ASYNC
);
1244 * Shared mappings need to be able to do the right thing at
1245 * close/unmap/sync. They will also use the private file as
1246 * backing-store for swapping..
1248 static struct vm_operations_struct file_shared_mmap
= {
1249 NULL
, /* no special open */
1250 NULL
, /* no special close */
1251 filemap_unmap
, /* unmap - we need to sync the pages */
1252 NULL
, /* no special protect */
1253 filemap_sync
, /* sync */
1255 filemap_nopage
, /* nopage */
1257 filemap_swapout
, /* swapout */
1262 * Private mappings just need to be able to load in the map.
1264 * (This is actually used for shared mappings as well, if we
1265 * know they can't ever get write permissions..)
1267 static struct vm_operations_struct file_private_mmap
= {
1274 filemap_nopage
, /* nopage */
1280 /* This is used for a general mmap of a disk file */
1282 int generic_file_mmap(struct file
* file
, struct vm_area_struct
* vma
)
1284 struct vm_operations_struct
* ops
;
1285 struct inode
*inode
= file
->f_dentry
->d_inode
;
1287 if ((vma
->vm_flags
& VM_SHARED
) && (vma
->vm_flags
& VM_MAYWRITE
)) {
1288 ops
= &file_shared_mmap
;
1289 /* share_page() can only guarantee proper page sharing if
1290 * the offsets are all page aligned. */
1291 if (vma
->vm_offset
& (PAGE_SIZE
- 1))
1294 ops
= &file_private_mmap
;
1295 if (vma
->vm_offset
& (inode
->i_sb
->s_blocksize
- 1))
1298 if (!inode
->i_sb
|| !S_ISREG(inode
->i_mode
))
1300 if (!inode
->i_op
|| !inode
->i_op
->readpage
)
1302 UPDATE_ATIME(inode
);
1303 vma
->vm_file
= file
;
1311 * The msync() system call.
1314 static int msync_interval(struct vm_area_struct
* vma
,
1315 unsigned long start
, unsigned long end
, int flags
)
1317 if (vma
->vm_file
&& vma
->vm_ops
&& vma
->vm_ops
->sync
) {
1319 error
= vma
->vm_ops
->sync(vma
, start
, end
-start
, flags
);
1320 if (!error
&& (flags
& MS_SYNC
)) {
1321 struct file
* file
= vma
->vm_file
;
1323 struct dentry
* dentry
= file
->f_dentry
;
1324 struct inode
* inode
= dentry
->d_inode
;
1325 down(&inode
->i_sem
);
1326 error
= file_fsync(file
, dentry
);
1335 asmlinkage
int sys_msync(unsigned long start
, size_t len
, int flags
)
1338 struct vm_area_struct
* vma
;
1339 int unmapped_error
, error
= -EINVAL
;
1341 down(¤t
->mm
->mmap_sem
);
1343 if (start
& ~PAGE_MASK
)
1345 len
= (len
+ ~PAGE_MASK
) & PAGE_MASK
;
1349 if (flags
& ~(MS_ASYNC
| MS_INVALIDATE
| MS_SYNC
))
1355 * If the interval [start,end) covers some unmapped address ranges,
1356 * just ignore them, but return -EFAULT at the end.
1358 vma
= find_vma(current
->mm
, start
);
1361 /* Still start < end. */
1365 /* Here start < vma->vm_end. */
1366 if (start
< vma
->vm_start
) {
1367 unmapped_error
= -EFAULT
;
1368 start
= vma
->vm_start
;
1370 /* Here vma->vm_start <= start < vma->vm_end. */
1371 if (end
<= vma
->vm_end
) {
1373 error
= msync_interval(vma
, start
, end
, flags
);
1377 error
= unmapped_error
;
1380 /* Here vma->vm_start <= start < vma->vm_end < end. */
1381 error
= msync_interval(vma
, start
, vma
->vm_end
, flags
);
1384 start
= vma
->vm_end
;
1389 up(¤t
->mm
->mmap_sem
);
1394 * Write to a file through the page cache. This is mainly for the
1395 * benefit of NFS and possibly other network-based file systems.
1397 * We currently put everything into the page cache prior to writing it.
1398 * This is not a problem when writing full pages. With partial pages,
1399 * however, we first have to read the data into the cache, then
1400 * dirty the page, and finally schedule it for writing. Alternatively, we
1401 * could write-through just the portion of data that would go into that
1402 * page, but that would kill performance for applications that write data
1403 * line by line, and it's prone to race conditions.
1405 * Note that this routine doesn't try to keep track of dirty pages. Each
1406 * file system has to do this all by itself, unfortunately.
1410 generic_file_write(struct file
*file
, const char *buf
,
1411 size_t count
, loff_t
*ppos
)
1413 struct dentry
*dentry
= file
->f_dentry
;
1414 struct inode
*inode
= dentry
->d_inode
;
1415 unsigned long pos
= *ppos
;
1416 unsigned long limit
= current
->rlim
[RLIMIT_FSIZE
].rlim_cur
;
1417 struct page
*page
, **hash
;
1418 unsigned long page_cache
= 0;
1419 unsigned long written
;
1422 if (!inode
->i_op
|| !inode
->i_op
->updatepage
)
1425 sync
= file
->f_flags
& O_SYNC
;
1428 if (file
->f_flags
& O_APPEND
)
1429 pos
= inode
->i_size
;
1432 * Check whether we've reached the file size limit.
1436 send_sig(SIGXFSZ
, current
, 0);
1442 * Check whether to truncate the write,
1443 * and send the signal if we do.
1445 if (count
> limit
- pos
) {
1446 send_sig(SIGXFSZ
, current
, 0);
1447 count
= limit
- pos
;
1451 unsigned long bytes
, pgpos
, offset
;
1453 * Try to find the page in the cache. If it isn't there,
1454 * allocate a free page.
1456 offset
= (pos
& ~PAGE_MASK
);
1457 pgpos
= pos
& PAGE_MASK
;
1458 bytes
= PAGE_SIZE
- offset
;
1462 hash
= page_hash(inode
, pgpos
);
1463 page
= __find_page(inode
, pgpos
, *hash
);
1466 page_cache
= __get_free_page(GFP_USER
);
1472 page
= mem_map
+ MAP_NR(page_cache
);
1473 add_to_page_cache(page
, inode
, pgpos
, hash
);
1477 /* Get exclusive IO access to the page.. */
1479 set_bit(PG_locked
, &page
->flags
);
1482 * Do the real work.. If the writer ends up delaying the write,
1483 * the writer needs to increment the page use counts until he
1484 * is done with the page.
1486 bytes
-= copy_from_user((u8
*)page_address(page
) + offset
, buf
, bytes
);
1489 status
= inode
->i_op
->updatepage(file
, page
, offset
, bytes
, sync
);
1491 /* Mark it unlocked again and drop the page.. */
1492 clear_bit(PG_locked
, &page
->flags
);
1493 wake_up(&page
->wait
);
1505 if (pos
> inode
->i_size
)
1506 inode
->i_size
= pos
;
1509 free_page(page_cache
);
1511 return written
? written
: status
;
1515 * Support routines for directory cacheing using the page cache.
1519 * Finds the page at the specified offset, installing a new page
1520 * if requested. The count is incremented and the page is locked.
1522 * Note: we don't have to worry about races here, as the caller
1523 * is holding the inode semaphore.
1525 unsigned long get_cached_page(struct inode
* inode
, unsigned long offset
,
1529 struct page
** hash
;
1530 unsigned long page_cache
= 0;
1532 hash
= page_hash(inode
, offset
);
1533 page
= __find_page(inode
, offset
, *hash
);
1537 page_cache
= get_free_page(GFP_USER
);
1540 page
= mem_map
+ MAP_NR(page_cache
);
1541 add_to_page_cache(page
, inode
, offset
, hash
);
1543 if (atomic_read(&page
->count
) != 2)
1544 printk(KERN_ERR
"get_cached_page: page count=%d\n",
1545 atomic_read(&page
->count
));
1546 if (test_bit(PG_locked
, &page
->flags
))
1547 printk(KERN_ERR
"get_cached_page: page already locked!\n");
1548 set_bit(PG_locked
, &page
->flags
);
1549 page_cache
= page_address(page
);
1556 * Unlock and free a page.
1558 void put_cached_page(unsigned long addr
)
1560 struct page
* page
= mem_map
+ MAP_NR(addr
);
1562 if (!test_bit(PG_locked
, &page
->flags
))
1563 printk("put_cached_page: page not locked!\n");
1564 if (atomic_read(&page
->count
) != 2)
1565 printk("put_cached_page: page count=%d\n",
1566 atomic_read(&page
->count
));
1567 clear_bit(PG_locked
, &page
->flags
);
1568 wake_up(&page
->wait
);