4 * Copyright (C) 1994, 1995 Linus Torvalds
8 * This file handles the generic file mmap semantics used by
9 * most "normal" filesystems (but you don't /have/ to use this:
10 * the NFS filesystem used to do this differently, for example)
12 #include <linux/malloc.h>
13 #include <linux/shm.h>
14 #include <linux/mman.h>
15 #include <linux/locks.h>
16 #include <linux/pagemap.h>
17 #include <linux/swap.h>
18 #include <linux/smp_lock.h>
19 #include <linux/blkdev.h>
20 #include <linux/file.h>
21 #include <linux/swapctl.h>
23 #include <asm/pgtable.h>
24 #include <asm/uaccess.h>
27 * Shared mappings implemented 30.11.1994. It's not fully working yet,
30 * Shared mappings now work. 15.8.1995 Bruno.
33 unsigned long page_cache_size
= 0;
34 struct page
* page_hash_table
[PAGE_HASH_SIZE
];
37 * Simple routines for both non-shared and shared mappings.
40 #define release_page(page) __free_page((page))
43 * Invalidate the pages of an inode, removing all pages that aren't
44 * locked down (those are sure to be up-to-date anyway, so we shouldn't
47 void invalidate_inode_pages(struct inode
* inode
)
53 while ((page
= *p
) != NULL
) {
54 if (PageLocked(page
)) {
59 if ((*p
= page
->next
) != NULL
)
60 (*p
)->prev
= page
->prev
;
63 remove_page_from_hash_queue(page
);
71 * Truncate the page cache at a set offset, removing the pages
72 * that are beyond that offset (and zeroing out partial pages).
74 void truncate_inode_pages(struct inode
* inode
, unsigned long start
)
81 while ((page
= *p
) != NULL
) {
82 unsigned long offset
= page
->offset
;
84 /* page wholly truncated - free it */
85 if (offset
>= start
) {
86 if (PageLocked(page
)) {
91 if ((*p
= page
->next
) != NULL
)
92 (*p
)->prev
= page
->prev
;
95 remove_page_from_hash_queue(page
);
101 offset
= start
- offset
;
102 /* partial truncate, clear end of page */
103 if (offset
< PAGE_SIZE
) {
104 unsigned long address
= page_address(page
);
105 memset((void *) (offset
+ address
), 0, PAGE_SIZE
- offset
);
106 flush_page_to_ram(address
);
112 * Remove a page from the page cache and free it.
114 void remove_inode_page(struct page
*page
)
116 remove_page_from_hash_queue(page
);
117 remove_page_from_inode_queue(page
);
121 int shrink_mmap(int priority
, int gfp_mask
)
123 static unsigned long clock
= 0;
124 unsigned long limit
= num_physpages
;
128 count
= limit
>> priority
;
130 page
= mem_map
+ clock
;
134 /* This works even in the presence of PageSkip because
135 * the first two entries at the beginning of a hole will
136 * be marked, not just the first.
140 if (clock
>= max_mapnr
) {
144 if (PageSkip(page
)) {
145 /* next_hash is overloaded for PageSkip */
146 page
= page
->next_hash
;
147 clock
= page
- mem_map
;
150 referenced
= test_and_clear_bit(PG_referenced
, &page
->flags
);
152 if (PageLocked(page
))
155 if ((gfp_mask
& __GFP_DMA
) && !PageDMA(page
))
158 /* We can't free pages unless there's just one user */
159 if (atomic_read(&page
->count
) != 1)
165 * Is it a page swap page? If so, we want to
166 * drop it if it is no longer used, even if it
167 * were to be marked referenced..
169 if (PageSwapCache(page
)) {
170 if (referenced
&& swap_count(page
->offset
) != 1)
172 delete_from_swap_cache(page
);
179 /* Is it a buffer page? */
181 if (buffer_under_min())
183 if (!try_to_free_buffers(page
))
188 /* is it a page-cache page? */
190 if (pgcache_under_min())
192 remove_inode_page(page
);
201 * Update a page cache copy, when we're doing a "write()" system call
202 * See also "update_vm_cache()".
204 void update_vm_cache(struct inode
* inode
, unsigned long pos
, const char * buf
, int count
)
206 unsigned long offset
, len
;
208 offset
= (pos
& ~PAGE_MASK
);
209 pos
= pos
& PAGE_MASK
;
210 len
= PAGE_SIZE
- offset
;
216 page
= find_page(inode
, pos
);
219 memcpy((void *) (offset
+ page_address(page
)), buf
, len
);
230 static inline void add_to_page_cache(struct page
* page
,
231 struct inode
* inode
, unsigned long offset
,
234 atomic_inc(&page
->count
);
235 page
->flags
= (page
->flags
& ~((1 << PG_uptodate
) | (1 << PG_error
))) | (1 << PG_referenced
);
236 page
->offset
= offset
;
237 add_page_to_inode_queue(inode
, page
);
238 __add_page_to_hash_queue(page
, hash
);
242 * Try to read ahead in the file. "page_cache" is a potentially free page
243 * that we could use for the cache (if it is 0 we can try to create one,
244 * this is all overlapped with the IO on the previous page finishing anyway)
246 static unsigned long try_to_read_ahead(struct file
* file
,
247 unsigned long offset
, unsigned long page_cache
)
249 struct inode
*inode
= file
->f_dentry
->d_inode
;
254 switch (page_cache
) {
256 page_cache
= __get_free_page(GFP_USER
);
260 if (offset
>= inode
->i_size
)
262 hash
= page_hash(inode
, offset
);
263 page
= __find_page(inode
, offset
, *hash
);
266 * Ok, add the new page to the hash-queues...
268 page
= mem_map
+ MAP_NR(page_cache
);
269 add_to_page_cache(page
, inode
, offset
, hash
);
270 inode
->i_op
->readpage(file
, page
);
279 * Wait for IO to complete on a locked page.
281 * This must be called with the caller "holding" the page,
282 * ie with increased "page->count" so that the page won't
283 * go away during the wait..
285 void __wait_on_page(struct page
*page
)
287 struct task_struct
*tsk
= current
;
288 struct wait_queue wait
;
291 add_wait_queue(&page
->wait
, &wait
);
293 tsk
->state
= TASK_UNINTERRUPTIBLE
;
294 run_task_queue(&tq_disk
);
295 if (PageLocked(page
)) {
299 tsk
->state
= TASK_RUNNING
;
300 remove_wait_queue(&page
->wait
, &wait
);
304 #define PROFILE_READAHEAD
305 #define DEBUG_READAHEAD
309 * Read-ahead profiling information
310 * --------------------------------
311 * Every PROFILE_MAXREADCOUNT, the following information is written
313 * Percentage of asynchronous read-ahead.
314 * Average of read-ahead fields context value.
315 * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
319 #ifdef PROFILE_READAHEAD
321 #define PROFILE_MAXREADCOUNT 1000
323 static unsigned long total_reada
;
324 static unsigned long total_async
;
325 static unsigned long total_ramax
;
326 static unsigned long total_ralen
;
327 static unsigned long total_rawin
;
329 static void profile_readahead(int async
, struct file
*filp
)
337 total_ramax
+= filp
->f_ramax
;
338 total_ralen
+= filp
->f_ralen
;
339 total_rawin
+= filp
->f_rawin
;
341 if (total_reada
> PROFILE_MAXREADCOUNT
) {
344 if (!(total_reada
> PROFILE_MAXREADCOUNT
)) {
345 restore_flags(flags
);
349 printk("Readahead average: max=%ld, len=%ld, win=%ld, async=%ld%%\n",
350 total_ramax
/total_reada
,
351 total_ralen
/total_reada
,
352 total_rawin
/total_reada
,
353 (total_async
*100)/total_reada
);
354 #ifdef DEBUG_READAHEAD
355 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%ld\n",
356 filp
->f_ramax
, filp
->f_ralen
, filp
->f_rawin
, filp
->f_raend
);
365 restore_flags(flags
);
368 #endif /* defined PROFILE_READAHEAD */
371 * Read-ahead context:
372 * -------------------
373 * The read ahead context fields of the "struct file" are the following:
374 * - f_raend : position of the first byte after the last page we tried to
376 * - f_ramax : current read-ahead maximum size.
377 * - f_ralen : length of the current IO read block we tried to read-ahead.
378 * - f_rawin : length of the current read-ahead window.
379 * if last read-ahead was synchronous then
381 * otherwise (was asynchronous)
382 * f_rawin = previous value of f_ralen + f_ralen
386 * MIN_READAHEAD : minimum read-ahead size when read-ahead.
387 * MAX_READAHEAD : maximum read-ahead size when read-ahead.
389 * Synchronous read-ahead benefits:
390 * --------------------------------
391 * Using reasonable IO xfer length from peripheral devices increase system
393 * Reasonable means, in this context, not too large but not too small.
394 * The actual maximum value is:
395 * MAX_READAHEAD + PAGE_SIZE = 76k is CONFIG_READA_SMALL is undefined
396 * and 32K if defined (4K page size assumed).
398 * Asynchronous read-ahead benefits:
399 * ---------------------------------
400 * Overlapping next read request and user process execution increase system
405 * We have to guess which further data are needed by the user process.
406 * If these data are often not really needed, it's bad for system
408 * However, we know that files are often accessed sequentially by
409 * application programs and it seems that it is possible to have some good
410 * strategy in that guessing.
411 * We only try to read-ahead files that seems to be read sequentially.
413 * Asynchronous read-ahead risks:
414 * ------------------------------
415 * In order to maximize overlapping, we must start some asynchronous read
416 * request from the device, as soon as possible.
417 * We must be very careful about:
418 * - The number of effective pending IO read requests.
419 * ONE seems to be the only reasonable value.
420 * - The total memory pool usage for the file access stream.
421 * This maximum memory usage is implicitly 2 IO read chunks:
422 * 2*(MAX_READAHEAD + PAGE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
423 * 64k if defined (4K page size assumed).
426 static inline int get_max_readahead(struct inode
* inode
)
428 if (!inode
->i_dev
|| !max_readahead
[MAJOR(inode
->i_dev
)])
429 return MAX_READAHEAD
;
430 return max_readahead
[MAJOR(inode
->i_dev
)][MINOR(inode
->i_dev
)];
433 static inline unsigned long generic_file_readahead(int reada_ok
,
434 struct file
* filp
, struct inode
* inode
,
435 unsigned long ppos
, struct page
* page
, unsigned long page_cache
)
437 unsigned long max_ahead
, ahead
;
439 int max_readahead
= get_max_readahead(inode
);
441 raend
= filp
->f_raend
& PAGE_MASK
;
445 * The current page is locked.
446 * If the current position is inside the previous read IO request, do not
447 * try to reread previously read ahead pages.
448 * Otherwise decide or not to read ahead some pages synchronously.
449 * If we are not going to read ahead, set the read ahead context for this
452 if (PageLocked(page
)) {
453 if (!filp
->f_ralen
|| ppos
>= raend
|| ppos
+ filp
->f_ralen
< raend
) {
455 if (raend
< inode
->i_size
)
456 max_ahead
= filp
->f_ramax
;
458 filp
->f_ralen
= PAGE_SIZE
;
460 filp
->f_raend
= ppos
+ filp
->f_ralen
;
461 filp
->f_rawin
+= filp
->f_ralen
;
466 * The current page is not locked.
467 * If we were reading ahead and,
468 * if the current max read ahead size is not zero and,
469 * if the current position is inside the last read-ahead IO request,
470 * it is the moment to try to read ahead asynchronously.
471 * We will later force unplug device in order to force asynchronous read IO.
473 else if (reada_ok
&& filp
->f_ramax
&& raend
>= PAGE_SIZE
&&
474 ppos
<= raend
&& ppos
+ filp
->f_ralen
>= raend
) {
476 * Add ONE page to max_ahead in order to try to have about the same IO max size
477 * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_SIZE.
478 * Compute the position of the last page we have tried to read in order to
479 * begin to read ahead just at the next page.
482 if (raend
< inode
->i_size
)
483 max_ahead
= filp
->f_ramax
+ PAGE_SIZE
;
486 filp
->f_rawin
= filp
->f_ralen
;
492 * Try to read ahead pages.
493 * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
494 * scheduler, will work enough for us to avoid too bad actuals IO requests.
497 while (ahead
< max_ahead
) {
499 page_cache
= try_to_read_ahead(filp
, raend
+ ahead
,
503 * If we tried to read ahead some pages,
504 * If we tried to read ahead asynchronously,
505 * Try to force unplug of the device in order to start an asynchronous
507 * Update the read-ahead context.
508 * Store the length of the current read-ahead window.
509 * Double the current max read ahead size.
510 * That heuristic avoid to do some large IO for files that are not really
511 * accessed sequentially.
515 run_task_queue(&tq_disk
);
518 filp
->f_ralen
+= ahead
;
519 filp
->f_rawin
+= filp
->f_ralen
;
520 filp
->f_raend
= raend
+ ahead
+ PAGE_SIZE
;
522 filp
->f_ramax
+= filp
->f_ramax
;
524 if (filp
->f_ramax
> max_readahead
)
525 filp
->f_ramax
= max_readahead
;
527 #ifdef PROFILE_READAHEAD
528 profile_readahead((reada_ok
== 2), filp
);
536 * "descriptor" for what we're up to with a read.
537 * This allows us to use the same read code yet
538 * have multiple different users of the data that
539 * we read from a file.
541 * The simplest case just copies the data to user
551 typedef int (*read_actor_t
)(read_descriptor_t
*, const char *, unsigned long);
554 * This is a generic file read routine, and uses the
555 * inode->i_op->readpage() function for the actual low-level
558 * This is really ugly. But the goto's actually try to clarify some
559 * of the logic when it comes to error handling etc.
561 static void do_generic_file_read(struct file
* filp
, loff_t
*ppos
, read_descriptor_t
* desc
, read_actor_t actor
)
563 struct dentry
*dentry
= filp
->f_dentry
;
564 struct inode
*inode
= dentry
->d_inode
;
565 size_t pos
, pgpos
, page_cache
;
567 int max_readahead
= get_max_readahead(inode
);
572 pgpos
= pos
& PAGE_MASK
;
574 * If the current position is outside the previous read-ahead window,
575 * we reset the current read-ahead context and set read ahead max to zero
576 * (will be set to just needed value later),
577 * otherwise, we assume that the file accesses are sequential enough to
578 * continue read-ahead.
580 if (pgpos
> filp
->f_raend
|| pgpos
+ filp
->f_rawin
< filp
->f_raend
) {
590 * Adjust the current value of read-ahead max.
591 * If the read operation stay in the first half page, force no readahead.
592 * Otherwise try to increase read ahead max just enough to do the read request.
593 * Then, at least MIN_READAHEAD if read ahead is ok,
594 * and at most MAX_READAHEAD in all cases.
596 if (pos
+ desc
->count
<= (PAGE_SIZE
>> 1)) {
599 unsigned long needed
;
601 needed
= ((pos
+ desc
->count
) & PAGE_MASK
) - pgpos
;
603 if (filp
->f_ramax
< needed
)
604 filp
->f_ramax
= needed
;
606 if (reada_ok
&& filp
->f_ramax
< MIN_READAHEAD
)
607 filp
->f_ramax
= MIN_READAHEAD
;
608 if (filp
->f_ramax
> max_readahead
)
609 filp
->f_ramax
= max_readahead
;
613 struct page
*page
, **hash
;
615 if (pos
>= inode
->i_size
)
619 * Try to find the data in the page cache..
621 hash
= page_hash(inode
, pos
& PAGE_MASK
);
622 page
= __find_page(inode
, pos
& PAGE_MASK
, *hash
);
628 * Try to read ahead only if the current page is filled or being filled.
629 * Otherwise, if we were reading ahead, decrease max read ahead size to
631 * In this context, that seems to may happen only on some read error or if
632 * the page has been rewritten.
634 if (PageUptodate(page
) || PageLocked(page
))
635 page_cache
= generic_file_readahead(reada_ok
, filp
, inode
, pos
& PAGE_MASK
, page
, page_cache
);
636 else if (reada_ok
&& filp
->f_ramax
> MIN_READAHEAD
)
637 filp
->f_ramax
= MIN_READAHEAD
;
641 if (!PageUptodate(page
))
642 goto page_read_error
;
646 * Ok, we have the page, it's up-to-date and ok,
647 * so now we can finally copy it to user space...
650 unsigned long offset
, nr
;
652 offset
= pos
& ~PAGE_MASK
;
653 nr
= PAGE_SIZE
- offset
;
654 if (nr
> inode
->i_size
- pos
)
655 nr
= inode
->i_size
- pos
;
658 * The actor routine returns how many bytes were actually used..
659 * NOTE! This may not be the same as how much of a user buffer
660 * we filled up (we may be padding etc), so we can only update
661 * "pos" here (the actor routine has to update the user buffer
662 * pointers and the remaining count).
664 nr
= actor(desc
, (const char *) (page_address(page
) + offset
), nr
);
667 if (nr
&& desc
->count
)
674 * Ok, it wasn't cached, so we need to create a new
678 page_cache
= __get_free_page(GFP_USER
);
680 * That could have slept, so go around to the
685 desc
->error
= -ENOMEM
;
690 * Ok, add the new page to the hash-queues...
692 page
= mem_map
+ MAP_NR(page_cache
);
694 add_to_page_cache(page
, inode
, pos
& PAGE_MASK
, hash
);
697 * Error handling is tricky. If we get a read error,
698 * the cached page stays in the cache (but uptodate=0),
699 * and the next process that accesses it will try to
700 * re-read it. This is needed for NFS etc, where the
701 * identity of the reader can decide if we can read the
705 * We have to read the page.
706 * If we were reading ahead, we had previously tried to read this page,
707 * That means that the page has probably been removed from the cache before
708 * the application process needs it, or has been rewritten.
709 * Decrease max readahead size to the minimum value in that situation.
711 if (reada_ok
&& filp
->f_ramax
> MIN_READAHEAD
)
712 filp
->f_ramax
= MIN_READAHEAD
;
715 int error
= inode
->i_op
->readpage(filp
, page
);
725 * We found the page, but it wasn't up-to-date.
726 * Try to re-read it _once_. We do this synchronously,
727 * because this happens only if there were errors.
730 int error
= inode
->i_op
->readpage(filp
, page
);
733 if (PageUptodate(page
) && !PageError(page
))
735 error
= -EIO
; /* Some unspecified error occurred.. */
746 free_page(page_cache
);
750 static int file_read_actor(read_descriptor_t
* desc
, const char *area
, unsigned long size
)
753 unsigned long count
= desc
->count
;
757 left
= __copy_to_user(desc
->buf
, area
, size
);
760 desc
->error
= -EFAULT
;
762 desc
->count
= count
- size
;
763 desc
->written
+= size
;
769 * This is the "read()" routine for all filesystems
770 * that can use the page cache directly.
772 ssize_t
generic_file_read(struct file
* filp
, char * buf
, size_t count
, loff_t
*ppos
)
777 if (access_ok(VERIFY_WRITE
, buf
, count
)) {
780 read_descriptor_t desc
;
786 do_generic_file_read(filp
, ppos
, &desc
, file_read_actor
);
788 retval
= desc
.written
;
796 static int file_send_actor(read_descriptor_t
* desc
, const char *area
, unsigned long size
)
799 unsigned long count
= desc
->count
;
800 struct file
*file
= (struct file
*) desc
->buf
;
801 struct inode
*inode
= file
->f_dentry
->d_inode
;
809 written
= file
->f_op
->write(file
, area
, size
, &file
->f_pos
);
813 desc
->error
= written
;
816 desc
->count
= count
- written
;
817 desc
->written
+= written
;
821 asmlinkage ssize_t
sys_sendfile(int out_fd
, int in_fd
, off_t
*offset
, size_t count
)
824 struct file
* in_file
, * out_file
;
825 struct inode
* in_inode
, * out_inode
;
830 * Get input file, and verify that it is ok..
833 in_file
= fget(in_fd
);
836 if (!(in_file
->f_mode
& FMODE_READ
))
839 in_inode
= in_file
->f_dentry
->d_inode
;
842 if (!in_inode
->i_op
|| !in_inode
->i_op
->readpage
)
844 retval
= locks_verify_area(FLOCK_VERIFY_READ
, in_inode
, in_file
, in_file
->f_pos
, count
);
849 * Get output file, and verify that it is ok..
852 out_file
= fget(out_fd
);
855 if (!(out_file
->f_mode
& FMODE_WRITE
))
858 if (!out_file
->f_op
|| !out_file
->f_op
->write
)
860 out_inode
= out_file
->f_dentry
->d_inode
;
863 retval
= locks_verify_area(FLOCK_VERIFY_WRITE
, out_inode
, out_file
, out_file
->f_pos
, count
);
869 read_descriptor_t desc
;
870 loff_t pos
= 0, *ppos
;
873 ppos
= &in_file
->f_pos
;
875 if (get_user(pos
, offset
))
882 desc
.buf
= (char *) out_file
;
884 do_generic_file_read(in_file
, ppos
, &desc
, file_send_actor
);
886 retval
= desc
.written
;
890 put_user(pos
, offset
);
904 * Semantics for shared and private memory areas are different past the end
905 * of the file. A shared mapping past the last page of the file is an error
906 * and results in a SIGBUS, while a private mapping just maps in a zero page.
908 * The goto's are kind of ugly, but this streamlines the normal case of having
909 * it in the page cache, and handles the special cases reasonably without
910 * having a lot of duplicated code.
912 * WSH 06/04/97: fixed a memory leak and moved the allocation of new_page
913 * ahead of the wait if we're sure to need it.
915 static unsigned long filemap_nopage(struct vm_area_struct
* area
, unsigned long address
, int no_share
)
917 struct file
* file
= area
->vm_file
;
918 struct dentry
* dentry
= file
->f_dentry
;
919 struct inode
* inode
= dentry
->d_inode
;
920 unsigned long offset
, reada
, i
;
921 struct page
* page
, **hash
;
922 unsigned long old_page
, new_page
;
925 offset
= (address
& PAGE_MASK
) - area
->vm_start
+ area
->vm_offset
;
926 if (offset
>= inode
->i_size
&& (area
->vm_flags
& VM_SHARED
) && area
->vm_mm
== current
->mm
)
930 * Do we have something in the page cache already?
932 hash
= page_hash(inode
, offset
);
933 page
= __find_page(inode
, offset
, *hash
);
939 * Ok, found a page in the page cache, now we need to check
940 * that it's up-to-date. First check whether we'll need an
941 * extra page -- better to overlap the allocation with the I/O.
943 if (no_share
&& !new_page
) {
944 new_page
= __get_free_page(GFP_USER
);
949 if (PageLocked(page
))
950 goto page_locked_wait
;
951 if (!PageUptodate(page
))
952 goto page_read_error
;
956 * Found the page, need to check sharing and possibly
957 * copy it over to another page..
959 old_page
= page_address(page
);
962 * Ok, we can share the cached page directly.. Get rid
963 * of any potential extra pages.
968 flush_page_to_ram(old_page
);
973 * No sharing ... copy to the new page.
975 copy_page(new_page
, old_page
);
976 flush_page_to_ram(new_page
);
982 * Try to read in an entire cluster at once.
985 reada
>>= PAGE_SHIFT
+ page_cluster
;
986 reada
<<= PAGE_SHIFT
+ page_cluster
;
988 for (i
= 1 << page_cluster
; i
> 0; --i
, reada
+= PAGE_SIZE
)
989 new_page
= try_to_read_ahead(file
, reada
, new_page
);
992 new_page
= __get_free_page(GFP_USER
);
997 * During getting the above page we might have slept,
998 * so we need to re-check the situation with the page
999 * cache.. The page we just got may be useful if we
1000 * can't share, so don't get rid of it here.
1002 page
= find_page(inode
, offset
);
1007 * Now, create a new page-cache page from the page we got
1009 page
= mem_map
+ MAP_NR(new_page
);
1011 add_to_page_cache(page
, inode
, offset
, hash
);
1013 if (inode
->i_op
->readpage(file
, page
) != 0)
1019 __wait_on_page(page
);
1020 if (PageUptodate(page
))
1025 * Umm, take care of errors if the page isn't up-to-date.
1026 * Try to re-read it _once_. We do this synchronously,
1027 * because there really aren't any performance issues here
1028 * and we need to check for errors.
1030 if (inode
->i_op
->readpage(file
, page
) != 0)
1033 if (PageError(page
))
1035 if (PageUptodate(page
))
1039 * Things didn't work out. Return zero to tell the
1040 * mm layer so, possibly freeing the page cache page first.
1045 free_page(new_page
);
1051 * Tries to write a shared mapped page to its backing store. May return -EIO
1052 * if the disk is full.
1054 static inline int do_write_page(struct inode
* inode
, struct file
* file
,
1055 const char * page
, unsigned long offset
)
1059 loff_t loff
= offset
;
1060 mm_segment_t old_fs
;
1062 size
= offset
+ PAGE_SIZE
;
1063 /* refuse to extend file size.. */
1064 if (S_ISREG(inode
->i_mode
)) {
1065 if (size
> inode
->i_size
)
1066 size
= inode
->i_size
;
1067 /* Ho humm.. We should have tested for this earlier */
1075 if (size
== file
->f_op
->write(file
, (const char *) page
, size
, &loff
))
1081 static int filemap_write_page(struct vm_area_struct
* vma
,
1082 unsigned long offset
,
1087 struct dentry
* dentry
;
1088 struct inode
* inode
;
1090 file
= vma
->vm_file
;
1091 dentry
= file
->f_dentry
;
1092 inode
= dentry
->d_inode
;
1093 if (!file
->f_op
->write
)
1097 * If a task terminates while we're swapping the page, the vma and
1098 * and file could be released ... increment the count to be safe.
1101 down(&inode
->i_sem
);
1102 result
= do_write_page(inode
, file
, (const char *) page
, offset
);
1110 * The page cache takes care of races between somebody
1111 * trying to swap something out and swap something in
1112 * at the same time..
1114 int filemap_swapout(struct vm_area_struct
* vma
, struct page
* page
)
1116 return filemap_write_page(vma
, page
->offset
, page_address(page
));
1119 static inline int filemap_sync_pte(pte_t
* ptep
, struct vm_area_struct
*vma
,
1120 unsigned long address
, unsigned int flags
)
1126 if (!(flags
& MS_INVALIDATE
)) {
1127 if (!pte_present(pte
))
1129 if (!pte_dirty(pte
))
1131 flush_page_to_ram(pte_page(pte
));
1132 flush_cache_page(vma
, address
);
1133 set_pte(ptep
, pte_mkclean(pte
));
1134 flush_tlb_page(vma
, address
);
1135 page
= pte_page(pte
);
1136 atomic_inc(&mem_map
[MAP_NR(page
)].count
);
1140 flush_cache_page(vma
, address
);
1142 flush_tlb_page(vma
, address
);
1143 if (!pte_present(pte
)) {
1144 swap_free(pte_val(pte
));
1147 page
= pte_page(pte
);
1148 if (!pte_dirty(pte
) || flags
== MS_INVALIDATE
) {
1153 error
= filemap_write_page(vma
, address
- vma
->vm_start
+ vma
->vm_offset
, page
);
1158 static inline int filemap_sync_pte_range(pmd_t
* pmd
,
1159 unsigned long address
, unsigned long size
,
1160 struct vm_area_struct
*vma
, unsigned long offset
, unsigned int flags
)
1168 if (pmd_bad(*pmd
)) {
1169 printk("filemap_sync_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd
));
1173 pte
= pte_offset(pmd
, address
);
1174 offset
+= address
& PMD_MASK
;
1175 address
&= ~PMD_MASK
;
1176 end
= address
+ size
;
1181 error
|= filemap_sync_pte(pte
, vma
, address
+ offset
, flags
);
1182 address
+= PAGE_SIZE
;
1184 } while (address
< end
);
1188 static inline int filemap_sync_pmd_range(pgd_t
* pgd
,
1189 unsigned long address
, unsigned long size
,
1190 struct vm_area_struct
*vma
, unsigned int flags
)
1193 unsigned long offset
, end
;
1198 if (pgd_bad(*pgd
)) {
1199 printk("filemap_sync_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd
));
1203 pmd
= pmd_offset(pgd
, address
);
1204 offset
= address
& PGDIR_MASK
;
1205 address
&= ~PGDIR_MASK
;
1206 end
= address
+ size
;
1207 if (end
> PGDIR_SIZE
)
1211 error
|= filemap_sync_pte_range(pmd
, address
, end
- address
, vma
, offset
, flags
);
1212 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
1214 } while (address
< end
);
1218 static int filemap_sync(struct vm_area_struct
* vma
, unsigned long address
,
1219 size_t size
, unsigned int flags
)
1222 unsigned long end
= address
+ size
;
1225 dir
= pgd_offset(vma
->vm_mm
, address
);
1226 flush_cache_range(vma
->vm_mm
, end
- size
, end
);
1227 while (address
< end
) {
1228 error
|= filemap_sync_pmd_range(dir
, address
, end
- address
, vma
, flags
);
1229 address
= (address
+ PGDIR_SIZE
) & PGDIR_MASK
;
1232 flush_tlb_range(vma
->vm_mm
, end
- size
, end
);
1237 * This handles (potentially partial) area unmaps..
1239 static void filemap_unmap(struct vm_area_struct
*vma
, unsigned long start
, size_t len
)
1241 filemap_sync(vma
, start
, len
, MS_ASYNC
);
1245 * Shared mappings need to be able to do the right thing at
1246 * close/unmap/sync. They will also use the private file as
1247 * backing-store for swapping..
1249 static struct vm_operations_struct file_shared_mmap
= {
1250 NULL
, /* no special open */
1251 NULL
, /* no special close */
1252 filemap_unmap
, /* unmap - we need to sync the pages */
1253 NULL
, /* no special protect */
1254 filemap_sync
, /* sync */
1256 filemap_nopage
, /* nopage */
1258 filemap_swapout
, /* swapout */
1263 * Private mappings just need to be able to load in the map.
1265 * (This is actually used for shared mappings as well, if we
1266 * know they can't ever get write permissions..)
1268 static struct vm_operations_struct file_private_mmap
= {
1275 filemap_nopage
, /* nopage */
1281 /* This is used for a general mmap of a disk file */
1283 int generic_file_mmap(struct file
* file
, struct vm_area_struct
* vma
)
1285 struct vm_operations_struct
* ops
;
1286 struct inode
*inode
= file
->f_dentry
->d_inode
;
1288 if ((vma
->vm_flags
& VM_SHARED
) && (vma
->vm_flags
& VM_MAYWRITE
)) {
1289 ops
= &file_shared_mmap
;
1290 /* share_page() can only guarantee proper page sharing if
1291 * the offsets are all page aligned. */
1292 if (vma
->vm_offset
& (PAGE_SIZE
- 1))
1295 ops
= &file_private_mmap
;
1296 if (vma
->vm_offset
& (inode
->i_sb
->s_blocksize
- 1))
1299 if (!inode
->i_sb
|| !S_ISREG(inode
->i_mode
))
1301 if (!inode
->i_op
|| !inode
->i_op
->readpage
)
1303 UPDATE_ATIME(inode
);
1304 vma
->vm_file
= file
;
1312 * The msync() system call.
1315 static int msync_interval(struct vm_area_struct
* vma
,
1316 unsigned long start
, unsigned long end
, int flags
)
1318 if (vma
->vm_file
&& vma
->vm_ops
&& vma
->vm_ops
->sync
) {
1320 error
= vma
->vm_ops
->sync(vma
, start
, end
-start
, flags
);
1321 if (!error
&& (flags
& MS_SYNC
)) {
1322 struct file
* file
= vma
->vm_file
;
1324 struct dentry
* dentry
= file
->f_dentry
;
1325 struct inode
* inode
= dentry
->d_inode
;
1326 down(&inode
->i_sem
);
1327 error
= file_fsync(file
, dentry
);
1336 asmlinkage
int sys_msync(unsigned long start
, size_t len
, int flags
)
1339 struct vm_area_struct
* vma
;
1340 int unmapped_error
, error
= -EINVAL
;
1342 down(¤t
->mm
->mmap_sem
);
1344 if (start
& ~PAGE_MASK
)
1346 len
= (len
+ ~PAGE_MASK
) & PAGE_MASK
;
1350 if (flags
& ~(MS_ASYNC
| MS_INVALIDATE
| MS_SYNC
))
1356 * If the interval [start,end) covers some unmapped address ranges,
1357 * just ignore them, but return -EFAULT at the end.
1359 vma
= find_vma(current
->mm
, start
);
1362 /* Still start < end. */
1366 /* Here start < vma->vm_end. */
1367 if (start
< vma
->vm_start
) {
1368 unmapped_error
= -EFAULT
;
1369 start
= vma
->vm_start
;
1371 /* Here vma->vm_start <= start < vma->vm_end. */
1372 if (end
<= vma
->vm_end
) {
1374 error
= msync_interval(vma
, start
, end
, flags
);
1378 error
= unmapped_error
;
1381 /* Here vma->vm_start <= start < vma->vm_end < end. */
1382 error
= msync_interval(vma
, start
, vma
->vm_end
, flags
);
1385 start
= vma
->vm_end
;
1390 up(¤t
->mm
->mmap_sem
);
1395 * Write to a file through the page cache. This is mainly for the
1396 * benefit of NFS and possibly other network-based file systems.
1398 * We currently put everything into the page cache prior to writing it.
1399 * This is not a problem when writing full pages. With partial pages,
1400 * however, we first have to read the data into the cache, then
1401 * dirty the page, and finally schedule it for writing. Alternatively, we
1402 * could write-through just the portion of data that would go into that
1403 * page, but that would kill performance for applications that write data
1404 * line by line, and it's prone to race conditions.
1406 * Note that this routine doesn't try to keep track of dirty pages. Each
1407 * file system has to do this all by itself, unfortunately.
1411 generic_file_write(struct file
*file
, const char *buf
,
1412 size_t count
, loff_t
*ppos
)
1414 struct dentry
*dentry
= file
->f_dentry
;
1415 struct inode
*inode
= dentry
->d_inode
;
1416 unsigned long pos
= *ppos
;
1417 unsigned long limit
= current
->rlim
[RLIMIT_FSIZE
].rlim_cur
;
1418 struct page
*page
, **hash
;
1419 unsigned long page_cache
= 0;
1420 unsigned long written
;
1423 if (!inode
->i_op
|| !inode
->i_op
->updatepage
)
1426 sync
= file
->f_flags
& O_SYNC
;
1429 if (file
->f_flags
& O_APPEND
)
1430 pos
= inode
->i_size
;
1433 * Check whether we've reached the file size limit.
1437 send_sig(SIGXFSZ
, current
, 0);
1443 * Check whether to truncate the write,
1444 * and send the signal if we do.
1446 if (count
> limit
- pos
) {
1447 send_sig(SIGXFSZ
, current
, 0);
1448 count
= limit
- pos
;
1452 unsigned long bytes
, pgpos
, offset
;
1454 * Try to find the page in the cache. If it isn't there,
1455 * allocate a free page.
1457 offset
= (pos
& ~PAGE_MASK
);
1458 pgpos
= pos
& PAGE_MASK
;
1459 bytes
= PAGE_SIZE
- offset
;
1463 hash
= page_hash(inode
, pgpos
);
1464 page
= __find_page(inode
, pgpos
, *hash
);
1467 page_cache
= __get_free_page(GFP_USER
);
1473 page
= mem_map
+ MAP_NR(page_cache
);
1474 add_to_page_cache(page
, inode
, pgpos
, hash
);
1478 /* Get exclusive IO access to the page.. */
1480 set_bit(PG_locked
, &page
->flags
);
1483 * Do the real work.. If the writer ends up delaying the write,
1484 * the writer needs to increment the page use counts until he
1485 * is done with the page.
1487 bytes
-= copy_from_user((u8
*)page_address(page
) + offset
, buf
, bytes
);
1490 status
= inode
->i_op
->updatepage(file
, page
, offset
, bytes
, sync
);
1492 /* Mark it unlocked again and drop the page.. */
1493 clear_bit(PG_locked
, &page
->flags
);
1494 wake_up(&page
->wait
);
1506 if (pos
> inode
->i_size
)
1507 inode
->i_size
= pos
;
1510 free_page(page_cache
);
1512 return written
? written
: status
;
1516 * Support routines for directory cacheing using the page cache.
1520 * Finds the page at the specified offset, installing a new page
1521 * if requested. The count is incremented and the page is locked.
1523 * Note: we don't have to worry about races here, as the caller
1524 * is holding the inode semaphore.
1526 unsigned long get_cached_page(struct inode
* inode
, unsigned long offset
,
1530 struct page
** hash
;
1531 unsigned long page_cache
= 0;
1533 hash
= page_hash(inode
, offset
);
1534 page
= __find_page(inode
, offset
, *hash
);
1538 page_cache
= get_free_page(GFP_USER
);
1541 page
= mem_map
+ MAP_NR(page_cache
);
1542 add_to_page_cache(page
, inode
, offset
, hash
);
1544 if (atomic_read(&page
->count
) != 2)
1545 printk(KERN_ERR
"get_cached_page: page count=%d\n",
1546 atomic_read(&page
->count
));
1547 if (test_bit(PG_locked
, &page
->flags
))
1548 printk(KERN_ERR
"get_cached_page: page already locked!\n");
1549 set_bit(PG_locked
, &page
->flags
);
1550 page_cache
= page_address(page
);
1557 * Unlock and free a page.
1559 void put_cached_page(unsigned long addr
)
1561 struct page
* page
= mem_map
+ MAP_NR(addr
);
1563 if (!test_bit(PG_locked
, &page
->flags
))
1564 printk("put_cached_page: page not locked!\n");
1565 if (atomic_read(&page
->count
) != 2)
1566 printk("put_cached_page: page count=%d\n",
1567 atomic_read(&page
->count
));
1568 clear_bit(PG_locked
, &page
->flags
);
1569 wake_up(&page
->wait
);