Merge with Linux 2.3.40.
[linux-2.6/linux-mips.git] / mm / filemap.c
blob63a50b7e624c22702aa3de0a79a026e97300dc1b
1 /*
2 * linux/mm/filemap.c
4 * Copyright (C) 1994-1999 Linus Torvalds
5 */
7 /*
8 * This file handles the generic file mmap semantics used by
9 * most "normal" filesystems (but you don't /have/ to use this:
10 * the NFS filesystem used to do this differently, for example)
12 #include <linux/malloc.h>
13 #include <linux/shm.h>
14 #include <linux/mman.h>
15 #include <linux/locks.h>
16 #include <linux/pagemap.h>
17 #include <linux/swap.h>
18 #include <linux/smp_lock.h>
19 #include <linux/blkdev.h>
20 #include <linux/file.h>
21 #include <linux/swapctl.h>
22 #include <linux/slab.h>
23 #include <linux/init.h>
24 #include <linux/mm.h>
26 #include <asm/pgalloc.h>
27 #include <asm/uaccess.h>
29 #include <linux/highmem.h>
32 * Shared mappings implemented 30.11.1994. It's not fully working yet,
33 * though.
35 * Shared mappings now work. 15.8.1995 Bruno.
37 * finished 'unifying' the page and buffer cache and SMP-threaded the
38 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
40 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
43 atomic_t page_cache_size = ATOMIC_INIT(0);
44 unsigned int page_hash_bits;
45 struct page **page_hash_table;
47 spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
49 * NOTE: to avoid deadlocking you must never acquire the pagecache_lock with
50 * the pagemap_lru_lock held.
52 spinlock_t pagemap_lru_lock = SPIN_LOCK_UNLOCKED;
54 #define CLUSTER_PAGES (1 << page_cluster)
55 #define CLUSTER_OFFSET(x) (((x) >> page_cluster) << page_cluster)
57 void __add_page_to_hash_queue(struct page * page, struct page **p)
59 atomic_inc(&page_cache_size);
60 if((page->next_hash = *p) != NULL)
61 (*p)->pprev_hash = &page->next_hash;
62 *p = page;
63 page->pprev_hash = p;
64 if (page->buffers)
65 PAGE_BUG(page);
68 static void remove_page_from_hash_queue(struct page * page)
70 if(page->pprev_hash) {
71 if(page->next_hash)
72 page->next_hash->pprev_hash = page->pprev_hash;
73 *page->pprev_hash = page->next_hash;
74 page->pprev_hash = NULL;
76 atomic_dec(&page_cache_size);
80 * Remove a page from the page cache and free it. Caller has to make
81 * sure the page is locked and that nobody else uses it - or that usage
82 * is safe.
84 void remove_inode_page(struct page *page)
86 if (!PageLocked(page))
87 PAGE_BUG(page);
89 spin_lock(&pagecache_lock);
90 remove_page_from_inode_queue(page);
91 remove_page_from_hash_queue(page);
92 page->mapping = NULL;
93 spin_unlock(&pagecache_lock);
96 void invalidate_inode_pages(struct inode * inode)
98 struct list_head *head, *curr;
99 struct page * page;
101 head = &inode->i_data.pages;
102 spin_lock(&pagecache_lock);
103 curr = head->next;
105 while (curr != head) {
106 page = list_entry(curr, struct page, list);
107 curr = curr->next;
109 /* We cannot invalidate a locked page */
110 if (PageLocked(page))
111 continue;
113 lru_cache_del(page);
115 remove_page_from_inode_queue(page);
116 remove_page_from_hash_queue(page);
117 page->mapping = NULL;
118 page_cache_release(page);
120 spin_unlock(&pagecache_lock);
124 * Truncate the page cache at a set offset, removing the pages
125 * that are beyond that offset (and zeroing out partial pages).
127 void truncate_inode_pages(struct inode * inode, loff_t lstart)
129 struct list_head *head, *curr;
130 struct page * page;
131 unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
132 unsigned long start;
134 start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
136 repeat:
137 head = &inode->i_data.pages;
138 spin_lock(&pagecache_lock);
139 curr = head->next;
140 while (curr != head) {
141 unsigned long offset;
143 page = list_entry(curr, struct page, list);
144 curr = curr->next;
146 offset = page->index;
148 /* page wholly truncated - free it */
149 if (offset >= start) {
150 get_page(page);
151 spin_unlock(&pagecache_lock);
153 lock_page(page);
155 if (!page->buffers || block_flushpage(page, 0))
156 lru_cache_del(page);
159 * We remove the page from the page cache
160 * _after_ we have destroyed all buffer-cache
161 * references to it. Otherwise some other process
162 * might think this inode page is not in the
163 * page cache and creates a buffer-cache alias
164 * to it causing all sorts of fun problems ...
166 remove_inode_page(page);
168 UnlockPage(page);
169 page_cache_release(page);
170 page_cache_release(page);
173 * We have done things without the pagecache lock,
174 * so we'll have to repeat the scan.
175 * It's not possible to deadlock here because
176 * we are guaranteed to make progress. (ie. we have
177 * just removed a page)
179 goto repeat;
182 * there is only one partial page possible.
184 if (!partial)
185 continue;
187 /* and it's the one preceeding the first wholly truncated page */
188 if ((offset + 1) != start)
189 continue;
191 /* partial truncate, clear end of page */
192 get_page(page);
193 spin_unlock(&pagecache_lock);
195 lock_page(page);
197 memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
198 if (page->buffers)
199 block_flushpage(page, partial);
201 partial = 0;
204 * we have dropped the spinlock so we have to
205 * restart.
207 UnlockPage(page);
208 page_cache_release(page);
209 goto repeat;
211 spin_unlock(&pagecache_lock);
214 int shrink_mmap(int priority, int gfp_mask, zone_t *zone)
216 int ret = 0, count;
217 LIST_HEAD(young);
218 LIST_HEAD(old);
219 LIST_HEAD(forget);
220 struct list_head * page_lru, * dispose;
221 struct page * page;
223 count = nr_lru_pages / (priority+1);
225 spin_lock(&pagemap_lru_lock);
227 while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache) {
228 page = list_entry(page_lru, struct page, lru);
229 list_del(page_lru);
231 dispose = &lru_cache;
232 if (test_and_clear_bit(PG_referenced, &page->flags))
233 /* Roll the page at the top of the lru list,
234 * we could also be more aggressive putting
235 * the page in the young-dispose-list, so
236 * avoiding to free young pages in each pass.
238 goto dispose_continue;
240 dispose = &old;
241 /* don't account passes over not DMA pages */
242 if (zone && (!memclass(page->zone, zone)))
243 goto dispose_continue;
245 count--;
247 dispose = &young;
248 if (TryLockPage(page))
249 goto dispose_continue;
251 /* Release the pagemap_lru lock even if the page is not yet
252 queued in any lru queue since we have just locked down
253 the page so nobody else may SMP race with us running
254 a lru_cache_del() (lru_cache_del() always run with the
255 page locked down ;). */
256 spin_unlock(&pagemap_lru_lock);
258 /* avoid unscalable SMP locking */
259 if (!page->buffers && page_count(page) > 1)
260 goto unlock_noput_continue;
262 /* Take the pagecache_lock spinlock held to avoid
263 other tasks to notice the page while we are looking at its
264 page count. If it's a pagecache-page we'll free it
265 in one atomic transaction after checking its page count. */
266 spin_lock(&pagecache_lock);
268 /* avoid freeing the page while it's locked */
269 get_page(page);
271 /* Is it a buffer page? */
272 if (page->buffers) {
273 spin_unlock(&pagecache_lock);
274 if (!try_to_free_buffers(page))
275 goto unlock_continue;
276 /* page was locked, inode can't go away under us */
277 if (!page->mapping) {
278 atomic_dec(&buffermem_pages);
279 goto made_buffer_progress;
281 spin_lock(&pagecache_lock);
285 * We can't free pages unless there's just one user
286 * (count == 2 because we added one ourselves above).
288 if (page_count(page) != 2)
289 goto cache_unlock_continue;
292 * Is it a page swap page? If so, we want to
293 * drop it if it is no longer used, even if it
294 * were to be marked referenced..
296 if (PageSwapCache(page)) {
297 spin_unlock(&pagecache_lock);
298 __delete_from_swap_cache(page);
299 goto made_inode_progress;
302 /* is it a page-cache page? */
303 if (page->mapping) {
304 if (!pgcache_under_min())
306 remove_page_from_inode_queue(page);
307 remove_page_from_hash_queue(page);
308 page->mapping = NULL;
309 spin_unlock(&pagecache_lock);
310 goto made_inode_progress;
312 goto cache_unlock_continue;
315 dispose = &forget;
316 printk(KERN_ERR "shrink_mmap: unknown LRU page!\n");
318 cache_unlock_continue:
319 spin_unlock(&pagecache_lock);
320 unlock_continue:
321 UnlockPage(page);
322 put_page(page);
323 dispose_relock_continue:
324 /* even if the dispose list is local, a truncate_inode_page()
325 may remove a page from its queue so always
326 synchronize with the lru lock while accesing the
327 page->lru field */
328 spin_lock(&pagemap_lru_lock);
329 list_add(page_lru, dispose);
330 continue;
332 unlock_noput_continue:
333 UnlockPage(page);
334 goto dispose_relock_continue;
336 dispose_continue:
337 list_add(page_lru, dispose);
339 goto out;
341 made_inode_progress:
342 page_cache_release(page);
343 made_buffer_progress:
344 UnlockPage(page);
345 put_page(page);
346 ret = 1;
347 spin_lock(&pagemap_lru_lock);
348 /* nr_lru_pages needs the spinlock */
349 nr_lru_pages--;
351 out:
352 list_splice(&young, &lru_cache);
353 list_splice(&old, lru_cache.prev);
355 spin_unlock(&pagemap_lru_lock);
357 return ret;
360 static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
362 goto inside;
364 for (;;) {
365 page = page->next_hash;
366 inside:
367 if (!page)
368 goto not_found;
369 if (page->mapping != mapping)
370 continue;
371 if (page->index == offset)
372 break;
374 set_bit(PG_referenced, &page->flags);
375 not_found:
376 return page;
380 * By the time this is called, the page is locked and
381 * we don't have to worry about any races any more.
383 * Start the IO..
385 static int writeout_one_page(struct page *page)
387 struct buffer_head *bh, *head = page->buffers;
389 bh = head;
390 do {
391 if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
392 continue;
394 bh->b_flushtime = 0;
395 ll_rw_block(WRITE, 1, &bh);
396 } while ((bh = bh->b_this_page) != head);
397 return 0;
400 static int waitfor_one_page(struct page *page)
402 int error = 0;
403 struct buffer_head *bh, *head = page->buffers;
405 bh = head;
406 do {
407 wait_on_buffer(bh);
408 if (buffer_req(bh) && !buffer_uptodate(bh))
409 error = -EIO;
410 } while ((bh = bh->b_this_page) != head);
411 return error;
414 static int do_buffer_fdatasync(struct inode *inode, unsigned long start, unsigned long end, int (*fn)(struct page *))
416 struct list_head *head, *curr;
417 struct page *page;
418 int retval = 0;
420 head = &inode->i_data.pages;
422 spin_lock(&pagecache_lock);
423 curr = head->next;
424 while (curr != head) {
425 page = list_entry(curr, struct page, list);
426 curr = curr->next;
427 if (!page->buffers)
428 continue;
429 if (page->index >= end)
430 continue;
431 if (page->index < start)
432 continue;
434 get_page(page);
435 spin_unlock(&pagecache_lock);
436 lock_page(page);
438 /* The buffers could have been free'd while we waited for the page lock */
439 if (page->buffers)
440 retval |= fn(page);
442 UnlockPage(page);
443 spin_lock(&pagecache_lock);
444 curr = page->list.next;
445 page_cache_release(page);
447 spin_unlock(&pagecache_lock);
449 return retval;
453 * Two-stage data sync: first start the IO, then go back and
454 * collect the information..
456 int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx)
458 int retval;
460 retval = do_buffer_fdatasync(inode, start_idx, end_idx, writeout_one_page);
461 retval |= do_buffer_fdatasync(inode, start_idx, end_idx, waitfor_one_page);
462 return retval;
466 * This adds a page to the page cache, starting out as locked,
467 * owned by us, referenced, but not uptodate and with no errors.
469 static inline void __add_to_page_cache(struct page * page,
470 struct address_space *mapping, unsigned long offset,
471 struct page **hash)
473 struct page *alias;
474 unsigned long flags;
476 flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced));
477 page->flags = flags | (1 << PG_locked);
478 get_page(page);
479 page->index = offset;
480 add_page_to_inode_queue(mapping, page);
481 __add_page_to_hash_queue(page, hash);
482 lru_cache_add(page);
483 alias = __find_page_nolock(mapping, offset, *hash);
484 if (alias != page)
485 BUG();
488 void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset)
490 spin_lock(&pagecache_lock);
491 __add_to_page_cache(page, mapping, offset, page_hash(mapping, offset));
492 spin_unlock(&pagecache_lock);
495 static int add_to_page_cache_unique(struct page * page,
496 struct address_space *mapping, unsigned long offset,
497 struct page **hash)
499 int err;
500 struct page *alias;
502 spin_lock(&pagecache_lock);
503 alias = __find_page_nolock(mapping, offset, *hash);
505 err = 1;
506 if (!alias) {
507 __add_to_page_cache(page,mapping,offset,hash);
508 err = 0;
511 spin_unlock(&pagecache_lock);
512 return err;
516 * This adds the requested page to the page cache if it isn't already there,
517 * and schedules an I/O to read in its contents from disk.
519 static inline int page_cache_read(struct file * file, unsigned long offset)
521 struct inode *inode = file->f_dentry->d_inode;
522 struct page **hash = page_hash(&inode->i_data, offset);
523 struct page *page;
525 spin_lock(&pagecache_lock);
526 page = __find_page_nolock(&inode->i_data, offset, *hash);
527 spin_unlock(&pagecache_lock);
528 if (page)
529 return 0;
531 page = page_cache_alloc();
532 if (!page)
533 return -ENOMEM;
535 if (!add_to_page_cache_unique(page, &inode->i_data, offset, hash)) {
536 int error = inode->i_op->readpage(file->f_dentry, page);
537 page_cache_release(page);
538 return error;
541 * We arrive here in the unlikely event that someone
542 * raced with us and added our page to the cache first.
544 page_cache_free(page);
545 return 0;
549 * Read in an entire cluster at once. A cluster is usually a 64k-
550 * aligned block that includes the page requested in "offset."
552 static int read_cluster_nonblocking(struct file * file, unsigned long offset,
553 unsigned long filesize)
555 unsigned long pages = CLUSTER_PAGES;
557 offset = CLUSTER_OFFSET(offset);
558 while ((pages-- > 0) && (offset < filesize)) {
559 int error = page_cache_read(file, offset);
560 if (error < 0)
561 return error;
562 offset ++;
565 return 0;
569 * Wait for a page to get unlocked.
571 * This must be called with the caller "holding" the page,
572 * ie with increased "page->count" so that the page won't
573 * go away during the wait..
575 void ___wait_on_page(struct page *page)
577 struct task_struct *tsk = current;
578 DECLARE_WAITQUEUE(wait, tsk);
580 add_wait_queue(&page->wait, &wait);
581 do {
582 run_task_queue(&tq_disk);
583 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
584 if (!PageLocked(page))
585 break;
586 schedule();
587 } while (PageLocked(page));
588 tsk->state = TASK_RUNNING;
589 remove_wait_queue(&page->wait, &wait);
593 * Get an exclusive lock on the page..
595 void lock_page(struct page *page)
597 while (TryLockPage(page))
598 ___wait_on_page(page);
603 * a rather lightweight function, finding and getting a reference to a
604 * hashed page atomically, waiting for it if it's locked.
606 struct page * __find_get_page (struct address_space *mapping,
607 unsigned long offset, struct page **hash)
609 struct page *page;
612 * We scan the hash list read-only. Addition to and removal from
613 * the hash-list needs a held write-lock.
615 repeat:
616 spin_lock(&pagecache_lock);
617 page = __find_page_nolock(mapping, offset, *hash);
618 if (page)
619 get_page(page);
620 spin_unlock(&pagecache_lock);
622 /* Found the page, sleep if locked. */
623 if (page && PageLocked(page)) {
624 struct task_struct *tsk = current;
625 DECLARE_WAITQUEUE(wait, tsk);
627 run_task_queue(&tq_disk);
629 __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
630 add_wait_queue(&page->wait, &wait);
632 if (PageLocked(page))
633 schedule();
634 __set_task_state(tsk, TASK_RUNNING);
635 remove_wait_queue(&page->wait, &wait);
638 * The page might have been unhashed meanwhile. It's
639 * not freed though because we hold a reference to it.
640 * If this is the case then it will be freed _here_,
641 * and we recheck the hash anyway.
643 page_cache_release(page);
644 goto repeat;
647 * It's not locked so we can return the page and we hold
648 * a reference to it.
650 return page;
654 * Get the lock to a page atomically.
656 struct page * __find_lock_page (struct address_space *mapping,
657 unsigned long offset, struct page **hash)
659 struct page *page;
662 * We scan the hash list read-only. Addition to and removal from
663 * the hash-list needs a held write-lock.
665 repeat:
666 spin_lock(&pagecache_lock);
667 page = __find_page_nolock(mapping, offset, *hash);
668 if (page)
669 get_page(page);
670 spin_unlock(&pagecache_lock);
672 /* Found the page, sleep if locked. */
673 if (page && TryLockPage(page)) {
674 struct task_struct *tsk = current;
675 DECLARE_WAITQUEUE(wait, tsk);
677 run_task_queue(&tq_disk);
679 __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
680 add_wait_queue(&page->wait, &wait);
682 if (PageLocked(page))
683 schedule();
684 __set_task_state(tsk, TASK_RUNNING);
685 remove_wait_queue(&page->wait, &wait);
688 * The page might have been unhashed meanwhile. It's
689 * not freed though because we hold a reference to it.
690 * If this is the case then it will be freed _here_,
691 * and we recheck the hash anyway.
693 page_cache_release(page);
694 goto repeat;
697 * It's not locked so we can return the page and we hold
698 * a reference to it.
700 return page;
703 #if 0
704 #define PROFILE_READAHEAD
705 #define DEBUG_READAHEAD
706 #endif
709 * Read-ahead profiling information
710 * --------------------------------
711 * Every PROFILE_MAXREADCOUNT, the following information is written
712 * to the syslog:
713 * Percentage of asynchronous read-ahead.
714 * Average of read-ahead fields context value.
715 * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
716 * to the syslog.
719 #ifdef PROFILE_READAHEAD
721 #define PROFILE_MAXREADCOUNT 1000
723 static unsigned long total_reada;
724 static unsigned long total_async;
725 static unsigned long total_ramax;
726 static unsigned long total_ralen;
727 static unsigned long total_rawin;
729 static void profile_readahead(int async, struct file *filp)
731 unsigned long flags;
733 ++total_reada;
734 if (async)
735 ++total_async;
737 total_ramax += filp->f_ramax;
738 total_ralen += filp->f_ralen;
739 total_rawin += filp->f_rawin;
741 if (total_reada > PROFILE_MAXREADCOUNT) {
742 save_flags(flags);
743 cli();
744 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
745 restore_flags(flags);
746 return;
749 printk("Readahead average: max=%ld, len=%ld, win=%ld, async=%ld%%\n",
750 total_ramax/total_reada,
751 total_ralen/total_reada,
752 total_rawin/total_reada,
753 (total_async*100)/total_reada);
754 #ifdef DEBUG_READAHEAD
755 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n",
756 filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
757 #endif
759 total_reada = 0;
760 total_async = 0;
761 total_ramax = 0;
762 total_ralen = 0;
763 total_rawin = 0;
765 restore_flags(flags);
768 #endif /* defined PROFILE_READAHEAD */
771 * Read-ahead context:
772 * -------------------
773 * The read ahead context fields of the "struct file" are the following:
774 * - f_raend : position of the first byte after the last page we tried to
775 * read ahead.
776 * - f_ramax : current read-ahead maximum size.
777 * - f_ralen : length of the current IO read block we tried to read-ahead.
778 * - f_rawin : length of the current read-ahead window.
779 * if last read-ahead was synchronous then
780 * f_rawin = f_ralen
781 * otherwise (was asynchronous)
782 * f_rawin = previous value of f_ralen + f_ralen
784 * Read-ahead limits:
785 * ------------------
786 * MIN_READAHEAD : minimum read-ahead size when read-ahead.
787 * MAX_READAHEAD : maximum read-ahead size when read-ahead.
789 * Synchronous read-ahead benefits:
790 * --------------------------------
791 * Using reasonable IO xfer length from peripheral devices increase system
792 * performances.
793 * Reasonable means, in this context, not too large but not too small.
794 * The actual maximum value is:
795 * MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
796 * and 32K if defined (4K page size assumed).
798 * Asynchronous read-ahead benefits:
799 * ---------------------------------
800 * Overlapping next read request and user process execution increase system
801 * performance.
803 * Read-ahead risks:
804 * -----------------
805 * We have to guess which further data are needed by the user process.
806 * If these data are often not really needed, it's bad for system
807 * performances.
808 * However, we know that files are often accessed sequentially by
809 * application programs and it seems that it is possible to have some good
810 * strategy in that guessing.
811 * We only try to read-ahead files that seems to be read sequentially.
813 * Asynchronous read-ahead risks:
814 * ------------------------------
815 * In order to maximize overlapping, we must start some asynchronous read
816 * request from the device, as soon as possible.
817 * We must be very careful about:
818 * - The number of effective pending IO read requests.
819 * ONE seems to be the only reasonable value.
820 * - The total memory pool usage for the file access stream.
821 * This maximum memory usage is implicitly 2 IO read chunks:
822 * 2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
823 * 64k if defined (4K page size assumed).
826 static inline int get_max_readahead(struct inode * inode)
828 if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
829 return MAX_READAHEAD;
830 return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
833 static void generic_file_readahead(int reada_ok,
834 struct file * filp, struct inode * inode,
835 struct page * page)
837 unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
838 unsigned long index = page->index;
839 unsigned long max_ahead, ahead;
840 unsigned long raend;
841 int max_readahead = get_max_readahead(inode);
843 raend = filp->f_raend;
844 max_ahead = 0;
847 * The current page is locked.
848 * If the current position is inside the previous read IO request, do not
849 * try to reread previously read ahead pages.
850 * Otherwise decide or not to read ahead some pages synchronously.
851 * If we are not going to read ahead, set the read ahead context for this
852 * page only.
854 if (PageLocked(page)) {
855 if (!filp->f_ralen || index >= raend || index + filp->f_ralen < raend) {
856 raend = index;
857 if (raend < end_index)
858 max_ahead = filp->f_ramax;
859 filp->f_rawin = 0;
860 filp->f_ralen = 1;
861 if (!max_ahead) {
862 filp->f_raend = index + filp->f_ralen;
863 filp->f_rawin += filp->f_ralen;
868 * The current page is not locked.
869 * If we were reading ahead and,
870 * if the current max read ahead size is not zero and,
871 * if the current position is inside the last read-ahead IO request,
872 * it is the moment to try to read ahead asynchronously.
873 * We will later force unplug device in order to force asynchronous read IO.
875 else if (reada_ok && filp->f_ramax && raend >= 1 &&
876 index <= raend && index + filp->f_ralen >= raend) {
878 * Add ONE page to max_ahead in order to try to have about the same IO max size
879 * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
880 * Compute the position of the last page we have tried to read in order to
881 * begin to read ahead just at the next page.
883 raend -= 1;
884 if (raend < end_index)
885 max_ahead = filp->f_ramax + 1;
887 if (max_ahead) {
888 filp->f_rawin = filp->f_ralen;
889 filp->f_ralen = 0;
890 reada_ok = 2;
894 * Try to read ahead pages.
895 * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
896 * scheduler, will work enough for us to avoid too bad actuals IO requests.
898 ahead = 0;
899 while (ahead < max_ahead) {
900 ahead ++;
901 if ((raend + ahead) >= end_index)
902 break;
903 if (page_cache_read(filp, raend + ahead) < 0)
904 break;
907 * If we tried to read ahead some pages,
908 * If we tried to read ahead asynchronously,
909 * Try to force unplug of the device in order to start an asynchronous
910 * read IO request.
911 * Update the read-ahead context.
912 * Store the length of the current read-ahead window.
913 * Double the current max read ahead size.
914 * That heuristic avoid to do some large IO for files that are not really
915 * accessed sequentially.
917 if (ahead) {
918 if (reada_ok == 2) {
919 run_task_queue(&tq_disk);
922 filp->f_ralen += ahead;
923 filp->f_rawin += filp->f_ralen;
924 filp->f_raend = raend + ahead + 1;
926 filp->f_ramax += filp->f_ramax;
928 if (filp->f_ramax > max_readahead)
929 filp->f_ramax = max_readahead;
931 #ifdef PROFILE_READAHEAD
932 profile_readahead((reada_ok == 2), filp);
933 #endif
936 return;
941 * This is a generic file read routine, and uses the
942 * inode->i_op->readpage() function for the actual low-level
943 * stuff.
945 * This is really ugly. But the goto's actually try to clarify some
946 * of the logic when it comes to error handling etc.
948 void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
950 struct dentry *dentry = filp->f_dentry;
951 struct inode *inode = dentry->d_inode;
952 unsigned long index, offset;
953 struct page *cached_page;
954 int reada_ok;
955 int error;
956 int max_readahead = get_max_readahead(inode);
958 cached_page = NULL;
959 index = *ppos >> PAGE_CACHE_SHIFT;
960 offset = *ppos & ~PAGE_CACHE_MASK;
963 * If the current position is outside the previous read-ahead window,
964 * we reset the current read-ahead context and set read ahead max to zero
965 * (will be set to just needed value later),
966 * otherwise, we assume that the file accesses are sequential enough to
967 * continue read-ahead.
969 if (index > filp->f_raend || index + filp->f_rawin < filp->f_raend) {
970 reada_ok = 0;
971 filp->f_raend = 0;
972 filp->f_ralen = 0;
973 filp->f_ramax = 0;
974 filp->f_rawin = 0;
975 } else {
976 reada_ok = 1;
979 * Adjust the current value of read-ahead max.
980 * If the read operation stay in the first half page, force no readahead.
981 * Otherwise try to increase read ahead max just enough to do the read request.
982 * Then, at least MIN_READAHEAD if read ahead is ok,
983 * and at most MAX_READAHEAD in all cases.
985 if (!index && offset + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
986 filp->f_ramax = 0;
987 } else {
988 unsigned long needed;
990 needed = ((offset + desc->count) >> PAGE_CACHE_SHIFT) + 1;
992 if (filp->f_ramax < needed)
993 filp->f_ramax = needed;
995 if (reada_ok && filp->f_ramax < MIN_READAHEAD)
996 filp->f_ramax = MIN_READAHEAD;
997 if (filp->f_ramax > max_readahead)
998 filp->f_ramax = max_readahead;
1001 for (;;) {
1002 struct page *page, **hash;
1003 unsigned long end_index, nr;
1005 end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1006 if (index > end_index)
1007 break;
1008 nr = PAGE_CACHE_SIZE;
1009 if (index == end_index) {
1010 nr = inode->i_size & ~PAGE_CACHE_MASK;
1011 if (nr <= offset)
1012 break;
1015 nr = nr - offset;
1018 * Try to find the data in the page cache..
1020 hash = page_hash(&inode->i_data, index);
1022 spin_lock(&pagecache_lock);
1023 page = __find_page_nolock(&inode->i_data, index, *hash);
1024 if (!page)
1025 goto no_cached_page;
1026 found_page:
1027 get_page(page);
1028 spin_unlock(&pagecache_lock);
1030 if (!Page_Uptodate(page))
1031 goto page_not_up_to_date;
1032 page_ok:
1034 * Ok, we have the page, and it's up-to-date, so
1035 * now we can copy it to user space...
1037 * The actor routine returns how many bytes were actually used..
1038 * NOTE! This may not be the same as how much of a user buffer
1039 * we filled up (we may be padding etc), so we can only update
1040 * "pos" here (the actor routine has to update the user buffer
1041 * pointers and the remaining count).
1043 nr = actor(desc, page, offset, nr);
1044 offset += nr;
1045 index += offset >> PAGE_CACHE_SHIFT;
1046 offset &= ~PAGE_CACHE_MASK;
1048 page_cache_release(page);
1049 if (nr && desc->count)
1050 continue;
1051 break;
1054 * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
1056 page_not_up_to_date:
1057 generic_file_readahead(reada_ok, filp, inode, page);
1059 if (Page_Uptodate(page))
1060 goto page_ok;
1062 /* Get exclusive access to the page ... */
1063 lock_page(page);
1064 if (Page_Uptodate(page)) {
1065 UnlockPage(page);
1066 goto page_ok;
1069 readpage:
1070 /* ... and start the actual read. The read will unlock the page. */
1071 error = inode->i_op->readpage(filp->f_dentry, page);
1073 if (!error) {
1074 if (Page_Uptodate(page))
1075 goto page_ok;
1077 /* Again, try some read-ahead while waiting for the page to finish.. */
1078 generic_file_readahead(reada_ok, filp, inode, page);
1079 wait_on_page(page);
1080 if (Page_Uptodate(page))
1081 goto page_ok;
1082 error = -EIO;
1085 /* UHHUH! A synchronous read error occurred. Report it */
1086 desc->error = error;
1087 page_cache_release(page);
1088 break;
1090 no_cached_page:
1092 * Ok, it wasn't cached, so we need to create a new
1093 * page..
1095 * We get here with the page cache lock held.
1097 if (!cached_page) {
1098 spin_unlock(&pagecache_lock);
1099 cached_page = page_cache_alloc();
1100 if (!cached_page) {
1101 desc->error = -ENOMEM;
1102 break;
1106 * Somebody may have added the page while we
1107 * dropped the page cache lock. Check for that.
1109 spin_lock(&pagecache_lock);
1110 page = __find_page_nolock(&inode->i_data, index, *hash);
1111 if (page)
1112 goto found_page;
1116 * Ok, add the new page to the hash-queues...
1118 page = cached_page;
1119 __add_to_page_cache(page, &inode->i_data, index, hash);
1120 spin_unlock(&pagecache_lock);
1121 cached_page = NULL;
1123 goto readpage;
1126 *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1127 filp->f_reada = 1;
1128 if (cached_page)
1129 page_cache_free(cached_page);
1130 UPDATE_ATIME(inode);
1133 static int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
1135 unsigned long kaddr;
1136 unsigned long left, count = desc->count;
1138 if (size > count)
1139 size = count;
1141 kaddr = kmap(page);
1142 left = __copy_to_user(desc->buf, (void *)(kaddr + offset), size);
1143 kunmap(page);
1145 if (left) {
1146 size -= left;
1147 desc->error = -EFAULT;
1149 desc->count = count - size;
1150 desc->written += size;
1151 desc->buf += size;
1152 return size;
1156 * This is the "read()" routine for all filesystems
1157 * that can use the page cache directly.
1159 ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
1161 ssize_t retval;
1163 retval = -EFAULT;
1164 if (access_ok(VERIFY_WRITE, buf, count)) {
1165 retval = 0;
1167 if (count) {
1168 read_descriptor_t desc;
1170 desc.written = 0;
1171 desc.count = count;
1172 desc.buf = buf;
1173 desc.error = 0;
1174 do_generic_file_read(filp, ppos, &desc, file_read_actor);
1176 retval = desc.written;
1177 if (!retval)
1178 retval = desc.error;
1181 return retval;
1184 static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size)
1186 unsigned long kaddr;
1187 ssize_t written;
1188 unsigned long count = desc->count;
1189 struct file *file = (struct file *) desc->buf;
1190 mm_segment_t old_fs;
1192 if (size > count)
1193 size = count;
1194 old_fs = get_fs();
1195 set_fs(KERNEL_DS);
1197 kaddr = kmap(page);
1198 written = file->f_op->write(file, (char *)kaddr + offset,
1199 size, &file->f_pos);
1200 kunmap(page);
1201 set_fs(old_fs);
1202 if (written < 0) {
1203 desc->error = written;
1204 written = 0;
1206 desc->count = count - written;
1207 desc->written += written;
1208 return written;
1211 asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
1213 ssize_t retval;
1214 struct file * in_file, * out_file;
1215 struct inode * in_inode, * out_inode;
1218 * Get input file, and verify that it is ok..
1220 retval = -EBADF;
1221 in_file = fget(in_fd);
1222 if (!in_file)
1223 goto out;
1224 if (!(in_file->f_mode & FMODE_READ))
1225 goto fput_in;
1226 retval = -EINVAL;
1227 in_inode = in_file->f_dentry->d_inode;
1228 if (!in_inode)
1229 goto fput_in;
1230 if (!in_inode->i_op || !in_inode->i_op->readpage)
1231 goto fput_in;
1232 retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
1233 if (retval)
1234 goto fput_in;
1237 * Get output file, and verify that it is ok..
1239 retval = -EBADF;
1240 out_file = fget(out_fd);
1241 if (!out_file)
1242 goto fput_in;
1243 if (!(out_file->f_mode & FMODE_WRITE))
1244 goto fput_out;
1245 retval = -EINVAL;
1246 if (!out_file->f_op || !out_file->f_op->write)
1247 goto fput_out;
1248 out_inode = out_file->f_dentry->d_inode;
1249 if (!out_inode)
1250 goto fput_out;
1251 retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
1252 if (retval)
1253 goto fput_out;
1255 retval = 0;
1256 if (count) {
1257 read_descriptor_t desc;
1258 loff_t pos = 0, *ppos;
1260 retval = -EFAULT;
1261 ppos = &in_file->f_pos;
1262 if (offset) {
1263 if (get_user(pos, offset))
1264 goto fput_out;
1265 ppos = &pos;
1268 desc.written = 0;
1269 desc.count = count;
1270 desc.buf = (char *) out_file;
1271 desc.error = 0;
1272 do_generic_file_read(in_file, ppos, &desc, file_send_actor);
1274 retval = desc.written;
1275 if (!retval)
1276 retval = desc.error;
1277 if (offset)
1278 put_user(pos, offset);
1281 fput_out:
1282 fput(out_file);
1283 fput_in:
1284 fput(in_file);
1285 out:
1286 return retval;
1290 * filemap_nopage() is invoked via the vma operations vector for a
1291 * mapped memory region to read in file data during a page fault.
1293 * The goto's are kind of ugly, but this streamlines the normal case of having
1294 * it in the page cache, and handles the special cases reasonably without
1295 * having a lot of duplicated code.
1297 struct page * filemap_nopage(struct vm_area_struct * area,
1298 unsigned long address, int no_share)
1300 int error;
1301 struct file *file = area->vm_file;
1302 struct dentry *dentry = file->f_dentry;
1303 struct inode *inode = dentry->d_inode;
1304 struct page *page, **hash, *old_page;
1305 unsigned long size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1307 unsigned long pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
1310 * Semantics for shared and private memory areas are different
1311 * past the end of the file. A shared mapping past the last page
1312 * of the file is an error and results in a SIGBUS, while a
1313 * private mapping just maps in a zero page.
1315 if ((pgoff >= size) &&
1316 (area->vm_flags & VM_SHARED) && (area->vm_mm == current->mm))
1317 return NULL;
1320 * Do we have something in the page cache already?
1322 hash = page_hash(&inode->i_data, pgoff);
1323 retry_find:
1324 page = __find_get_page(&inode->i_data, pgoff, hash);
1325 if (!page)
1326 goto no_cached_page;
1329 * Ok, found a page in the page cache, now we need to check
1330 * that it's up-to-date.
1332 if (!Page_Uptodate(page))
1333 goto page_not_uptodate;
1335 success:
1337 * Found the page and have a reference on it, need to check sharing
1338 * and possibly copy it over to another page..
1340 old_page = page;
1341 if (no_share) {
1342 struct page *new_page = page_cache_alloc();
1344 if (new_page) {
1345 copy_highpage(new_page, old_page);
1346 flush_page_to_ram(new_page);
1347 } else
1348 new_page = NOPAGE_OOM;
1349 page_cache_release(page);
1350 return new_page;
1353 flush_page_to_ram(old_page);
1354 return old_page;
1356 no_cached_page:
1358 * If the requested offset is within our file, try to read a whole
1359 * cluster of pages at once.
1361 * Otherwise, we're off the end of a privately mapped file,
1362 * so we need to map a zero page.
1364 if (pgoff < size)
1365 error = read_cluster_nonblocking(file, pgoff, size);
1366 else
1367 error = page_cache_read(file, pgoff);
1370 * The page we want has now been added to the page cache.
1371 * In the unlikely event that someone removed it in the
1372 * meantime, we'll just come back here and read it again.
1374 if (error >= 0)
1375 goto retry_find;
1378 * An error return from page_cache_read can result if the
1379 * system is low on memory, or a problem occurs while trying
1380 * to schedule I/O.
1382 if (error == -ENOMEM)
1383 return NOPAGE_OOM;
1384 return NULL;
1386 page_not_uptodate:
1387 lock_page(page);
1388 if (Page_Uptodate(page)) {
1389 UnlockPage(page);
1390 goto success;
1393 if (!inode->i_op->readpage(file->f_dentry, page)) {
1394 wait_on_page(page);
1395 if (Page_Uptodate(page))
1396 goto success;
1400 * Umm, take care of errors if the page isn't up-to-date.
1401 * Try to re-read it _once_. We do this synchronously,
1402 * because there really aren't any performance issues here
1403 * and we need to check for errors.
1405 lock_page(page);
1406 if (Page_Uptodate(page)) {
1407 UnlockPage(page);
1408 goto success;
1410 ClearPageError(page);
1411 if (!inode->i_op->readpage(file->f_dentry, page)) {
1412 wait_on_page(page);
1413 if (Page_Uptodate(page))
1414 goto success;
1418 * Things didn't work out. Return zero to tell the
1419 * mm layer so, possibly freeing the page cache page first.
1421 page_cache_release(page);
1422 return NULL;
1426 * Tries to write a shared mapped page to its backing store. May return -EIO
1427 * if the disk is full.
1429 static inline int do_write_page(struct inode * inode, struct file * file,
1430 struct page * page, unsigned long index)
1432 int retval;
1433 int (*writepage) (struct dentry *, struct page *);
1435 /* refuse to extend file size.. */
1436 if (S_ISREG(inode->i_mode)) {
1437 unsigned long size_idx = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1439 /* Ho humm.. We should have tested for this earlier */
1440 if (size_idx <= index)
1441 return -EIO;
1443 writepage = inode->i_op->writepage;
1444 lock_page(page);
1446 retval = writepage(file->f_dentry, page);
1448 UnlockPage(page);
1449 return retval;
1452 static int filemap_write_page(struct file *file,
1453 unsigned long index,
1454 struct page * page,
1455 int wait)
1457 int result;
1458 struct dentry * dentry;
1459 struct inode * inode;
1461 dentry = file->f_dentry;
1462 inode = dentry->d_inode;
1465 * If a task terminates while we're swapping the page, the vma and
1466 * and file could be released: try_to_swap_out has done a get_file.
1467 * vma/file is guaranteed to exist in the unmap/sync cases because
1468 * mmap_sem is held.
1470 result = do_write_page(inode, file, page, index);
1471 return result;
1476 * The page cache takes care of races between somebody
1477 * trying to swap something out and swap something in
1478 * at the same time..
1480 extern void wakeup_bdflush(int);
1481 int filemap_swapout(struct page * page, struct file * file)
1483 int retval = filemap_write_page(file, page->index, page, 0);
1484 wakeup_bdflush(0);
1485 return retval;
1488 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
1489 unsigned long address, unsigned int flags)
1491 unsigned long pgoff;
1492 pte_t pte = *ptep;
1493 struct page *page;
1494 int error;
1496 if (!(flags & MS_INVALIDATE)) {
1497 if (!pte_present(pte))
1498 return 0;
1499 if (!pte_dirty(pte))
1500 return 0;
1501 flush_page_to_ram(pte_page(pte));
1502 flush_cache_page(vma, address);
1503 set_pte(ptep, pte_mkclean(pte));
1504 flush_tlb_page(vma, address);
1505 page = pte_page(pte);
1506 get_page(page);
1507 } else {
1508 if (pte_none(pte))
1509 return 0;
1510 flush_cache_page(vma, address);
1511 pte_clear(ptep);
1512 flush_tlb_page(vma, address);
1513 if (!pte_present(pte)) {
1514 swap_free(pte_to_swp_entry(pte));
1515 return 0;
1517 page = pte_page(pte);
1518 if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
1519 page_cache_free(page);
1520 return 0;
1523 pgoff = (address - vma->vm_start) >> PAGE_CACHE_SHIFT;
1524 pgoff += vma->vm_pgoff;
1525 if (page->index != pgoff) {
1526 printk("weirdness: pgoff=%lu index=%lu address=%lu vm_start=%lu vm_pgoff=%lu\n",
1527 pgoff, page->index, address, vma->vm_start, vma->vm_pgoff);
1529 error = filemap_write_page(vma->vm_file, pgoff, page, 1);
1530 page_cache_free(page);
1531 return error;
1534 static inline int filemap_sync_pte_range(pmd_t * pmd,
1535 unsigned long address, unsigned long size,
1536 struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
1538 pte_t * pte;
1539 unsigned long end;
1540 int error;
1542 if (pmd_none(*pmd))
1543 return 0;
1544 if (pmd_bad(*pmd)) {
1545 pmd_ERROR(*pmd);
1546 pmd_clear(pmd);
1547 return 0;
1549 pte = pte_offset(pmd, address);
1550 offset += address & PMD_MASK;
1551 address &= ~PMD_MASK;
1552 end = address + size;
1553 if (end > PMD_SIZE)
1554 end = PMD_SIZE;
1555 error = 0;
1556 do {
1557 error |= filemap_sync_pte(pte, vma, address + offset, flags);
1558 address += PAGE_SIZE;
1559 pte++;
1560 } while (address && (address < end));
1561 return error;
1564 static inline int filemap_sync_pmd_range(pgd_t * pgd,
1565 unsigned long address, unsigned long size,
1566 struct vm_area_struct *vma, unsigned int flags)
1568 pmd_t * pmd;
1569 unsigned long offset, end;
1570 int error;
1572 if (pgd_none(*pgd))
1573 return 0;
1574 if (pgd_bad(*pgd)) {
1575 pgd_ERROR(*pgd);
1576 pgd_clear(pgd);
1577 return 0;
1579 pmd = pmd_offset(pgd, address);
1580 offset = address & PGDIR_MASK;
1581 address &= ~PGDIR_MASK;
1582 end = address + size;
1583 if (end > PGDIR_SIZE)
1584 end = PGDIR_SIZE;
1585 error = 0;
1586 do {
1587 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
1588 address = (address + PMD_SIZE) & PMD_MASK;
1589 pmd++;
1590 } while (address && (address < end));
1591 return error;
1594 int filemap_sync(struct vm_area_struct * vma, unsigned long address,
1595 size_t size, unsigned int flags)
1597 pgd_t * dir;
1598 unsigned long end = address + size;
1599 int error = 0;
1601 dir = pgd_offset(vma->vm_mm, address);
1602 flush_cache_range(vma->vm_mm, end - size, end);
1603 if (address >= end)
1604 BUG();
1605 do {
1606 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
1607 address = (address + PGDIR_SIZE) & PGDIR_MASK;
1608 dir++;
1609 } while (address && (address < end));
1610 flush_tlb_range(vma->vm_mm, end - size, end);
1611 return error;
1615 * This handles (potentially partial) area unmaps..
1617 static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len)
1619 lock_kernel();
1620 filemap_sync(vma, start, len, MS_ASYNC);
1621 unlock_kernel();
1625 * Shared mappings need to be able to do the right thing at
1626 * close/unmap/sync. They will also use the private file as
1627 * backing-store for swapping..
1629 static struct vm_operations_struct file_shared_mmap = {
1630 NULL, /* no special open */
1631 NULL, /* no special close */
1632 filemap_unmap, /* unmap - we need to sync the pages */
1633 NULL, /* no special protect */
1634 filemap_sync, /* sync */
1635 NULL, /* advise */
1636 filemap_nopage, /* nopage */
1637 NULL, /* wppage */
1638 filemap_swapout /* swapout */
1642 * Private mappings just need to be able to load in the map.
1644 * (This is actually used for shared mappings as well, if we
1645 * know they can't ever get write permissions..)
1647 static struct vm_operations_struct file_private_mmap = {
1648 NULL, /* open */
1649 NULL, /* close */
1650 NULL, /* unmap */
1651 NULL, /* protect */
1652 NULL, /* sync */
1653 NULL, /* advise */
1654 filemap_nopage, /* nopage */
1655 NULL, /* wppage */
1656 NULL /* swapout */
1659 /* This is used for a general mmap of a disk file */
1661 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1663 struct vm_operations_struct * ops;
1664 struct inode *inode = file->f_dentry->d_inode;
1666 ops = &file_private_mmap;
1667 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1668 if (!inode->i_op || !inode->i_op->writepage)
1669 return -EINVAL;
1670 ops = &file_shared_mmap;
1672 if (!inode->i_sb || !S_ISREG(inode->i_mode))
1673 return -EACCES;
1674 if (!inode->i_op || !inode->i_op->readpage)
1675 return -ENOEXEC;
1676 UPDATE_ATIME(inode);
1677 vma->vm_ops = ops;
1678 return 0;
1683 * The msync() system call.
1686 static int msync_interval(struct vm_area_struct * vma,
1687 unsigned long start, unsigned long end, int flags)
1689 if (vma->vm_file && vma->vm_ops && vma->vm_ops->sync) {
1690 int error;
1691 error = vma->vm_ops->sync(vma, start, end-start, flags);
1692 if (!error && (flags & MS_SYNC)) {
1693 struct file * file = vma->vm_file;
1694 if (file) {
1695 struct dentry * dentry = file->f_dentry;
1696 error = file_fsync(file, dentry);
1699 return error;
1701 return 0;
1704 asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
1706 unsigned long end;
1707 struct vm_area_struct * vma;
1708 int unmapped_error, error = -EINVAL;
1710 down(&current->mm->mmap_sem);
1711 lock_kernel();
1712 if (start & ~PAGE_MASK)
1713 goto out;
1714 len = (len + ~PAGE_MASK) & PAGE_MASK;
1715 end = start + len;
1716 if (end < start)
1717 goto out;
1718 if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1719 goto out;
1720 error = 0;
1721 if (end == start)
1722 goto out;
1724 * If the interval [start,end) covers some unmapped address ranges,
1725 * just ignore them, but return -EFAULT at the end.
1727 vma = find_vma(current->mm, start);
1728 unmapped_error = 0;
1729 for (;;) {
1730 /* Still start < end. */
1731 error = -EFAULT;
1732 if (!vma)
1733 goto out;
1734 /* Here start < vma->vm_end. */
1735 if (start < vma->vm_start) {
1736 unmapped_error = -EFAULT;
1737 start = vma->vm_start;
1739 /* Here vma->vm_start <= start < vma->vm_end. */
1740 if (end <= vma->vm_end) {
1741 if (start < end) {
1742 error = msync_interval(vma, start, end, flags);
1743 if (error)
1744 goto out;
1746 error = unmapped_error;
1747 goto out;
1749 /* Here vma->vm_start <= start < vma->vm_end < end. */
1750 error = msync_interval(vma, start, vma->vm_end, flags);
1751 if (error)
1752 goto out;
1753 start = vma->vm_end;
1754 vma = vma->vm_next;
1756 out:
1757 unlock_kernel();
1758 up(&current->mm->mmap_sem);
1759 return error;
1762 struct page *read_cache_page(struct address_space *mapping,
1763 unsigned long index,
1764 int (*filler)(void *,struct page*),
1765 void *data)
1767 struct page **hash = page_hash(mapping, index);
1768 struct page *page, *cached_page = NULL;
1769 int err;
1770 repeat:
1771 page = __find_get_page(mapping, index, hash);
1772 if (!page) {
1773 if (!cached_page) {
1774 cached_page = page_cache_alloc();
1775 if (!cached_page)
1776 return ERR_PTR(-ENOMEM);
1778 page = cached_page;
1779 if (add_to_page_cache_unique(page, mapping, index, hash))
1780 goto repeat;
1781 cached_page = NULL;
1782 err = filler(data, page);
1783 if (err < 0) {
1784 page_cache_release(page);
1785 page = ERR_PTR(err);
1788 if (cached_page)
1789 page_cache_free(cached_page);
1790 return page;
1793 static inline struct page * __grab_cache_page(struct address_space *mapping,
1794 unsigned long index, struct page **cached_page)
1796 struct page *page, **hash = page_hash(mapping, index);
1797 repeat:
1798 page = __find_lock_page(mapping, index, hash);
1799 if (!page) {
1800 if (!*cached_page) {
1801 *cached_page = page_cache_alloc();
1802 if (!*cached_page)
1803 return NULL;
1805 page = *cached_page;
1806 if (add_to_page_cache_unique(page, mapping, index, hash))
1807 goto repeat;
1808 *cached_page = NULL;
1810 return page;
1814 * Returns locked page at given index in given cache, creating it if needed.
1817 struct page *grab_cache_page(struct address_space *mapping, unsigned long index)
1819 struct page *cached_page = NULL;
1820 struct page *page = __grab_cache_page(mapping,index,&cached_page);
1821 if (cached_page)
1822 page_cache_free(cached_page);
1823 return page;
1827 * Write to a file through the page cache. This is mainly for the
1828 * benefit of NFS and possibly other network-based file systems.
1830 * We currently put everything into the page cache prior to writing it.
1831 * This is not a problem when writing full pages. With partial pages,
1832 * however, we first have to read the data into the cache, then
1833 * dirty the page, and finally schedule it for writing. Alternatively, we
1834 * could write-through just the portion of data that would go into that
1835 * page, but that would kill performance for applications that write data
1836 * line by line, and it's prone to race conditions.
1838 * Note that this routine doesn't try to keep track of dirty pages. Each
1839 * file system has to do this all by itself, unfortunately.
1840 * okir@monad.swb.de
1842 ssize_t
1843 generic_file_write(struct file *file, const char *buf,
1844 size_t count, loff_t *ppos,
1845 writepage_t write_one_page)
1847 struct dentry *dentry = file->f_dentry;
1848 struct inode *inode = dentry->d_inode;
1849 unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
1850 loff_t pos;
1851 struct page *page, *cached_page;
1852 unsigned long written;
1853 long status;
1854 int err;
1856 cached_page = NULL;
1858 down(&inode->i_sem);
1860 pos = *ppos;
1861 err = -EINVAL;
1862 if (pos < 0)
1863 goto out;
1865 err = file->f_error;
1866 if (err) {
1867 file->f_error = 0;
1868 goto out;
1871 written = 0;
1873 if (file->f_flags & O_APPEND)
1874 pos = inode->i_size;
1877 * Check whether we've reached the file size limit.
1879 err = -EFBIG;
1880 if (limit != RLIM_INFINITY) {
1881 if (pos >= limit) {
1882 send_sig(SIGXFSZ, current, 0);
1883 goto out;
1885 if (count > limit - pos) {
1886 send_sig(SIGXFSZ, current, 0);
1887 count = limit - pos;
1891 status = 0;
1893 while (count) {
1894 unsigned long bytes, index, offset;
1897 * Try to find the page in the cache. If it isn't there,
1898 * allocate a free page.
1900 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
1901 index = pos >> PAGE_CACHE_SHIFT;
1902 bytes = PAGE_CACHE_SIZE - offset;
1903 if (bytes > count)
1904 bytes = count;
1906 status = -ENOMEM; /* we'll assign it later anyway */
1907 page = __grab_cache_page(&inode->i_data, index, &cached_page);
1908 if (!page)
1909 break;
1911 /* We have exclusive IO access to the page.. */
1912 if (!PageLocked(page)) {
1913 PAGE_BUG(page);
1916 status = write_one_page(file, page, offset, bytes, buf);
1918 if (status >= 0) {
1919 written += status;
1920 count -= status;
1921 pos += status;
1922 buf += status;
1923 if (pos > inode->i_size)
1924 inode->i_size = pos;
1926 /* Mark it unlocked again and drop the page.. */
1927 UnlockPage(page);
1928 page_cache_release(page);
1930 if (status < 0)
1931 break;
1933 *ppos = pos;
1935 if (cached_page)
1936 page_cache_free(cached_page);
1938 err = written ? written : status;
1939 out:
1940 up(&inode->i_sem);
1941 return err;
1944 void __init page_cache_init(unsigned long mempages)
1946 unsigned long htable_size, order;
1948 htable_size = mempages;
1949 htable_size *= sizeof(struct page *);
1950 for(order = 0; (PAGE_SIZE << order) < htable_size; order++)
1953 do {
1954 unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *);
1956 page_hash_bits = 0;
1957 while((tmp >>= 1UL) != 0UL)
1958 page_hash_bits++;
1960 page_hash_table = (struct page **)
1961 __get_free_pages(GFP_ATOMIC, order);
1962 } while(page_hash_table == NULL && --order > 0);
1964 printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
1965 (1 << page_hash_bits), order, (PAGE_SIZE << order));
1966 if (!page_hash_table)
1967 panic("Failed to allocate page hash table\n");
1968 memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));