Import 2.3.25pre1
[davej-history.git] / mm / filemap.c
blob7f121f1f64fef53019e5bbf3244de7ff6968e61b
1 /*
2 * linux/mm/filemap.c
4 * Copyright (C) 1994-1999 Linus Torvalds
5 */
7 /*
8 * This file handles the generic file mmap semantics used by
9 * most "normal" filesystems (but you don't /have/ to use this:
10 * the NFS filesystem used to do this differently, for example)
12 #include <linux/malloc.h>
13 #include <linux/shm.h>
14 #include <linux/mman.h>
15 #include <linux/locks.h>
16 #include <linux/pagemap.h>
17 #include <linux/swap.h>
18 #include <linux/smp_lock.h>
19 #include <linux/blkdev.h>
20 #include <linux/file.h>
21 #include <linux/swapctl.h>
22 #include <linux/slab.h>
23 #include <linux/init.h>
24 #include <linux/highmem.h>
26 #include <asm/pgtable.h>
27 #include <asm/uaccess.h>
30 * Shared mappings implemented 30.11.1994. It's not fully working yet,
31 * though.
33 * Shared mappings now work. 15.8.1995 Bruno.
35 * finished 'unifying' the page and buffer cache and SMP-threaded the
36 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
38 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
41 atomic_t page_cache_size = ATOMIC_INIT(0);
42 unsigned int page_hash_bits;
43 struct page **page_hash_table;
45 spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
47 * NOTE: to avoid deadlocking you must never acquire the pagecache_lock with
48 * the pagemap_lru_lock held.
50 spinlock_t pagemap_lru_lock = SPIN_LOCK_UNLOCKED;
52 #define CLUSTER_PAGES (1 << page_cluster)
53 #define CLUSTER_OFFSET(x) (((x) >> page_cluster) << page_cluster)
55 void __add_page_to_hash_queue(struct page * page, struct page **p)
57 atomic_inc(&page_cache_size);
58 if((page->next_hash = *p) != NULL)
59 (*p)->pprev_hash = &page->next_hash;
60 *p = page;
61 page->pprev_hash = p;
62 if (page->buffers)
63 PAGE_BUG(page);
66 static void remove_page_from_hash_queue(struct page * page)
68 if(page->pprev_hash) {
69 if(page->next_hash)
70 page->next_hash->pprev_hash = page->pprev_hash;
71 *page->pprev_hash = page->next_hash;
72 page->pprev_hash = NULL;
74 atomic_dec(&page_cache_size);
78 * Remove a page from the page cache and free it. Caller has to make
79 * sure the page is locked and that nobody else uses it - or that usage
80 * is safe.
82 void remove_inode_page(struct page *page)
84 if (!PageLocked(page))
85 PAGE_BUG(page);
87 spin_lock(&pagecache_lock);
88 remove_page_from_inode_queue(page);
89 remove_page_from_hash_queue(page);
90 page->mapping = NULL;
91 spin_unlock(&pagecache_lock);
94 void invalidate_inode_pages(struct inode * inode)
96 struct list_head *head, *curr;
97 struct page * page;
99 head = &inode->i_data.pages;
100 spin_lock(&pagecache_lock);
101 curr = head->next;
103 while (curr != head) {
104 page = list_entry(curr, struct page, list);
105 curr = curr->next;
107 /* We cannot invalidate a locked page */
108 if (PageLocked(page))
109 continue;
111 lru_cache_del(page);
113 remove_page_from_inode_queue(page);
114 remove_page_from_hash_queue(page);
115 page->mapping = NULL;
116 page_cache_release(page);
118 spin_unlock(&pagecache_lock);
122 * Truncate the page cache at a set offset, removing the pages
123 * that are beyond that offset (and zeroing out partial pages).
125 void truncate_inode_pages(struct inode * inode, unsigned long start)
127 struct list_head *head, *curr;
128 struct page * page;
129 unsigned partial = start & (PAGE_CACHE_SIZE - 1);
131 start = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
133 repeat:
134 head = &inode->i_data.pages;
135 spin_lock(&pagecache_lock);
136 curr = head->next;
137 while (curr != head) {
138 unsigned long offset;
140 page = list_entry(curr, struct page, list);
141 curr = curr->next;
143 offset = page->pg_offset;
145 /* page wholly truncated - free it */
146 if (offset >= start) {
147 get_page(page);
148 spin_unlock(&pagecache_lock);
150 lock_page(page);
152 if (!inode->i_op->flushpage ||
153 inode->i_op->flushpage(inode, page, 0))
154 lru_cache_del(page);
157 * We remove the page from the page cache
158 * _after_ we have destroyed all buffer-cache
159 * references to it. Otherwise some other process
160 * might think this inode page is not in the
161 * page cache and creates a buffer-cache alias
162 * to it causing all sorts of fun problems ...
164 remove_inode_page(page);
166 UnlockPage(page);
167 page_cache_release(page);
168 page_cache_release(page);
171 * We have done things without the pagecache lock,
172 * so we'll have to repeat the scan.
173 * It's not possible to deadlock here because
174 * we are guaranteed to make progress. (ie. we have
175 * just removed a page)
177 goto repeat;
180 * there is only one partial page possible.
182 if (!partial)
183 continue;
185 /* and it's the one preceeding the first wholly truncated page */
186 if ((offset + 1) != start)
187 continue;
189 /* partial truncate, clear end of page */
190 get_page(page);
191 spin_unlock(&pagecache_lock);
193 lock_page(page);
195 memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
196 if (inode->i_op->flushpage)
197 inode->i_op->flushpage(inode, page, partial);
199 partial = 0;
202 * we have dropped the spinlock so we have to
203 * restart.
205 UnlockPage(page);
206 page_cache_release(page);
207 goto repeat;
209 spin_unlock(&pagecache_lock);
212 int shrink_mmap(int priority, int gfp_mask)
214 int ret = 0, count;
215 LIST_HEAD(young);
216 LIST_HEAD(old);
217 LIST_HEAD(forget);
218 struct list_head * page_lru, * dispose;
219 struct page * page;
221 count = nr_lru_pages / (priority+1);
223 spin_lock(&pagemap_lru_lock);
225 while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache) {
226 page = list_entry(page_lru, struct page, lru);
227 list_del(page_lru);
229 dispose = &lru_cache;
230 if (test_and_clear_bit(PG_referenced, &page->flags))
231 /* Roll the page at the top of the lru list,
232 * we could also be more aggressive putting
233 * the page in the young-dispose-list, so
234 * avoiding to free young pages in each pass.
236 goto dispose_continue;
238 dispose = &old;
239 /* don't account passes over not DMA pages */
240 if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
241 goto dispose_continue;
242 if (!(gfp_mask & __GFP_HIGHMEM) && PageHighMem(page))
243 goto dispose_continue;
245 count--;
247 dispose = &young;
248 if (TryLockPage(page))
249 goto dispose_continue;
251 /* Release the pagemap_lru lock even if the page is not yet
252 queued in any lru queue since we have just locked down
253 the page so nobody else may SMP race with us running
254 a lru_cache_del() (lru_cache_del() always run with the
255 page locked down ;). */
256 spin_unlock(&pagemap_lru_lock);
258 /* avoid unscalable SMP locking */
259 if (!page->buffers && page_count(page) > 1)
260 goto unlock_noput_continue;
262 /* Take the pagecache_lock spinlock held to avoid
263 other tasks to notice the page while we are looking at its
264 page count. If it's a pagecache-page we'll free it
265 in one atomic transaction after checking its page count. */
266 spin_lock(&pagecache_lock);
268 /* avoid freeing the page while it's locked */
269 get_page(page);
271 /* Is it a buffer page? */
272 if (page->buffers) {
273 spin_unlock(&pagecache_lock);
274 if (!try_to_free_buffers(page))
275 goto unlock_continue;
276 /* page was locked, inode can't go away under us */
277 if (!page->mapping) {
278 atomic_dec(&buffermem_pages);
279 goto made_buffer_progress;
281 spin_lock(&pagecache_lock);
285 * We can't free pages unless there's just one user
286 * (count == 2 because we added one ourselves above).
288 if (page_count(page) != 2)
289 goto cache_unlock_continue;
292 * Is it a page swap page? If so, we want to
293 * drop it if it is no longer used, even if it
294 * were to be marked referenced..
296 if (PageSwapCache(page)) {
297 spin_unlock(&pagecache_lock);
298 __delete_from_swap_cache(page);
299 goto made_inode_progress;
302 /* is it a page-cache page? */
303 if (page->mapping) {
304 dispose = &old;
305 if (!pgcache_under_min())
307 remove_page_from_inode_queue(page);
308 remove_page_from_hash_queue(page);
309 page->mapping = NULL;
310 spin_unlock(&pagecache_lock);
311 goto made_inode_progress;
313 goto cache_unlock_continue;
316 dispose = &forget;
317 printk(KERN_ERR "shrink_mmap: unknown LRU page!\n");
319 cache_unlock_continue:
320 spin_unlock(&pagecache_lock);
321 unlock_continue:
322 UnlockPage(page);
323 put_page(page);
324 dispose_relock_continue:
325 /* even if the dispose list is local, a truncate_inode_page()
326 may remove a page from its queue so always
327 synchronize with the lru lock while accesing the
328 page->lru field */
329 spin_lock(&pagemap_lru_lock);
330 list_add(page_lru, dispose);
331 continue;
333 unlock_noput_continue:
334 UnlockPage(page);
335 goto dispose_relock_continue;
337 dispose_continue:
338 list_add(page_lru, dispose);
340 goto out;
342 made_inode_progress:
343 page_cache_release(page);
344 made_buffer_progress:
345 UnlockPage(page);
346 put_page(page);
347 ret = 1;
348 spin_lock(&pagemap_lru_lock);
349 /* nr_lru_pages needs the spinlock */
350 nr_lru_pages--;
352 out:
353 list_splice(&young, &lru_cache);
354 list_splice(&old, lru_cache.prev);
356 spin_unlock(&pagemap_lru_lock);
358 return ret;
361 static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
363 goto inside;
365 for (;;) {
366 page = page->next_hash;
367 inside:
368 if (!page)
369 goto not_found;
370 if (page->mapping != mapping)
371 continue;
372 if (page->pg_offset == offset)
373 break;
375 set_bit(PG_referenced, &page->flags);
376 not_found:
377 return page;
381 * By the time this is called, the page is locked and
382 * we don't have to worry about any races any more.
384 * Start the IO..
386 static int writeout_one_page(struct page *page)
388 struct buffer_head *bh, *head = page->buffers;
390 bh = head;
391 do {
392 if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
393 continue;
395 bh->b_flushtime = 0;
396 ll_rw_block(WRITE, 1, &bh);
397 } while ((bh = bh->b_this_page) != head);
398 return 0;
401 static int waitfor_one_page(struct page *page)
403 int error = 0;
404 struct buffer_head *bh, *head = page->buffers;
406 bh = head;
407 do {
408 wait_on_buffer(bh);
409 if (buffer_req(bh) && !buffer_uptodate(bh))
410 error = -EIO;
411 } while ((bh = bh->b_this_page) != head);
412 return error;
415 static int do_buffer_fdatasync(struct inode *inode, unsigned long start, unsigned long end, int (*fn)(struct page *))
417 struct list_head *head, *curr;
418 struct page *page;
419 int retval = 0;
421 head = &inode->i_data.pages;
423 spin_lock(&pagecache_lock);
424 curr = head->next;
425 while (curr != head) {
426 page = list_entry(curr, struct page, list);
427 curr = curr->next;
428 if (!page->buffers)
429 continue;
430 if (page->pg_offset >= end)
431 continue;
432 if (page->pg_offset < start)
433 continue;
435 get_page(page);
436 spin_unlock(&pagecache_lock);
437 lock_page(page);
439 /* The buffers could have been free'd while we waited for the page lock */
440 if (page->buffers)
441 retval |= fn(page);
443 UnlockPage(page);
444 spin_lock(&pagecache_lock);
445 curr = page->list.next;
446 page_cache_release(page);
448 spin_unlock(&pagecache_lock);
450 return retval;
454 * Two-stage data sync: first start the IO, then go back and
455 * collect the information..
457 int generic_buffer_fdatasync(struct inode *inode, unsigned long start, unsigned long end)
459 unsigned long start_idx = start >> PAGE_CACHE_SHIFT;
460 unsigned long end_idx = (end + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
461 int retval;
463 retval = do_buffer_fdatasync(inode, start_idx, end_idx, writeout_one_page);
464 retval |= do_buffer_fdatasync(inode, start_idx, end_idx, waitfor_one_page);
465 return retval;
469 * This adds a page to the page cache, starting out as locked,
470 * owned by us, referenced, but not uptodate and with no errors.
472 static inline void __add_to_page_cache(struct page * page,
473 struct address_space *mapping, unsigned long offset,
474 struct page **hash)
476 struct page *alias;
477 unsigned long flags;
479 flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced));
480 page->flags = flags | (1 << PG_locked);
481 get_page(page);
482 page->pg_offset = offset;
483 add_page_to_inode_queue(mapping, page);
484 __add_page_to_hash_queue(page, hash);
485 lru_cache_add(page);
486 alias = __find_page_nolock(mapping, offset, *hash);
487 if (alias != page)
488 BUG();
491 void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset)
493 spin_lock(&pagecache_lock);
494 __add_to_page_cache(page, mapping, offset, page_hash(mapping, offset));
495 spin_unlock(&pagecache_lock);
498 int add_to_page_cache_unique(struct page * page,
499 struct address_space *mapping, unsigned long offset,
500 struct page **hash)
502 int err;
503 struct page *alias;
505 spin_lock(&pagecache_lock);
506 alias = __find_page_nolock(mapping, offset, *hash);
508 err = 1;
509 if (!alias) {
510 __add_to_page_cache(page,mapping,offset,hash);
511 err = 0;
514 spin_unlock(&pagecache_lock);
515 return err;
519 * This adds the requested page to the page cache if it isn't already there,
520 * and schedules an I/O to read in its contents from disk.
522 static inline void page_cache_read(struct file * file, unsigned long offset)
524 struct inode *inode = file->f_dentry->d_inode;
525 struct page **hash = page_hash(&inode->i_data, offset);
526 struct page *page;
528 spin_lock(&pagecache_lock);
529 page = __find_page_nolock(&inode->i_data, offset, *hash);
530 spin_unlock(&pagecache_lock);
531 if (page)
532 return;
534 page = page_cache_alloc();
535 if (!page)
536 return;
538 if (!add_to_page_cache_unique(page, &inode->i_data, offset, hash)) {
539 inode->i_op->readpage(file, page);
540 page_cache_release(page);
541 return;
544 * We arrive here in the unlikely event that someone
545 * raced with us and added our page to the cache first.
547 page_cache_free(page);
548 return;
552 * Read in an entire cluster at once. A cluster is usually a 64k-
553 * aligned block that includes the address requested in "offset."
555 static void read_cluster_nonblocking(struct file * file, unsigned long offset)
557 unsigned long filesize = (file->f_dentry->d_inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
558 unsigned long pages = CLUSTER_PAGES;
560 offset = CLUSTER_OFFSET(offset);
561 while ((pages-- > 0) && (offset < filesize)) {
562 page_cache_read(file, offset);
563 offset ++;
566 return;
570 * Wait for a page to get unlocked.
572 * This must be called with the caller "holding" the page,
573 * ie with increased "page->count" so that the page won't
574 * go away during the wait..
576 void ___wait_on_page(struct page *page)
578 struct task_struct *tsk = current;
579 DECLARE_WAITQUEUE(wait, tsk);
581 add_wait_queue(&page->wait, &wait);
582 do {
583 run_task_queue(&tq_disk);
584 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
585 if (!PageLocked(page))
586 break;
587 schedule();
588 } while (PageLocked(page));
589 tsk->state = TASK_RUNNING;
590 remove_wait_queue(&page->wait, &wait);
594 * Get an exclusive lock on the page..
596 void lock_page(struct page *page)
598 while (TryLockPage(page))
599 ___wait_on_page(page);
604 * a rather lightweight function, finding and getting a reference to a
605 * hashed page atomically, waiting for it if it's locked.
607 struct page * __find_get_page (struct address_space *mapping,
608 unsigned long offset, struct page **hash)
610 struct page *page;
613 * We scan the hash list read-only. Addition to and removal from
614 * the hash-list needs a held write-lock.
616 repeat:
617 spin_lock(&pagecache_lock);
618 page = __find_page_nolock(mapping, offset, *hash);
619 if (page)
620 get_page(page);
621 spin_unlock(&pagecache_lock);
623 /* Found the page, sleep if locked. */
624 if (page && PageLocked(page)) {
625 struct task_struct *tsk = current;
626 DECLARE_WAITQUEUE(wait, tsk);
628 run_task_queue(&tq_disk);
630 __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
631 add_wait_queue(&page->wait, &wait);
633 if (PageLocked(page))
634 schedule();
635 __set_task_state(tsk, TASK_RUNNING);
636 remove_wait_queue(&page->wait, &wait);
639 * The page might have been unhashed meanwhile. It's
640 * not freed though because we hold a reference to it.
641 * If this is the case then it will be freed _here_,
642 * and we recheck the hash anyway.
644 page_cache_release(page);
645 goto repeat;
648 * It's not locked so we can return the page and we hold
649 * a reference to it.
651 return page;
655 * Get the lock to a page atomically.
657 struct page * __find_lock_page (struct address_space *mapping,
658 unsigned long offset, struct page **hash)
660 struct page *page;
663 * We scan the hash list read-only. Addition to and removal from
664 * the hash-list needs a held write-lock.
666 repeat:
667 spin_lock(&pagecache_lock);
668 page = __find_page_nolock(mapping, offset, *hash);
669 if (page)
670 get_page(page);
671 spin_unlock(&pagecache_lock);
673 /* Found the page, sleep if locked. */
674 if (page && TryLockPage(page)) {
675 struct task_struct *tsk = current;
676 DECLARE_WAITQUEUE(wait, tsk);
678 run_task_queue(&tq_disk);
680 __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
681 add_wait_queue(&page->wait, &wait);
683 if (PageLocked(page))
684 schedule();
685 __set_task_state(tsk, TASK_RUNNING);
686 remove_wait_queue(&page->wait, &wait);
689 * The page might have been unhashed meanwhile. It's
690 * not freed though because we hold a reference to it.
691 * If this is the case then it will be freed _here_,
692 * and we recheck the hash anyway.
694 page_cache_release(page);
695 goto repeat;
698 * It's not locked so we can return the page and we hold
699 * a reference to it.
701 return page;
704 #if 0
705 #define PROFILE_READAHEAD
706 #define DEBUG_READAHEAD
707 #endif
710 * Read-ahead profiling information
711 * --------------------------------
712 * Every PROFILE_MAXREADCOUNT, the following information is written
713 * to the syslog:
714 * Percentage of asynchronous read-ahead.
715 * Average of read-ahead fields context value.
716 * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
717 * to the syslog.
720 #ifdef PROFILE_READAHEAD
722 #define PROFILE_MAXREADCOUNT 1000
724 static unsigned long total_reada;
725 static unsigned long total_async;
726 static unsigned long total_ramax;
727 static unsigned long total_ralen;
728 static unsigned long total_rawin;
730 static void profile_readahead(int async, struct file *filp)
732 unsigned long flags;
734 ++total_reada;
735 if (async)
736 ++total_async;
738 total_ramax += filp->f_ramax;
739 total_ralen += filp->f_ralen;
740 total_rawin += filp->f_rawin;
742 if (total_reada > PROFILE_MAXREADCOUNT) {
743 save_flags(flags);
744 cli();
745 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
746 restore_flags(flags);
747 return;
750 printk("Readahead average: max=%ld, len=%ld, win=%ld, async=%ld%%\n",
751 total_ramax/total_reada,
752 total_ralen/total_reada,
753 total_rawin/total_reada,
754 (total_async*100)/total_reada);
755 #ifdef DEBUG_READAHEAD
756 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n",
757 filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
758 #endif
760 total_reada = 0;
761 total_async = 0;
762 total_ramax = 0;
763 total_ralen = 0;
764 total_rawin = 0;
766 restore_flags(flags);
769 #endif /* defined PROFILE_READAHEAD */
772 * Read-ahead context:
773 * -------------------
774 * The read ahead context fields of the "struct file" are the following:
775 * - f_raend : position of the first byte after the last page we tried to
776 * read ahead.
777 * - f_ramax : current read-ahead maximum size.
778 * - f_ralen : length of the current IO read block we tried to read-ahead.
779 * - f_rawin : length of the current read-ahead window.
780 * if last read-ahead was synchronous then
781 * f_rawin = f_ralen
782 * otherwise (was asynchronous)
783 * f_rawin = previous value of f_ralen + f_ralen
785 * Read-ahead limits:
786 * ------------------
787 * MIN_READAHEAD : minimum read-ahead size when read-ahead.
788 * MAX_READAHEAD : maximum read-ahead size when read-ahead.
790 * Synchronous read-ahead benefits:
791 * --------------------------------
792 * Using reasonable IO xfer length from peripheral devices increase system
793 * performances.
794 * Reasonable means, in this context, not too large but not too small.
795 * The actual maximum value is:
796 * MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
797 * and 32K if defined (4K page size assumed).
799 * Asynchronous read-ahead benefits:
800 * ---------------------------------
801 * Overlapping next read request and user process execution increase system
802 * performance.
804 * Read-ahead risks:
805 * -----------------
806 * We have to guess which further data are needed by the user process.
807 * If these data are often not really needed, it's bad for system
808 * performances.
809 * However, we know that files are often accessed sequentially by
810 * application programs and it seems that it is possible to have some good
811 * strategy in that guessing.
812 * We only try to read-ahead files that seems to be read sequentially.
814 * Asynchronous read-ahead risks:
815 * ------------------------------
816 * In order to maximize overlapping, we must start some asynchronous read
817 * request from the device, as soon as possible.
818 * We must be very careful about:
819 * - The number of effective pending IO read requests.
820 * ONE seems to be the only reasonable value.
821 * - The total memory pool usage for the file access stream.
822 * This maximum memory usage is implicitly 2 IO read chunks:
823 * 2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
824 * 64k if defined (4K page size assumed).
827 static inline int get_max_readahead(struct inode * inode)
829 if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
830 return MAX_READAHEAD;
831 return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
834 static void generic_file_readahead(int reada_ok,
835 struct file * filp, struct inode * inode,
836 unsigned long ppos, struct page * page)
838 unsigned long max_ahead, ahead;
839 unsigned long raend;
840 int max_readahead = get_max_readahead(inode);
842 raend = filp->f_raend & PAGE_CACHE_MASK;
843 max_ahead = 0;
846 * The current page is locked.
847 * If the current position is inside the previous read IO request, do not
848 * try to reread previously read ahead pages.
849 * Otherwise decide or not to read ahead some pages synchronously.
850 * If we are not going to read ahead, set the read ahead context for this
851 * page only.
853 if (PageLocked(page)) {
854 if (!filp->f_ralen || ppos >= raend || ppos + filp->f_ralen < raend) {
855 raend = ppos;
856 if (raend < inode->i_size)
857 max_ahead = filp->f_ramax;
858 filp->f_rawin = 0;
859 filp->f_ralen = PAGE_CACHE_SIZE;
860 if (!max_ahead) {
861 filp->f_raend = ppos + filp->f_ralen;
862 filp->f_rawin += filp->f_ralen;
867 * The current page is not locked.
868 * If we were reading ahead and,
869 * if the current max read ahead size is not zero and,
870 * if the current position is inside the last read-ahead IO request,
871 * it is the moment to try to read ahead asynchronously.
872 * We will later force unplug device in order to force asynchronous read IO.
874 else if (reada_ok && filp->f_ramax && raend >= PAGE_CACHE_SIZE &&
875 ppos <= raend && ppos + filp->f_ralen >= raend) {
877 * Add ONE page to max_ahead in order to try to have about the same IO max size
878 * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
879 * Compute the position of the last page we have tried to read in order to
880 * begin to read ahead just at the next page.
882 raend -= PAGE_CACHE_SIZE;
883 if (raend < inode->i_size)
884 max_ahead = filp->f_ramax + PAGE_CACHE_SIZE;
886 if (max_ahead) {
887 filp->f_rawin = filp->f_ralen;
888 filp->f_ralen = 0;
889 reada_ok = 2;
893 * Try to read ahead pages.
894 * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
895 * scheduler, will work enough for us to avoid too bad actuals IO requests.
897 ahead = 0;
898 while (ahead < max_ahead) {
899 ahead += PAGE_CACHE_SIZE;
900 if ((raend + ahead) >= inode->i_size)
901 break;
902 page_cache_read(filp, (raend + ahead) >> PAGE_CACHE_SHIFT);
905 * If we tried to read ahead some pages,
906 * If we tried to read ahead asynchronously,
907 * Try to force unplug of the device in order to start an asynchronous
908 * read IO request.
909 * Update the read-ahead context.
910 * Store the length of the current read-ahead window.
911 * Double the current max read ahead size.
912 * That heuristic avoid to do some large IO for files that are not really
913 * accessed sequentially.
915 if (ahead) {
916 if (reada_ok == 2) {
917 run_task_queue(&tq_disk);
920 filp->f_ralen += ahead;
921 filp->f_rawin += filp->f_ralen;
922 filp->f_raend = raend + ahead + PAGE_CACHE_SIZE;
924 filp->f_ramax += filp->f_ramax;
926 if (filp->f_ramax > max_readahead)
927 filp->f_ramax = max_readahead;
929 #ifdef PROFILE_READAHEAD
930 profile_readahead((reada_ok == 2), filp);
931 #endif
934 return;
939 * This is a generic file read routine, and uses the
940 * inode->i_op->readpage() function for the actual low-level
941 * stuff.
943 * This is really ugly. But the goto's actually try to clarify some
944 * of the logic when it comes to error handling etc.
946 void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
948 struct dentry *dentry = filp->f_dentry;
949 struct inode *inode = dentry->d_inode;
950 unsigned long pos, pgpos;
951 struct page *cached_page;
952 int reada_ok;
953 int error;
954 int max_readahead = get_max_readahead(inode);
955 unsigned long pgoff;
957 cached_page = NULL;
958 pos = *ppos;
959 pgpos = pos & PAGE_CACHE_MASK;
960 pgoff = pos >> PAGE_CACHE_SHIFT;
962 * If the current position is outside the previous read-ahead window,
963 * we reset the current read-ahead context and set read ahead max to zero
964 * (will be set to just needed value later),
965 * otherwise, we assume that the file accesses are sequential enough to
966 * continue read-ahead.
968 if (pgpos > filp->f_raend || pgpos + filp->f_rawin < filp->f_raend) {
969 reada_ok = 0;
970 filp->f_raend = 0;
971 filp->f_ralen = 0;
972 filp->f_ramax = 0;
973 filp->f_rawin = 0;
974 } else {
975 reada_ok = 1;
978 * Adjust the current value of read-ahead max.
979 * If the read operation stay in the first half page, force no readahead.
980 * Otherwise try to increase read ahead max just enough to do the read request.
981 * Then, at least MIN_READAHEAD if read ahead is ok,
982 * and at most MAX_READAHEAD in all cases.
984 if (pos + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
985 filp->f_ramax = 0;
986 } else {
987 unsigned long needed;
989 needed = ((pos + desc->count) & PAGE_CACHE_MASK) - pgpos;
991 if (filp->f_ramax < needed)
992 filp->f_ramax = needed;
994 if (reada_ok && filp->f_ramax < MIN_READAHEAD)
995 filp->f_ramax = MIN_READAHEAD;
996 if (filp->f_ramax > max_readahead)
997 filp->f_ramax = max_readahead;
1000 for (;;) {
1001 struct page *page, **hash;
1003 if (pos >= inode->i_size)
1004 break;
1007 * Try to find the data in the page cache..
1009 hash = page_hash(&inode->i_data, pgoff);
1011 spin_lock(&pagecache_lock);
1012 page = __find_page_nolock(&inode->i_data, pgoff, *hash);
1013 if (!page)
1014 goto no_cached_page;
1015 found_page:
1016 get_page(page);
1017 spin_unlock(&pagecache_lock);
1019 if (!Page_Uptodate(page))
1020 goto page_not_up_to_date;
1021 page_ok:
1023 * Ok, we have the page, and it's up-to-date, so
1024 * now we can copy it to user space...
1027 unsigned long offset, nr;
1029 offset = pos & ~PAGE_CACHE_MASK;
1030 nr = PAGE_CACHE_SIZE - offset;
1031 if (nr > inode->i_size - pos)
1032 nr = inode->i_size - pos;
1035 * The actor routine returns how many bytes were actually used..
1036 * NOTE! This may not be the same as how much of a user buffer
1037 * we filled up (we may be padding etc), so we can only update
1038 * "pos" here (the actor routine has to update the user buffer
1039 * pointers and the remaining count).
1041 nr = actor(desc, page, offset, nr);
1042 pos += nr;
1043 pgoff = pos >> PAGE_CACHE_SHIFT;
1044 page_cache_release(page);
1045 if (nr && desc->count)
1046 continue;
1047 break;
1051 * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
1053 page_not_up_to_date:
1054 generic_file_readahead(reada_ok, filp, inode,
1055 pos & PAGE_CACHE_MASK, page);
1057 if (Page_Uptodate(page))
1058 goto page_ok;
1060 /* Get exclusive access to the page ... */
1061 lock_page(page);
1062 if (Page_Uptodate(page)) {
1063 UnlockPage(page);
1064 goto page_ok;
1067 readpage:
1068 /* ... and start the actual read. The read will unlock the page. */
1069 error = inode->i_op->readpage(filp, page);
1071 if (!error) {
1072 if (Page_Uptodate(page))
1073 goto page_ok;
1075 /* Again, try some read-ahead while waiting for the page to finish.. */
1076 generic_file_readahead(reada_ok, filp, inode,
1077 pos & PAGE_CACHE_MASK, page);
1078 wait_on_page(page);
1079 if (Page_Uptodate(page))
1080 goto page_ok;
1081 error = -EIO;
1084 /* UHHUH! A synchronous read error occurred. Report it */
1085 desc->error = error;
1086 page_cache_release(page);
1087 break;
1089 no_cached_page:
1091 * Ok, it wasn't cached, so we need to create a new
1092 * page..
1094 * We get here with the page cache lock held.
1096 if (!cached_page) {
1097 spin_unlock(&pagecache_lock);
1098 cached_page = page_cache_alloc();
1099 if (!cached_page) {
1100 desc->error = -ENOMEM;
1101 break;
1105 * Somebody may have added the page while we
1106 * dropped the page cache lock. Check for that.
1108 spin_lock(&pagecache_lock);
1109 page = __find_page_nolock(&inode->i_data, pgoff, *hash);
1110 if (page)
1111 goto found_page;
1115 * Ok, add the new page to the hash-queues...
1117 page = cached_page;
1118 __add_to_page_cache(page, &inode->i_data, pgoff, hash);
1119 spin_unlock(&pagecache_lock);
1120 cached_page = NULL;
1122 goto readpage;
1125 *ppos = pos;
1126 filp->f_reada = 1;
1127 if (cached_page)
1128 page_cache_free(cached_page);
1129 UPDATE_ATIME(inode);
1132 static int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
1134 unsigned long kaddr;
1135 unsigned long left, count = desc->count;
1137 if (size > count)
1138 size = count;
1140 * FIXME: We cannot yet sleep with kmaps held.
1142 kaddr = kmap(page, KM_READ);
1143 left = __copy_to_user(desc->buf, (void *)(kaddr+offset), size);
1144 kunmap(kaddr, KM_READ);
1146 if (left) {
1147 size -= left;
1148 desc->error = -EFAULT;
1150 desc->count = count - size;
1151 desc->written += size;
1152 desc->buf += size;
1153 return size;
1157 * This is the "read()" routine for all filesystems
1158 * that can use the page cache directly.
1160 ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
1162 ssize_t retval;
1164 retval = -EFAULT;
1165 if (access_ok(VERIFY_WRITE, buf, count)) {
1166 retval = 0;
1168 if (count) {
1169 read_descriptor_t desc;
1171 desc.written = 0;
1172 desc.count = count;
1173 desc.buf = buf;
1174 desc.error = 0;
1175 do_generic_file_read(filp, ppos, &desc, file_read_actor);
1177 retval = desc.written;
1178 if (!retval)
1179 retval = desc.error;
1182 return retval;
1185 static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size)
1187 unsigned long kaddr;
1188 ssize_t written;
1189 unsigned long count = desc->count;
1190 struct file *file = (struct file *) desc->buf;
1191 mm_segment_t old_fs;
1193 if (size > count)
1194 size = count;
1195 old_fs = get_fs();
1196 set_fs(KERNEL_DS);
1197 kaddr = kmap(page, KM_READ);
1198 written = file->f_op->write(file, (char *)kaddr + offset, size, &file->f_pos);
1199 kunmap(kaddr, KM_READ);
1200 set_fs(old_fs);
1201 if (written < 0) {
1202 desc->error = written;
1203 written = 0;
1205 desc->count = count - written;
1206 desc->written += written;
1207 return written;
1210 asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
1212 ssize_t retval;
1213 struct file * in_file, * out_file;
1214 struct inode * in_inode, * out_inode;
1217 * Get input file, and verify that it is ok..
1219 retval = -EBADF;
1220 in_file = fget(in_fd);
1221 if (!in_file)
1222 goto out;
1223 if (!(in_file->f_mode & FMODE_READ))
1224 goto fput_in;
1225 retval = -EINVAL;
1226 in_inode = in_file->f_dentry->d_inode;
1227 if (!in_inode)
1228 goto fput_in;
1229 if (!in_inode->i_op || !in_inode->i_op->readpage)
1230 goto fput_in;
1231 retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
1232 if (retval)
1233 goto fput_in;
1236 * Get output file, and verify that it is ok..
1238 retval = -EBADF;
1239 out_file = fget(out_fd);
1240 if (!out_file)
1241 goto fput_in;
1242 if (!(out_file->f_mode & FMODE_WRITE))
1243 goto fput_out;
1244 retval = -EINVAL;
1245 if (!out_file->f_op || !out_file->f_op->write)
1246 goto fput_out;
1247 out_inode = out_file->f_dentry->d_inode;
1248 if (!out_inode)
1249 goto fput_out;
1250 retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
1251 if (retval)
1252 goto fput_out;
1254 retval = 0;
1255 if (count) {
1256 read_descriptor_t desc;
1257 loff_t pos = 0, *ppos;
1259 retval = -EFAULT;
1260 ppos = &in_file->f_pos;
1261 if (offset) {
1262 if (get_user(pos, offset))
1263 goto fput_out;
1264 ppos = &pos;
1267 desc.written = 0;
1268 desc.count = count;
1269 desc.buf = (char *) out_file;
1270 desc.error = 0;
1271 do_generic_file_read(in_file, ppos, &desc, file_send_actor);
1273 retval = desc.written;
1274 if (!retval)
1275 retval = desc.error;
1276 if (offset)
1277 put_user(pos, offset);
1280 fput_out:
1281 fput(out_file);
1282 fput_in:
1283 fput(in_file);
1284 out:
1285 return retval;
1289 * filemap_nopage() is invoked via the vma operations vector for a
1290 * mapped memory region to read in file data during a page fault.
1292 * The goto's are kind of ugly, but this streamlines the normal case of having
1293 * it in the page cache, and handles the special cases reasonably without
1294 * having a lot of duplicated code.
1296 * XXX - at some point, this should return unique values to indicate to
1297 * the caller whether this is EIO, OOM, or SIGBUS.
1299 static struct page * filemap_nopage(struct vm_area_struct * area,
1300 unsigned long address, int no_share)
1302 struct file *file = area->vm_file;
1303 struct dentry *dentry = file->f_dentry;
1304 struct inode *inode = dentry->d_inode;
1305 struct page *page, **hash, *old_page;
1306 unsigned long size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1308 unsigned long pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
1311 * Semantics for shared and private memory areas are different
1312 * past the end of the file. A shared mapping past the last page
1313 * of the file is an error and results in a SIGBUS, while a
1314 * private mapping just maps in a zero page.
1316 if ((pgoff >= size) &&
1317 (area->vm_flags & VM_SHARED) && (area->vm_mm == current->mm))
1318 return NULL;
1321 * Do we have something in the page cache already?
1323 hash = page_hash(&inode->i_data, pgoff);
1324 retry_find:
1325 page = __find_get_page(&inode->i_data, pgoff, hash);
1326 if (!page)
1327 goto no_cached_page;
1330 * Ok, found a page in the page cache, now we need to check
1331 * that it's up-to-date.
1333 if (!Page_Uptodate(page))
1334 goto page_not_uptodate;
1336 success:
1338 * Found the page and have a reference on it, need to check sharing
1339 * and possibly copy it over to another page..
1341 old_page = page;
1342 if (no_share) {
1343 struct page *new_page = page_cache_alloc();
1345 if (new_page) {
1346 if (PageHighMem(new_page) || PageHighMem(old_page))
1347 BUG();
1348 copy_highpage(new_page, old_page);
1349 flush_page_to_ram(new_page);
1351 page_cache_release(page);
1352 return new_page;
1355 flush_page_to_ram(old_page);
1356 return old_page;
1358 no_cached_page:
1360 * If the requested offset is within our file, try to read a whole
1361 * cluster of pages at once.
1363 * Otherwise, we're off the end of a privately mapped file,
1364 * so we need to map a zero page.
1366 if (pgoff < size)
1367 read_cluster_nonblocking(file, pgoff);
1368 else
1369 page_cache_read(file, pgoff);
1372 * The page we want has now been added to the page cache.
1373 * In the unlikely event that someone removed it in the
1374 * meantime, we'll just come back here and read it again.
1376 goto retry_find;
1378 page_not_uptodate:
1379 lock_page(page);
1380 if (Page_Uptodate(page)) {
1381 UnlockPage(page);
1382 goto success;
1385 if (!inode->i_op->readpage(file, page)) {
1386 wait_on_page(page);
1387 if (Page_Uptodate(page))
1388 goto success;
1392 * Umm, take care of errors if the page isn't up-to-date.
1393 * Try to re-read it _once_. We do this synchronously,
1394 * because there really aren't any performance issues here
1395 * and we need to check for errors.
1397 lock_page(page);
1398 if (Page_Uptodate(page)) {
1399 UnlockPage(page);
1400 goto success;
1402 ClearPageError(page);
1403 if (!inode->i_op->readpage(file, page)) {
1404 wait_on_page(page);
1405 if (Page_Uptodate(page))
1406 goto success;
1410 * Things didn't work out. Return zero to tell the
1411 * mm layer so, possibly freeing the page cache page first.
1413 page_cache_release(page);
1414 return NULL;
1418 * Tries to write a shared mapped page to its backing store. May return -EIO
1419 * if the disk is full.
1421 static inline int do_write_page(struct inode * inode, struct file * file,
1422 struct page * page, unsigned long offset)
1424 int retval;
1425 unsigned long size;
1426 int (*writepage) (struct file *, struct page *);
1428 size = (offset << PAGE_CACHE_SHIFT) + PAGE_CACHE_SIZE;
1429 /* refuse to extend file size.. */
1430 if (S_ISREG(inode->i_mode)) {
1431 if (size > inode->i_size)
1432 size = inode->i_size;
1433 /* Ho humm.. We should have tested for this earlier */
1434 if (size < offset)
1435 return -EIO;
1437 retval = -EIO;
1438 writepage = inode->i_op->writepage;
1439 lock_page(page);
1441 retval = writepage(file, page);
1443 UnlockPage(page);
1444 return retval;
1447 static int filemap_write_page(struct file *file,
1448 unsigned long offset,
1449 struct page * page,
1450 int wait)
1452 int result;
1453 struct dentry * dentry;
1454 struct inode * inode;
1456 dentry = file->f_dentry;
1457 inode = dentry->d_inode;
1460 * If a task terminates while we're swapping the page, the vma and
1461 * and file could be released: try_to_swap_out has done a get_file.
1462 * vma/file is guaranteed to exist in the unmap/sync cases because
1463 * mmap_sem is held.
1465 result = do_write_page(inode, file, page, offset);
1466 return result;
1471 * The page cache takes care of races between somebody
1472 * trying to swap something out and swap something in
1473 * at the same time..
1475 extern void wakeup_bdflush(int);
1476 int filemap_swapout(struct page * page, struct file * file)
1478 int retval = filemap_write_page(file, page->pg_offset, page, 0);
1479 wakeup_bdflush(0);
1480 return retval;
1483 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
1484 unsigned long address, unsigned int flags)
1486 unsigned long pgoff;
1487 pte_t pte = *ptep;
1488 struct page *page;
1489 int error;
1491 if (!(flags & MS_INVALIDATE)) {
1492 if (!pte_present(pte))
1493 return 0;
1494 if (!pte_dirty(pte))
1495 return 0;
1496 flush_page_to_ram(pte_page(pte));
1497 flush_cache_page(vma, address);
1498 set_pte(ptep, pte_mkclean(pte));
1499 flush_tlb_page(vma, address);
1500 page = pte_page(pte);
1501 get_page(page);
1502 } else {
1503 if (pte_none(pte))
1504 return 0;
1505 flush_cache_page(vma, address);
1506 pte_clear(ptep);
1507 flush_tlb_page(vma, address);
1508 if (!pte_present(pte)) {
1509 swap_free(pte_to_swp_entry(pte));
1510 return 0;
1512 page = pte_page(pte);
1513 if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
1514 page_cache_free(page);
1515 return 0;
1518 if (PageHighMem(page))
1519 BUG();
1520 pgoff = (address - vma->vm_start) >> PAGE_CACHE_SHIFT;
1521 pgoff += vma->vm_pgoff;
1522 if (page->pg_offset != pgoff) {
1523 printk("weirdness: pgoff=%lu pg_offset=%lu address=%lu vm_start=%lu vm_pgoff=%lu\n",
1524 pgoff, page->pg_offset, address, vma->vm_start, vma->vm_pgoff);
1526 error = filemap_write_page(vma->vm_file, pgoff, page, 1);
1527 page_cache_free(page);
1528 return error;
1531 static inline int filemap_sync_pte_range(pmd_t * pmd,
1532 unsigned long address, unsigned long size,
1533 struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
1535 pte_t * pte;
1536 unsigned long end;
1537 int error;
1539 if (pmd_none(*pmd))
1540 return 0;
1541 if (pmd_bad(*pmd)) {
1542 pmd_ERROR(*pmd);
1543 pmd_clear(pmd);
1544 return 0;
1546 pte = pte_offset(pmd, address);
1547 offset += address & PMD_MASK;
1548 address &= ~PMD_MASK;
1549 end = address + size;
1550 if (end > PMD_SIZE)
1551 end = PMD_SIZE;
1552 error = 0;
1553 do {
1554 error |= filemap_sync_pte(pte, vma, address + offset, flags);
1555 address += PAGE_SIZE;
1556 pte++;
1557 } while (address && (address < end));
1558 return error;
1561 static inline int filemap_sync_pmd_range(pgd_t * pgd,
1562 unsigned long address, unsigned long size,
1563 struct vm_area_struct *vma, unsigned int flags)
1565 pmd_t * pmd;
1566 unsigned long offset, end;
1567 int error;
1569 if (pgd_none(*pgd))
1570 return 0;
1571 if (pgd_bad(*pgd)) {
1572 pgd_ERROR(*pgd);
1573 pgd_clear(pgd);
1574 return 0;
1576 pmd = pmd_offset(pgd, address);
1577 offset = address & PGDIR_MASK;
1578 address &= ~PGDIR_MASK;
1579 end = address + size;
1580 if (end > PGDIR_SIZE)
1581 end = PGDIR_SIZE;
1582 error = 0;
1583 do {
1584 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
1585 address = (address + PMD_SIZE) & PMD_MASK;
1586 pmd++;
1587 } while (address && (address < end));
1588 return error;
1591 static int filemap_sync(struct vm_area_struct * vma, unsigned long address,
1592 size_t size, unsigned int flags)
1594 pgd_t * dir;
1595 unsigned long end = address + size;
1596 int error = 0;
1598 dir = pgd_offset(vma->vm_mm, address);
1599 flush_cache_range(vma->vm_mm, end - size, end);
1600 if (address >= end)
1601 BUG();
1602 do {
1603 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
1604 address = (address + PGDIR_SIZE) & PGDIR_MASK;
1605 dir++;
1606 } while (address && (address < end));
1607 flush_tlb_range(vma->vm_mm, end - size, end);
1608 return error;
1612 * This handles (potentially partial) area unmaps..
1614 static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len)
1616 filemap_sync(vma, start, len, MS_ASYNC);
1620 * Shared mappings need to be able to do the right thing at
1621 * close/unmap/sync. They will also use the private file as
1622 * backing-store for swapping..
1624 static struct vm_operations_struct file_shared_mmap = {
1625 NULL, /* no special open */
1626 NULL, /* no special close */
1627 filemap_unmap, /* unmap - we need to sync the pages */
1628 NULL, /* no special protect */
1629 filemap_sync, /* sync */
1630 NULL, /* advise */
1631 filemap_nopage, /* nopage */
1632 NULL, /* wppage */
1633 filemap_swapout /* swapout */
1637 * Private mappings just need to be able to load in the map.
1639 * (This is actually used for shared mappings as well, if we
1640 * know they can't ever get write permissions..)
1642 static struct vm_operations_struct file_private_mmap = {
1643 NULL, /* open */
1644 NULL, /* close */
1645 NULL, /* unmap */
1646 NULL, /* protect */
1647 NULL, /* sync */
1648 NULL, /* advise */
1649 filemap_nopage, /* nopage */
1650 NULL, /* wppage */
1651 NULL /* swapout */
1654 /* This is used for a general mmap of a disk file */
1656 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1658 struct vm_operations_struct * ops;
1659 struct inode *inode = file->f_dentry->d_inode;
1661 ops = &file_private_mmap;
1662 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1663 if (!inode->i_op || !inode->i_op->writepage)
1664 return -EINVAL;
1665 ops = &file_shared_mmap;
1667 if (!inode->i_sb || !S_ISREG(inode->i_mode))
1668 return -EACCES;
1669 if (!inode->i_op || !inode->i_op->readpage)
1670 return -ENOEXEC;
1671 UPDATE_ATIME(inode);
1672 vma->vm_ops = ops;
1673 return 0;
1678 * The msync() system call.
1681 static int msync_interval(struct vm_area_struct * vma,
1682 unsigned long start, unsigned long end, int flags)
1684 if (vma->vm_file && vma->vm_ops && vma->vm_ops->sync) {
1685 int error;
1686 error = vma->vm_ops->sync(vma, start, end-start, flags);
1687 if (!error && (flags & MS_SYNC)) {
1688 struct file * file = vma->vm_file;
1689 if (file) {
1690 struct dentry * dentry = file->f_dentry;
1691 error = file_fsync(file, dentry);
1694 return error;
1696 return 0;
1699 asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
1701 unsigned long end;
1702 struct vm_area_struct * vma;
1703 int unmapped_error, error = -EINVAL;
1705 down(&current->mm->mmap_sem);
1706 lock_kernel();
1707 if (start & ~PAGE_MASK)
1708 goto out;
1709 len = (len + ~PAGE_MASK) & PAGE_MASK;
1710 end = start + len;
1711 if (end < start)
1712 goto out;
1713 if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1714 goto out;
1715 error = 0;
1716 if (end == start)
1717 goto out;
1719 * If the interval [start,end) covers some unmapped address ranges,
1720 * just ignore them, but return -EFAULT at the end.
1722 vma = find_vma(current->mm, start);
1723 unmapped_error = 0;
1724 for (;;) {
1725 /* Still start < end. */
1726 error = -EFAULT;
1727 if (!vma)
1728 goto out;
1729 /* Here start < vma->vm_end. */
1730 if (start < vma->vm_start) {
1731 unmapped_error = -EFAULT;
1732 start = vma->vm_start;
1734 /* Here vma->vm_start <= start < vma->vm_end. */
1735 if (end <= vma->vm_end) {
1736 if (start < end) {
1737 error = msync_interval(vma, start, end, flags);
1738 if (error)
1739 goto out;
1741 error = unmapped_error;
1742 goto out;
1744 /* Here vma->vm_start <= start < vma->vm_end < end. */
1745 error = msync_interval(vma, start, vma->vm_end, flags);
1746 if (error)
1747 goto out;
1748 start = vma->vm_end;
1749 vma = vma->vm_next;
1751 out:
1752 unlock_kernel();
1753 up(&current->mm->mmap_sem);
1754 return error;
1758 * Write to a file through the page cache. This is mainly for the
1759 * benefit of NFS and possibly other network-based file systems.
1761 * We currently put everything into the page cache prior to writing it.
1762 * This is not a problem when writing full pages. With partial pages,
1763 * however, we first have to read the data into the cache, then
1764 * dirty the page, and finally schedule it for writing. Alternatively, we
1765 * could write-through just the portion of data that would go into that
1766 * page, but that would kill performance for applications that write data
1767 * line by line, and it's prone to race conditions.
1769 * Note that this routine doesn't try to keep track of dirty pages. Each
1770 * file system has to do this all by itself, unfortunately.
1771 * okir@monad.swb.de
1773 ssize_t
1774 generic_file_write(struct file *file, const char *buf,
1775 size_t count, loff_t *ppos,
1776 writepage_t write_one_page)
1778 struct dentry *dentry = file->f_dentry;
1779 struct inode *inode = dentry->d_inode;
1780 unsigned long pos = *ppos;
1781 unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
1782 struct page *page, **hash, *cached_page;
1783 unsigned long written;
1784 long status;
1785 int err;
1787 cached_page = NULL;
1789 down(&inode->i_sem);
1790 err = file->f_error;
1791 if (err) {
1792 file->f_error = 0;
1793 goto out;
1796 written = 0;
1798 if (file->f_flags & O_APPEND)
1799 pos = inode->i_size;
1802 * Check whether we've reached the file size limit.
1804 err = -EFBIG;
1805 if (pos >= limit) {
1806 send_sig(SIGXFSZ, current, 0);
1807 goto out;
1810 status = 0;
1812 * Check whether to truncate the write,
1813 * and send the signal if we do.
1815 if (count > limit - pos) {
1816 send_sig(SIGXFSZ, current, 0);
1817 count = limit - pos;
1820 while (count) {
1821 unsigned long bytes, pgoff, offset;
1824 * Try to find the page in the cache. If it isn't there,
1825 * allocate a free page.
1827 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
1828 pgoff = pos >> PAGE_CACHE_SHIFT;
1829 bytes = PAGE_CACHE_SIZE - offset;
1830 if (bytes > count)
1831 bytes = count;
1833 hash = page_hash(&inode->i_data, pgoff);
1834 repeat_find:
1835 page = __find_lock_page(&inode->i_data, pgoff, hash);
1836 if (!page) {
1837 if (!cached_page) {
1838 cached_page = page_cache_alloc();
1839 if (cached_page)
1840 goto repeat_find;
1841 status = -ENOMEM;
1842 break;
1844 page = cached_page;
1845 if (add_to_page_cache_unique(page,&inode->i_data,pgoff,hash))
1846 goto repeat_find;
1848 cached_page = NULL;
1851 /* We have exclusive IO access to the page.. */
1852 if (!PageLocked(page)) {
1853 PAGE_BUG(page);
1856 status = write_one_page(file, page, offset, bytes, buf);
1858 if (status >= 0) {
1859 written += status;
1860 count -= status;
1861 pos += status;
1862 buf += status;
1863 if (pos > inode->i_size)
1864 inode->i_size = pos;
1866 /* Mark it unlocked again and drop the page.. */
1867 UnlockPage(page);
1868 page_cache_release(page);
1870 if (status < 0)
1871 break;
1873 *ppos = pos;
1875 if (cached_page)
1876 page_cache_free(cached_page);
1878 err = written ? written : status;
1879 out:
1880 up(&inode->i_sem);
1881 return err;
1885 * Support routines for directory caching using the page cache.
1889 * Unlock and free a page.
1891 void put_cached_page(unsigned long addr)
1893 struct page * page = page_cache_entry(addr);
1895 UnlockPage(page);
1896 if (page_count(page) != 2)
1897 panic("put_cached_page: page count=%d\n",
1898 page_count(page));
1899 page_cache_release(page);
1902 void __init page_cache_init(unsigned long mempages)
1904 unsigned long htable_size, order;
1906 htable_size = mempages;
1907 htable_size *= sizeof(struct page *);
1908 for(order = 0; (PAGE_SIZE << order) < htable_size; order++)
1911 do {
1912 unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *);
1914 page_hash_bits = 0;
1915 while((tmp >>= 1UL) != 0UL)
1916 page_hash_bits++;
1918 page_hash_table = (struct page **)
1919 __get_free_pages(GFP_ATOMIC, order);
1920 } while(page_hash_table == NULL && --order > 0);
1922 printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
1923 (1 << page_hash_bits), order, (PAGE_SIZE << order));
1924 if (!page_hash_table)
1925 panic("Failed to allocate page hash table\n");
1926 memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));