Import 2.3.16
[davej-history.git] / mm / filemap.c
blobbad408d03e2598f55827cb9e310e8861552a2269
1 /*
2 * linux/mm/filemap.c
4 * Copyright (C) 1994-1999 Linus Torvalds
5 */
7 /*
8 * This file handles the generic file mmap semantics used by
9 * most "normal" filesystems (but you don't /have/ to use this:
10 * the NFS filesystem used to do this differently, for example)
12 #include <linux/malloc.h>
13 #include <linux/shm.h>
14 #include <linux/mman.h>
15 #include <linux/locks.h>
16 #include <linux/pagemap.h>
17 #include <linux/swap.h>
18 #include <linux/smp_lock.h>
19 #include <linux/blkdev.h>
20 #include <linux/file.h>
21 #include <linux/swapctl.h>
22 #include <linux/slab.h>
23 #include <linux/init.h>
25 #include <asm/pgtable.h>
26 #include <asm/uaccess.h>
29 * Shared mappings implemented 30.11.1994. It's not fully working yet,
30 * though.
32 * Shared mappings now work. 15.8.1995 Bruno.
34 * finished 'unifying' the page and buffer cache and SMP-threaded the
35 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
37 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
40 atomic_t page_cache_size = ATOMIC_INIT(0);
41 unsigned int page_hash_bits;
42 struct page **page_hash_table;
44 spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
46 * NOTE: to avoid deadlocking you must never acquire the pagecache_lock with
47 * the pagemap_lru_lock held.
49 spinlock_t pagemap_lru_lock = SPIN_LOCK_UNLOCKED;
51 #define CLUSTER_PAGES (1 << page_cluster)
52 #define CLUSTER_SHIFT (PAGE_CACHE_SHIFT + page_cluster)
53 #define CLUSTER_BYTES (1 << CLUSTER_SHIFT)
54 #define CLUSTER_OFFSET(x) (((x) >> CLUSTER_SHIFT) << CLUSTER_SHIFT)
56 void __add_page_to_hash_queue(struct page * page, struct page **p)
58 atomic_inc(&page_cache_size);
59 if((page->next_hash = *p) != NULL)
60 (*p)->pprev_hash = &page->next_hash;
61 *p = page;
62 page->pprev_hash = p;
63 if (page->buffers)
64 PAGE_BUG(page);
67 static void remove_page_from_hash_queue(struct page * page)
69 if(page->pprev_hash) {
70 if(page->next_hash)
71 page->next_hash->pprev_hash = page->pprev_hash;
72 *page->pprev_hash = page->next_hash;
73 page->pprev_hash = NULL;
75 atomic_dec(&page_cache_size);
78 static void remove_page_from_inode_queue(struct page * page)
80 struct inode * inode = page->inode;
81 struct page *prev, *next;
83 inode->i_nrpages--;
84 next = page->next;
85 prev = page->prev;
86 if (inode->i_pages == page)
87 inode->i_pages = next;
88 if (next)
89 next->prev = prev;
90 if (prev)
91 prev->next = next;
92 page->next = NULL;
93 page->prev = NULL;
97 * Remove a page from the page cache and free it. Caller has to make
98 * sure the page is locked and that nobody else uses it - or that usage
99 * is safe.
101 void remove_inode_page(struct page *page)
103 if (!PageLocked(page))
104 PAGE_BUG(page);
106 spin_lock(&pagecache_lock);
107 remove_page_from_inode_queue(page);
108 remove_page_from_hash_queue(page);
109 page->inode = NULL;
110 spin_unlock(&pagecache_lock);
113 void invalidate_inode_pages(struct inode * inode)
115 struct page ** p;
116 struct page * page;
118 repeat:
119 spin_lock(&pagecache_lock);
120 p = &inode->i_pages;
121 while ((page = *p) != NULL) {
122 get_page(page);
123 if (TryLockPage(page)) {
124 spin_unlock(&pagecache_lock);
125 wait_on_page(page);
126 page_cache_release(page);
127 goto repeat;
129 if (page_count(page) != 2)
130 printk("hm, busy page invalidated? (not necesserily a bug)\n");
131 lru_cache_del(page);
133 remove_page_from_inode_queue(page);
134 remove_page_from_hash_queue(page);
135 page->inode = NULL;
136 UnlockPage(page);
137 page_cache_release(page);
138 page_cache_release(page);
141 spin_unlock(&pagecache_lock);
144 * Truncate the page cache at a set offset, removing the pages
145 * that are beyond that offset (and zeroing out partial pages).
147 void truncate_inode_pages(struct inode * inode, unsigned long start)
149 struct page ** p;
150 struct page * page;
151 int partial = 0;
153 repeat:
154 spin_lock(&pagecache_lock);
155 p = &inode->i_pages;
156 while ((page = *p) != NULL) {
157 unsigned long offset = page->offset;
159 /* page wholly truncated - free it */
160 if (offset >= start) {
161 get_page(page);
162 spin_unlock(&pagecache_lock);
164 lock_page(page);
166 if (!inode->i_op->flushpage ||
167 inode->i_op->flushpage(inode, page, 0))
168 lru_cache_del(page);
171 * We remove the page from the page cache
172 * _after_ we have destroyed all buffer-cache
173 * references to it. Otherwise some other process
174 * might think this inode page is not in the
175 * page cache and creates a buffer-cache alias
176 * to it causing all sorts of fun problems ...
178 remove_inode_page(page);
180 UnlockPage(page);
181 page_cache_release(page);
182 page_cache_release(page);
185 * We have done things without the pagecache lock,
186 * so we'll have to repeat the scan.
187 * It's not possible to deadlock here because
188 * we are guaranteed to make progress. (ie. we have
189 * just removed a page)
191 goto repeat;
193 p = &page->next;
195 * there is only one partial page possible.
197 if (partial)
198 continue;
200 offset = start - offset;
201 /* partial truncate, clear end of page */
202 if (offset < PAGE_CACHE_SIZE) {
203 unsigned long address;
204 get_page(page);
205 spin_unlock(&pagecache_lock);
207 lock_page(page);
208 partial = 1;
210 address = page_address(page);
211 memset((void *) (offset + address), 0, PAGE_CACHE_SIZE - offset);
212 flush_page_to_ram(address);
214 if (inode->i_op->flushpage)
215 inode->i_op->flushpage(inode, page, offset);
217 * we have dropped the spinlock so we have to
218 * restart.
220 UnlockPage(page);
221 page_cache_release(page);
222 goto repeat;
225 spin_unlock(&pagecache_lock);
228 int shrink_mmap(int priority, int gfp_mask)
230 int ret = 0, count;
231 LIST_HEAD(young);
232 LIST_HEAD(old);
233 LIST_HEAD(forget);
234 struct list_head * page_lru, * dispose;
235 struct page * page;
237 count = nr_lru_pages / (priority+1);
239 spin_lock(&pagemap_lru_lock);
241 while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache) {
242 page = list_entry(page_lru, struct page, lru);
243 list_del(page_lru);
245 dispose = &lru_cache;
246 if (test_and_clear_bit(PG_referenced, &page->flags))
247 /* Roll the page at the top of the lru list,
248 * we could also be more aggressive putting
249 * the page in the young-dispose-list, so
250 * avoiding to free young pages in each pass.
252 goto dispose_continue;
254 dispose = &old;
255 /* don't account passes over not DMA pages */
256 if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
257 goto dispose_continue;
258 if (!(gfp_mask & __GFP_BIGMEM) && PageBIGMEM(page))
259 goto dispose_continue;
261 count--;
263 dispose = &young;
264 if (TryLockPage(page))
265 goto dispose_continue;
267 /* Release the pagemap_lru lock even if the page is not yet
268 queued in any lru queue since we have just locked down
269 the page so nobody else may SMP race with us running
270 a lru_cache_del() (lru_cache_del() always run with the
271 page locked down ;). */
272 spin_unlock(&pagemap_lru_lock);
274 /* avoid unscalable SMP locking */
275 if (!page->buffers && page_count(page) > 1)
276 goto unlock_noput_continue;
278 /* Take the pagecache_lock spinlock held to avoid
279 other tasks to notice the page while we are looking at its
280 page count. If it's a pagecache-page we'll free it
281 in one atomic transaction after checking its page count. */
282 spin_lock(&pagecache_lock);
284 /* avoid freeing the page while it's locked */
285 get_page(page);
287 /* Is it a buffer page? */
288 if (page->buffers) {
289 spin_unlock(&pagecache_lock);
290 if (!try_to_free_buffers(page))
291 goto unlock_continue;
292 /* page was locked, inode can't go away under us */
293 if (!page->inode) {
294 atomic_sub(PAGE_CACHE_SIZE, &buffermem);
295 goto made_buffer_progress;
297 spin_lock(&pagecache_lock);
301 * We can't free pages unless there's just one user
302 * (count == 2 because we added one ourselves above).
304 if (page_count(page) != 2)
305 goto cache_unlock_continue;
308 * Is it a page swap page? If so, we want to
309 * drop it if it is no longer used, even if it
310 * were to be marked referenced..
312 if (PageSwapCache(page)) {
313 spin_unlock(&pagecache_lock);
314 __delete_from_swap_cache(page);
315 goto made_inode_progress;
318 /* is it a page-cache page? */
319 if (page->inode)
321 dispose = &old;
322 if (!pgcache_under_min())
324 remove_page_from_inode_queue(page);
325 remove_page_from_hash_queue(page);
326 page->inode = NULL;
327 spin_unlock(&pagecache_lock);
328 goto made_inode_progress;
330 goto cache_unlock_continue;
333 dispose = &forget;
334 printk(KERN_ERR "shrink_mmap: unknown LRU page!\n");
336 cache_unlock_continue:
337 spin_unlock(&pagecache_lock);
338 unlock_continue:
339 UnlockPage(page);
340 put_page(page);
341 dispose_relock_continue:
342 /* even if the dispose list is local, a truncate_inode_page()
343 may remove a page from its queue so always
344 synchronize with the lru lock while accesing the
345 page->lru field */
346 spin_lock(&pagemap_lru_lock);
347 list_add(page_lru, dispose);
348 continue;
350 unlock_noput_continue:
351 UnlockPage(page);
352 goto dispose_relock_continue;
354 dispose_continue:
355 list_add(page_lru, dispose);
357 goto out;
359 made_inode_progress:
360 page_cache_release(page);
361 made_buffer_progress:
362 UnlockPage(page);
363 put_page(page);
364 ret = 1;
365 spin_lock(&pagemap_lru_lock);
366 /* nr_lru_pages needs the spinlock */
367 nr_lru_pages--;
369 out:
370 list_splice(&young, &lru_cache);
371 list_splice(&old, lru_cache.prev);
373 spin_unlock(&pagemap_lru_lock);
375 return ret;
378 static inline struct page * __find_page_nolock(struct inode * inode, unsigned long offset, struct page *page)
380 goto inside;
382 for (;;) {
383 page = page->next_hash;
384 inside:
385 if (!page)
386 goto not_found;
387 if (page->inode != inode)
388 continue;
389 if (page->offset == offset)
390 break;
392 set_bit(PG_referenced, &page->flags);
393 not_found:
394 return page;
398 * By the time this is called, the page is locked and
399 * we don't have to worry about any races any more.
401 * Start the IO..
403 static int writeout_one_page(struct page *page)
405 struct buffer_head *bh, *head = page->buffers;
407 bh = head;
408 do {
409 if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
410 continue;
412 bh->b_flushtime = 0;
413 ll_rw_block(WRITE, 1, &bh);
414 } while ((bh = bh->b_this_page) != head);
415 return 0;
418 static int waitfor_one_page(struct page *page)
420 int error = 0;
421 struct buffer_head *bh, *head = page->buffers;
423 bh = head;
424 do {
425 wait_on_buffer(bh);
426 if (buffer_req(bh) && !buffer_uptodate(bh))
427 error = -EIO;
428 } while ((bh = bh->b_this_page) != head);
429 return error;
432 static int do_buffer_fdatasync(struct inode *inode, unsigned long start, unsigned long end, int (*fn)(struct page *))
434 struct page *next;
435 int retval = 0;
437 start &= PAGE_MASK;
439 spin_lock(&pagecache_lock);
440 next = inode->i_pages;
441 while (next) {
442 struct page *page = next;
443 next = page->next;
444 if (!page->buffers)
445 continue;
446 if (page->offset >= end)
447 continue;
448 if (page->offset < start)
449 continue;
451 get_page(page);
452 spin_unlock(&pagecache_lock);
453 lock_page(page);
455 /* The buffers could have been free'd while we waited for the page lock */
456 if (page->buffers)
457 retval |= fn(page);
459 UnlockPage(page);
460 spin_lock(&pagecache_lock);
461 next = page->next;
462 page_cache_release(page);
464 spin_unlock(&pagecache_lock);
466 return retval;
470 * Two-stage data sync: first start the IO, then go back and
471 * collect the information..
473 int generic_buffer_fdatasync(struct inode *inode, unsigned long start, unsigned long end)
475 int retval;
477 retval = do_buffer_fdatasync(inode, start, end, writeout_one_page);
478 retval |= do_buffer_fdatasync(inode, start, end, waitfor_one_page);
479 return retval;
483 * This adds a page to the page cache, starting out as locked,
484 * owned by us, referenced, but not uptodate and with no errors.
486 static inline void __add_to_page_cache(struct page * page,
487 struct inode * inode, unsigned long offset,
488 struct page **hash)
490 unsigned long flags;
492 flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced));
493 page->flags = flags | (1 << PG_locked);
494 page->owner = current; /* REMOVEME */
495 get_page(page);
496 page->offset = offset;
497 add_page_to_inode_queue(inode, page);
498 __add_page_to_hash_queue(page, hash);
499 lru_cache_add(page);
502 void add_to_page_cache(struct page * page, struct inode * inode, unsigned long offset)
504 spin_lock(&pagecache_lock);
505 __add_to_page_cache(page, inode, offset, page_hash(inode, offset));
506 spin_unlock(&pagecache_lock);
509 int add_to_page_cache_unique(struct page * page,
510 struct inode * inode, unsigned long offset,
511 struct page **hash)
513 int err;
514 struct page *alias;
516 spin_lock(&pagecache_lock);
517 alias = __find_page_nolock(inode, offset, *hash);
519 err = 1;
520 if (!alias) {
521 __add_to_page_cache(page,inode,offset,hash);
522 err = 0;
525 spin_unlock(&pagecache_lock);
526 return err;
530 * This adds the requested page to the page cache if it isn't already there,
531 * and schedules an I/O to read in its contents from disk.
533 static inline void page_cache_read(struct file * file, unsigned long offset)
535 unsigned long new_page;
536 struct inode *inode = file->f_dentry->d_inode;
537 struct page ** hash = page_hash(inode, offset);
538 struct page * page;
540 spin_lock(&pagecache_lock);
541 page = __find_page_nolock(inode, offset, *hash);
542 spin_unlock(&pagecache_lock);
543 if (page)
544 return;
546 new_page = page_cache_alloc();
547 if (!new_page)
548 return;
549 page = page_cache_entry(new_page);
551 if (!add_to_page_cache_unique(page, inode, offset, hash)) {
552 inode->i_op->readpage(file, page);
553 page_cache_release(page);
554 return;
558 * We arrive here in the unlikely event that someone
559 * raced with us and added our page to the cache first.
561 page_cache_free(new_page);
562 return;
566 * Read in an entire cluster at once. A cluster is usually a 64k-
567 * aligned block that includes the address requested in "offset."
569 static void read_cluster_nonblocking(struct file * file,
570 unsigned long offset)
572 off_t filesize = file->f_dentry->d_inode->i_size;
573 unsigned long pages = CLUSTER_PAGES;
575 offset = CLUSTER_OFFSET(offset);
576 while ((pages-- > 0) && (offset < filesize)) {
577 page_cache_read(file, offset);
578 offset += PAGE_CACHE_SIZE;
581 return;
585 * Wait for a page to get unlocked.
587 * This must be called with the caller "holding" the page,
588 * ie with increased "page->count" so that the page won't
589 * go away during the wait..
591 void ___wait_on_page(struct page *page)
593 struct task_struct *tsk = current;
594 DECLARE_WAITQUEUE(wait, tsk);
596 add_wait_queue(&page->wait, &wait);
597 do {
598 run_task_queue(&tq_disk);
599 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
600 if (!PageLocked(page))
601 break;
602 schedule();
603 } while (PageLocked(page));
604 tsk->state = TASK_RUNNING;
605 remove_wait_queue(&page->wait, &wait);
609 * Get an exclusive lock on the page..
611 void lock_page(struct page *page)
613 while (TryLockPage(page))
614 ___wait_on_page(page);
619 * a rather lightweight function, finding and getting a reference to a
620 * hashed page atomically, waiting for it if it's locked.
622 struct page * __find_get_page (struct inode * inode,
623 unsigned long offset, struct page **hash)
625 struct page *page;
628 * We scan the hash list read-only. Addition to and removal from
629 * the hash-list needs a held write-lock.
631 repeat:
632 spin_lock(&pagecache_lock);
633 page = __find_page_nolock(inode, offset, *hash);
634 if (page)
635 get_page(page);
636 spin_unlock(&pagecache_lock);
638 /* Found the page, sleep if locked. */
639 if (page && PageLocked(page)) {
640 struct task_struct *tsk = current;
641 DECLARE_WAITQUEUE(wait, tsk);
643 run_task_queue(&tq_disk);
645 __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
646 add_wait_queue(&page->wait, &wait);
648 if (PageLocked(page))
649 schedule();
650 __set_task_state(tsk, TASK_RUNNING);
651 remove_wait_queue(&page->wait, &wait);
654 * The page might have been unhashed meanwhile. It's
655 * not freed though because we hold a reference to it.
656 * If this is the case then it will be freed _here_,
657 * and we recheck the hash anyway.
659 page_cache_release(page);
660 goto repeat;
663 * It's not locked so we can return the page and we hold
664 * a reference to it.
666 return page;
670 * Get the lock to a page atomically.
672 struct page * __find_lock_page (struct inode * inode,
673 unsigned long offset, struct page **hash)
675 struct page *page;
678 * We scan the hash list read-only. Addition to and removal from
679 * the hash-list needs a held write-lock.
681 repeat:
682 spin_lock(&pagecache_lock);
683 page = __find_page_nolock(inode, offset, *hash);
684 if (page)
685 get_page(page);
686 spin_unlock(&pagecache_lock);
688 /* Found the page, sleep if locked. */
689 if (page && TryLockPage(page)) {
690 struct task_struct *tsk = current;
691 DECLARE_WAITQUEUE(wait, tsk);
693 run_task_queue(&tq_disk);
695 __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
696 add_wait_queue(&page->wait, &wait);
698 if (PageLocked(page))
699 schedule();
700 __set_task_state(tsk, TASK_RUNNING);
701 remove_wait_queue(&page->wait, &wait);
704 * The page might have been unhashed meanwhile. It's
705 * not freed though because we hold a reference to it.
706 * If this is the case then it will be freed _here_,
707 * and we recheck the hash anyway.
709 page_cache_release(page);
710 goto repeat;
713 * It's not locked so we can return the page and we hold
714 * a reference to it.
716 return page;
719 #if 0
720 #define PROFILE_READAHEAD
721 #define DEBUG_READAHEAD
722 #endif
725 * Read-ahead profiling information
726 * --------------------------------
727 * Every PROFILE_MAXREADCOUNT, the following information is written
728 * to the syslog:
729 * Percentage of asynchronous read-ahead.
730 * Average of read-ahead fields context value.
731 * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
732 * to the syslog.
735 #ifdef PROFILE_READAHEAD
737 #define PROFILE_MAXREADCOUNT 1000
739 static unsigned long total_reada;
740 static unsigned long total_async;
741 static unsigned long total_ramax;
742 static unsigned long total_ralen;
743 static unsigned long total_rawin;
745 static void profile_readahead(int async, struct file *filp)
747 unsigned long flags;
749 ++total_reada;
750 if (async)
751 ++total_async;
753 total_ramax += filp->f_ramax;
754 total_ralen += filp->f_ralen;
755 total_rawin += filp->f_rawin;
757 if (total_reada > PROFILE_MAXREADCOUNT) {
758 save_flags(flags);
759 cli();
760 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
761 restore_flags(flags);
762 return;
765 printk("Readahead average: max=%ld, len=%ld, win=%ld, async=%ld%%\n",
766 total_ramax/total_reada,
767 total_ralen/total_reada,
768 total_rawin/total_reada,
769 (total_async*100)/total_reada);
770 #ifdef DEBUG_READAHEAD
771 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%ld\n",
772 filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
773 #endif
775 total_reada = 0;
776 total_async = 0;
777 total_ramax = 0;
778 total_ralen = 0;
779 total_rawin = 0;
781 restore_flags(flags);
784 #endif /* defined PROFILE_READAHEAD */
787 * Read-ahead context:
788 * -------------------
789 * The read ahead context fields of the "struct file" are the following:
790 * - f_raend : position of the first byte after the last page we tried to
791 * read ahead.
792 * - f_ramax : current read-ahead maximum size.
793 * - f_ralen : length of the current IO read block we tried to read-ahead.
794 * - f_rawin : length of the current read-ahead window.
795 * if last read-ahead was synchronous then
796 * f_rawin = f_ralen
797 * otherwise (was asynchronous)
798 * f_rawin = previous value of f_ralen + f_ralen
800 * Read-ahead limits:
801 * ------------------
802 * MIN_READAHEAD : minimum read-ahead size when read-ahead.
803 * MAX_READAHEAD : maximum read-ahead size when read-ahead.
805 * Synchronous read-ahead benefits:
806 * --------------------------------
807 * Using reasonable IO xfer length from peripheral devices increase system
808 * performances.
809 * Reasonable means, in this context, not too large but not too small.
810 * The actual maximum value is:
811 * MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
812 * and 32K if defined (4K page size assumed).
814 * Asynchronous read-ahead benefits:
815 * ---------------------------------
816 * Overlapping next read request and user process execution increase system
817 * performance.
819 * Read-ahead risks:
820 * -----------------
821 * We have to guess which further data are needed by the user process.
822 * If these data are often not really needed, it's bad for system
823 * performances.
824 * However, we know that files are often accessed sequentially by
825 * application programs and it seems that it is possible to have some good
826 * strategy in that guessing.
827 * We only try to read-ahead files that seems to be read sequentially.
829 * Asynchronous read-ahead risks:
830 * ------------------------------
831 * In order to maximize overlapping, we must start some asynchronous read
832 * request from the device, as soon as possible.
833 * We must be very careful about:
834 * - The number of effective pending IO read requests.
835 * ONE seems to be the only reasonable value.
836 * - The total memory pool usage for the file access stream.
837 * This maximum memory usage is implicitly 2 IO read chunks:
838 * 2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
839 * 64k if defined (4K page size assumed).
842 static inline int get_max_readahead(struct inode * inode)
844 if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
845 return MAX_READAHEAD;
846 return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
849 static void generic_file_readahead(int reada_ok,
850 struct file * filp, struct inode * inode,
851 unsigned long ppos, struct page * page)
853 unsigned long max_ahead, ahead;
854 unsigned long raend;
855 int max_readahead = get_max_readahead(inode);
857 raend = filp->f_raend & PAGE_CACHE_MASK;
858 max_ahead = 0;
861 * The current page is locked.
862 * If the current position is inside the previous read IO request, do not
863 * try to reread previously read ahead pages.
864 * Otherwise decide or not to read ahead some pages synchronously.
865 * If we are not going to read ahead, set the read ahead context for this
866 * page only.
868 if (PageLocked(page)) {
869 if (!filp->f_ralen || ppos >= raend || ppos + filp->f_ralen < raend) {
870 raend = ppos;
871 if (raend < inode->i_size)
872 max_ahead = filp->f_ramax;
873 filp->f_rawin = 0;
874 filp->f_ralen = PAGE_CACHE_SIZE;
875 if (!max_ahead) {
876 filp->f_raend = ppos + filp->f_ralen;
877 filp->f_rawin += filp->f_ralen;
882 * The current page is not locked.
883 * If we were reading ahead and,
884 * if the current max read ahead size is not zero and,
885 * if the current position is inside the last read-ahead IO request,
886 * it is the moment to try to read ahead asynchronously.
887 * We will later force unplug device in order to force asynchronous read IO.
889 else if (reada_ok && filp->f_ramax && raend >= PAGE_CACHE_SIZE &&
890 ppos <= raend && ppos + filp->f_ralen >= raend) {
892 * Add ONE page to max_ahead in order to try to have about the same IO max size
893 * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
894 * Compute the position of the last page we have tried to read in order to
895 * begin to read ahead just at the next page.
897 raend -= PAGE_CACHE_SIZE;
898 if (raend < inode->i_size)
899 max_ahead = filp->f_ramax + PAGE_CACHE_SIZE;
901 if (max_ahead) {
902 filp->f_rawin = filp->f_ralen;
903 filp->f_ralen = 0;
904 reada_ok = 2;
908 * Try to read ahead pages.
909 * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
910 * scheduler, will work enough for us to avoid too bad actuals IO requests.
912 ahead = 0;
913 while (ahead < max_ahead) {
914 ahead += PAGE_CACHE_SIZE;
915 page_cache_read(filp, raend + ahead);
918 * If we tried to read ahead some pages,
919 * If we tried to read ahead asynchronously,
920 * Try to force unplug of the device in order to start an asynchronous
921 * read IO request.
922 * Update the read-ahead context.
923 * Store the length of the current read-ahead window.
924 * Double the current max read ahead size.
925 * That heuristic avoid to do some large IO for files that are not really
926 * accessed sequentially.
928 if (ahead) {
929 if (reada_ok == 2) {
930 run_task_queue(&tq_disk);
933 filp->f_ralen += ahead;
934 filp->f_rawin += filp->f_ralen;
935 filp->f_raend = raend + ahead + PAGE_CACHE_SIZE;
937 filp->f_ramax += filp->f_ramax;
939 if (filp->f_ramax > max_readahead)
940 filp->f_ramax = max_readahead;
942 #ifdef PROFILE_READAHEAD
943 profile_readahead((reada_ok == 2), filp);
944 #endif
947 return;
952 * This is a generic file read routine, and uses the
953 * inode->i_op->readpage() function for the actual low-level
954 * stuff.
956 * This is really ugly. But the goto's actually try to clarify some
957 * of the logic when it comes to error handling etc.
959 void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
961 struct dentry *dentry = filp->f_dentry;
962 struct inode *inode = dentry->d_inode;
963 size_t pos, pgpos, page_cache;
964 int reada_ok;
965 int error;
966 int max_readahead = get_max_readahead(inode);
968 page_cache = 0;
970 pos = *ppos;
971 pgpos = pos & PAGE_CACHE_MASK;
973 * If the current position is outside the previous read-ahead window,
974 * we reset the current read-ahead context and set read ahead max to zero
975 * (will be set to just needed value later),
976 * otherwise, we assume that the file accesses are sequential enough to
977 * continue read-ahead.
979 if (pgpos > filp->f_raend || pgpos + filp->f_rawin < filp->f_raend) {
980 reada_ok = 0;
981 filp->f_raend = 0;
982 filp->f_ralen = 0;
983 filp->f_ramax = 0;
984 filp->f_rawin = 0;
985 } else {
986 reada_ok = 1;
989 * Adjust the current value of read-ahead max.
990 * If the read operation stay in the first half page, force no readahead.
991 * Otherwise try to increase read ahead max just enough to do the read request.
992 * Then, at least MIN_READAHEAD if read ahead is ok,
993 * and at most MAX_READAHEAD in all cases.
995 if (pos + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
996 filp->f_ramax = 0;
997 } else {
998 unsigned long needed;
1000 needed = ((pos + desc->count) & PAGE_CACHE_MASK) - pgpos;
1002 if (filp->f_ramax < needed)
1003 filp->f_ramax = needed;
1005 if (reada_ok && filp->f_ramax < MIN_READAHEAD)
1006 filp->f_ramax = MIN_READAHEAD;
1007 if (filp->f_ramax > max_readahead)
1008 filp->f_ramax = max_readahead;
1011 for (;;) {
1012 struct page *page, **hash;
1014 if (pos >= inode->i_size)
1015 break;
1018 * Try to find the data in the page cache..
1020 hash = page_hash(inode, pos & PAGE_CACHE_MASK);
1022 spin_lock(&pagecache_lock);
1023 page = __find_page_nolock(inode, pos & PAGE_CACHE_MASK, *hash);
1024 if (!page)
1025 goto no_cached_page;
1026 found_page:
1027 get_page(page);
1028 spin_unlock(&pagecache_lock);
1030 if (!Page_Uptodate(page))
1031 goto page_not_up_to_date;
1032 page_ok:
1034 * Ok, we have the page, and it's up-to-date, so
1035 * now we can copy it to user space...
1038 unsigned long offset, nr;
1040 offset = pos & ~PAGE_CACHE_MASK;
1041 nr = PAGE_CACHE_SIZE - offset;
1042 if (nr > inode->i_size - pos)
1043 nr = inode->i_size - pos;
1046 * The actor routine returns how many bytes were actually used..
1047 * NOTE! This may not be the same as how much of a user buffer
1048 * we filled up (we may be padding etc), so we can only update
1049 * "pos" here (the actor routine has to update the user buffer
1050 * pointers and the remaining count).
1052 nr = actor(desc, (const char *) (page_address(page) + offset), nr);
1053 pos += nr;
1054 page_cache_release(page);
1055 if (nr && desc->count)
1056 continue;
1057 break;
1061 * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
1063 page_not_up_to_date:
1064 generic_file_readahead(reada_ok, filp, inode,
1065 pos & PAGE_CACHE_MASK, page);
1067 if (Page_Uptodate(page))
1068 goto page_ok;
1070 /* Get exclusive access to the page ... */
1071 lock_page(page);
1072 if (Page_Uptodate(page)) {
1073 UnlockPage(page);
1074 goto page_ok;
1077 readpage:
1078 /* ... and start the actual read. The read will unlock the page. */
1079 error = inode->i_op->readpage(filp, page);
1081 if (!error) {
1082 if (Page_Uptodate(page))
1083 goto page_ok;
1085 /* Again, try some read-ahead while waiting for the page to finish.. */
1086 generic_file_readahead(reada_ok, filp, inode,
1087 pos & PAGE_CACHE_MASK, page);
1088 wait_on_page(page);
1089 if (Page_Uptodate(page))
1090 goto page_ok;
1091 error = -EIO;
1094 /* UHHUH! A synchronous read error occurred. Report it */
1095 desc->error = error;
1096 page_cache_release(page);
1097 break;
1099 no_cached_page:
1101 * Ok, it wasn't cached, so we need to create a new
1102 * page..
1104 * We get here with the page cache lock held.
1106 if (!page_cache) {
1107 spin_unlock(&pagecache_lock);
1108 page_cache = page_cache_alloc();
1109 if (!page_cache) {
1110 desc->error = -ENOMEM;
1111 break;
1115 * Somebody may have added the page while we
1116 * dropped the page cache lock. Check for that.
1118 spin_lock(&pagecache_lock);
1119 page = __find_page_nolock(inode, pos & PAGE_CACHE_MASK, *hash);
1120 if (page)
1121 goto found_page;
1125 * Ok, add the new page to the hash-queues...
1127 page = page_cache_entry(page_cache);
1128 __add_to_page_cache(page, inode, pos & PAGE_CACHE_MASK, hash);
1129 spin_unlock(&pagecache_lock);
1131 page_cache = 0;
1132 goto readpage;
1135 *ppos = pos;
1136 filp->f_reada = 1;
1137 if (page_cache)
1138 page_cache_free(page_cache);
1139 UPDATE_ATIME(inode);
1142 static int file_read_actor(read_descriptor_t * desc, const char *area, unsigned long size)
1144 unsigned long left;
1145 unsigned long count = desc->count;
1147 if (size > count)
1148 size = count;
1149 left = __copy_to_user(desc->buf, area, size);
1150 if (left) {
1151 size -= left;
1152 desc->error = -EFAULT;
1154 desc->count = count - size;
1155 desc->written += size;
1156 desc->buf += size;
1157 return size;
1161 * This is the "read()" routine for all filesystems
1162 * that can use the page cache directly.
1164 ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
1166 ssize_t retval;
1168 retval = -EFAULT;
1169 if (access_ok(VERIFY_WRITE, buf, count)) {
1170 retval = 0;
1171 if (count) {
1172 read_descriptor_t desc;
1174 desc.written = 0;
1175 desc.count = count;
1176 desc.buf = buf;
1177 desc.error = 0;
1178 do_generic_file_read(filp, ppos, &desc, file_read_actor);
1180 retval = desc.written;
1181 if (!retval)
1182 retval = desc.error;
1185 return retval;
1188 static int file_send_actor(read_descriptor_t * desc, const char *area, unsigned long size)
1190 ssize_t written;
1191 unsigned long count = desc->count;
1192 struct file *file = (struct file *) desc->buf;
1193 mm_segment_t old_fs;
1195 if (size > count)
1196 size = count;
1197 old_fs = get_fs();
1198 set_fs(KERNEL_DS);
1199 written = file->f_op->write(file, area, size, &file->f_pos);
1200 set_fs(old_fs);
1201 if (written < 0) {
1202 desc->error = written;
1203 written = 0;
1205 desc->count = count - written;
1206 desc->written += written;
1207 return written;
1210 asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
1212 ssize_t retval;
1213 struct file * in_file, * out_file;
1214 struct inode * in_inode, * out_inode;
1217 * Get input file, and verify that it is ok..
1219 retval = -EBADF;
1220 in_file = fget(in_fd);
1221 if (!in_file)
1222 goto out;
1223 if (!(in_file->f_mode & FMODE_READ))
1224 goto fput_in;
1225 retval = -EINVAL;
1226 in_inode = in_file->f_dentry->d_inode;
1227 if (!in_inode)
1228 goto fput_in;
1229 if (!in_inode->i_op || !in_inode->i_op->readpage)
1230 goto fput_in;
1231 retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
1232 if (retval)
1233 goto fput_in;
1236 * Get output file, and verify that it is ok..
1238 retval = -EBADF;
1239 out_file = fget(out_fd);
1240 if (!out_file)
1241 goto fput_in;
1242 if (!(out_file->f_mode & FMODE_WRITE))
1243 goto fput_out;
1244 retval = -EINVAL;
1245 if (!out_file->f_op || !out_file->f_op->write)
1246 goto fput_out;
1247 out_inode = out_file->f_dentry->d_inode;
1248 if (!out_inode)
1249 goto fput_out;
1250 retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
1251 if (retval)
1252 goto fput_out;
1254 retval = 0;
1255 if (count) {
1256 read_descriptor_t desc;
1257 loff_t pos = 0, *ppos;
1259 retval = -EFAULT;
1260 ppos = &in_file->f_pos;
1261 if (offset) {
1262 if (get_user(pos, offset))
1263 goto fput_out;
1264 ppos = &pos;
1267 desc.written = 0;
1268 desc.count = count;
1269 desc.buf = (char *) out_file;
1270 desc.error = 0;
1271 do_generic_file_read(in_file, ppos, &desc, file_send_actor);
1273 retval = desc.written;
1274 if (!retval)
1275 retval = desc.error;
1276 if (offset)
1277 put_user(pos, offset);
1280 fput_out:
1281 fput(out_file);
1282 fput_in:
1283 fput(in_file);
1284 out:
1285 return retval;
1289 * filemap_nopage() is invoked via the vma operations vector for a
1290 * mapped memory region to read in file data during a page fault.
1292 * The goto's are kind of ugly, but this streamlines the normal case of having
1293 * it in the page cache, and handles the special cases reasonably without
1294 * having a lot of duplicated code.
1296 * XXX - at some point, this should return unique values to indicate to
1297 * the caller whether this is EIO, OOM, or SIGBUS.
1299 static unsigned long filemap_nopage(struct vm_area_struct * area,
1300 unsigned long address, int no_share)
1302 struct file * file = area->vm_file;
1303 struct dentry * dentry = file->f_dentry;
1304 struct inode * inode = dentry->d_inode;
1305 struct page * page, **hash;
1306 unsigned long old_page, new_page = 0;
1308 unsigned long offset = address - area->vm_start + area->vm_offset;
1311 * Semantics for shared and private memory areas are different
1312 * past the end of the file. A shared mapping past the last page
1313 * of the file is an error and results in a SIGBUS, while a
1314 * private mapping just maps in a zero page.
1316 if ((offset >= inode->i_size) &&
1317 (area->vm_flags & VM_SHARED) && (area->vm_mm == current->mm))
1318 return 0;
1321 * Do we have something in the page cache already?
1323 hash = page_hash(inode, offset);
1324 retry_find:
1325 page = __find_get_page(inode, offset, hash);
1326 if (!page)
1327 goto no_cached_page;
1330 * Ok, found a page in the page cache, now we need to check
1331 * that it's up-to-date.
1333 if (!Page_Uptodate(page))
1334 goto page_not_uptodate;
1336 success:
1338 * Found the page and have a reference on it, need to check sharing
1339 * and possibly copy it over to another page..
1341 old_page = page_address(page);
1342 if (!no_share) {
1343 flush_page_to_ram(old_page);
1344 return old_page;
1347 new_page = page_cache_alloc();
1348 if (new_page) {
1349 copy_page(new_page, old_page);
1350 flush_page_to_ram(new_page);
1352 page_cache_release(page);
1353 return new_page;
1355 no_cached_page:
1357 * If the requested offset is within our file, try to read a whole
1358 * cluster of pages at once.
1360 * Otherwise, we're off the end of a privately mapped file,
1361 * so we need to map a zero page.
1363 if (offset < inode->i_size)
1364 read_cluster_nonblocking(file, offset);
1365 else
1366 page_cache_read(file, offset);
1369 * The page we want has now been added to the page cache.
1370 * In the unlikely event that someone removed it in the
1371 * meantime, we'll just come back here and read it again.
1373 goto retry_find;
1375 page_not_uptodate:
1376 lock_page(page);
1377 if (Page_Uptodate(page)) {
1378 UnlockPage(page);
1379 goto success;
1382 if (!inode->i_op->readpage(file, page)) {
1383 wait_on_page(page);
1384 if (Page_Uptodate(page))
1385 goto success;
1389 * Umm, take care of errors if the page isn't up-to-date.
1390 * Try to re-read it _once_. We do this synchronously,
1391 * because there really aren't any performance issues here
1392 * and we need to check for errors.
1394 lock_page(page);
1395 if (Page_Uptodate(page)) {
1396 UnlockPage(page);
1397 goto success;
1399 ClearPageError(page);
1400 if (!inode->i_op->readpage(file, page)) {
1401 wait_on_page(page);
1402 if (Page_Uptodate(page))
1403 goto success;
1407 * Things didn't work out. Return zero to tell the
1408 * mm layer so, possibly freeing the page cache page first.
1410 page_cache_release(page);
1411 if (new_page)
1412 page_cache_free(new_page);
1413 return 0;
1417 * Tries to write a shared mapped page to its backing store. May return -EIO
1418 * if the disk is full.
1420 static inline int do_write_page(struct inode * inode, struct file * file,
1421 const char * page_addr, unsigned long offset)
1423 int retval;
1424 unsigned long size;
1425 int (*writepage) (struct file *, struct page *);
1426 struct page * page;
1428 size = offset + PAGE_SIZE;
1429 /* refuse to extend file size.. */
1430 if (S_ISREG(inode->i_mode)) {
1431 if (size > inode->i_size)
1432 size = inode->i_size;
1433 /* Ho humm.. We should have tested for this earlier */
1434 if (size < offset)
1435 return -EIO;
1437 size -= offset;
1438 retval = -EIO;
1439 writepage = inode->i_op->writepage;
1440 page = mem_map + MAP_NR(page_addr);
1441 lock_page(page);
1443 retval = writepage(file, page);
1445 UnlockPage(page);
1446 return retval;
1449 static int filemap_write_page(struct vm_area_struct * vma,
1450 unsigned long offset,
1451 unsigned long page,
1452 int wait)
1454 int result;
1455 struct file * file;
1456 struct dentry * dentry;
1457 struct inode * inode;
1459 file = vma->vm_file;
1460 dentry = file->f_dentry;
1461 inode = dentry->d_inode;
1464 * If a task terminates while we're swapping the page, the vma and
1465 * and file could be released ... increment the count to be safe.
1467 get_file(file);
1468 result = do_write_page(inode, file, (const char *) page, offset);
1469 fput(file);
1470 return result;
1475 * The page cache takes care of races between somebody
1476 * trying to swap something out and swap something in
1477 * at the same time..
1479 extern void wakeup_bdflush(int);
1480 int filemap_swapout(struct vm_area_struct * vma, struct page * page)
1482 int retval = filemap_write_page(vma, page->offset, page_address(page), 0);
1483 wakeup_bdflush(0);
1484 return retval;
1487 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
1488 unsigned long address, unsigned int flags)
1490 pte_t pte = *ptep;
1491 unsigned long pageaddr;
1492 struct page *page;
1493 int error;
1495 if (!(flags & MS_INVALIDATE)) {
1496 if (!pte_present(pte))
1497 return 0;
1498 if (!pte_dirty(pte))
1499 return 0;
1500 flush_page_to_ram(pte_page(pte));
1501 flush_cache_page(vma, address);
1502 set_pte(ptep, pte_mkclean(pte));
1503 flush_tlb_page(vma, address);
1504 pageaddr = pte_page(pte);
1505 page = page_cache_entry(pageaddr);
1506 get_page(page);
1507 } else {
1508 if (pte_none(pte))
1509 return 0;
1510 flush_cache_page(vma, address);
1511 pte_clear(ptep);
1512 flush_tlb_page(vma, address);
1513 if (!pte_present(pte)) {
1514 swap_free(pte_val(pte));
1515 return 0;
1517 pageaddr = pte_page(pte);
1518 if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
1519 page_cache_free(pageaddr);
1520 return 0;
1523 error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, pageaddr, 1);
1524 page_cache_free(pageaddr);
1525 return error;
1528 static inline int filemap_sync_pte_range(pmd_t * pmd,
1529 unsigned long address, unsigned long size,
1530 struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
1532 pte_t * pte;
1533 unsigned long end;
1534 int error;
1536 if (pmd_none(*pmd))
1537 return 0;
1538 if (pmd_bad(*pmd)) {
1539 printk("filemap_sync_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
1540 pmd_clear(pmd);
1541 return 0;
1543 pte = pte_offset(pmd, address);
1544 offset += address & PMD_MASK;
1545 address &= ~PMD_MASK;
1546 end = address + size;
1547 if (end > PMD_SIZE)
1548 end = PMD_SIZE;
1549 error = 0;
1550 do {
1551 error |= filemap_sync_pte(pte, vma, address + offset, flags);
1552 address += PAGE_SIZE;
1553 pte++;
1554 } while (address < end);
1555 return error;
1558 static inline int filemap_sync_pmd_range(pgd_t * pgd,
1559 unsigned long address, unsigned long size,
1560 struct vm_area_struct *vma, unsigned int flags)
1562 pmd_t * pmd;
1563 unsigned long offset, end;
1564 int error;
1566 if (pgd_none(*pgd))
1567 return 0;
1568 if (pgd_bad(*pgd)) {
1569 printk("filemap_sync_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd));
1570 pgd_clear(pgd);
1571 return 0;
1573 pmd = pmd_offset(pgd, address);
1574 offset = address & PGDIR_MASK;
1575 address &= ~PGDIR_MASK;
1576 end = address + size;
1577 if (end > PGDIR_SIZE)
1578 end = PGDIR_SIZE;
1579 error = 0;
1580 do {
1581 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
1582 address = (address + PMD_SIZE) & PMD_MASK;
1583 pmd++;
1584 } while (address < end);
1585 return error;
1588 static int filemap_sync(struct vm_area_struct * vma, unsigned long address,
1589 size_t size, unsigned int flags)
1591 pgd_t * dir;
1592 unsigned long end = address + size;
1593 int error = 0;
1595 dir = pgd_offset(vma->vm_mm, address);
1596 flush_cache_range(vma->vm_mm, end - size, end);
1597 while (address < end) {
1598 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
1599 address = (address + PGDIR_SIZE) & PGDIR_MASK;
1600 dir++;
1602 flush_tlb_range(vma->vm_mm, end - size, end);
1603 return error;
1607 * This handles (potentially partial) area unmaps..
1609 static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len)
1611 filemap_sync(vma, start, len, MS_ASYNC);
1615 * Shared mappings need to be able to do the right thing at
1616 * close/unmap/sync. They will also use the private file as
1617 * backing-store for swapping..
1619 static struct vm_operations_struct file_shared_mmap = {
1620 NULL, /* no special open */
1621 NULL, /* no special close */
1622 filemap_unmap, /* unmap - we need to sync the pages */
1623 NULL, /* no special protect */
1624 filemap_sync, /* sync */
1625 NULL, /* advise */
1626 filemap_nopage, /* nopage */
1627 NULL, /* wppage */
1628 filemap_swapout /* swapout */
1632 * Private mappings just need to be able to load in the map.
1634 * (This is actually used for shared mappings as well, if we
1635 * know they can't ever get write permissions..)
1637 static struct vm_operations_struct file_private_mmap = {
1638 NULL, /* open */
1639 NULL, /* close */
1640 NULL, /* unmap */
1641 NULL, /* protect */
1642 NULL, /* sync */
1643 NULL, /* advise */
1644 filemap_nopage, /* nopage */
1645 NULL, /* wppage */
1646 NULL /* swapout */
1649 /* This is used for a general mmap of a disk file */
1651 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1653 struct vm_operations_struct * ops;
1654 struct inode *inode = file->f_dentry->d_inode;
1656 ops = &file_private_mmap;
1657 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1658 if (!inode->i_op || !inode->i_op->writepage)
1659 return -EINVAL;
1660 ops = &file_shared_mmap;
1662 if (!inode->i_sb || !S_ISREG(inode->i_mode))
1663 return -EACCES;
1664 if (!inode->i_op || !inode->i_op->readpage)
1665 return -ENOEXEC;
1666 UPDATE_ATIME(inode);
1667 vma->vm_ops = ops;
1668 return 0;
1673 * The msync() system call.
1676 static int msync_interval(struct vm_area_struct * vma,
1677 unsigned long start, unsigned long end, int flags)
1679 if (vma->vm_file && vma->vm_ops && vma->vm_ops->sync) {
1680 int error;
1681 error = vma->vm_ops->sync(vma, start, end-start, flags);
1682 if (!error && (flags & MS_SYNC)) {
1683 struct file * file = vma->vm_file;
1684 if (file) {
1685 struct dentry * dentry = file->f_dentry;
1686 error = file_fsync(file, dentry);
1689 return error;
1691 return 0;
1694 asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
1696 unsigned long end;
1697 struct vm_area_struct * vma;
1698 int unmapped_error, error = -EINVAL;
1700 down(&current->mm->mmap_sem);
1701 lock_kernel();
1702 if (start & ~PAGE_MASK)
1703 goto out;
1704 len = (len + ~PAGE_MASK) & PAGE_MASK;
1705 end = start + len;
1706 if (end < start)
1707 goto out;
1708 if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1709 goto out;
1710 error = 0;
1711 if (end == start)
1712 goto out;
1714 * If the interval [start,end) covers some unmapped address ranges,
1715 * just ignore them, but return -EFAULT at the end.
1717 vma = find_vma(current->mm, start);
1718 unmapped_error = 0;
1719 for (;;) {
1720 /* Still start < end. */
1721 error = -EFAULT;
1722 if (!vma)
1723 goto out;
1724 /* Here start < vma->vm_end. */
1725 if (start < vma->vm_start) {
1726 unmapped_error = -EFAULT;
1727 start = vma->vm_start;
1729 /* Here vma->vm_start <= start < vma->vm_end. */
1730 if (end <= vma->vm_end) {
1731 if (start < end) {
1732 error = msync_interval(vma, start, end, flags);
1733 if (error)
1734 goto out;
1736 error = unmapped_error;
1737 goto out;
1739 /* Here vma->vm_start <= start < vma->vm_end < end. */
1740 error = msync_interval(vma, start, vma->vm_end, flags);
1741 if (error)
1742 goto out;
1743 start = vma->vm_end;
1744 vma = vma->vm_next;
1746 out:
1747 unlock_kernel();
1748 up(&current->mm->mmap_sem);
1749 return error;
1753 * Write to a file through the page cache. This is mainly for the
1754 * benefit of NFS and possibly other network-based file systems.
1756 * We currently put everything into the page cache prior to writing it.
1757 * This is not a problem when writing full pages. With partial pages,
1758 * however, we first have to read the data into the cache, then
1759 * dirty the page, and finally schedule it for writing. Alternatively, we
1760 * could write-through just the portion of data that would go into that
1761 * page, but that would kill performance for applications that write data
1762 * line by line, and it's prone to race conditions.
1764 * Note that this routine doesn't try to keep track of dirty pages. Each
1765 * file system has to do this all by itself, unfortunately.
1766 * okir@monad.swb.de
1768 ssize_t
1769 generic_file_write(struct file *file, const char *buf,
1770 size_t count, loff_t *ppos,
1771 writepage_t write_one_page)
1773 struct dentry *dentry = file->f_dentry;
1774 struct inode *inode = dentry->d_inode;
1775 unsigned long pos = *ppos;
1776 unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
1777 struct page *page, **hash;
1778 unsigned long page_cache = 0;
1779 unsigned long written;
1780 long status;
1781 int err;
1783 err = file->f_error;
1784 if (err) {
1785 file->f_error = 0;
1786 goto out;
1789 written = 0;
1791 if (file->f_flags & O_APPEND)
1792 pos = inode->i_size;
1795 * Check whether we've reached the file size limit.
1797 err = -EFBIG;
1798 if (pos >= limit) {
1799 send_sig(SIGXFSZ, current, 0);
1800 goto out;
1803 status = 0;
1805 * Check whether to truncate the write,
1806 * and send the signal if we do.
1808 if (count > limit - pos) {
1809 send_sig(SIGXFSZ, current, 0);
1810 count = limit - pos;
1813 while (count) {
1814 unsigned long bytes, pgpos, offset;
1816 * Try to find the page in the cache. If it isn't there,
1817 * allocate a free page.
1819 offset = (pos & ~PAGE_CACHE_MASK);
1820 pgpos = pos & PAGE_CACHE_MASK;
1821 bytes = PAGE_CACHE_SIZE - offset;
1822 if (bytes > count)
1823 bytes = count;
1825 hash = page_hash(inode, pgpos);
1826 repeat_find:
1827 page = __find_lock_page(inode, pgpos, hash);
1828 if (!page) {
1829 if (!page_cache) {
1830 page_cache = page_cache_alloc();
1831 if (page_cache)
1832 goto repeat_find;
1833 status = -ENOMEM;
1834 break;
1836 page = page_cache_entry(page_cache);
1837 if (add_to_page_cache_unique(page,inode,pgpos,hash))
1838 goto repeat_find;
1840 page_cache = 0;
1843 /* We have exclusive IO access to the page.. */
1844 if (!PageLocked(page)) {
1845 PAGE_BUG(page);
1846 } else {
1847 if (page->owner != current) {
1848 PAGE_BUG(page);
1852 status = write_one_page(file, page, offset, bytes, buf);
1854 if (status >= 0) {
1855 written += status;
1856 count -= status;
1857 pos += status;
1858 buf += status;
1859 if (pos > inode->i_size)
1860 inode->i_size = pos;
1862 /* Mark it unlocked again and drop the page.. */
1863 UnlockPage(page);
1864 page_cache_release(page);
1866 if (status < 0)
1867 break;
1869 *ppos = pos;
1871 if (page_cache)
1872 page_cache_free(page_cache);
1874 err = written ? written : status;
1875 out:
1876 return err;
1880 * Support routines for directory caching using the page cache.
1884 * Unlock and free a page.
1886 void put_cached_page(unsigned long addr)
1888 struct page * page = page_cache_entry(addr);
1890 UnlockPage(page);
1891 if (page_count(page) != 2)
1892 panic("put_cached_page: page count=%d\n",
1893 page_count(page));
1894 page_cache_release(page);
1897 void __init page_cache_init(unsigned long memory_size)
1899 unsigned long htable_size, order;
1901 htable_size = memory_size >> PAGE_SHIFT;
1902 htable_size *= sizeof(struct page *);
1903 for(order = 0; (PAGE_SIZE << order) < htable_size; order++)
1906 do {
1907 unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *);
1909 page_hash_bits = 0;
1910 while((tmp >>= 1UL) != 0UL)
1911 page_hash_bits++;
1913 page_hash_table = (struct page **)
1914 __get_free_pages(GFP_ATOMIC, order);
1915 } while(page_hash_table == NULL && --order > 0);
1917 printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
1918 (1 << page_hash_bits), order, (PAGE_SIZE << order));
1919 if (!page_hash_table)
1920 panic("Failed to allocate page hash table\n");
1921 memset(page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));