Optimize andes_clear_page() and andes_copy_page() with prefetch
[linux-2.6/linux-mips.git] / mm / filemap.c
blob688682aad0c3b61a41874bf907ff6743522fabca
1 /*
2 * linux/mm/filemap.c
4 * Copyright (C) 1994-1999 Linus Torvalds
5 */
7 /*
8 * This file handles the generic file mmap semantics used by
9 * most "normal" filesystems (but you don't /have/ to use this:
10 * the NFS filesystem used to do this differently, for example)
12 #include <linux/malloc.h>
13 #include <linux/shm.h>
14 #include <linux/mman.h>
15 #include <linux/locks.h>
16 #include <linux/pagemap.h>
17 #include <linux/swap.h>
18 #include <linux/smp_lock.h>
19 #include <linux/blkdev.h>
20 #include <linux/file.h>
21 #include <linux/swapctl.h>
22 #include <linux/slab.h>
23 #include <linux/init.h>
24 #include <linux/mm.h>
26 #include <asm/pgalloc.h>
27 #include <asm/uaccess.h>
28 #include <asm/mman.h>
30 #include <linux/highmem.h>
33 * Shared mappings implemented 30.11.1994. It's not fully working yet,
34 * though.
36 * Shared mappings now work. 15.8.1995 Bruno.
38 * finished 'unifying' the page and buffer cache and SMP-threaded the
39 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
41 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
44 atomic_t page_cache_size = ATOMIC_INIT(0);
45 unsigned int page_hash_bits;
46 struct page **page_hash_table;
47 struct list_head lru_cache;
49 static spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
51 * NOTE: to avoid deadlocking you must never acquire the pagecache_lock with
52 * the pagemap_lru_lock held.
54 spinlock_t pagemap_lru_lock = SPIN_LOCK_UNLOCKED;
56 #define CLUSTER_PAGES (1 << page_cluster)
57 #define CLUSTER_OFFSET(x) (((x) >> page_cluster) << page_cluster)
59 void __add_page_to_hash_queue(struct page * page, struct page **p)
61 atomic_inc(&page_cache_size);
62 if((page->next_hash = *p) != NULL)
63 (*p)->pprev_hash = &page->next_hash;
64 *p = page;
65 page->pprev_hash = p;
66 if (page->buffers)
67 PAGE_BUG(page);
70 static inline void remove_page_from_hash_queue(struct page * page)
72 if(page->pprev_hash) {
73 if(page->next_hash)
74 page->next_hash->pprev_hash = page->pprev_hash;
75 *page->pprev_hash = page->next_hash;
76 page->pprev_hash = NULL;
78 atomic_dec(&page_cache_size);
81 static inline int sync_page(struct page *page)
83 struct address_space *mapping = page->mapping;
85 if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
86 return mapping->a_ops->sync_page(page);
87 return 0;
91 * Remove a page from the page cache and free it. Caller has to make
92 * sure the page is locked and that nobody else uses it - or that usage
93 * is safe.
95 static inline void __remove_inode_page(struct page *page)
97 remove_page_from_inode_queue(page);
98 remove_page_from_hash_queue(page);
99 page->mapping = NULL;
102 void remove_inode_page(struct page *page)
104 if (!PageLocked(page))
105 PAGE_BUG(page);
107 spin_lock(&pagecache_lock);
108 __remove_inode_page(page);
109 spin_unlock(&pagecache_lock);
113 * invalidate_inode_pages - Invalidate all the unlocked pages of one inode
114 * @inode: the inode which pages we want to invalidate
116 * This function only removes the unlocked pages, if you want to
117 * remove all the pages of one inode, you must call truncate_inode_pages.
120 void invalidate_inode_pages(struct inode * inode)
122 struct list_head *head, *curr;
123 struct page * page;
125 head = &inode->i_mapping->pages;
127 spin_lock(&pagecache_lock);
128 spin_lock(&pagemap_lru_lock);
129 curr = head->next;
131 while (curr != head) {
132 page = list_entry(curr, struct page, list);
133 curr = curr->next;
135 /* We cannot invalidate a locked page */
136 if (TryLockPage(page))
137 continue;
139 __lru_cache_del(page);
140 __remove_inode_page(page);
141 UnlockPage(page);
142 page_cache_release(page);
145 spin_unlock(&pagemap_lru_lock);
146 spin_unlock(&pagecache_lock);
150 * Truncate the page cache at a set offset, removing the pages
151 * that are beyond that offset (and zeroing out partial pages).
153 void truncate_inode_pages(struct address_space * mapping, loff_t lstart)
155 struct list_head *head, *curr;
156 struct page * page;
157 unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
158 unsigned long start;
160 start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
162 repeat:
163 head = &mapping->pages;
164 spin_lock(&pagecache_lock);
165 curr = head->next;
166 while (curr != head) {
167 unsigned long offset;
169 page = list_entry(curr, struct page, list);
170 curr = curr->next;
172 offset = page->index;
174 /* page wholly truncated - free it */
175 if (offset >= start) {
176 if (TryLockPage(page)) {
177 page_cache_get(page);
178 spin_unlock(&pagecache_lock);
179 wait_on_page(page);
180 page_cache_release(page);
181 goto repeat;
183 page_cache_get(page);
184 spin_unlock(&pagecache_lock);
186 if (!page->buffers || block_flushpage(page, 0))
187 lru_cache_del(page);
190 * We remove the page from the page cache
191 * _after_ we have destroyed all buffer-cache
192 * references to it. Otherwise some other process
193 * might think this inode page is not in the
194 * page cache and creates a buffer-cache alias
195 * to it causing all sorts of fun problems ...
197 remove_inode_page(page);
198 ClearPageDirty(page);
200 UnlockPage(page);
201 page_cache_release(page);
202 page_cache_release(page);
205 * We have done things without the pagecache lock,
206 * so we'll have to repeat the scan.
207 * It's not possible to deadlock here because
208 * we are guaranteed to make progress. (ie. we have
209 * just removed a page)
211 goto repeat;
214 * there is only one partial page possible.
216 if (!partial)
217 continue;
219 /* and it's the one preceeding the first wholly truncated page */
220 if ((offset + 1) != start)
221 continue;
223 /* partial truncate, clear end of page */
224 if (TryLockPage(page)) {
225 spin_unlock(&pagecache_lock);
226 goto repeat;
228 page_cache_get(page);
229 spin_unlock(&pagecache_lock);
231 memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
232 if (page->buffers)
233 block_flushpage(page, partial);
235 partial = 0;
238 * we have dropped the spinlock so we have to
239 * restart.
241 UnlockPage(page);
242 page_cache_release(page);
243 goto repeat;
245 spin_unlock(&pagecache_lock);
249 * nr_dirty represents the number of dirty pages that we will write async
250 * before doing sync writes. We can only do sync writes if we can
251 * wait for IO (__GFP_IO set).
253 int shrink_mmap(int priority, int gfp_mask)
255 int ret = 0, count, nr_dirty;
256 struct list_head * page_lru;
257 struct page * page = NULL;
259 count = nr_lru_pages / (priority + 1);
260 nr_dirty = priority;
262 /* we need pagemap_lru_lock for list_del() ... subtle code below */
263 spin_lock(&pagemap_lru_lock);
264 while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache) {
265 page = list_entry(page_lru, struct page, lru);
266 list_del(page_lru);
268 if (PageTestandClearReferenced(page))
269 goto dispose_continue;
271 count--;
273 * Avoid unscalable SMP locking for pages we can
274 * immediate tell are untouchable..
276 if (!page->buffers && page_count(page) > 1)
277 goto dispose_continue;
279 if (TryLockPage(page))
280 goto dispose_continue;
282 /* Release the pagemap_lru lock even if the page is not yet
283 queued in any lru queue since we have just locked down
284 the page so nobody else may SMP race with us running
285 a lru_cache_del() (lru_cache_del() always run with the
286 page locked down ;). */
287 spin_unlock(&pagemap_lru_lock);
289 /* avoid freeing the page while it's locked */
290 page_cache_get(page);
293 * Is it a buffer page? Try to clean it up regardless
294 * of zone - it's old.
296 if (page->buffers) {
297 int wait = ((gfp_mask & __GFP_IO) && (nr_dirty-- < 0));
298 if (!try_to_free_buffers(page, wait))
299 goto unlock_continue;
300 /* page was locked, inode can't go away under us */
301 if (!page->mapping) {
302 atomic_dec(&buffermem_pages);
303 goto made_buffer_progress;
307 /* Take the pagecache_lock spinlock held to avoid
308 other tasks to notice the page while we are looking at its
309 page count. If it's a pagecache-page we'll free it
310 in one atomic transaction after checking its page count. */
311 spin_lock(&pagecache_lock);
314 * We can't free pages unless there's just one user
315 * (count == 2 because we added one ourselves above).
317 if (page_count(page) != 2)
318 goto cache_unlock_continue;
321 * Is it a page swap page? If so, we want to
322 * drop it if it is no longer used, even if it
323 * were to be marked referenced..
325 if (PageSwapCache(page)) {
326 spin_unlock(&pagecache_lock);
327 __delete_from_swap_cache(page);
328 goto made_inode_progress;
332 * Page is from a zone we don't care about.
333 * Don't drop page cache entries in vain.
335 if (page->zone->free_pages > page->zone->pages_high)
336 goto cache_unlock_continue;
338 /* is it a page-cache page? */
339 if (page->mapping) {
340 if (!PageDirty(page) && !pgcache_under_min()) {
341 __remove_inode_page(page);
342 spin_unlock(&pagecache_lock);
343 goto made_inode_progress;
345 goto cache_unlock_continue;
348 printk(KERN_ERR "shrink_mmap: unknown LRU page!\n");
350 cache_unlock_continue:
351 spin_unlock(&pagecache_lock);
352 unlock_continue:
353 spin_lock(&pagemap_lru_lock);
354 UnlockPage(page);
355 page_cache_release(page);
356 dispose_continue:
357 list_add(page_lru, &lru_cache);
359 goto out;
361 made_inode_progress:
362 page_cache_release(page);
363 made_buffer_progress:
364 UnlockPage(page);
365 page_cache_release(page);
366 ret = 1;
367 spin_lock(&pagemap_lru_lock);
368 /* nr_lru_pages needs the spinlock */
369 nr_lru_pages--;
371 out:
372 spin_unlock(&pagemap_lru_lock);
374 return ret;
377 static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
379 goto inside;
381 for (;;) {
382 page = page->next_hash;
383 inside:
384 if (!page)
385 goto not_found;
386 if (page->mapping != mapping)
387 continue;
388 if (page->index == offset)
389 break;
391 SetPageReferenced(page);
392 not_found:
393 return page;
397 * By the time this is called, the page is locked and
398 * we don't have to worry about any races any more.
400 * Start the IO..
402 static int writeout_one_page(struct page *page)
404 struct buffer_head *bh, *head = page->buffers;
406 bh = head;
407 do {
408 if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
409 continue;
411 bh->b_flushtime = 0;
412 ll_rw_block(WRITE, 1, &bh);
413 } while ((bh = bh->b_this_page) != head);
414 return 0;
417 static int waitfor_one_page(struct page *page)
419 int error = 0;
420 struct buffer_head *bh, *head = page->buffers;
422 bh = head;
423 do {
424 wait_on_buffer(bh);
425 if (buffer_req(bh) && !buffer_uptodate(bh))
426 error = -EIO;
427 } while ((bh = bh->b_this_page) != head);
428 return error;
431 static int do_buffer_fdatasync(struct inode *inode, unsigned long start, unsigned long end, int (*fn)(struct page *))
433 struct list_head *head, *curr;
434 struct page *page;
435 int retval = 0;
437 head = &inode->i_mapping->pages;
439 spin_lock(&pagecache_lock);
440 curr = head->next;
441 while (curr != head) {
442 page = list_entry(curr, struct page, list);
443 curr = curr->next;
444 if (!page->buffers)
445 continue;
446 if (page->index >= end)
447 continue;
448 if (page->index < start)
449 continue;
451 page_cache_get(page);
452 spin_unlock(&pagecache_lock);
453 lock_page(page);
455 /* The buffers could have been free'd while we waited for the page lock */
456 if (page->buffers)
457 retval |= fn(page);
459 UnlockPage(page);
460 spin_lock(&pagecache_lock);
461 curr = page->list.next;
462 page_cache_release(page);
464 spin_unlock(&pagecache_lock);
466 return retval;
470 * Two-stage data sync: first start the IO, then go back and
471 * collect the information..
473 int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx)
475 int retval;
477 retval = do_buffer_fdatasync(inode, start_idx, end_idx, writeout_one_page);
478 retval |= do_buffer_fdatasync(inode, start_idx, end_idx, waitfor_one_page);
479 return retval;
483 * Add a page to the inode page cache.
485 * The caller must have locked the page and
486 * set all the page flags correctly..
488 void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index)
490 if (!PageLocked(page))
491 BUG();
493 page_cache_get(page);
494 spin_lock(&pagecache_lock);
495 page->index = index;
496 add_page_to_inode_queue(mapping, page);
497 __add_page_to_hash_queue(page, page_hash(mapping, index));
498 lru_cache_add(page);
499 spin_unlock(&pagecache_lock);
503 * This adds a page to the page cache, starting out as locked,
504 * owned by us, but unreferenced, not uptodate and with no errors.
506 static inline void __add_to_page_cache(struct page * page,
507 struct address_space *mapping, unsigned long offset,
508 struct page **hash)
510 struct page *alias;
511 unsigned long flags;
513 if (PageLocked(page))
514 BUG();
516 flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced));
517 page->flags = flags | (1 << PG_locked);
518 page_cache_get(page);
519 page->index = offset;
520 add_page_to_inode_queue(mapping, page);
521 __add_page_to_hash_queue(page, hash);
522 lru_cache_add(page);
523 alias = __find_page_nolock(mapping, offset, *hash);
524 if (alias != page)
525 BUG();
528 void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset)
530 spin_lock(&pagecache_lock);
531 __add_to_page_cache(page, mapping, offset, page_hash(mapping, offset));
532 spin_unlock(&pagecache_lock);
535 static int add_to_page_cache_unique(struct page * page,
536 struct address_space *mapping, unsigned long offset,
537 struct page **hash)
539 int err;
540 struct page *alias;
542 spin_lock(&pagecache_lock);
543 alias = __find_page_nolock(mapping, offset, *hash);
545 err = 1;
546 if (!alias) {
547 __add_to_page_cache(page,mapping,offset,hash);
548 err = 0;
551 spin_unlock(&pagecache_lock);
552 return err;
556 * This adds the requested page to the page cache if it isn't already there,
557 * and schedules an I/O to read in its contents from disk.
559 static inline int page_cache_read(struct file * file, unsigned long offset)
561 struct inode *inode = file->f_dentry->d_inode;
562 struct address_space *mapping = inode->i_mapping;
563 struct page **hash = page_hash(mapping, offset);
564 struct page *page;
566 spin_lock(&pagecache_lock);
567 page = __find_page_nolock(mapping, offset, *hash);
568 spin_unlock(&pagecache_lock);
569 if (page)
570 return 0;
572 page = page_cache_alloc();
573 if (!page)
574 return -ENOMEM;
576 if (!add_to_page_cache_unique(page, mapping, offset, hash)) {
577 int error = mapping->a_ops->readpage(file, page);
578 page_cache_release(page);
579 return error;
582 * We arrive here in the unlikely event that someone
583 * raced with us and added our page to the cache first.
585 page_cache_free(page);
586 return 0;
590 * Read in an entire cluster at once. A cluster is usually a 64k-
591 * aligned block that includes the page requested in "offset."
593 static int read_cluster_nonblocking(struct file * file, unsigned long offset,
594 unsigned long filesize)
596 unsigned long pages = CLUSTER_PAGES;
598 offset = CLUSTER_OFFSET(offset);
599 while ((pages-- > 0) && (offset < filesize)) {
600 int error = page_cache_read(file, offset);
601 if (error < 0)
602 return error;
603 offset ++;
606 return 0;
610 * Wait for a page to get unlocked.
612 * This must be called with the caller "holding" the page,
613 * ie with increased "page->count" so that the page won't
614 * go away during the wait..
616 void ___wait_on_page(struct page *page)
618 struct task_struct *tsk = current;
619 DECLARE_WAITQUEUE(wait, tsk);
621 add_wait_queue(&page->wait, &wait);
622 do {
623 sync_page(page);
624 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
625 if (!PageLocked(page))
626 break;
627 schedule();
628 } while (PageLocked(page));
629 tsk->state = TASK_RUNNING;
630 remove_wait_queue(&page->wait, &wait);
634 * Get an exclusive lock on the page..
636 void lock_page(struct page *page)
638 while (TryLockPage(page))
639 ___wait_on_page(page);
644 * a rather lightweight function, finding and getting a reference to a
645 * hashed page atomically, waiting for it if it's locked.
647 struct page * __find_get_page (struct address_space *mapping,
648 unsigned long offset, struct page **hash)
650 struct page *page;
653 * We scan the hash list read-only. Addition to and removal from
654 * the hash-list needs a held write-lock.
656 repeat:
657 spin_lock(&pagecache_lock);
658 page = __find_page_nolock(mapping, offset, *hash);
659 if (page)
660 page_cache_get(page);
661 spin_unlock(&pagecache_lock);
663 /* Found the page, sleep if locked. */
664 if (page && PageLocked(page)) {
665 struct task_struct *tsk = current;
666 DECLARE_WAITQUEUE(wait, tsk);
668 sync_page(page);
670 __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
671 add_wait_queue(&page->wait, &wait);
673 if (PageLocked(page))
674 schedule();
675 __set_task_state(tsk, TASK_RUNNING);
676 remove_wait_queue(&page->wait, &wait);
679 * The page might have been unhashed meanwhile. It's
680 * not freed though because we hold a reference to it.
681 * If this is the case then it will be freed _here_,
682 * and we recheck the hash anyway.
684 page_cache_release(page);
685 goto repeat;
688 * It's not locked so we can return the page and we hold
689 * a reference to it.
691 return page;
695 * Get the lock to a page atomically.
697 struct page * __find_lock_page (struct address_space *mapping,
698 unsigned long offset, struct page **hash)
700 struct page *page;
703 * We scan the hash list read-only. Addition to and removal from
704 * the hash-list needs a held write-lock.
706 repeat:
707 spin_lock(&pagecache_lock);
708 page = __find_page_nolock(mapping, offset, *hash);
709 if (page)
710 page_cache_get(page);
711 spin_unlock(&pagecache_lock);
713 /* Found the page, sleep if locked. */
714 if (page && TryLockPage(page)) {
715 struct task_struct *tsk = current;
716 DECLARE_WAITQUEUE(wait, tsk);
718 sync_page(page);
720 __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
721 add_wait_queue(&page->wait, &wait);
723 if (PageLocked(page))
724 schedule();
725 __set_task_state(tsk, TASK_RUNNING);
726 remove_wait_queue(&page->wait, &wait);
729 * The page might have been unhashed meanwhile. It's
730 * not freed though because we hold a reference to it.
731 * If this is the case then it will be freed _here_,
732 * and we recheck the hash anyway.
734 page_cache_release(page);
735 goto repeat;
738 * It's not locked so we can return the page and we hold
739 * a reference to it.
741 return page;
744 #if 0
745 #define PROFILE_READAHEAD
746 #define DEBUG_READAHEAD
747 #endif
750 * Read-ahead profiling information
751 * --------------------------------
752 * Every PROFILE_MAXREADCOUNT, the following information is written
753 * to the syslog:
754 * Percentage of asynchronous read-ahead.
755 * Average of read-ahead fields context value.
756 * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
757 * to the syslog.
760 #ifdef PROFILE_READAHEAD
762 #define PROFILE_MAXREADCOUNT 1000
764 static unsigned long total_reada;
765 static unsigned long total_async;
766 static unsigned long total_ramax;
767 static unsigned long total_ralen;
768 static unsigned long total_rawin;
770 static void profile_readahead(int async, struct file *filp)
772 unsigned long flags;
774 ++total_reada;
775 if (async)
776 ++total_async;
778 total_ramax += filp->f_ramax;
779 total_ralen += filp->f_ralen;
780 total_rawin += filp->f_rawin;
782 if (total_reada > PROFILE_MAXREADCOUNT) {
783 save_flags(flags);
784 cli();
785 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
786 restore_flags(flags);
787 return;
790 printk("Readahead average: max=%ld, len=%ld, win=%ld, async=%ld%%\n",
791 total_ramax/total_reada,
792 total_ralen/total_reada,
793 total_rawin/total_reada,
794 (total_async*100)/total_reada);
795 #ifdef DEBUG_READAHEAD
796 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n",
797 filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
798 #endif
800 total_reada = 0;
801 total_async = 0;
802 total_ramax = 0;
803 total_ralen = 0;
804 total_rawin = 0;
806 restore_flags(flags);
809 #endif /* defined PROFILE_READAHEAD */
812 * Read-ahead context:
813 * -------------------
814 * The read ahead context fields of the "struct file" are the following:
815 * - f_raend : position of the first byte after the last page we tried to
816 * read ahead.
817 * - f_ramax : current read-ahead maximum size.
818 * - f_ralen : length of the current IO read block we tried to read-ahead.
819 * - f_rawin : length of the current read-ahead window.
820 * if last read-ahead was synchronous then
821 * f_rawin = f_ralen
822 * otherwise (was asynchronous)
823 * f_rawin = previous value of f_ralen + f_ralen
825 * Read-ahead limits:
826 * ------------------
827 * MIN_READAHEAD : minimum read-ahead size when read-ahead.
828 * MAX_READAHEAD : maximum read-ahead size when read-ahead.
830 * Synchronous read-ahead benefits:
831 * --------------------------------
832 * Using reasonable IO xfer length from peripheral devices increase system
833 * performances.
834 * Reasonable means, in this context, not too large but not too small.
835 * The actual maximum value is:
836 * MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
837 * and 32K if defined (4K page size assumed).
839 * Asynchronous read-ahead benefits:
840 * ---------------------------------
841 * Overlapping next read request and user process execution increase system
842 * performance.
844 * Read-ahead risks:
845 * -----------------
846 * We have to guess which further data are needed by the user process.
847 * If these data are often not really needed, it's bad for system
848 * performances.
849 * However, we know that files are often accessed sequentially by
850 * application programs and it seems that it is possible to have some good
851 * strategy in that guessing.
852 * We only try to read-ahead files that seems to be read sequentially.
854 * Asynchronous read-ahead risks:
855 * ------------------------------
856 * In order to maximize overlapping, we must start some asynchronous read
857 * request from the device, as soon as possible.
858 * We must be very careful about:
859 * - The number of effective pending IO read requests.
860 * ONE seems to be the only reasonable value.
861 * - The total memory pool usage for the file access stream.
862 * This maximum memory usage is implicitly 2 IO read chunks:
863 * 2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
864 * 64k if defined (4K page size assumed).
867 static inline int get_max_readahead(struct inode * inode)
869 if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
870 return MAX_READAHEAD;
871 return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
874 static void generic_file_readahead(int reada_ok,
875 struct file * filp, struct inode * inode,
876 struct page * page)
878 unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
879 unsigned long index = page->index;
880 unsigned long max_ahead, ahead;
881 unsigned long raend;
882 int max_readahead = get_max_readahead(inode);
884 raend = filp->f_raend;
885 max_ahead = 0;
888 * The current page is locked.
889 * If the current position is inside the previous read IO request, do not
890 * try to reread previously read ahead pages.
891 * Otherwise decide or not to read ahead some pages synchronously.
892 * If we are not going to read ahead, set the read ahead context for this
893 * page only.
895 if (PageLocked(page)) {
896 if (!filp->f_ralen || index >= raend || index + filp->f_ralen < raend) {
897 raend = index;
898 if (raend < end_index)
899 max_ahead = filp->f_ramax;
900 filp->f_rawin = 0;
901 filp->f_ralen = 1;
902 if (!max_ahead) {
903 filp->f_raend = index + filp->f_ralen;
904 filp->f_rawin += filp->f_ralen;
909 * The current page is not locked.
910 * If we were reading ahead and,
911 * if the current max read ahead size is not zero and,
912 * if the current position is inside the last read-ahead IO request,
913 * it is the moment to try to read ahead asynchronously.
914 * We will later force unplug device in order to force asynchronous read IO.
916 else if (reada_ok && filp->f_ramax && raend >= 1 &&
917 index <= raend && index + filp->f_ralen >= raend) {
919 * Add ONE page to max_ahead in order to try to have about the same IO max size
920 * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
921 * Compute the position of the last page we have tried to read in order to
922 * begin to read ahead just at the next page.
924 raend -= 1;
925 if (raend < end_index)
926 max_ahead = filp->f_ramax + 1;
928 if (max_ahead) {
929 filp->f_rawin = filp->f_ralen;
930 filp->f_ralen = 0;
931 reada_ok = 2;
935 * Try to read ahead pages.
936 * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
937 * scheduler, will work enough for us to avoid too bad actuals IO requests.
939 ahead = 0;
940 while (ahead < max_ahead) {
941 ahead ++;
942 if ((raend + ahead) >= end_index)
943 break;
944 if (page_cache_read(filp, raend + ahead) < 0)
945 break;
948 * If we tried to read ahead some pages,
949 * If we tried to read ahead asynchronously,
950 * Try to force unplug of the device in order to start an asynchronous
951 * read IO request.
952 * Update the read-ahead context.
953 * Store the length of the current read-ahead window.
954 * Double the current max read ahead size.
955 * That heuristic avoid to do some large IO for files that are not really
956 * accessed sequentially.
958 if (ahead) {
959 if (reada_ok == 2) {
960 run_task_queue(&tq_disk);
963 filp->f_ralen += ahead;
964 filp->f_rawin += filp->f_ralen;
965 filp->f_raend = raend + ahead + 1;
967 filp->f_ramax += filp->f_ramax;
969 if (filp->f_ramax > max_readahead)
970 filp->f_ramax = max_readahead;
972 #ifdef PROFILE_READAHEAD
973 profile_readahead((reada_ok == 2), filp);
974 #endif
977 return;
982 * This is a generic file read routine, and uses the
983 * inode->i_op->readpage() function for the actual low-level
984 * stuff.
986 * This is really ugly. But the goto's actually try to clarify some
987 * of the logic when it comes to error handling etc.
989 void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
991 struct inode *inode = filp->f_dentry->d_inode;
992 struct address_space *mapping = inode->i_mapping;
993 unsigned long index, offset;
994 struct page *cached_page;
995 int reada_ok;
996 int error;
997 int max_readahead = get_max_readahead(inode);
999 cached_page = NULL;
1000 index = *ppos >> PAGE_CACHE_SHIFT;
1001 offset = *ppos & ~PAGE_CACHE_MASK;
1004 * If the current position is outside the previous read-ahead window,
1005 * we reset the current read-ahead context and set read ahead max to zero
1006 * (will be set to just needed value later),
1007 * otherwise, we assume that the file accesses are sequential enough to
1008 * continue read-ahead.
1010 if (index > filp->f_raend || index + filp->f_rawin < filp->f_raend) {
1011 reada_ok = 0;
1012 filp->f_raend = 0;
1013 filp->f_ralen = 0;
1014 filp->f_ramax = 0;
1015 filp->f_rawin = 0;
1016 } else {
1017 reada_ok = 1;
1020 * Adjust the current value of read-ahead max.
1021 * If the read operation stay in the first half page, force no readahead.
1022 * Otherwise try to increase read ahead max just enough to do the read request.
1023 * Then, at least MIN_READAHEAD if read ahead is ok,
1024 * and at most MAX_READAHEAD in all cases.
1026 if (!index && offset + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
1027 filp->f_ramax = 0;
1028 } else {
1029 unsigned long needed;
1031 needed = ((offset + desc->count) >> PAGE_CACHE_SHIFT) + 1;
1033 if (filp->f_ramax < needed)
1034 filp->f_ramax = needed;
1036 if (reada_ok && filp->f_ramax < MIN_READAHEAD)
1037 filp->f_ramax = MIN_READAHEAD;
1038 if (filp->f_ramax > max_readahead)
1039 filp->f_ramax = max_readahead;
1042 for (;;) {
1043 struct page *page, **hash;
1044 unsigned long end_index, nr;
1046 end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1047 if (index > end_index)
1048 break;
1049 nr = PAGE_CACHE_SIZE;
1050 if (index == end_index) {
1051 nr = inode->i_size & ~PAGE_CACHE_MASK;
1052 if (nr <= offset)
1053 break;
1056 nr = nr - offset;
1059 * Try to find the data in the page cache..
1061 hash = page_hash(mapping, index);
1063 spin_lock(&pagecache_lock);
1064 page = __find_page_nolock(mapping, index, *hash);
1065 if (!page)
1066 goto no_cached_page;
1067 found_page:
1068 page_cache_get(page);
1069 spin_unlock(&pagecache_lock);
1071 if (!Page_Uptodate(page))
1072 goto page_not_up_to_date;
1073 page_ok:
1075 * Ok, we have the page, and it's up-to-date, so
1076 * now we can copy it to user space...
1078 * The actor routine returns how many bytes were actually used..
1079 * NOTE! This may not be the same as how much of a user buffer
1080 * we filled up (we may be padding etc), so we can only update
1081 * "pos" here (the actor routine has to update the user buffer
1082 * pointers and the remaining count).
1084 nr = actor(desc, page, offset, nr);
1085 offset += nr;
1086 index += offset >> PAGE_CACHE_SHIFT;
1087 offset &= ~PAGE_CACHE_MASK;
1089 page_cache_release(page);
1090 if (nr && desc->count)
1091 continue;
1092 break;
1095 * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
1097 page_not_up_to_date:
1098 generic_file_readahead(reada_ok, filp, inode, page);
1100 if (Page_Uptodate(page))
1101 goto page_ok;
1103 /* Get exclusive access to the page ... */
1104 lock_page(page);
1105 if (Page_Uptodate(page)) {
1106 UnlockPage(page);
1107 goto page_ok;
1110 readpage:
1111 /* ... and start the actual read. The read will unlock the page. */
1112 error = mapping->a_ops->readpage(filp, page);
1114 if (!error) {
1115 if (Page_Uptodate(page))
1116 goto page_ok;
1118 /* Again, try some read-ahead while waiting for the page to finish.. */
1119 generic_file_readahead(reada_ok, filp, inode, page);
1120 wait_on_page(page);
1121 if (Page_Uptodate(page))
1122 goto page_ok;
1123 error = -EIO;
1126 /* UHHUH! A synchronous read error occurred. Report it */
1127 desc->error = error;
1128 page_cache_release(page);
1129 break;
1131 no_cached_page:
1133 * Ok, it wasn't cached, so we need to create a new
1134 * page..
1136 * We get here with the page cache lock held.
1138 if (!cached_page) {
1139 spin_unlock(&pagecache_lock);
1140 cached_page = page_cache_alloc();
1141 if (!cached_page) {
1142 desc->error = -ENOMEM;
1143 break;
1147 * Somebody may have added the page while we
1148 * dropped the page cache lock. Check for that.
1150 spin_lock(&pagecache_lock);
1151 page = __find_page_nolock(mapping, index, *hash);
1152 if (page)
1153 goto found_page;
1157 * Ok, add the new page to the hash-queues...
1159 page = cached_page;
1160 __add_to_page_cache(page, mapping, index, hash);
1161 spin_unlock(&pagecache_lock);
1162 cached_page = NULL;
1164 goto readpage;
1167 *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1168 filp->f_reada = 1;
1169 if (cached_page)
1170 page_cache_free(cached_page);
1171 UPDATE_ATIME(inode);
1174 static int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
1176 unsigned long kaddr;
1177 unsigned long left, count = desc->count;
1179 if (size > count)
1180 size = count;
1182 kaddr = kmap(page);
1183 left = __copy_to_user(desc->buf, (void *)(kaddr + offset), size);
1184 kunmap(page);
1186 if (left) {
1187 size -= left;
1188 desc->error = -EFAULT;
1190 desc->count = count - size;
1191 desc->written += size;
1192 desc->buf += size;
1193 return size;
1197 * This is the "read()" routine for all filesystems
1198 * that can use the page cache directly.
1200 ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
1202 ssize_t retval;
1204 retval = -EFAULT;
1205 if (access_ok(VERIFY_WRITE, buf, count)) {
1206 retval = 0;
1208 if (count) {
1209 read_descriptor_t desc;
1211 desc.written = 0;
1212 desc.count = count;
1213 desc.buf = buf;
1214 desc.error = 0;
1215 do_generic_file_read(filp, ppos, &desc, file_read_actor);
1217 retval = desc.written;
1218 if (!retval)
1219 retval = desc.error;
1222 return retval;
1225 static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size)
1227 unsigned long kaddr;
1228 ssize_t written;
1229 unsigned long count = desc->count;
1230 struct file *file = (struct file *) desc->buf;
1231 mm_segment_t old_fs;
1233 if (size > count)
1234 size = count;
1235 old_fs = get_fs();
1236 set_fs(KERNEL_DS);
1238 kaddr = kmap(page);
1239 written = file->f_op->write(file, (char *)kaddr + offset,
1240 size, &file->f_pos);
1241 kunmap(page);
1242 set_fs(old_fs);
1243 if (written < 0) {
1244 desc->error = written;
1245 written = 0;
1247 desc->count = count - written;
1248 desc->written += written;
1249 return written;
1252 asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
1254 ssize_t retval;
1255 struct file * in_file, * out_file;
1256 struct inode * in_inode, * out_inode;
1259 * Get input file, and verify that it is ok..
1261 retval = -EBADF;
1262 in_file = fget(in_fd);
1263 if (!in_file)
1264 goto out;
1265 if (!(in_file->f_mode & FMODE_READ))
1266 goto fput_in;
1267 retval = -EINVAL;
1268 in_inode = in_file->f_dentry->d_inode;
1269 if (!in_inode)
1270 goto fput_in;
1271 if (!in_inode->i_mapping->a_ops->readpage)
1272 goto fput_in;
1273 retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
1274 if (retval)
1275 goto fput_in;
1278 * Get output file, and verify that it is ok..
1280 retval = -EBADF;
1281 out_file = fget(out_fd);
1282 if (!out_file)
1283 goto fput_in;
1284 if (!(out_file->f_mode & FMODE_WRITE))
1285 goto fput_out;
1286 retval = -EINVAL;
1287 if (!out_file->f_op || !out_file->f_op->write)
1288 goto fput_out;
1289 out_inode = out_file->f_dentry->d_inode;
1290 if (!out_inode)
1291 goto fput_out;
1292 retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
1293 if (retval)
1294 goto fput_out;
1296 retval = 0;
1297 if (count) {
1298 read_descriptor_t desc;
1299 loff_t pos = 0, *ppos;
1301 retval = -EFAULT;
1302 ppos = &in_file->f_pos;
1303 if (offset) {
1304 if (get_user(pos, offset))
1305 goto fput_out;
1306 ppos = &pos;
1309 desc.written = 0;
1310 desc.count = count;
1311 desc.buf = (char *) out_file;
1312 desc.error = 0;
1313 do_generic_file_read(in_file, ppos, &desc, file_send_actor);
1315 retval = desc.written;
1316 if (!retval)
1317 retval = desc.error;
1318 if (offset)
1319 put_user(pos, offset);
1322 fput_out:
1323 fput(out_file);
1324 fput_in:
1325 fput(in_file);
1326 out:
1327 return retval;
1331 * Read-ahead and flush behind for MADV_SEQUENTIAL areas. Since we are
1332 * sure this is sequential access, we don't need a flexible read-ahead
1333 * window size -- we can always use a large fixed size window.
1335 static void nopage_sequential_readahead(struct vm_area_struct * vma,
1336 unsigned long pgoff, unsigned long filesize)
1338 unsigned long ra_window;
1340 ra_window = get_max_readahead(vma->vm_file->f_dentry->d_inode);
1341 ra_window = CLUSTER_OFFSET(ra_window + CLUSTER_PAGES - 1);
1343 /* vm_raend is zero if we haven't read ahead in this area yet. */
1344 if (vma->vm_raend == 0)
1345 vma->vm_raend = vma->vm_pgoff + ra_window;
1348 * If we've just faulted the page half-way through our window,
1349 * then schedule reads for the next window, and release the
1350 * pages in the previous window.
1352 if ((pgoff + (ra_window >> 1)) == vma->vm_raend) {
1353 unsigned long start = vma->vm_pgoff + vma->vm_raend;
1354 unsigned long end = start + ra_window;
1356 if (end > ((vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff))
1357 end = (vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff;
1358 if (start > end)
1359 return;
1361 while ((start < end) && (start < filesize)) {
1362 if (read_cluster_nonblocking(vma->vm_file,
1363 start, filesize) < 0)
1364 break;
1365 start += CLUSTER_PAGES;
1367 run_task_queue(&tq_disk);
1369 /* if we're far enough past the beginning of this area,
1370 recycle pages that are in the previous window. */
1371 if (vma->vm_raend > (vma->vm_pgoff + ra_window + ra_window)) {
1372 unsigned long window = ra_window << PAGE_SHIFT;
1374 end = vma->vm_start + (vma->vm_raend << PAGE_SHIFT);
1375 end -= window + window;
1376 filemap_sync(vma, end - window, window, MS_INVALIDATE);
1379 vma->vm_raend += ra_window;
1382 return;
1386 * filemap_nopage() is invoked via the vma operations vector for a
1387 * mapped memory region to read in file data during a page fault.
1389 * The goto's are kind of ugly, but this streamlines the normal case of having
1390 * it in the page cache, and handles the special cases reasonably without
1391 * having a lot of duplicated code.
1393 struct page * filemap_nopage(struct vm_area_struct * area,
1394 unsigned long address, int no_share)
1396 int error;
1397 struct file *file = area->vm_file;
1398 struct inode *inode = file->f_dentry->d_inode;
1399 struct address_space *mapping = inode->i_mapping;
1400 struct page *page, **hash, *old_page;
1401 unsigned long size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1403 unsigned long pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
1406 * Semantics for shared and private memory areas are different
1407 * past the end of the file. A shared mapping past the last page
1408 * of the file is an error and results in a SIGBUS, while a
1409 * private mapping just maps in a zero page.
1411 if ((pgoff >= size) && (area->vm_mm == current->mm))
1412 return NULL;
1415 * Do we have something in the page cache already?
1417 hash = page_hash(mapping, pgoff);
1418 retry_find:
1419 page = __find_get_page(mapping, pgoff, hash);
1420 if (!page)
1421 goto no_cached_page;
1424 * Ok, found a page in the page cache, now we need to check
1425 * that it's up-to-date.
1427 if (!Page_Uptodate(page))
1428 goto page_not_uptodate;
1430 success:
1432 * Try read-ahead for sequential areas.
1434 if (VM_SequentialReadHint(area))
1435 nopage_sequential_readahead(area, pgoff, size);
1438 * Found the page and have a reference on it, need to check sharing
1439 * and possibly copy it over to another page..
1441 old_page = page;
1442 if (no_share) {
1443 struct page *new_page = page_cache_alloc();
1445 if (new_page) {
1446 copy_user_highpage(new_page, old_page, address);
1447 flush_page_to_ram(new_page);
1448 } else
1449 new_page = NOPAGE_OOM;
1450 page_cache_release(page);
1451 return new_page;
1454 flush_page_to_ram(old_page);
1455 return old_page;
1457 no_cached_page:
1459 * If the requested offset is within our file, try to read a whole
1460 * cluster of pages at once.
1462 * Otherwise, we're off the end of a privately mapped file,
1463 * so we need to map a zero page.
1465 if ((pgoff < size) && !VM_RandomReadHint(area))
1466 error = read_cluster_nonblocking(file, pgoff, size);
1467 else
1468 error = page_cache_read(file, pgoff);
1471 * The page we want has now been added to the page cache.
1472 * In the unlikely event that someone removed it in the
1473 * meantime, we'll just come back here and read it again.
1475 if (error >= 0)
1476 goto retry_find;
1479 * An error return from page_cache_read can result if the
1480 * system is low on memory, or a problem occurs while trying
1481 * to schedule I/O.
1483 if (error == -ENOMEM)
1484 return NOPAGE_OOM;
1485 return NULL;
1487 page_not_uptodate:
1488 lock_page(page);
1489 if (Page_Uptodate(page)) {
1490 UnlockPage(page);
1491 goto success;
1494 if (!mapping->a_ops->readpage(file, page)) {
1495 wait_on_page(page);
1496 if (Page_Uptodate(page))
1497 goto success;
1501 * Umm, take care of errors if the page isn't up-to-date.
1502 * Try to re-read it _once_. We do this synchronously,
1503 * because there really aren't any performance issues here
1504 * and we need to check for errors.
1506 lock_page(page);
1507 if (Page_Uptodate(page)) {
1508 UnlockPage(page);
1509 goto success;
1511 ClearPageError(page);
1512 if (!mapping->a_ops->readpage(file, page)) {
1513 wait_on_page(page);
1514 if (Page_Uptodate(page))
1515 goto success;
1519 * Things didn't work out. Return zero to tell the
1520 * mm layer so, possibly freeing the page cache page first.
1522 page_cache_release(page);
1523 return NULL;
1526 static int filemap_write_page(struct file *file,
1527 struct page * page,
1528 int wait)
1531 * If a task terminates while we're swapping the page, the vma and
1532 * and file could be released: try_to_swap_out has done a get_file.
1533 * vma/file is guaranteed to exist in the unmap/sync cases because
1534 * mmap_sem is held.
1536 return page->mapping->a_ops->writepage(file, page);
1541 * The page cache takes care of races between somebody
1542 * trying to swap something out and swap something in
1543 * at the same time..
1545 extern void wakeup_bdflush(int);
1546 int filemap_swapout(struct page * page, struct file * file)
1548 int retval = filemap_write_page(file, page, 0);
1549 wakeup_bdflush(0);
1550 return retval;
1553 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
1554 unsigned long address, unsigned int flags)
1556 unsigned long pgoff;
1557 pte_t pte = *ptep;
1558 struct page *page;
1559 int error;
1561 if (!(flags & MS_INVALIDATE)) {
1562 if (!pte_present(pte))
1563 return 0;
1564 if (!pte_dirty(pte))
1565 return 0;
1566 flush_page_to_ram(pte_page(pte));
1567 flush_cache_page(vma, address);
1568 set_pte(ptep, pte_mkclean(pte));
1569 flush_tlb_page(vma, address);
1570 page = pte_page(pte);
1571 page_cache_get(page);
1572 } else {
1573 if (pte_none(pte))
1574 return 0;
1575 flush_cache_page(vma, address);
1576 pte_clear(ptep);
1577 flush_tlb_page(vma, address);
1578 if (!pte_present(pte)) {
1579 swap_free(pte_to_swp_entry(pte));
1580 return 0;
1582 page = pte_page(pte);
1583 if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
1584 page_cache_free(page);
1585 return 0;
1588 pgoff = (address - vma->vm_start) >> PAGE_CACHE_SHIFT;
1589 pgoff += vma->vm_pgoff;
1590 if (page->index != pgoff) {
1591 printk("weirdness: pgoff=%lu index=%lu address=%lu vm_start=%lu vm_pgoff=%lu\n",
1592 pgoff, page->index, address, vma->vm_start, vma->vm_pgoff);
1594 lock_page(page);
1595 error = filemap_write_page(vma->vm_file, page, 1);
1596 UnlockPage(page);
1597 page_cache_free(page);
1598 return error;
1601 static inline int filemap_sync_pte_range(pmd_t * pmd,
1602 unsigned long address, unsigned long size,
1603 struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
1605 pte_t * pte;
1606 unsigned long end;
1607 int error;
1609 if (pmd_none(*pmd))
1610 return 0;
1611 if (pmd_bad(*pmd)) {
1612 pmd_ERROR(*pmd);
1613 pmd_clear(pmd);
1614 return 0;
1616 pte = pte_offset(pmd, address);
1617 offset += address & PMD_MASK;
1618 address &= ~PMD_MASK;
1619 end = address + size;
1620 if (end > PMD_SIZE)
1621 end = PMD_SIZE;
1622 error = 0;
1623 do {
1624 error |= filemap_sync_pte(pte, vma, address + offset, flags);
1625 address += PAGE_SIZE;
1626 pte++;
1627 } while (address && (address < end));
1628 return error;
1631 static inline int filemap_sync_pmd_range(pgd_t * pgd,
1632 unsigned long address, unsigned long size,
1633 struct vm_area_struct *vma, unsigned int flags)
1635 pmd_t * pmd;
1636 unsigned long offset, end;
1637 int error;
1639 if (pgd_none(*pgd))
1640 return 0;
1641 if (pgd_bad(*pgd)) {
1642 pgd_ERROR(*pgd);
1643 pgd_clear(pgd);
1644 return 0;
1646 pmd = pmd_offset(pgd, address);
1647 offset = address & PGDIR_MASK;
1648 address &= ~PGDIR_MASK;
1649 end = address + size;
1650 if (end > PGDIR_SIZE)
1651 end = PGDIR_SIZE;
1652 error = 0;
1653 do {
1654 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
1655 address = (address + PMD_SIZE) & PMD_MASK;
1656 pmd++;
1657 } while (address && (address < end));
1658 return error;
1661 int filemap_sync(struct vm_area_struct * vma, unsigned long address,
1662 size_t size, unsigned int flags)
1664 pgd_t * dir;
1665 unsigned long end = address + size;
1666 int error = 0;
1668 dir = pgd_offset(vma->vm_mm, address);
1669 flush_cache_range(vma->vm_mm, end - size, end);
1670 if (address >= end)
1671 BUG();
1672 do {
1673 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
1674 address = (address + PGDIR_SIZE) & PGDIR_MASK;
1675 dir++;
1676 } while (address && (address < end));
1677 flush_tlb_range(vma->vm_mm, end - size, end);
1678 return error;
1682 * This handles (potentially partial) area unmaps..
1684 static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len)
1686 filemap_sync(vma, start, len, MS_ASYNC);
1690 * Shared mappings need to be able to do the right thing at
1691 * close/unmap/sync. They will also use the private file as
1692 * backing-store for swapping..
1694 static struct vm_operations_struct file_shared_mmap = {
1695 unmap: filemap_unmap, /* unmap - we need to sync the pages */
1696 sync: filemap_sync,
1697 nopage: filemap_nopage,
1698 swapout: filemap_swapout,
1702 * Private mappings just need to be able to load in the map.
1704 * (This is actually used for shared mappings as well, if we
1705 * know they can't ever get write permissions..)
1707 static struct vm_operations_struct file_private_mmap = {
1708 nopage: filemap_nopage,
1711 /* This is used for a general mmap of a disk file */
1713 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1715 struct vm_operations_struct * ops;
1716 struct inode *inode = file->f_dentry->d_inode;
1718 ops = &file_private_mmap;
1719 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1720 if (!inode->i_mapping->a_ops->writepage)
1721 return -EINVAL;
1722 ops = &file_shared_mmap;
1724 if (!inode->i_sb || !S_ISREG(inode->i_mode))
1725 return -EACCES;
1726 if (!inode->i_mapping->a_ops->readpage)
1727 return -ENOEXEC;
1728 UPDATE_ATIME(inode);
1729 vma->vm_ops = ops;
1730 return 0;
1734 * The msync() system call.
1737 static int msync_interval(struct vm_area_struct * vma,
1738 unsigned long start, unsigned long end, int flags)
1740 if (vma->vm_file && vma->vm_ops && vma->vm_ops->sync) {
1741 int error;
1742 error = vma->vm_ops->sync(vma, start, end-start, flags);
1743 if (!error && (flags & MS_SYNC)) {
1744 struct file * file = vma->vm_file;
1745 if (file && file->f_op && file->f_op->fsync) {
1746 down(&file->f_dentry->d_inode->i_sem);
1747 error = file->f_op->fsync(file, file->f_dentry, 1);
1748 up(&file->f_dentry->d_inode->i_sem);
1751 return error;
1753 return 0;
1756 asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
1758 unsigned long end;
1759 struct vm_area_struct * vma;
1760 int unmapped_error, error = -EINVAL;
1762 down(&current->mm->mmap_sem);
1763 if (start & ~PAGE_MASK)
1764 goto out;
1765 len = (len + ~PAGE_MASK) & PAGE_MASK;
1766 end = start + len;
1767 if (end < start)
1768 goto out;
1769 if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1770 goto out;
1771 error = 0;
1772 if (end == start)
1773 goto out;
1775 * If the interval [start,end) covers some unmapped address ranges,
1776 * just ignore them, but return -EFAULT at the end.
1778 vma = find_vma(current->mm, start);
1779 unmapped_error = 0;
1780 for (;;) {
1781 /* Still start < end. */
1782 error = -EFAULT;
1783 if (!vma)
1784 goto out;
1785 /* Here start < vma->vm_end. */
1786 if (start < vma->vm_start) {
1787 unmapped_error = -EFAULT;
1788 start = vma->vm_start;
1790 /* Here vma->vm_start <= start < vma->vm_end. */
1791 if (end <= vma->vm_end) {
1792 if (start < end) {
1793 error = msync_interval(vma, start, end, flags);
1794 if (error)
1795 goto out;
1797 error = unmapped_error;
1798 goto out;
1800 /* Here vma->vm_start <= start < vma->vm_end < end. */
1801 error = msync_interval(vma, start, vma->vm_end, flags);
1802 if (error)
1803 goto out;
1804 start = vma->vm_end;
1805 vma = vma->vm_next;
1807 out:
1808 up(&current->mm->mmap_sem);
1809 return error;
1812 static inline void setup_read_behavior(struct vm_area_struct * vma,
1813 int behavior)
1815 VM_ClearReadHint(vma);
1816 switch(behavior) {
1817 case MADV_SEQUENTIAL:
1818 vma->vm_flags |= VM_SEQ_READ;
1819 break;
1820 case MADV_RANDOM:
1821 vma->vm_flags |= VM_RAND_READ;
1822 break;
1823 default:
1824 break;
1826 return;
1829 static long madvise_fixup_start(struct vm_area_struct * vma,
1830 unsigned long end, int behavior)
1832 struct vm_area_struct * n;
1834 n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1835 if (!n)
1836 return -EAGAIN;
1837 *n = *vma;
1838 n->vm_end = end;
1839 setup_read_behavior(n, behavior);
1840 n->vm_raend = 0;
1841 get_file(n->vm_file);
1842 if (n->vm_ops && n->vm_ops->open)
1843 n->vm_ops->open(n);
1844 vmlist_modify_lock(vma->vm_mm);
1845 vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
1846 vma->vm_start = end;
1847 insert_vm_struct(current->mm, n);
1848 vmlist_modify_unlock(vma->vm_mm);
1849 return 0;
1852 static long madvise_fixup_end(struct vm_area_struct * vma,
1853 unsigned long start, int behavior)
1855 struct vm_area_struct * n;
1857 n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1858 if (!n)
1859 return -EAGAIN;
1860 *n = *vma;
1861 n->vm_start = start;
1862 n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT;
1863 setup_read_behavior(n, behavior);
1864 n->vm_raend = 0;
1865 get_file(n->vm_file);
1866 if (n->vm_ops && n->vm_ops->open)
1867 n->vm_ops->open(n);
1868 vmlist_modify_lock(vma->vm_mm);
1869 vma->vm_end = start;
1870 insert_vm_struct(current->mm, n);
1871 vmlist_modify_unlock(vma->vm_mm);
1872 return 0;
1875 static long madvise_fixup_middle(struct vm_area_struct * vma,
1876 unsigned long start, unsigned long end, int behavior)
1878 struct vm_area_struct * left, * right;
1880 left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1881 if (!left)
1882 return -EAGAIN;
1883 right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1884 if (!right) {
1885 kmem_cache_free(vm_area_cachep, left);
1886 return -EAGAIN;
1888 *left = *vma;
1889 *right = *vma;
1890 left->vm_end = start;
1891 right->vm_start = end;
1892 right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT;
1893 left->vm_raend = 0;
1894 right->vm_raend = 0;
1895 atomic_add(2, &vma->vm_file->f_count);
1897 if (vma->vm_ops && vma->vm_ops->open) {
1898 vma->vm_ops->open(left);
1899 vma->vm_ops->open(right);
1901 vmlist_modify_lock(vma->vm_mm);
1902 vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
1903 vma->vm_start = start;
1904 vma->vm_end = end;
1905 setup_read_behavior(vma, behavior);
1906 vma->vm_raend = 0;
1907 insert_vm_struct(current->mm, left);
1908 insert_vm_struct(current->mm, right);
1909 vmlist_modify_unlock(vma->vm_mm);
1910 return 0;
1914 * We can potentially split a vm area into separate
1915 * areas, each area with its own behavior.
1917 static long madvise_behavior(struct vm_area_struct * vma,
1918 unsigned long start, unsigned long end, int behavior)
1920 int error = 0;
1922 /* This caps the number of vma's this process can own */
1923 if (vma->vm_mm->map_count > MAX_MAP_COUNT)
1924 return -ENOMEM;
1926 if (start == vma->vm_start) {
1927 if (end == vma->vm_end) {
1928 setup_read_behavior(vma, behavior);
1929 vma->vm_raend = 0;
1930 } else
1931 error = madvise_fixup_start(vma, end, behavior);
1932 } else {
1933 if (end == vma->vm_end)
1934 error = madvise_fixup_end(vma, start, behavior);
1935 else
1936 error = madvise_fixup_middle(vma, start, end, behavior);
1939 return error;
1943 * Schedule all required I/O operations, then run the disk queue
1944 * to make sure they are started. Do not wait for completion.
1946 static long madvise_willneed(struct vm_area_struct * vma,
1947 unsigned long start, unsigned long end)
1949 long error = -EBADF;
1950 struct file * file;
1951 unsigned long size, rlim_rss;
1953 /* Doesn't work if there's no mapped file. */
1954 if (!vma->vm_file)
1955 return error;
1956 file = vma->vm_file;
1957 size = (file->f_dentry->d_inode->i_size + PAGE_CACHE_SIZE - 1) >>
1958 PAGE_CACHE_SHIFT;
1960 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1961 if (end > vma->vm_end)
1962 end = vma->vm_end;
1963 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1965 /* Make sure this doesn't exceed the process's max rss. */
1966 error = -EIO;
1967 rlim_rss = current->rlim ? current->rlim[RLIMIT_RSS].rlim_cur :
1968 LONG_MAX; /* default: see resource.h */
1969 if ((vma->vm_mm->rss + (end - start)) > rlim_rss)
1970 return error;
1972 /* round to cluster boundaries if this isn't a "random" area. */
1973 if (!VM_RandomReadHint(vma)) {
1974 start = CLUSTER_OFFSET(start);
1975 end = CLUSTER_OFFSET(end + CLUSTER_PAGES - 1);
1977 while ((start < end) && (start < size)) {
1978 error = read_cluster_nonblocking(file, start, size);
1979 start += CLUSTER_PAGES;
1980 if (error < 0)
1981 break;
1983 } else {
1984 while ((start < end) && (start < size)) {
1985 error = page_cache_read(file, start);
1986 start++;
1987 if (error < 0)
1988 break;
1992 /* Don't wait for someone else to push these requests. */
1993 run_task_queue(&tq_disk);
1995 return error;
1999 * Application no longer needs these pages. If the pages are dirty,
2000 * it's OK to just throw them away. The app will be more careful about
2001 * data it wants to keep. Be sure to free swap resources too. The
2002 * zap_page_range call sets things up for shrink_mmap to actually free
2003 * these pages later if no one else has touched them in the meantime,
2004 * although we could add these pages to a global reuse list for
2005 * shrink_mmap to pick up before reclaiming other pages.
2007 * NB: This interface discards data rather than pushes it out to swap,
2008 * as some implementations do. This has performance implications for
2009 * applications like large transactional databases which want to discard
2010 * pages in anonymous maps after committing to backing store the data
2011 * that was kept in them. There is no reason to write this data out to
2012 * the swap area if the application is discarding it.
2014 * An interface that causes the system to free clean pages and flush
2015 * dirty pages is already available as msync(MS_INVALIDATE).
2017 static long madvise_dontneed(struct vm_area_struct * vma,
2018 unsigned long start, unsigned long end)
2020 if (vma->vm_flags & VM_LOCKED)
2021 return -EINVAL;
2023 flush_cache_range(vma->vm_mm, start, end);
2024 zap_page_range(vma->vm_mm, start, end - start);
2025 flush_tlb_range(vma->vm_mm, start, end);
2026 return 0;
2029 static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
2030 unsigned long end, int behavior)
2032 long error = -EBADF;
2034 switch (behavior) {
2035 case MADV_NORMAL:
2036 case MADV_SEQUENTIAL:
2037 case MADV_RANDOM:
2038 error = madvise_behavior(vma, start, end, behavior);
2039 break;
2041 case MADV_WILLNEED:
2042 error = madvise_willneed(vma, start, end);
2043 break;
2045 case MADV_DONTNEED:
2046 error = madvise_dontneed(vma, start, end);
2047 break;
2049 default:
2050 error = -EINVAL;
2051 break;
2054 return error;
2058 * The madvise(2) system call.
2060 * Applications can use madvise() to advise the kernel how it should
2061 * handle paging I/O in this VM area. The idea is to help the kernel
2062 * use appropriate read-ahead and caching techniques. The information
2063 * provided is advisory only, and can be safely disregarded by the
2064 * kernel without affecting the correct operation of the application.
2066 * behavior values:
2067 * MADV_NORMAL - the default behavior is to read clusters. This
2068 * results in some read-ahead and read-behind.
2069 * MADV_RANDOM - the system should read the minimum amount of data
2070 * on any access, since it is unlikely that the appli-
2071 * cation will need more than what it asks for.
2072 * MADV_SEQUENTIAL - pages in the given range will probably be accessed
2073 * once, so they can be aggressively read ahead, and
2074 * can be freed soon after they are accessed.
2075 * MADV_WILLNEED - the application is notifying the system to read
2076 * some pages ahead.
2077 * MADV_DONTNEED - the application is finished with the given range,
2078 * so the kernel can free resources associated with it.
2080 * return values:
2081 * zero - success
2082 * -EINVAL - start + len < 0, start is not page-aligned,
2083 * "behavior" is not a valid value, or application
2084 * is attempting to release locked or shared pages.
2085 * -ENOMEM - addresses in the specified range are not currently
2086 * mapped, or are outside the AS of the process.
2087 * -EIO - an I/O error occurred while paging in data.
2088 * -EBADF - map exists, but area maps something that isn't a file.
2089 * -EAGAIN - a kernel resource was temporarily unavailable.
2091 asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior)
2093 unsigned long end;
2094 struct vm_area_struct * vma;
2095 int unmapped_error = 0;
2096 int error = -EINVAL;
2098 down(&current->mm->mmap_sem);
2100 if (start & ~PAGE_MASK)
2101 goto out;
2102 len = (len + ~PAGE_MASK) & PAGE_MASK;
2103 end = start + len;
2104 if (end < start)
2105 goto out;
2107 error = 0;
2108 if (end == start)
2109 goto out;
2112 * If the interval [start,end) covers some unmapped address
2113 * ranges, just ignore them, but return -ENOMEM at the end.
2115 vma = find_vma(current->mm, start);
2116 for (;;) {
2117 /* Still start < end. */
2118 error = -ENOMEM;
2119 if (!vma)
2120 goto out;
2122 /* Here start < vma->vm_end. */
2123 if (start < vma->vm_start) {
2124 unmapped_error = -ENOMEM;
2125 start = vma->vm_start;
2128 /* Here vma->vm_start <= start < vma->vm_end. */
2129 if (end <= vma->vm_end) {
2130 if (start < end) {
2131 error = madvise_vma(vma, start, end,
2132 behavior);
2133 if (error)
2134 goto out;
2136 error = unmapped_error;
2137 goto out;
2140 /* Here vma->vm_start <= start < vma->vm_end < end. */
2141 error = madvise_vma(vma, start, vma->vm_end, behavior);
2142 if (error)
2143 goto out;
2144 start = vma->vm_end;
2145 vma = vma->vm_next;
2148 out:
2149 up(&current->mm->mmap_sem);
2150 return error;
2154 * Later we can get more picky about what "in core" means precisely.
2155 * For now, simply check to see if the page is in the page cache,
2156 * and is up to date; i.e. that no page-in operation would be required
2157 * at this time if an application were to map and access this page.
2159 static unsigned char mincore_page(struct vm_area_struct * vma,
2160 unsigned long pgoff)
2162 unsigned char present = 0;
2163 struct address_space * as = &vma->vm_file->f_dentry->d_inode->i_data;
2164 struct page * page, ** hash = page_hash(as, pgoff);
2166 spin_lock(&pagecache_lock);
2167 page = __find_page_nolock(as, pgoff, *hash);
2168 if ((page) && (Page_Uptodate(page)))
2169 present = 1;
2170 spin_unlock(&pagecache_lock);
2172 return present;
2175 static long mincore_vma(struct vm_area_struct * vma,
2176 unsigned long start, unsigned long end, unsigned char * vec)
2178 long error, i, remaining;
2179 unsigned char * tmp;
2181 error = -ENOMEM;
2182 if (!vma->vm_file)
2183 return error;
2185 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2186 if (end > vma->vm_end)
2187 end = vma->vm_end;
2188 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2190 error = -EAGAIN;
2191 tmp = (unsigned char *) __get_free_page(GFP_KERNEL);
2192 if (!tmp)
2193 return error;
2195 /* (end - start) is # of pages, and also # of bytes in "vec */
2196 remaining = (end - start),
2198 error = 0;
2199 for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) {
2200 int j = 0;
2201 long thispiece = (remaining < PAGE_SIZE) ?
2202 remaining : PAGE_SIZE;
2204 while (j < thispiece)
2205 tmp[j++] = mincore_page(vma, start++);
2207 if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) {
2208 error = -EFAULT;
2209 break;
2213 free_page((unsigned long) tmp);
2214 return error;
2218 * The mincore(2) system call.
2220 * mincore() returns the memory residency status of the pages in the
2221 * current process's address space specified by [addr, addr + len).
2222 * The status is returned in a vector of bytes. The least significant
2223 * bit of each byte is 1 if the referenced page is in memory, otherwise
2224 * it is zero.
2226 * Because the status of a page can change after mincore() checks it
2227 * but before it returns to the application, the returned vector may
2228 * contain stale information. Only locked pages are guaranteed to
2229 * remain in memory.
2231 * return values:
2232 * zero - success
2233 * -EFAULT - vec points to an illegal address
2234 * -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE,
2235 * or len has a nonpositive value
2236 * -ENOMEM - Addresses in the range [addr, addr + len] are
2237 * invalid for the address space of this process, or
2238 * specify one or more pages which are not currently
2239 * mapped
2240 * -EAGAIN - A kernel resource was temporarily unavailable.
2242 asmlinkage long sys_mincore(unsigned long start, size_t len,
2243 unsigned char * vec)
2245 int index = 0;
2246 unsigned long end;
2247 struct vm_area_struct * vma;
2248 int unmapped_error = 0;
2249 long error = -EINVAL;
2251 down(&current->mm->mmap_sem);
2253 if (start & ~PAGE_CACHE_MASK)
2254 goto out;
2255 len = (len + ~PAGE_CACHE_MASK) & PAGE_CACHE_MASK;
2256 end = start + len;
2257 if (end < start)
2258 goto out;
2260 error = 0;
2261 if (end == start)
2262 goto out;
2265 * If the interval [start,end) covers some unmapped address
2266 * ranges, just ignore them, but return -ENOMEM at the end.
2268 vma = find_vma(current->mm, start);
2269 for (;;) {
2270 /* Still start < end. */
2271 error = -ENOMEM;
2272 if (!vma)
2273 goto out;
2275 /* Here start < vma->vm_end. */
2276 if (start < vma->vm_start) {
2277 unmapped_error = -ENOMEM;
2278 start = vma->vm_start;
2281 /* Here vma->vm_start <= start < vma->vm_end. */
2282 if (end <= vma->vm_end) {
2283 if (start < end) {
2284 error = mincore_vma(vma, start, end,
2285 &vec[index]);
2286 if (error)
2287 goto out;
2289 error = unmapped_error;
2290 goto out;
2293 /* Here vma->vm_start <= start < vma->vm_end < end. */
2294 error = mincore_vma(vma, start, vma->vm_end, &vec[index]);
2295 if (error)
2296 goto out;
2297 index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT;
2298 start = vma->vm_end;
2299 vma = vma->vm_next;
2302 out:
2303 up(&current->mm->mmap_sem);
2304 return error;
2307 static inline
2308 struct page *__read_cache_page(struct address_space *mapping,
2309 unsigned long index,
2310 int (*filler)(void *,struct page*),
2311 void *data)
2313 struct page **hash = page_hash(mapping, index);
2314 struct page *page, *cached_page = NULL;
2315 int err;
2316 repeat:
2317 page = __find_get_page(mapping, index, hash);
2318 if (!page) {
2319 if (!cached_page) {
2320 cached_page = page_cache_alloc();
2321 if (!cached_page)
2322 return ERR_PTR(-ENOMEM);
2324 page = cached_page;
2325 if (add_to_page_cache_unique(page, mapping, index, hash))
2326 goto repeat;
2327 cached_page = NULL;
2328 err = filler(data, page);
2329 if (err < 0) {
2330 page_cache_release(page);
2331 page = ERR_PTR(err);
2334 if (cached_page)
2335 page_cache_free(cached_page);
2336 return page;
2340 * Read into the page cache. If a page already exists,
2341 * and Page_Uptodate() is not set, try to fill the page.
2343 struct page *read_cache_page(struct address_space *mapping,
2344 unsigned long index,
2345 int (*filler)(void *,struct page*),
2346 void *data)
2348 struct page *page = __read_cache_page(mapping, index, filler, data);
2349 int err;
2351 if (IS_ERR(page) || Page_Uptodate(page))
2352 goto out;
2354 lock_page(page);
2355 if (Page_Uptodate(page)) {
2356 UnlockPage(page);
2357 goto out;
2359 err = filler(data, page);
2360 if (err < 0) {
2361 page_cache_release(page);
2362 page = ERR_PTR(err);
2364 out:
2365 return page;
2368 static inline struct page * __grab_cache_page(struct address_space *mapping,
2369 unsigned long index, struct page **cached_page)
2371 struct page *page, **hash = page_hash(mapping, index);
2372 repeat:
2373 page = __find_lock_page(mapping, index, hash);
2374 if (!page) {
2375 if (!*cached_page) {
2376 *cached_page = page_cache_alloc();
2377 if (!*cached_page)
2378 return NULL;
2380 page = *cached_page;
2381 if (add_to_page_cache_unique(page, mapping, index, hash))
2382 goto repeat;
2383 *cached_page = NULL;
2385 return page;
2389 * Returns locked page at given index in given cache, creating it if needed.
2392 struct page *grab_cache_page(struct address_space *mapping, unsigned long index)
2394 struct page *cached_page = NULL;
2395 struct page *page = __grab_cache_page(mapping,index,&cached_page);
2396 if (cached_page)
2397 page_cache_free(cached_page);
2398 return page;
2401 static inline void remove_suid(struct inode *inode)
2403 unsigned int mode;
2405 /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
2406 mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
2408 /* was any of the uid bits set? */
2409 mode &= inode->i_mode;
2410 if (mode && !capable(CAP_FSETID)) {
2411 inode->i_mode &= ~mode;
2412 mark_inode_dirty(inode);
2417 * Write to a file through the page cache.
2419 * We currently put everything into the page cache prior to writing it.
2420 * This is not a problem when writing full pages. With partial pages,
2421 * however, we first have to read the data into the cache, then
2422 * dirty the page, and finally schedule it for writing. Alternatively, we
2423 * could write-through just the portion of data that would go into that
2424 * page, but that would kill performance for applications that write data
2425 * line by line, and it's prone to race conditions.
2427 * Note that this routine doesn't try to keep track of dirty pages. Each
2428 * file system has to do this all by itself, unfortunately.
2429 * okir@monad.swb.de
2431 ssize_t
2432 generic_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)
2434 struct inode *inode = file->f_dentry->d_inode;
2435 struct address_space *mapping = inode->i_mapping;
2436 unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
2437 loff_t pos;
2438 struct page *page, *cached_page;
2439 unsigned long written;
2440 long status;
2441 int err;
2443 cached_page = NULL;
2445 down(&inode->i_sem);
2447 pos = *ppos;
2448 err = -EINVAL;
2449 if (pos < 0)
2450 goto out;
2452 err = file->f_error;
2453 if (err) {
2454 file->f_error = 0;
2455 goto out;
2458 written = 0;
2460 if (file->f_flags & O_APPEND)
2461 pos = inode->i_size;
2464 * Check whether we've reached the file size limit.
2466 err = -EFBIG;
2467 if (limit != RLIM_INFINITY) {
2468 if (pos >= limit) {
2469 send_sig(SIGXFSZ, current, 0);
2470 goto out;
2472 if (count > limit - pos) {
2473 send_sig(SIGXFSZ, current, 0);
2474 count = limit - pos;
2478 status = 0;
2479 if (count) {
2480 remove_suid(inode);
2481 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
2482 mark_inode_dirty(inode);
2485 while (count) {
2486 unsigned long bytes, index, offset;
2487 char *kaddr;
2490 * Try to find the page in the cache. If it isn't there,
2491 * allocate a free page.
2493 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
2494 index = pos >> PAGE_CACHE_SHIFT;
2495 bytes = PAGE_CACHE_SIZE - offset;
2496 if (bytes > count)
2497 bytes = count;
2499 status = -ENOMEM; /* we'll assign it later anyway */
2500 page = __grab_cache_page(mapping, index, &cached_page);
2501 if (!page)
2502 break;
2504 /* We have exclusive IO access to the page.. */
2505 if (!PageLocked(page)) {
2506 PAGE_BUG(page);
2509 status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes);
2510 if (status)
2511 goto unlock;
2512 kaddr = (char*)page_address(page);
2513 status = copy_from_user(kaddr+offset, buf, bytes);
2514 if (status)
2515 goto fail_write;
2516 status = mapping->a_ops->commit_write(file, page, offset, offset+bytes);
2517 if (!status)
2518 status = bytes;
2520 if (status >= 0) {
2521 written += status;
2522 count -= status;
2523 pos += status;
2524 buf += status;
2526 unlock:
2527 /* Mark it unlocked again and drop the page.. */
2528 UnlockPage(page);
2529 page_cache_release(page);
2531 if (status < 0)
2532 break;
2534 *ppos = pos;
2536 if (cached_page)
2537 page_cache_free(cached_page);
2539 err = written ? written : status;
2540 out:
2541 up(&inode->i_sem);
2542 return err;
2543 fail_write:
2544 status = -EFAULT;
2545 ClearPageUptodate(page);
2546 kunmap(page);
2547 goto unlock;
2550 void __init page_cache_init(unsigned long mempages)
2552 unsigned long htable_size, order;
2554 htable_size = mempages;
2555 htable_size *= sizeof(struct page *);
2556 for(order = 0; (PAGE_SIZE << order) < htable_size; order++)
2559 do {
2560 unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *);
2562 page_hash_bits = 0;
2563 while((tmp >>= 1UL) != 0UL)
2564 page_hash_bits++;
2566 page_hash_table = (struct page **)
2567 __get_free_pages(GFP_ATOMIC, order);
2568 } while(page_hash_table == NULL && --order > 0);
2570 printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
2571 (1 << page_hash_bits), order, (PAGE_SIZE << order));
2572 if (!page_hash_table)
2573 panic("Failed to allocate page hash table\n");
2574 memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));