Linux-2.4.0-test2
[davej-history.git] / mm / filemap.c
blobb1e2b8547fe69cb9c4a6efd92bd39d410b22f71a
1 /*
2 * linux/mm/filemap.c
4 * Copyright (C) 1994-1999 Linus Torvalds
5 */
7 /*
8 * This file handles the generic file mmap semantics used by
9 * most "normal" filesystems (but you don't /have/ to use this:
10 * the NFS filesystem used to do this differently, for example)
12 #include <linux/malloc.h>
13 #include <linux/shm.h>
14 #include <linux/mman.h>
15 #include <linux/locks.h>
16 #include <linux/pagemap.h>
17 #include <linux/swap.h>
18 #include <linux/smp_lock.h>
19 #include <linux/blkdev.h>
20 #include <linux/file.h>
21 #include <linux/swapctl.h>
22 #include <linux/slab.h>
23 #include <linux/init.h>
24 #include <linux/mm.h>
26 #include <asm/pgalloc.h>
27 #include <asm/uaccess.h>
28 #include <asm/mman.h>
30 #include <linux/highmem.h>
33 * Shared mappings implemented 30.11.1994. It's not fully working yet,
34 * though.
36 * Shared mappings now work. 15.8.1995 Bruno.
38 * finished 'unifying' the page and buffer cache and SMP-threaded the
39 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
41 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
44 atomic_t page_cache_size = ATOMIC_INIT(0);
45 unsigned int page_hash_bits;
46 struct page **page_hash_table;
47 struct list_head lru_cache;
49 static spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
51 * NOTE: to avoid deadlocking you must never acquire the pagecache_lock with
52 * the pagemap_lru_lock held.
54 spinlock_t pagemap_lru_lock = SPIN_LOCK_UNLOCKED;
56 #define CLUSTER_PAGES (1 << page_cluster)
57 #define CLUSTER_OFFSET(x) (((x) >> page_cluster) << page_cluster)
59 void __add_page_to_hash_queue(struct page * page, struct page **p)
61 atomic_inc(&page_cache_size);
62 if((page->next_hash = *p) != NULL)
63 (*p)->pprev_hash = &page->next_hash;
64 *p = page;
65 page->pprev_hash = p;
66 if (page->buffers)
67 PAGE_BUG(page);
70 static inline void remove_page_from_hash_queue(struct page * page)
72 if(page->pprev_hash) {
73 if(page->next_hash)
74 page->next_hash->pprev_hash = page->pprev_hash;
75 *page->pprev_hash = page->next_hash;
76 page->pprev_hash = NULL;
78 atomic_dec(&page_cache_size);
81 static inline int sync_page(struct page *page)
83 struct address_space *mapping = page->mapping;
85 if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
86 return mapping->a_ops->sync_page(page);
87 return 0;
91 * Remove a page from the page cache and free it. Caller has to make
92 * sure the page is locked and that nobody else uses it - or that usage
93 * is safe.
95 static inline void __remove_inode_page(struct page *page)
97 remove_page_from_inode_queue(page);
98 remove_page_from_hash_queue(page);
99 page->mapping = NULL;
102 void remove_inode_page(struct page *page)
104 if (!PageLocked(page))
105 PAGE_BUG(page);
107 spin_lock(&pagecache_lock);
108 __remove_inode_page(page);
109 spin_unlock(&pagecache_lock);
113 * invalidate_inode_pages - Invalidate all the unlocked pages of one inode
114 * @inode: the inode which pages we want to invalidate
116 * This function only removes the unlocked pages, if you want to
117 * remove all the pages of one inode, you must call truncate_inode_pages.
120 void invalidate_inode_pages(struct inode * inode)
122 struct list_head *head, *curr;
123 struct page * page;
125 head = &inode->i_mapping->pages;
127 spin_lock(&pagecache_lock);
128 spin_lock(&pagemap_lru_lock);
129 curr = head->next;
131 while (curr != head) {
132 page = list_entry(curr, struct page, list);
133 curr = curr->next;
135 /* We cannot invalidate a locked page */
136 if (TryLockPage(page))
137 continue;
139 __lru_cache_del(page);
140 __remove_inode_page(page);
141 UnlockPage(page);
142 page_cache_release(page);
145 spin_unlock(&pagemap_lru_lock);
146 spin_unlock(&pagecache_lock);
150 * Truncate the page cache at a set offset, removing the pages
151 * that are beyond that offset (and zeroing out partial pages).
153 void truncate_inode_pages(struct address_space * mapping, loff_t lstart)
155 struct list_head *head, *curr;
156 struct page * page;
157 unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
158 unsigned long start;
160 start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
162 repeat:
163 head = &mapping->pages;
164 spin_lock(&pagecache_lock);
165 curr = head->next;
166 while (curr != head) {
167 unsigned long offset;
169 page = list_entry(curr, struct page, list);
170 curr = curr->next;
172 offset = page->index;
174 /* page wholly truncated - free it */
175 if (offset >= start) {
176 if (TryLockPage(page)) {
177 page_cache_get(page);
178 spin_unlock(&pagecache_lock);
179 wait_on_page(page);
180 page_cache_release(page);
181 goto repeat;
183 page_cache_get(page);
184 spin_unlock(&pagecache_lock);
186 if (!page->buffers || block_flushpage(page, 0))
187 lru_cache_del(page);
190 * We remove the page from the page cache
191 * _after_ we have destroyed all buffer-cache
192 * references to it. Otherwise some other process
193 * might think this inode page is not in the
194 * page cache and creates a buffer-cache alias
195 * to it causing all sorts of fun problems ...
197 remove_inode_page(page);
199 UnlockPage(page);
200 page_cache_release(page);
201 page_cache_release(page);
204 * We have done things without the pagecache lock,
205 * so we'll have to repeat the scan.
206 * It's not possible to deadlock here because
207 * we are guaranteed to make progress. (ie. we have
208 * just removed a page)
210 goto repeat;
213 * there is only one partial page possible.
215 if (!partial)
216 continue;
218 /* and it's the one preceeding the first wholly truncated page */
219 if ((offset + 1) != start)
220 continue;
222 /* partial truncate, clear end of page */
223 if (TryLockPage(page)) {
224 spin_unlock(&pagecache_lock);
225 goto repeat;
227 page_cache_get(page);
228 spin_unlock(&pagecache_lock);
230 memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
231 if (page->buffers)
232 block_flushpage(page, partial);
234 partial = 0;
237 * we have dropped the spinlock so we have to
238 * restart.
240 UnlockPage(page);
241 page_cache_release(page);
242 goto repeat;
244 spin_unlock(&pagecache_lock);
248 * nr_dirty represents the number of dirty pages that we will write async
249 * before doing sync writes. We can only do sync writes if we can
250 * wait for IO (__GFP_IO set).
252 int shrink_mmap(int priority, int gfp_mask)
254 int ret = 0, count, nr_dirty;
255 struct list_head * page_lru;
256 struct page * page = NULL;
258 count = nr_lru_pages / (priority + 1);
259 nr_dirty = priority;
261 /* we need pagemap_lru_lock for list_del() ... subtle code below */
262 spin_lock(&pagemap_lru_lock);
263 while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache) {
264 page = list_entry(page_lru, struct page, lru);
265 list_del(page_lru);
267 if (PageTestandClearReferenced(page))
268 goto dispose_continue;
270 count--;
272 * Avoid unscalable SMP locking for pages we can
273 * immediate tell are untouchable..
275 if (!page->buffers && page_count(page) > 1)
276 goto dispose_continue;
278 if (TryLockPage(page))
279 goto dispose_continue;
281 /* Release the pagemap_lru lock even if the page is not yet
282 queued in any lru queue since we have just locked down
283 the page so nobody else may SMP race with us running
284 a lru_cache_del() (lru_cache_del() always run with the
285 page locked down ;). */
286 spin_unlock(&pagemap_lru_lock);
288 /* avoid freeing the page while it's locked */
289 page_cache_get(page);
292 * Is it a buffer page? Try to clean it up regardless
293 * of zone - it's old.
295 if (page->buffers) {
296 int wait = ((gfp_mask & __GFP_IO) && (nr_dirty-- < 0));
297 if (!try_to_free_buffers(page, wait))
298 goto unlock_continue;
299 /* page was locked, inode can't go away under us */
300 if (!page->mapping) {
301 atomic_dec(&buffermem_pages);
302 goto made_buffer_progress;
306 /* Take the pagecache_lock spinlock held to avoid
307 other tasks to notice the page while we are looking at its
308 page count. If it's a pagecache-page we'll free it
309 in one atomic transaction after checking its page count. */
310 spin_lock(&pagecache_lock);
313 * We can't free pages unless there's just one user
314 * (count == 2 because we added one ourselves above).
316 if (page_count(page) != 2)
317 goto cache_unlock_continue;
320 * Is it a page swap page? If so, we want to
321 * drop it if it is no longer used, even if it
322 * were to be marked referenced..
324 if (PageSwapCache(page)) {
325 spin_unlock(&pagecache_lock);
326 __delete_from_swap_cache(page);
327 goto made_inode_progress;
331 * Page is from a zone we don't care about.
332 * Don't drop page cache entries in vain.
334 if (page->zone->free_pages > page->zone->pages_high)
335 goto cache_unlock_continue;
337 /* is it a page-cache page? */
338 if (page->mapping) {
339 if (!PageDirty(page) && !pgcache_under_min()) {
340 __remove_inode_page(page);
341 spin_unlock(&pagecache_lock);
342 goto made_inode_progress;
344 goto cache_unlock_continue;
347 printk(KERN_ERR "shrink_mmap: unknown LRU page!\n");
349 cache_unlock_continue:
350 spin_unlock(&pagecache_lock);
351 unlock_continue:
352 spin_lock(&pagemap_lru_lock);
353 UnlockPage(page);
354 page_cache_release(page);
355 dispose_continue:
356 list_add(page_lru, &lru_cache);
358 goto out;
360 made_inode_progress:
361 page_cache_release(page);
362 made_buffer_progress:
363 UnlockPage(page);
364 page_cache_release(page);
365 ret = 1;
366 spin_lock(&pagemap_lru_lock);
367 /* nr_lru_pages needs the spinlock */
368 nr_lru_pages--;
370 out:
371 spin_unlock(&pagemap_lru_lock);
373 return ret;
376 static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
378 goto inside;
380 for (;;) {
381 page = page->next_hash;
382 inside:
383 if (!page)
384 goto not_found;
385 if (page->mapping != mapping)
386 continue;
387 if (page->index == offset)
388 break;
390 SetPageReferenced(page);
391 not_found:
392 return page;
396 * By the time this is called, the page is locked and
397 * we don't have to worry about any races any more.
399 * Start the IO..
401 static int writeout_one_page(struct page *page)
403 struct buffer_head *bh, *head = page->buffers;
405 bh = head;
406 do {
407 if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
408 continue;
410 bh->b_flushtime = 0;
411 ll_rw_block(WRITE, 1, &bh);
412 } while ((bh = bh->b_this_page) != head);
413 return 0;
416 static int waitfor_one_page(struct page *page)
418 int error = 0;
419 struct buffer_head *bh, *head = page->buffers;
421 bh = head;
422 do {
423 wait_on_buffer(bh);
424 if (buffer_req(bh) && !buffer_uptodate(bh))
425 error = -EIO;
426 } while ((bh = bh->b_this_page) != head);
427 return error;
430 static int do_buffer_fdatasync(struct inode *inode, unsigned long start, unsigned long end, int (*fn)(struct page *))
432 struct list_head *head, *curr;
433 struct page *page;
434 int retval = 0;
436 head = &inode->i_mapping->pages;
438 spin_lock(&pagecache_lock);
439 curr = head->next;
440 while (curr != head) {
441 page = list_entry(curr, struct page, list);
442 curr = curr->next;
443 if (!page->buffers)
444 continue;
445 if (page->index >= end)
446 continue;
447 if (page->index < start)
448 continue;
450 page_cache_get(page);
451 spin_unlock(&pagecache_lock);
452 lock_page(page);
454 /* The buffers could have been free'd while we waited for the page lock */
455 if (page->buffers)
456 retval |= fn(page);
458 UnlockPage(page);
459 spin_lock(&pagecache_lock);
460 curr = page->list.next;
461 page_cache_release(page);
463 spin_unlock(&pagecache_lock);
465 return retval;
469 * Two-stage data sync: first start the IO, then go back and
470 * collect the information..
472 int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx)
474 int retval;
476 retval = do_buffer_fdatasync(inode, start_idx, end_idx, writeout_one_page);
477 retval |= do_buffer_fdatasync(inode, start_idx, end_idx, waitfor_one_page);
478 return retval;
482 * Add a page to the inode page cache.
484 * The caller must have locked the page and
485 * set all the page flags correctly..
487 void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index)
489 if (!PageLocked(page))
490 BUG();
492 page_cache_get(page);
493 spin_lock(&pagecache_lock);
494 page->index = index;
495 add_page_to_inode_queue(mapping, page);
496 __add_page_to_hash_queue(page, page_hash(mapping, index));
497 lru_cache_add(page);
498 spin_unlock(&pagecache_lock);
502 * This adds a page to the page cache, starting out as locked,
503 * owned by us, referenced, but not uptodate and with no errors.
505 static inline void __add_to_page_cache(struct page * page,
506 struct address_space *mapping, unsigned long offset,
507 struct page **hash)
509 struct page *alias;
510 unsigned long flags;
512 if (PageLocked(page))
513 BUG();
515 flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty));
516 page->flags = flags | (1 << PG_locked) | (1 << PG_referenced);
517 page_cache_get(page);
518 page->index = offset;
519 add_page_to_inode_queue(mapping, page);
520 __add_page_to_hash_queue(page, hash);
521 lru_cache_add(page);
522 alias = __find_page_nolock(mapping, offset, *hash);
523 if (alias != page)
524 BUG();
527 void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset)
529 spin_lock(&pagecache_lock);
530 __add_to_page_cache(page, mapping, offset, page_hash(mapping, offset));
531 spin_unlock(&pagecache_lock);
534 static int add_to_page_cache_unique(struct page * page,
535 struct address_space *mapping, unsigned long offset,
536 struct page **hash)
538 int err;
539 struct page *alias;
541 spin_lock(&pagecache_lock);
542 alias = __find_page_nolock(mapping, offset, *hash);
544 err = 1;
545 if (!alias) {
546 __add_to_page_cache(page,mapping,offset,hash);
547 err = 0;
550 spin_unlock(&pagecache_lock);
551 return err;
555 * This adds the requested page to the page cache if it isn't already there,
556 * and schedules an I/O to read in its contents from disk.
558 static inline int page_cache_read(struct file * file, unsigned long offset)
560 struct inode *inode = file->f_dentry->d_inode;
561 struct address_space *mapping = inode->i_mapping;
562 struct page **hash = page_hash(mapping, offset);
563 struct page *page;
565 spin_lock(&pagecache_lock);
566 page = __find_page_nolock(mapping, offset, *hash);
567 spin_unlock(&pagecache_lock);
568 if (page)
569 return 0;
571 page = page_cache_alloc();
572 if (!page)
573 return -ENOMEM;
575 if (!add_to_page_cache_unique(page, mapping, offset, hash)) {
576 int error = mapping->a_ops->readpage(file, page);
577 page_cache_release(page);
578 return error;
581 * We arrive here in the unlikely event that someone
582 * raced with us and added our page to the cache first.
584 page_cache_free(page);
585 return 0;
589 * Read in an entire cluster at once. A cluster is usually a 64k-
590 * aligned block that includes the page requested in "offset."
592 static int read_cluster_nonblocking(struct file * file, unsigned long offset,
593 unsigned long filesize)
595 unsigned long pages = CLUSTER_PAGES;
597 offset = CLUSTER_OFFSET(offset);
598 while ((pages-- > 0) && (offset < filesize)) {
599 int error = page_cache_read(file, offset);
600 if (error < 0)
601 return error;
602 offset ++;
605 return 0;
609 * Wait for a page to get unlocked.
611 * This must be called with the caller "holding" the page,
612 * ie with increased "page->count" so that the page won't
613 * go away during the wait..
615 void ___wait_on_page(struct page *page)
617 struct task_struct *tsk = current;
618 DECLARE_WAITQUEUE(wait, tsk);
620 add_wait_queue(&page->wait, &wait);
621 do {
622 sync_page(page);
623 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
624 if (!PageLocked(page))
625 break;
626 schedule();
627 } while (PageLocked(page));
628 tsk->state = TASK_RUNNING;
629 remove_wait_queue(&page->wait, &wait);
633 * Get an exclusive lock on the page..
635 void lock_page(struct page *page)
637 while (TryLockPage(page))
638 ___wait_on_page(page);
643 * a rather lightweight function, finding and getting a reference to a
644 * hashed page atomically, waiting for it if it's locked.
646 struct page * __find_get_page (struct address_space *mapping,
647 unsigned long offset, struct page **hash)
649 struct page *page;
652 * We scan the hash list read-only. Addition to and removal from
653 * the hash-list needs a held write-lock.
655 repeat:
656 spin_lock(&pagecache_lock);
657 page = __find_page_nolock(mapping, offset, *hash);
658 if (page)
659 page_cache_get(page);
660 spin_unlock(&pagecache_lock);
662 /* Found the page, sleep if locked. */
663 if (page && PageLocked(page)) {
664 struct task_struct *tsk = current;
665 DECLARE_WAITQUEUE(wait, tsk);
667 sync_page(page);
669 __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
670 add_wait_queue(&page->wait, &wait);
672 if (PageLocked(page))
673 schedule();
674 __set_task_state(tsk, TASK_RUNNING);
675 remove_wait_queue(&page->wait, &wait);
678 * The page might have been unhashed meanwhile. It's
679 * not freed though because we hold a reference to it.
680 * If this is the case then it will be freed _here_,
681 * and we recheck the hash anyway.
683 page_cache_release(page);
684 goto repeat;
687 * It's not locked so we can return the page and we hold
688 * a reference to it.
690 return page;
694 * Get the lock to a page atomically.
696 struct page * __find_lock_page (struct address_space *mapping,
697 unsigned long offset, struct page **hash)
699 struct page *page;
702 * We scan the hash list read-only. Addition to and removal from
703 * the hash-list needs a held write-lock.
705 repeat:
706 spin_lock(&pagecache_lock);
707 page = __find_page_nolock(mapping, offset, *hash);
708 if (page)
709 page_cache_get(page);
710 spin_unlock(&pagecache_lock);
712 /* Found the page, sleep if locked. */
713 if (page && TryLockPage(page)) {
714 struct task_struct *tsk = current;
715 DECLARE_WAITQUEUE(wait, tsk);
717 sync_page(page);
719 __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
720 add_wait_queue(&page->wait, &wait);
722 if (PageLocked(page))
723 schedule();
724 __set_task_state(tsk, TASK_RUNNING);
725 remove_wait_queue(&page->wait, &wait);
728 * The page might have been unhashed meanwhile. It's
729 * not freed though because we hold a reference to it.
730 * If this is the case then it will be freed _here_,
731 * and we recheck the hash anyway.
733 page_cache_release(page);
734 goto repeat;
737 * It's not locked so we can return the page and we hold
738 * a reference to it.
740 return page;
743 #if 0
744 #define PROFILE_READAHEAD
745 #define DEBUG_READAHEAD
746 #endif
749 * Read-ahead profiling information
750 * --------------------------------
751 * Every PROFILE_MAXREADCOUNT, the following information is written
752 * to the syslog:
753 * Percentage of asynchronous read-ahead.
754 * Average of read-ahead fields context value.
755 * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
756 * to the syslog.
759 #ifdef PROFILE_READAHEAD
761 #define PROFILE_MAXREADCOUNT 1000
763 static unsigned long total_reada;
764 static unsigned long total_async;
765 static unsigned long total_ramax;
766 static unsigned long total_ralen;
767 static unsigned long total_rawin;
769 static void profile_readahead(int async, struct file *filp)
771 unsigned long flags;
773 ++total_reada;
774 if (async)
775 ++total_async;
777 total_ramax += filp->f_ramax;
778 total_ralen += filp->f_ralen;
779 total_rawin += filp->f_rawin;
781 if (total_reada > PROFILE_MAXREADCOUNT) {
782 save_flags(flags);
783 cli();
784 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
785 restore_flags(flags);
786 return;
789 printk("Readahead average: max=%ld, len=%ld, win=%ld, async=%ld%%\n",
790 total_ramax/total_reada,
791 total_ralen/total_reada,
792 total_rawin/total_reada,
793 (total_async*100)/total_reada);
794 #ifdef DEBUG_READAHEAD
795 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n",
796 filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
797 #endif
799 total_reada = 0;
800 total_async = 0;
801 total_ramax = 0;
802 total_ralen = 0;
803 total_rawin = 0;
805 restore_flags(flags);
808 #endif /* defined PROFILE_READAHEAD */
811 * Read-ahead context:
812 * -------------------
813 * The read ahead context fields of the "struct file" are the following:
814 * - f_raend : position of the first byte after the last page we tried to
815 * read ahead.
816 * - f_ramax : current read-ahead maximum size.
817 * - f_ralen : length of the current IO read block we tried to read-ahead.
818 * - f_rawin : length of the current read-ahead window.
819 * if last read-ahead was synchronous then
820 * f_rawin = f_ralen
821 * otherwise (was asynchronous)
822 * f_rawin = previous value of f_ralen + f_ralen
824 * Read-ahead limits:
825 * ------------------
826 * MIN_READAHEAD : minimum read-ahead size when read-ahead.
827 * MAX_READAHEAD : maximum read-ahead size when read-ahead.
829 * Synchronous read-ahead benefits:
830 * --------------------------------
831 * Using reasonable IO xfer length from peripheral devices increase system
832 * performances.
833 * Reasonable means, in this context, not too large but not too small.
834 * The actual maximum value is:
835 * MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
836 * and 32K if defined (4K page size assumed).
838 * Asynchronous read-ahead benefits:
839 * ---------------------------------
840 * Overlapping next read request and user process execution increase system
841 * performance.
843 * Read-ahead risks:
844 * -----------------
845 * We have to guess which further data are needed by the user process.
846 * If these data are often not really needed, it's bad for system
847 * performances.
848 * However, we know that files are often accessed sequentially by
849 * application programs and it seems that it is possible to have some good
850 * strategy in that guessing.
851 * We only try to read-ahead files that seems to be read sequentially.
853 * Asynchronous read-ahead risks:
854 * ------------------------------
855 * In order to maximize overlapping, we must start some asynchronous read
856 * request from the device, as soon as possible.
857 * We must be very careful about:
858 * - The number of effective pending IO read requests.
859 * ONE seems to be the only reasonable value.
860 * - The total memory pool usage for the file access stream.
861 * This maximum memory usage is implicitly 2 IO read chunks:
862 * 2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
863 * 64k if defined (4K page size assumed).
866 static inline int get_max_readahead(struct inode * inode)
868 if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
869 return MAX_READAHEAD;
870 return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
873 static void generic_file_readahead(int reada_ok,
874 struct file * filp, struct inode * inode,
875 struct page * page)
877 unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
878 unsigned long index = page->index;
879 unsigned long max_ahead, ahead;
880 unsigned long raend;
881 int max_readahead = get_max_readahead(inode);
883 raend = filp->f_raend;
884 max_ahead = 0;
887 * The current page is locked.
888 * If the current position is inside the previous read IO request, do not
889 * try to reread previously read ahead pages.
890 * Otherwise decide or not to read ahead some pages synchronously.
891 * If we are not going to read ahead, set the read ahead context for this
892 * page only.
894 if (PageLocked(page)) {
895 if (!filp->f_ralen || index >= raend || index + filp->f_ralen < raend) {
896 raend = index;
897 if (raend < end_index)
898 max_ahead = filp->f_ramax;
899 filp->f_rawin = 0;
900 filp->f_ralen = 1;
901 if (!max_ahead) {
902 filp->f_raend = index + filp->f_ralen;
903 filp->f_rawin += filp->f_ralen;
908 * The current page is not locked.
909 * If we were reading ahead and,
910 * if the current max read ahead size is not zero and,
911 * if the current position is inside the last read-ahead IO request,
912 * it is the moment to try to read ahead asynchronously.
913 * We will later force unplug device in order to force asynchronous read IO.
915 else if (reada_ok && filp->f_ramax && raend >= 1 &&
916 index <= raend && index + filp->f_ralen >= raend) {
918 * Add ONE page to max_ahead in order to try to have about the same IO max size
919 * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
920 * Compute the position of the last page we have tried to read in order to
921 * begin to read ahead just at the next page.
923 raend -= 1;
924 if (raend < end_index)
925 max_ahead = filp->f_ramax + 1;
927 if (max_ahead) {
928 filp->f_rawin = filp->f_ralen;
929 filp->f_ralen = 0;
930 reada_ok = 2;
934 * Try to read ahead pages.
935 * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
936 * scheduler, will work enough for us to avoid too bad actuals IO requests.
938 ahead = 0;
939 while (ahead < max_ahead) {
940 ahead ++;
941 if ((raend + ahead) >= end_index)
942 break;
943 if (page_cache_read(filp, raend + ahead) < 0)
944 break;
947 * If we tried to read ahead some pages,
948 * If we tried to read ahead asynchronously,
949 * Try to force unplug of the device in order to start an asynchronous
950 * read IO request.
951 * Update the read-ahead context.
952 * Store the length of the current read-ahead window.
953 * Double the current max read ahead size.
954 * That heuristic avoid to do some large IO for files that are not really
955 * accessed sequentially.
957 if (ahead) {
958 if (reada_ok == 2) {
959 run_task_queue(&tq_disk);
962 filp->f_ralen += ahead;
963 filp->f_rawin += filp->f_ralen;
964 filp->f_raend = raend + ahead + 1;
966 filp->f_ramax += filp->f_ramax;
968 if (filp->f_ramax > max_readahead)
969 filp->f_ramax = max_readahead;
971 #ifdef PROFILE_READAHEAD
972 profile_readahead((reada_ok == 2), filp);
973 #endif
976 return;
981 * This is a generic file read routine, and uses the
982 * inode->i_op->readpage() function for the actual low-level
983 * stuff.
985 * This is really ugly. But the goto's actually try to clarify some
986 * of the logic when it comes to error handling etc.
988 void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
990 struct inode *inode = filp->f_dentry->d_inode;
991 struct address_space *mapping = inode->i_mapping;
992 unsigned long index, offset;
993 struct page *cached_page;
994 int reada_ok;
995 int error;
996 int max_readahead = get_max_readahead(inode);
998 cached_page = NULL;
999 index = *ppos >> PAGE_CACHE_SHIFT;
1000 offset = *ppos & ~PAGE_CACHE_MASK;
1003 * If the current position is outside the previous read-ahead window,
1004 * we reset the current read-ahead context and set read ahead max to zero
1005 * (will be set to just needed value later),
1006 * otherwise, we assume that the file accesses are sequential enough to
1007 * continue read-ahead.
1009 if (index > filp->f_raend || index + filp->f_rawin < filp->f_raend) {
1010 reada_ok = 0;
1011 filp->f_raend = 0;
1012 filp->f_ralen = 0;
1013 filp->f_ramax = 0;
1014 filp->f_rawin = 0;
1015 } else {
1016 reada_ok = 1;
1019 * Adjust the current value of read-ahead max.
1020 * If the read operation stay in the first half page, force no readahead.
1021 * Otherwise try to increase read ahead max just enough to do the read request.
1022 * Then, at least MIN_READAHEAD if read ahead is ok,
1023 * and at most MAX_READAHEAD in all cases.
1025 if (!index && offset + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
1026 filp->f_ramax = 0;
1027 } else {
1028 unsigned long needed;
1030 needed = ((offset + desc->count) >> PAGE_CACHE_SHIFT) + 1;
1032 if (filp->f_ramax < needed)
1033 filp->f_ramax = needed;
1035 if (reada_ok && filp->f_ramax < MIN_READAHEAD)
1036 filp->f_ramax = MIN_READAHEAD;
1037 if (filp->f_ramax > max_readahead)
1038 filp->f_ramax = max_readahead;
1041 for (;;) {
1042 struct page *page, **hash;
1043 unsigned long end_index, nr;
1045 end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1046 if (index > end_index)
1047 break;
1048 nr = PAGE_CACHE_SIZE;
1049 if (index == end_index) {
1050 nr = inode->i_size & ~PAGE_CACHE_MASK;
1051 if (nr <= offset)
1052 break;
1055 nr = nr - offset;
1058 * Try to find the data in the page cache..
1060 hash = page_hash(mapping, index);
1062 spin_lock(&pagecache_lock);
1063 page = __find_page_nolock(mapping, index, *hash);
1064 if (!page)
1065 goto no_cached_page;
1066 found_page:
1067 page_cache_get(page);
1068 spin_unlock(&pagecache_lock);
1070 if (!Page_Uptodate(page))
1071 goto page_not_up_to_date;
1072 page_ok:
1074 * Ok, we have the page, and it's up-to-date, so
1075 * now we can copy it to user space...
1077 * The actor routine returns how many bytes were actually used..
1078 * NOTE! This may not be the same as how much of a user buffer
1079 * we filled up (we may be padding etc), so we can only update
1080 * "pos" here (the actor routine has to update the user buffer
1081 * pointers and the remaining count).
1083 nr = actor(desc, page, offset, nr);
1084 offset += nr;
1085 index += offset >> PAGE_CACHE_SHIFT;
1086 offset &= ~PAGE_CACHE_MASK;
1088 page_cache_release(page);
1089 if (nr && desc->count)
1090 continue;
1091 break;
1094 * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
1096 page_not_up_to_date:
1097 generic_file_readahead(reada_ok, filp, inode, page);
1099 if (Page_Uptodate(page))
1100 goto page_ok;
1102 /* Get exclusive access to the page ... */
1103 lock_page(page);
1104 if (Page_Uptodate(page)) {
1105 UnlockPage(page);
1106 goto page_ok;
1109 readpage:
1110 /* ... and start the actual read. The read will unlock the page. */
1111 error = mapping->a_ops->readpage(filp, page);
1113 if (!error) {
1114 if (Page_Uptodate(page))
1115 goto page_ok;
1117 /* Again, try some read-ahead while waiting for the page to finish.. */
1118 generic_file_readahead(reada_ok, filp, inode, page);
1119 wait_on_page(page);
1120 if (Page_Uptodate(page))
1121 goto page_ok;
1122 error = -EIO;
1125 /* UHHUH! A synchronous read error occurred. Report it */
1126 desc->error = error;
1127 page_cache_release(page);
1128 break;
1130 no_cached_page:
1132 * Ok, it wasn't cached, so we need to create a new
1133 * page..
1135 * We get here with the page cache lock held.
1137 if (!cached_page) {
1138 spin_unlock(&pagecache_lock);
1139 cached_page = page_cache_alloc();
1140 if (!cached_page) {
1141 desc->error = -ENOMEM;
1142 break;
1146 * Somebody may have added the page while we
1147 * dropped the page cache lock. Check for that.
1149 spin_lock(&pagecache_lock);
1150 page = __find_page_nolock(mapping, index, *hash);
1151 if (page)
1152 goto found_page;
1156 * Ok, add the new page to the hash-queues...
1158 page = cached_page;
1159 __add_to_page_cache(page, mapping, index, hash);
1160 spin_unlock(&pagecache_lock);
1161 cached_page = NULL;
1163 goto readpage;
1166 *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1167 filp->f_reada = 1;
1168 if (cached_page)
1169 page_cache_free(cached_page);
1170 UPDATE_ATIME(inode);
1173 static int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
1175 unsigned long kaddr;
1176 unsigned long left, count = desc->count;
1178 if (size > count)
1179 size = count;
1181 kaddr = kmap(page);
1182 left = __copy_to_user(desc->buf, (void *)(kaddr + offset), size);
1183 kunmap(page);
1185 if (left) {
1186 size -= left;
1187 desc->error = -EFAULT;
1189 desc->count = count - size;
1190 desc->written += size;
1191 desc->buf += size;
1192 return size;
1196 * This is the "read()" routine for all filesystems
1197 * that can use the page cache directly.
1199 ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
1201 ssize_t retval;
1203 retval = -EFAULT;
1204 if (access_ok(VERIFY_WRITE, buf, count)) {
1205 retval = 0;
1207 if (count) {
1208 read_descriptor_t desc;
1210 desc.written = 0;
1211 desc.count = count;
1212 desc.buf = buf;
1213 desc.error = 0;
1214 do_generic_file_read(filp, ppos, &desc, file_read_actor);
1216 retval = desc.written;
1217 if (!retval)
1218 retval = desc.error;
1221 return retval;
1224 static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size)
1226 unsigned long kaddr;
1227 ssize_t written;
1228 unsigned long count = desc->count;
1229 struct file *file = (struct file *) desc->buf;
1230 mm_segment_t old_fs;
1232 if (size > count)
1233 size = count;
1234 old_fs = get_fs();
1235 set_fs(KERNEL_DS);
1237 kaddr = kmap(page);
1238 written = file->f_op->write(file, (char *)kaddr + offset,
1239 size, &file->f_pos);
1240 kunmap(page);
1241 set_fs(old_fs);
1242 if (written < 0) {
1243 desc->error = written;
1244 written = 0;
1246 desc->count = count - written;
1247 desc->written += written;
1248 return written;
1251 asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
1253 ssize_t retval;
1254 struct file * in_file, * out_file;
1255 struct inode * in_inode, * out_inode;
1258 * Get input file, and verify that it is ok..
1260 retval = -EBADF;
1261 in_file = fget(in_fd);
1262 if (!in_file)
1263 goto out;
1264 if (!(in_file->f_mode & FMODE_READ))
1265 goto fput_in;
1266 retval = -EINVAL;
1267 in_inode = in_file->f_dentry->d_inode;
1268 if (!in_inode)
1269 goto fput_in;
1270 if (!in_inode->i_mapping->a_ops->readpage)
1271 goto fput_in;
1272 retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
1273 if (retval)
1274 goto fput_in;
1277 * Get output file, and verify that it is ok..
1279 retval = -EBADF;
1280 out_file = fget(out_fd);
1281 if (!out_file)
1282 goto fput_in;
1283 if (!(out_file->f_mode & FMODE_WRITE))
1284 goto fput_out;
1285 retval = -EINVAL;
1286 if (!out_file->f_op || !out_file->f_op->write)
1287 goto fput_out;
1288 out_inode = out_file->f_dentry->d_inode;
1289 if (!out_inode)
1290 goto fput_out;
1291 retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
1292 if (retval)
1293 goto fput_out;
1295 retval = 0;
1296 if (count) {
1297 read_descriptor_t desc;
1298 loff_t pos = 0, *ppos;
1300 retval = -EFAULT;
1301 ppos = &in_file->f_pos;
1302 if (offset) {
1303 if (get_user(pos, offset))
1304 goto fput_out;
1305 ppos = &pos;
1308 desc.written = 0;
1309 desc.count = count;
1310 desc.buf = (char *) out_file;
1311 desc.error = 0;
1312 do_generic_file_read(in_file, ppos, &desc, file_send_actor);
1314 retval = desc.written;
1315 if (!retval)
1316 retval = desc.error;
1317 if (offset)
1318 put_user(pos, offset);
1321 fput_out:
1322 fput(out_file);
1323 fput_in:
1324 fput(in_file);
1325 out:
1326 return retval;
1330 * Read-ahead and flush behind for MADV_SEQUENTIAL areas. Since we are
1331 * sure this is sequential access, we don't need a flexible read-ahead
1332 * window size -- we can always use a large fixed size window.
1334 static void nopage_sequential_readahead(struct vm_area_struct * vma,
1335 unsigned long pgoff, unsigned long filesize)
1337 unsigned long ra_window;
1339 ra_window = get_max_readahead(vma->vm_file->f_dentry->d_inode);
1340 ra_window = CLUSTER_OFFSET(ra_window + CLUSTER_PAGES - 1);
1342 /* vm_raend is zero if we haven't read ahead in this area yet. */
1343 if (vma->vm_raend == 0)
1344 vma->vm_raend = vma->vm_pgoff + ra_window;
1347 * If we've just faulted the page half-way through our window,
1348 * then schedule reads for the next window, and release the
1349 * pages in the previous window.
1351 if ((pgoff + (ra_window >> 1)) == vma->vm_raend) {
1352 unsigned long start = vma->vm_pgoff + vma->vm_raend;
1353 unsigned long end = start + ra_window;
1355 if (end > ((vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff))
1356 end = (vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff;
1357 if (start > end)
1358 return;
1360 while ((start < end) && (start < filesize)) {
1361 if (read_cluster_nonblocking(vma->vm_file,
1362 start, filesize) < 0)
1363 break;
1364 start += CLUSTER_PAGES;
1366 run_task_queue(&tq_disk);
1368 /* if we're far enough past the beginning of this area,
1369 recycle pages that are in the previous window. */
1370 if (vma->vm_raend > (vma->vm_pgoff + ra_window + ra_window)) {
1371 unsigned long window = ra_window << PAGE_SHIFT;
1373 end = vma->vm_start + (vma->vm_raend << PAGE_SHIFT);
1374 end -= window + window;
1375 filemap_sync(vma, end - window, window, MS_INVALIDATE);
1378 vma->vm_raend += ra_window;
1381 return;
1385 * filemap_nopage() is invoked via the vma operations vector for a
1386 * mapped memory region to read in file data during a page fault.
1388 * The goto's are kind of ugly, but this streamlines the normal case of having
1389 * it in the page cache, and handles the special cases reasonably without
1390 * having a lot of duplicated code.
1392 struct page * filemap_nopage(struct vm_area_struct * area,
1393 unsigned long address, int no_share)
1395 int error;
1396 struct file *file = area->vm_file;
1397 struct inode *inode = file->f_dentry->d_inode;
1398 struct address_space *mapping = inode->i_mapping;
1399 struct page *page, **hash, *old_page;
1400 unsigned long size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1402 unsigned long pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
1405 * Semantics for shared and private memory areas are different
1406 * past the end of the file. A shared mapping past the last page
1407 * of the file is an error and results in a SIGBUS, while a
1408 * private mapping just maps in a zero page.
1410 if ((pgoff >= size) && (area->vm_mm == current->mm))
1411 return NULL;
1414 * Do we have something in the page cache already?
1416 hash = page_hash(mapping, pgoff);
1417 retry_find:
1418 page = __find_get_page(mapping, pgoff, hash);
1419 if (!page)
1420 goto no_cached_page;
1423 * Ok, found a page in the page cache, now we need to check
1424 * that it's up-to-date.
1426 if (!Page_Uptodate(page))
1427 goto page_not_uptodate;
1429 success:
1431 * Try read-ahead for sequential areas.
1433 if (VM_SequentialReadHint(area))
1434 nopage_sequential_readahead(area, pgoff, size);
1437 * Found the page and have a reference on it, need to check sharing
1438 * and possibly copy it over to another page..
1440 old_page = page;
1441 if (no_share) {
1442 struct page *new_page = page_cache_alloc();
1444 if (new_page) {
1445 copy_user_highpage(new_page, old_page, address);
1446 flush_page_to_ram(new_page);
1447 } else
1448 new_page = NOPAGE_OOM;
1449 page_cache_release(page);
1450 return new_page;
1453 flush_page_to_ram(old_page);
1454 return old_page;
1456 no_cached_page:
1458 * If the requested offset is within our file, try to read a whole
1459 * cluster of pages at once.
1461 * Otherwise, we're off the end of a privately mapped file,
1462 * so we need to map a zero page.
1464 if ((pgoff < size) && !VM_RandomReadHint(area))
1465 error = read_cluster_nonblocking(file, pgoff, size);
1466 else
1467 error = page_cache_read(file, pgoff);
1470 * The page we want has now been added to the page cache.
1471 * In the unlikely event that someone removed it in the
1472 * meantime, we'll just come back here and read it again.
1474 if (error >= 0)
1475 goto retry_find;
1478 * An error return from page_cache_read can result if the
1479 * system is low on memory, or a problem occurs while trying
1480 * to schedule I/O.
1482 if (error == -ENOMEM)
1483 return NOPAGE_OOM;
1484 return NULL;
1486 page_not_uptodate:
1487 lock_page(page);
1488 if (Page_Uptodate(page)) {
1489 UnlockPage(page);
1490 goto success;
1493 if (!mapping->a_ops->readpage(file, page)) {
1494 wait_on_page(page);
1495 if (Page_Uptodate(page))
1496 goto success;
1500 * Umm, take care of errors if the page isn't up-to-date.
1501 * Try to re-read it _once_. We do this synchronously,
1502 * because there really aren't any performance issues here
1503 * and we need to check for errors.
1505 lock_page(page);
1506 if (Page_Uptodate(page)) {
1507 UnlockPage(page);
1508 goto success;
1510 ClearPageError(page);
1511 if (!mapping->a_ops->readpage(file, page)) {
1512 wait_on_page(page);
1513 if (Page_Uptodate(page))
1514 goto success;
1518 * Things didn't work out. Return zero to tell the
1519 * mm layer so, possibly freeing the page cache page first.
1521 page_cache_release(page);
1522 return NULL;
1525 static int filemap_write_page(struct file *file,
1526 struct page * page,
1527 int wait)
1530 * If a task terminates while we're swapping the page, the vma and
1531 * and file could be released: try_to_swap_out has done a get_file.
1532 * vma/file is guaranteed to exist in the unmap/sync cases because
1533 * mmap_sem is held.
1535 return page->mapping->a_ops->writepage(file, page);
1540 * The page cache takes care of races between somebody
1541 * trying to swap something out and swap something in
1542 * at the same time..
1544 extern void wakeup_bdflush(int);
1545 int filemap_swapout(struct page * page, struct file * file)
1547 int retval = filemap_write_page(file, page, 0);
1548 wakeup_bdflush(0);
1549 return retval;
1552 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
1553 unsigned long address, unsigned int flags)
1555 unsigned long pgoff;
1556 pte_t pte = *ptep;
1557 struct page *page;
1558 int error;
1560 if (!(flags & MS_INVALIDATE)) {
1561 if (!pte_present(pte))
1562 return 0;
1563 if (!pte_dirty(pte))
1564 return 0;
1565 flush_page_to_ram(pte_page(pte));
1566 flush_cache_page(vma, address);
1567 set_pte(ptep, pte_mkclean(pte));
1568 flush_tlb_page(vma, address);
1569 page = pte_page(pte);
1570 page_cache_get(page);
1571 } else {
1572 if (pte_none(pte))
1573 return 0;
1574 flush_cache_page(vma, address);
1575 pte_clear(ptep);
1576 flush_tlb_page(vma, address);
1577 if (!pte_present(pte)) {
1578 swap_free(pte_to_swp_entry(pte));
1579 return 0;
1581 page = pte_page(pte);
1582 if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
1583 page_cache_free(page);
1584 return 0;
1587 pgoff = (address - vma->vm_start) >> PAGE_CACHE_SHIFT;
1588 pgoff += vma->vm_pgoff;
1589 if (page->index != pgoff) {
1590 printk("weirdness: pgoff=%lu index=%lu address=%lu vm_start=%lu vm_pgoff=%lu\n",
1591 pgoff, page->index, address, vma->vm_start, vma->vm_pgoff);
1593 lock_page(page);
1594 error = filemap_write_page(vma->vm_file, page, 1);
1595 UnlockPage(page);
1596 page_cache_free(page);
1597 return error;
1600 static inline int filemap_sync_pte_range(pmd_t * pmd,
1601 unsigned long address, unsigned long size,
1602 struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
1604 pte_t * pte;
1605 unsigned long end;
1606 int error;
1608 if (pmd_none(*pmd))
1609 return 0;
1610 if (pmd_bad(*pmd)) {
1611 pmd_ERROR(*pmd);
1612 pmd_clear(pmd);
1613 return 0;
1615 pte = pte_offset(pmd, address);
1616 offset += address & PMD_MASK;
1617 address &= ~PMD_MASK;
1618 end = address + size;
1619 if (end > PMD_SIZE)
1620 end = PMD_SIZE;
1621 error = 0;
1622 do {
1623 error |= filemap_sync_pte(pte, vma, address + offset, flags);
1624 address += PAGE_SIZE;
1625 pte++;
1626 } while (address && (address < end));
1627 return error;
1630 static inline int filemap_sync_pmd_range(pgd_t * pgd,
1631 unsigned long address, unsigned long size,
1632 struct vm_area_struct *vma, unsigned int flags)
1634 pmd_t * pmd;
1635 unsigned long offset, end;
1636 int error;
1638 if (pgd_none(*pgd))
1639 return 0;
1640 if (pgd_bad(*pgd)) {
1641 pgd_ERROR(*pgd);
1642 pgd_clear(pgd);
1643 return 0;
1645 pmd = pmd_offset(pgd, address);
1646 offset = address & PGDIR_MASK;
1647 address &= ~PGDIR_MASK;
1648 end = address + size;
1649 if (end > PGDIR_SIZE)
1650 end = PGDIR_SIZE;
1651 error = 0;
1652 do {
1653 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
1654 address = (address + PMD_SIZE) & PMD_MASK;
1655 pmd++;
1656 } while (address && (address < end));
1657 return error;
1660 int filemap_sync(struct vm_area_struct * vma, unsigned long address,
1661 size_t size, unsigned int flags)
1663 pgd_t * dir;
1664 unsigned long end = address + size;
1665 int error = 0;
1667 dir = pgd_offset(vma->vm_mm, address);
1668 flush_cache_range(vma->vm_mm, end - size, end);
1669 if (address >= end)
1670 BUG();
1671 do {
1672 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
1673 address = (address + PGDIR_SIZE) & PGDIR_MASK;
1674 dir++;
1675 } while (address && (address < end));
1676 flush_tlb_range(vma->vm_mm, end - size, end);
1677 return error;
1681 * This handles (potentially partial) area unmaps..
1683 static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len)
1685 lock_kernel();
1686 filemap_sync(vma, start, len, MS_ASYNC);
1687 unlock_kernel();
1691 * Shared mappings need to be able to do the right thing at
1692 * close/unmap/sync. They will also use the private file as
1693 * backing-store for swapping..
1695 static struct vm_operations_struct file_shared_mmap = {
1696 unmap: filemap_unmap, /* unmap - we need to sync the pages */
1697 sync: filemap_sync,
1698 nopage: filemap_nopage,
1699 swapout: filemap_swapout,
1703 * Private mappings just need to be able to load in the map.
1705 * (This is actually used for shared mappings as well, if we
1706 * know they can't ever get write permissions..)
1708 static struct vm_operations_struct file_private_mmap = {
1709 nopage: filemap_nopage,
1712 /* This is used for a general mmap of a disk file */
1714 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1716 struct vm_operations_struct * ops;
1717 struct inode *inode = file->f_dentry->d_inode;
1719 ops = &file_private_mmap;
1720 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1721 if (!inode->i_mapping->a_ops->writepage)
1722 return -EINVAL;
1723 ops = &file_shared_mmap;
1725 if (!inode->i_sb || !S_ISREG(inode->i_mode))
1726 return -EACCES;
1727 if (!inode->i_mapping->a_ops->readpage)
1728 return -ENOEXEC;
1729 UPDATE_ATIME(inode);
1730 vma->vm_ops = ops;
1731 return 0;
1735 * The msync() system call.
1738 static int msync_interval(struct vm_area_struct * vma,
1739 unsigned long start, unsigned long end, int flags)
1741 if (vma->vm_file && vma->vm_ops && vma->vm_ops->sync) {
1742 int error;
1743 error = vma->vm_ops->sync(vma, start, end-start, flags);
1744 if (!error && (flags & MS_SYNC)) {
1745 struct file * file = vma->vm_file;
1746 if (file && file->f_op && file->f_op->fsync)
1747 error = file->f_op->fsync(file, file->f_dentry);
1749 return error;
1751 return 0;
1754 asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
1756 unsigned long end;
1757 struct vm_area_struct * vma;
1758 int unmapped_error, error = -EINVAL;
1760 down(&current->mm->mmap_sem);
1761 lock_kernel();
1762 if (start & ~PAGE_MASK)
1763 goto out;
1764 len = (len + ~PAGE_MASK) & PAGE_MASK;
1765 end = start + len;
1766 if (end < start)
1767 goto out;
1768 if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1769 goto out;
1770 error = 0;
1771 if (end == start)
1772 goto out;
1774 * If the interval [start,end) covers some unmapped address ranges,
1775 * just ignore them, but return -EFAULT at the end.
1777 vma = find_vma(current->mm, start);
1778 unmapped_error = 0;
1779 for (;;) {
1780 /* Still start < end. */
1781 error = -EFAULT;
1782 if (!vma)
1783 goto out;
1784 /* Here start < vma->vm_end. */
1785 if (start < vma->vm_start) {
1786 unmapped_error = -EFAULT;
1787 start = vma->vm_start;
1789 /* Here vma->vm_start <= start < vma->vm_end. */
1790 if (end <= vma->vm_end) {
1791 if (start < end) {
1792 error = msync_interval(vma, start, end, flags);
1793 if (error)
1794 goto out;
1796 error = unmapped_error;
1797 goto out;
1799 /* Here vma->vm_start <= start < vma->vm_end < end. */
1800 error = msync_interval(vma, start, vma->vm_end, flags);
1801 if (error)
1802 goto out;
1803 start = vma->vm_end;
1804 vma = vma->vm_next;
1806 out:
1807 unlock_kernel();
1808 up(&current->mm->mmap_sem);
1809 return error;
1812 static inline void setup_read_behavior(struct vm_area_struct * vma,
1813 int behavior)
1815 VM_ClearReadHint(vma);
1816 switch(behavior) {
1817 case MADV_SEQUENTIAL:
1818 vma->vm_flags |= VM_SEQ_READ;
1819 break;
1820 case MADV_RANDOM:
1821 vma->vm_flags |= VM_RAND_READ;
1822 break;
1823 default:
1824 break;
1826 return;
1829 static long madvise_fixup_start(struct vm_area_struct * vma,
1830 unsigned long end, int behavior)
1832 struct vm_area_struct * n;
1834 n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1835 if (!n)
1836 return -EAGAIN;
1837 *n = *vma;
1838 n->vm_end = end;
1839 setup_read_behavior(n, behavior);
1840 n->vm_raend = 0;
1841 get_file(n->vm_file);
1842 if (n->vm_ops && n->vm_ops->open)
1843 n->vm_ops->open(n);
1844 vmlist_modify_lock(vma->vm_mm);
1845 vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
1846 vma->vm_start = end;
1847 insert_vm_struct(current->mm, n);
1848 vmlist_modify_unlock(vma->vm_mm);
1849 return 0;
1852 static long madvise_fixup_end(struct vm_area_struct * vma,
1853 unsigned long start, int behavior)
1855 struct vm_area_struct * n;
1857 n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1858 if (!n)
1859 return -EAGAIN;
1860 *n = *vma;
1861 n->vm_start = start;
1862 n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT;
1863 setup_read_behavior(n, behavior);
1864 n->vm_raend = 0;
1865 get_file(n->vm_file);
1866 if (n->vm_ops && n->vm_ops->open)
1867 n->vm_ops->open(n);
1868 vmlist_modify_lock(vma->vm_mm);
1869 vma->vm_end = start;
1870 insert_vm_struct(current->mm, n);
1871 vmlist_modify_unlock(vma->vm_mm);
1872 return 0;
1875 static long madvise_fixup_middle(struct vm_area_struct * vma,
1876 unsigned long start, unsigned long end, int behavior)
1878 struct vm_area_struct * left, * right;
1880 left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1881 if (!left)
1882 return -EAGAIN;
1883 right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1884 if (!right) {
1885 kmem_cache_free(vm_area_cachep, left);
1886 return -EAGAIN;
1888 *left = *vma;
1889 *right = *vma;
1890 left->vm_end = start;
1891 right->vm_start = end;
1892 right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT;
1893 left->vm_raend = 0;
1894 right->vm_raend = 0;
1895 atomic_add(2, &vma->vm_file->f_count);
1897 if (vma->vm_ops && vma->vm_ops->open) {
1898 vma->vm_ops->open(left);
1899 vma->vm_ops->open(right);
1901 vmlist_modify_lock(vma->vm_mm);
1902 vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
1903 vma->vm_start = start;
1904 vma->vm_end = end;
1905 setup_read_behavior(vma, behavior);
1906 vma->vm_raend = 0;
1907 insert_vm_struct(current->mm, left);
1908 insert_vm_struct(current->mm, right);
1909 vmlist_modify_unlock(vma->vm_mm);
1910 return 0;
1914 * We can potentially split a vm area into separate
1915 * areas, each area with its own behavior.
1917 static long madvise_behavior(struct vm_area_struct * vma,
1918 unsigned long start, unsigned long end, int behavior)
1920 int error = 0;
1922 /* This caps the number of vma's this process can own */
1923 if (vma->vm_mm->map_count > MAX_MAP_COUNT)
1924 return -ENOMEM;
1926 if (start == vma->vm_start) {
1927 if (end == vma->vm_end) {
1928 setup_read_behavior(vma, behavior);
1929 vma->vm_raend = 0;
1930 } else
1931 error = madvise_fixup_start(vma, end, behavior);
1932 } else {
1933 if (end == vma->vm_end)
1934 error = madvise_fixup_end(vma, start, behavior);
1935 else
1936 error = madvise_fixup_middle(vma, start, end, behavior);
1939 return error;
1943 * Schedule all required I/O operations, then run the disk queue
1944 * to make sure they are started. Do not wait for completion.
1946 static long madvise_willneed(struct vm_area_struct * vma,
1947 unsigned long start, unsigned long end)
1949 long error = -EBADF;
1950 struct file * file;
1951 unsigned long size, rlim_rss;
1953 /* Doesn't work if there's no mapped file. */
1954 if (!vma->vm_file)
1955 return error;
1956 file = vma->vm_file;
1957 size = (file->f_dentry->d_inode->i_size + PAGE_CACHE_SIZE - 1) >>
1958 PAGE_CACHE_SHIFT;
1960 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1961 if (end > vma->vm_end)
1962 end = vma->vm_end;
1963 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1965 /* Make sure this doesn't exceed the process's max rss. */
1966 error = -EIO;
1967 rlim_rss = current->rlim ? current->rlim[RLIMIT_RSS].rlim_cur :
1968 LONG_MAX; /* default: see resource.h */
1969 if ((vma->vm_mm->rss + (end - start)) > rlim_rss)
1970 return error;
1972 /* round to cluster boundaries if this isn't a "random" area. */
1973 if (!VM_RandomReadHint(vma)) {
1974 start = CLUSTER_OFFSET(start);
1975 end = CLUSTER_OFFSET(end + CLUSTER_PAGES - 1);
1977 while ((start < end) && (start < size)) {
1978 error = read_cluster_nonblocking(file, start, size);
1979 start += CLUSTER_PAGES;
1980 if (error < 0)
1981 break;
1983 } else {
1984 while ((start < end) && (start < size)) {
1985 error = page_cache_read(file, start);
1986 start++;
1987 if (error < 0)
1988 break;
1992 /* Don't wait for someone else to push these requests. */
1993 run_task_queue(&tq_disk);
1995 return error;
1999 * Application no longer needs these pages. If the pages are dirty,
2000 * it's OK to just throw them away. The app will be more careful about
2001 * data it wants to keep. Be sure to free swap resources too. The
2002 * zap_page_range call sets things up for shrink_mmap to actually free
2003 * these pages later if no one else has touched them in the meantime,
2004 * although we could add these pages to a global reuse list for
2005 * shrink_mmap to pick up before reclaiming other pages.
2007 * NB: This interface discards data rather than pushes it out to swap,
2008 * as some implementations do. This has performance implications for
2009 * applications like large transactional databases which want to discard
2010 * pages in anonymous maps after committing to backing store the data
2011 * that was kept in them. There is no reason to write this data out to
2012 * the swap area if the application is discarding it.
2014 * An interface that causes the system to free clean pages and flush
2015 * dirty pages is already available as msync(MS_INVALIDATE).
2017 static long madvise_dontneed(struct vm_area_struct * vma,
2018 unsigned long start, unsigned long end)
2020 if (vma->vm_flags & VM_LOCKED)
2021 return -EINVAL;
2023 lock_kernel(); /* is this really necessary? */
2025 flush_cache_range(vma->vm_mm, start, end);
2026 zap_page_range(vma->vm_mm, start, end - start);
2027 flush_tlb_range(vma->vm_mm, start, end);
2029 unlock_kernel();
2030 return 0;
2033 static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
2034 unsigned long end, int behavior)
2036 long error = -EBADF;
2038 switch (behavior) {
2039 case MADV_NORMAL:
2040 case MADV_SEQUENTIAL:
2041 case MADV_RANDOM:
2042 error = madvise_behavior(vma, start, end, behavior);
2043 break;
2045 case MADV_WILLNEED:
2046 error = madvise_willneed(vma, start, end);
2047 break;
2049 case MADV_DONTNEED:
2050 error = madvise_dontneed(vma, start, end);
2051 break;
2053 default:
2054 error = -EINVAL;
2055 break;
2058 return error;
2062 * The madvise(2) system call.
2064 * Applications can use madvise() to advise the kernel how it should
2065 * handle paging I/O in this VM area. The idea is to help the kernel
2066 * use appropriate read-ahead and caching techniques. The information
2067 * provided is advisory only, and can be safely disregarded by the
2068 * kernel without affecting the correct operation of the application.
2070 * behavior values:
2071 * MADV_NORMAL - the default behavior is to read clusters. This
2072 * results in some read-ahead and read-behind.
2073 * MADV_RANDOM - the system should read the minimum amount of data
2074 * on any access, since it is unlikely that the appli-
2075 * cation will need more than what it asks for.
2076 * MADV_SEQUENTIAL - pages in the given range will probably be accessed
2077 * once, so they can be aggressively read ahead, and
2078 * can be freed soon after they are accessed.
2079 * MADV_WILLNEED - the application is notifying the system to read
2080 * some pages ahead.
2081 * MADV_DONTNEED - the application is finished with the given range,
2082 * so the kernel can free resources associated with it.
2084 * return values:
2085 * zero - success
2086 * -EINVAL - start + len < 0, start is not page-aligned,
2087 * "behavior" is not a valid value, or application
2088 * is attempting to release locked or shared pages.
2089 * -ENOMEM - addresses in the specified range are not currently
2090 * mapped, or are outside the AS of the process.
2091 * -EIO - an I/O error occurred while paging in data.
2092 * -EBADF - map exists, but area maps something that isn't a file.
2093 * -EAGAIN - a kernel resource was temporarily unavailable.
2095 asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior)
2097 unsigned long end;
2098 struct vm_area_struct * vma;
2099 int unmapped_error = 0;
2100 int error = -EINVAL;
2102 down(&current->mm->mmap_sem);
2104 if (start & ~PAGE_MASK)
2105 goto out;
2106 len = (len + ~PAGE_MASK) & PAGE_MASK;
2107 end = start + len;
2108 if (end < start)
2109 goto out;
2111 error = 0;
2112 if (end == start)
2113 goto out;
2116 * If the interval [start,end) covers some unmapped address
2117 * ranges, just ignore them, but return -ENOMEM at the end.
2119 vma = find_vma(current->mm, start);
2120 for (;;) {
2121 /* Still start < end. */
2122 error = -ENOMEM;
2123 if (!vma)
2124 goto out;
2126 /* Here start < vma->vm_end. */
2127 if (start < vma->vm_start) {
2128 unmapped_error = -ENOMEM;
2129 start = vma->vm_start;
2132 /* Here vma->vm_start <= start < vma->vm_end. */
2133 if (end <= vma->vm_end) {
2134 if (start < end) {
2135 error = madvise_vma(vma, start, end,
2136 behavior);
2137 if (error)
2138 goto out;
2140 error = unmapped_error;
2141 goto out;
2144 /* Here vma->vm_start <= start < vma->vm_end < end. */
2145 error = madvise_vma(vma, start, vma->vm_end, behavior);
2146 if (error)
2147 goto out;
2148 start = vma->vm_end;
2149 vma = vma->vm_next;
2152 out:
2153 up(&current->mm->mmap_sem);
2154 return error;
2158 * Later we can get more picky about what "in core" means precisely.
2159 * For now, simply check to see if the page is in the page cache,
2160 * and is up to date; i.e. that no page-in operation would be required
2161 * at this time if an application were to map and access this page.
2163 static unsigned char mincore_page(struct vm_area_struct * vma,
2164 unsigned long pgoff)
2166 unsigned char present = 0;
2167 struct address_space * as = &vma->vm_file->f_dentry->d_inode->i_data;
2168 struct page * page, ** hash = page_hash(as, pgoff);
2170 spin_lock(&pagecache_lock);
2171 page = __find_page_nolock(as, pgoff, *hash);
2172 if ((page) && (Page_Uptodate(page)))
2173 present = 1;
2174 spin_unlock(&pagecache_lock);
2176 return present;
2179 static long mincore_vma(struct vm_area_struct * vma,
2180 unsigned long start, unsigned long end, unsigned char * vec)
2182 long error, i, remaining;
2183 unsigned char * tmp;
2185 error = -ENOMEM;
2186 if (!vma->vm_file)
2187 return error;
2189 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2190 if (end > vma->vm_end)
2191 end = vma->vm_end;
2192 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2194 error = -EAGAIN;
2195 tmp = (unsigned char *) __get_free_page(GFP_KERNEL);
2196 if (!tmp)
2197 return error;
2199 /* (end - start) is # of pages, and also # of bytes in "vec */
2200 remaining = (end - start),
2202 error = 0;
2203 for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) {
2204 int j = 0;
2205 long thispiece = (remaining < PAGE_SIZE) ?
2206 remaining : PAGE_SIZE;
2208 while (j < thispiece)
2209 tmp[j++] = mincore_page(vma, start++);
2211 if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) {
2212 error = -EFAULT;
2213 break;
2217 free_page((unsigned long) tmp);
2218 return error;
2222 * The mincore(2) system call.
2224 * mincore() returns the memory residency status of the pages in the
2225 * current process's address space specified by [addr, addr + len).
2226 * The status is returned in a vector of bytes. The least significant
2227 * bit of each byte is 1 if the referenced page is in memory, otherwise
2228 * it is zero.
2230 * Because the status of a page can change after mincore() checks it
2231 * but before it returns to the application, the returned vector may
2232 * contain stale information. Only locked pages are guaranteed to
2233 * remain in memory.
2235 * return values:
2236 * zero - success
2237 * -EFAULT - vec points to an illegal address
2238 * -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE,
2239 * or len has a nonpositive value
2240 * -ENOMEM - Addresses in the range [addr, addr + len] are
2241 * invalid for the address space of this process, or
2242 * specify one or more pages which are not currently
2243 * mapped
2244 * -EAGAIN - A kernel resource was temporarily unavailable.
2246 asmlinkage long sys_mincore(unsigned long start, size_t len,
2247 unsigned char * vec)
2249 int index = 0;
2250 unsigned long end;
2251 struct vm_area_struct * vma;
2252 int unmapped_error = 0;
2253 long error = -EINVAL;
2255 down(&current->mm->mmap_sem);
2257 if (start & ~PAGE_CACHE_MASK)
2258 goto out;
2259 len = (len + ~PAGE_CACHE_MASK) & PAGE_CACHE_MASK;
2260 end = start + len;
2261 if (end < start)
2262 goto out;
2264 error = 0;
2265 if (end == start)
2266 goto out;
2269 * If the interval [start,end) covers some unmapped address
2270 * ranges, just ignore them, but return -ENOMEM at the end.
2272 vma = find_vma(current->mm, start);
2273 for (;;) {
2274 /* Still start < end. */
2275 error = -ENOMEM;
2276 if (!vma)
2277 goto out;
2279 /* Here start < vma->vm_end. */
2280 if (start < vma->vm_start) {
2281 unmapped_error = -ENOMEM;
2282 start = vma->vm_start;
2285 /* Here vma->vm_start <= start < vma->vm_end. */
2286 if (end <= vma->vm_end) {
2287 if (start < end) {
2288 error = mincore_vma(vma, start, end,
2289 &vec[index]);
2290 if (error)
2291 goto out;
2293 error = unmapped_error;
2294 goto out;
2297 /* Here vma->vm_start <= start < vma->vm_end < end. */
2298 error = mincore_vma(vma, start, vma->vm_end, &vec[index]);
2299 if (error)
2300 goto out;
2301 index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT;
2302 start = vma->vm_end;
2303 vma = vma->vm_next;
2306 out:
2307 up(&current->mm->mmap_sem);
2308 return error;
2311 static inline
2312 struct page *__read_cache_page(struct address_space *mapping,
2313 unsigned long index,
2314 int (*filler)(void *,struct page*),
2315 void *data)
2317 struct page **hash = page_hash(mapping, index);
2318 struct page *page, *cached_page = NULL;
2319 int err;
2320 repeat:
2321 page = __find_get_page(mapping, index, hash);
2322 if (!page) {
2323 if (!cached_page) {
2324 cached_page = page_cache_alloc();
2325 if (!cached_page)
2326 return ERR_PTR(-ENOMEM);
2328 page = cached_page;
2329 if (add_to_page_cache_unique(page, mapping, index, hash))
2330 goto repeat;
2331 cached_page = NULL;
2332 err = filler(data, page);
2333 if (err < 0) {
2334 page_cache_release(page);
2335 page = ERR_PTR(err);
2338 if (cached_page)
2339 page_cache_free(cached_page);
2340 return page;
2344 * Read into the page cache. If a page already exists,
2345 * and Page_Uptodate() is not set, try to fill the page.
2347 struct page *read_cache_page(struct address_space *mapping,
2348 unsigned long index,
2349 int (*filler)(void *,struct page*),
2350 void *data)
2352 struct page *page = __read_cache_page(mapping, index, filler, data);
2353 int err;
2355 if (IS_ERR(page) || Page_Uptodate(page))
2356 goto out;
2358 lock_page(page);
2359 if (Page_Uptodate(page)) {
2360 UnlockPage(page);
2361 goto out;
2363 err = filler(data, page);
2364 if (err < 0) {
2365 page_cache_release(page);
2366 page = ERR_PTR(err);
2368 out:
2369 return page;
2372 static inline struct page * __grab_cache_page(struct address_space *mapping,
2373 unsigned long index, struct page **cached_page)
2375 struct page *page, **hash = page_hash(mapping, index);
2376 repeat:
2377 page = __find_lock_page(mapping, index, hash);
2378 if (!page) {
2379 if (!*cached_page) {
2380 *cached_page = page_cache_alloc();
2381 if (!*cached_page)
2382 return NULL;
2384 page = *cached_page;
2385 if (add_to_page_cache_unique(page, mapping, index, hash))
2386 goto repeat;
2387 *cached_page = NULL;
2389 return page;
2393 * Returns locked page at given index in given cache, creating it if needed.
2396 struct page *grab_cache_page(struct address_space *mapping, unsigned long index)
2398 struct page *cached_page = NULL;
2399 struct page *page = __grab_cache_page(mapping,index,&cached_page);
2400 if (cached_page)
2401 page_cache_free(cached_page);
2402 return page;
2405 static inline void remove_suid(struct inode *inode)
2407 unsigned int mode;
2409 /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
2410 mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
2412 /* was any of the uid bits set? */
2413 mode &= inode->i_mode;
2414 if (mode && !capable(CAP_FSETID)) {
2415 inode->i_mode &= ~mode;
2416 mark_inode_dirty(inode);
2421 * Write to a file through the page cache.
2423 * We currently put everything into the page cache prior to writing it.
2424 * This is not a problem when writing full pages. With partial pages,
2425 * however, we first have to read the data into the cache, then
2426 * dirty the page, and finally schedule it for writing. Alternatively, we
2427 * could write-through just the portion of data that would go into that
2428 * page, but that would kill performance for applications that write data
2429 * line by line, and it's prone to race conditions.
2431 * Note that this routine doesn't try to keep track of dirty pages. Each
2432 * file system has to do this all by itself, unfortunately.
2433 * okir@monad.swb.de
2435 ssize_t
2436 generic_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)
2438 struct inode *inode = file->f_dentry->d_inode;
2439 struct address_space *mapping = inode->i_mapping;
2440 unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
2441 loff_t pos;
2442 struct page *page, *cached_page;
2443 unsigned long written;
2444 long status;
2445 int err;
2447 cached_page = NULL;
2449 down(&inode->i_sem);
2451 pos = *ppos;
2452 err = -EINVAL;
2453 if (pos < 0)
2454 goto out;
2456 err = file->f_error;
2457 if (err) {
2458 file->f_error = 0;
2459 goto out;
2462 written = 0;
2464 if (file->f_flags & O_APPEND)
2465 pos = inode->i_size;
2468 * Check whether we've reached the file size limit.
2470 err = -EFBIG;
2471 if (limit != RLIM_INFINITY) {
2472 if (pos >= limit) {
2473 send_sig(SIGXFSZ, current, 0);
2474 goto out;
2476 if (count > limit - pos) {
2477 send_sig(SIGXFSZ, current, 0);
2478 count = limit - pos;
2482 status = 0;
2483 if (count) {
2484 remove_suid(inode);
2485 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
2486 mark_inode_dirty(inode);
2489 while (count) {
2490 unsigned long bytes, index, offset;
2491 char *kaddr;
2494 * Try to find the page in the cache. If it isn't there,
2495 * allocate a free page.
2497 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
2498 index = pos >> PAGE_CACHE_SHIFT;
2499 bytes = PAGE_CACHE_SIZE - offset;
2500 if (bytes > count)
2501 bytes = count;
2503 status = -ENOMEM; /* we'll assign it later anyway */
2504 page = __grab_cache_page(mapping, index, &cached_page);
2505 if (!page)
2506 break;
2508 /* We have exclusive IO access to the page.. */
2509 if (!PageLocked(page)) {
2510 PAGE_BUG(page);
2513 status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes);
2514 if (status)
2515 goto unlock;
2516 kaddr = (char*)page_address(page);
2517 status = copy_from_user(kaddr+offset, buf, bytes);
2518 if (status)
2519 goto fail_write;
2520 status = mapping->a_ops->commit_write(file, page, offset, offset+bytes);
2521 if (!status)
2522 status = bytes;
2524 if (status >= 0) {
2525 written += status;
2526 count -= status;
2527 pos += status;
2528 buf += status;
2530 unlock:
2531 /* Mark it unlocked again and drop the page.. */
2532 UnlockPage(page);
2533 page_cache_release(page);
2535 if (status < 0)
2536 break;
2538 *ppos = pos;
2540 if (cached_page)
2541 page_cache_free(cached_page);
2543 err = written ? written : status;
2544 out:
2545 up(&inode->i_sem);
2546 return err;
2547 fail_write:
2548 status = -EFAULT;
2549 ClearPageUptodate(page);
2550 kunmap(page);
2551 goto unlock;
2554 void __init page_cache_init(unsigned long mempages)
2556 unsigned long htable_size, order;
2558 htable_size = mempages;
2559 htable_size *= sizeof(struct page *);
2560 for(order = 0; (PAGE_SIZE << order) < htable_size; order++)
2563 do {
2564 unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *);
2566 page_hash_bits = 0;
2567 while((tmp >>= 1UL) != 0UL)
2568 page_hash_bits++;
2570 page_hash_table = (struct page **)
2571 __get_free_pages(GFP_ATOMIC, order);
2572 } while(page_hash_table == NULL && --order > 0);
2574 printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
2575 (1 << page_hash_bits), order, (PAGE_SIZE << order));
2576 if (!page_hash_table)
2577 panic("Failed to allocate page hash table\n");
2578 memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));