- pre4:
[davej-history.git] / mm / filemap.c
blob750421a5b5209dac00a53e7d428eb9f8913be35f
1 /*
2 * linux/mm/filemap.c
4 * Copyright (C) 1994-1999 Linus Torvalds
5 */
7 /*
8 * This file handles the generic file mmap semantics used by
9 * most "normal" filesystems (but you don't /have/ to use this:
10 * the NFS filesystem used to do this differently, for example)
12 #include <linux/malloc.h>
13 #include <linux/shm.h>
14 #include <linux/mman.h>
15 #include <linux/locks.h>
16 #include <linux/pagemap.h>
17 #include <linux/swap.h>
18 #include <linux/smp_lock.h>
19 #include <linux/blkdev.h>
20 #include <linux/file.h>
21 #include <linux/swapctl.h>
22 #include <linux/slab.h>
23 #include <linux/init.h>
24 #include <linux/mm.h>
26 #include <asm/pgalloc.h>
27 #include <asm/uaccess.h>
28 #include <asm/mman.h>
30 #include <linux/highmem.h>
33 * Shared mappings implemented 30.11.1994. It's not fully working yet,
34 * though.
36 * Shared mappings now work. 15.8.1995 Bruno.
38 * finished 'unifying' the page and buffer cache and SMP-threaded the
39 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
41 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
44 atomic_t page_cache_size = ATOMIC_INIT(0);
45 unsigned int page_hash_bits;
46 struct page **page_hash_table;
47 struct list_head lru_cache;
49 static spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
51 * NOTE: to avoid deadlocking you must never acquire the pagecache_lock with
52 * the pagemap_lru_lock held.
54 spinlock_t pagemap_lru_lock = SPIN_LOCK_UNLOCKED;
56 #define CLUSTER_PAGES (1 << page_cluster)
57 #define CLUSTER_OFFSET(x) (((x) >> page_cluster) << page_cluster)
59 void __add_page_to_hash_queue(struct page * page, struct page **p)
61 atomic_inc(&page_cache_size);
62 if((page->next_hash = *p) != NULL)
63 (*p)->pprev_hash = &page->next_hash;
64 *p = page;
65 page->pprev_hash = p;
66 if (page->buffers)
67 PAGE_BUG(page);
70 static inline void remove_page_from_hash_queue(struct page * page)
72 if(page->pprev_hash) {
73 if(page->next_hash)
74 page->next_hash->pprev_hash = page->pprev_hash;
75 *page->pprev_hash = page->next_hash;
76 page->pprev_hash = NULL;
78 atomic_dec(&page_cache_size);
81 static inline int sync_page(struct page *page)
83 struct address_space *mapping = page->mapping;
85 if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
86 return mapping->a_ops->sync_page(page);
87 return 0;
91 * Remove a page from the page cache and free it. Caller has to make
92 * sure the page is locked and that nobody else uses it - or that usage
93 * is safe.
95 static inline void __remove_inode_page(struct page *page)
97 remove_page_from_inode_queue(page);
98 remove_page_from_hash_queue(page);
99 page->mapping = NULL;
102 void remove_inode_page(struct page *page)
104 if (!PageLocked(page))
105 PAGE_BUG(page);
107 spin_lock(&pagecache_lock);
108 __remove_inode_page(page);
109 spin_unlock(&pagecache_lock);
113 * invalidate_inode_pages - Invalidate all the unlocked pages of one inode
114 * @inode: the inode which pages we want to invalidate
116 * This function only removes the unlocked pages, if you want to
117 * remove all the pages of one inode, you must call truncate_inode_pages.
120 void invalidate_inode_pages(struct inode * inode)
122 struct list_head *head, *curr;
123 struct page * page;
125 head = &inode->i_mapping->pages;
127 spin_lock(&pagecache_lock);
128 spin_lock(&pagemap_lru_lock);
129 curr = head->next;
131 while (curr != head) {
132 page = list_entry(curr, struct page, list);
133 curr = curr->next;
135 /* We cannot invalidate a locked page */
136 if (TryLockPage(page))
137 continue;
139 __lru_cache_del(page);
140 __remove_inode_page(page);
141 UnlockPage(page);
142 page_cache_release(page);
145 spin_unlock(&pagemap_lru_lock);
146 spin_unlock(&pagecache_lock);
150 * Truncate the page cache at a set offset, removing the pages
151 * that are beyond that offset (and zeroing out partial pages).
153 void truncate_inode_pages(struct address_space * mapping, loff_t lstart)
155 struct list_head *head, *curr;
156 struct page * page;
157 unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
158 unsigned long start;
160 start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
162 repeat:
163 head = &mapping->pages;
164 spin_lock(&pagecache_lock);
165 curr = head->next;
166 while (curr != head) {
167 unsigned long offset;
169 page = list_entry(curr, struct page, list);
170 curr = curr->next;
172 offset = page->index;
174 /* page wholly truncated - free it */
175 if (offset >= start) {
176 if (TryLockPage(page)) {
177 page_cache_get(page);
178 spin_unlock(&pagecache_lock);
179 wait_on_page(page);
180 page_cache_release(page);
181 goto repeat;
183 page_cache_get(page);
184 spin_unlock(&pagecache_lock);
186 if (!page->buffers || block_flushpage(page, 0))
187 lru_cache_del(page);
190 * We remove the page from the page cache
191 * _after_ we have destroyed all buffer-cache
192 * references to it. Otherwise some other process
193 * might think this inode page is not in the
194 * page cache and creates a buffer-cache alias
195 * to it causing all sorts of fun problems ...
197 remove_inode_page(page);
198 ClearPageDirty(page);
200 UnlockPage(page);
201 page_cache_release(page);
202 page_cache_release(page);
205 * We have done things without the pagecache lock,
206 * so we'll have to repeat the scan.
207 * It's not possible to deadlock here because
208 * we are guaranteed to make progress. (ie. we have
209 * just removed a page)
211 goto repeat;
214 * there is only one partial page possible.
216 if (!partial)
217 continue;
219 /* and it's the one preceeding the first wholly truncated page */
220 if ((offset + 1) != start)
221 continue;
223 /* partial truncate, clear end of page */
224 if (TryLockPage(page)) {
225 spin_unlock(&pagecache_lock);
226 goto repeat;
228 page_cache_get(page);
229 spin_unlock(&pagecache_lock);
231 memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
232 if (page->buffers)
233 block_flushpage(page, partial);
235 partial = 0;
238 * we have dropped the spinlock so we have to
239 * restart.
241 UnlockPage(page);
242 page_cache_release(page);
243 goto repeat;
245 spin_unlock(&pagecache_lock);
249 * nr_dirty represents the number of dirty pages that we will write async
250 * before doing sync writes. We can only do sync writes if we can
251 * wait for IO (__GFP_IO set).
253 int shrink_mmap(int priority, int gfp_mask)
255 int ret = 0, count, nr_dirty;
256 struct list_head * page_lru;
257 struct page * page = NULL;
259 count = nr_lru_pages / (priority + 1);
260 nr_dirty = priority;
262 /* we need pagemap_lru_lock for list_del() ... subtle code below */
263 spin_lock(&pagemap_lru_lock);
264 while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache) {
265 page = list_entry(page_lru, struct page, lru);
266 list_del(page_lru);
268 if (PageTestandClearReferenced(page))
269 goto dispose_continue;
271 count--;
273 * Avoid unscalable SMP locking for pages we can
274 * immediate tell are untouchable..
276 if (!page->buffers && page_count(page) > 1)
277 goto dispose_continue;
279 if (TryLockPage(page))
280 goto dispose_continue;
282 /* Release the pagemap_lru lock even if the page is not yet
283 queued in any lru queue since we have just locked down
284 the page so nobody else may SMP race with us running
285 a lru_cache_del() (lru_cache_del() always run with the
286 page locked down ;). */
287 spin_unlock(&pagemap_lru_lock);
289 /* avoid freeing the page while it's locked */
290 page_cache_get(page);
293 * Is it a buffer page? Try to clean it up regardless
294 * of zone - it's old.
296 if (page->buffers) {
297 int wait;
299 * 0 - free it if can do so without IO
300 * 1 - start write-out of dirty buffers
301 * 2 - wait for locked buffers
303 wait = (gfp_mask & __GFP_IO) ? (nr_dirty-- < 0) ? 2 : 1 : 0;
304 if (!try_to_free_buffers(page, wait))
305 goto unlock_continue;
306 /* page was locked, inode can't go away under us */
307 if (!page->mapping) {
308 atomic_dec(&buffermem_pages);
309 goto made_buffer_progress;
313 /* Take the pagecache_lock spinlock held to avoid
314 other tasks to notice the page while we are looking at its
315 page count. If it's a pagecache-page we'll free it
316 in one atomic transaction after checking its page count. */
317 spin_lock(&pagecache_lock);
320 * We can't free pages unless there's just one user
321 * (count == 2 because we added one ourselves above).
323 if (page_count(page) != 2)
324 goto cache_unlock_continue;
327 * Is it a page swap page? If so, we want to
328 * drop it if it is no longer used, even if it
329 * were to be marked referenced..
331 if (PageSwapCache(page)) {
332 spin_unlock(&pagecache_lock);
333 __delete_from_swap_cache(page);
334 goto made_inode_progress;
338 * Page is from a zone we don't care about.
339 * Don't drop page cache entries in vain.
341 if (page->zone->free_pages > page->zone->pages_high)
342 goto cache_unlock_continue;
344 /* is it a page-cache page? */
345 if (page->mapping) {
346 if (!PageDirty(page) && !pgcache_under_min()) {
347 __remove_inode_page(page);
348 spin_unlock(&pagecache_lock);
349 goto made_inode_progress;
351 goto cache_unlock_continue;
354 printk(KERN_ERR "shrink_mmap: unknown LRU page!\n");
356 cache_unlock_continue:
357 spin_unlock(&pagecache_lock);
358 unlock_continue:
359 spin_lock(&pagemap_lru_lock);
360 UnlockPage(page);
361 page_cache_release(page);
362 dispose_continue:
363 list_add(page_lru, &lru_cache);
365 goto out;
367 made_inode_progress:
368 page_cache_release(page);
369 made_buffer_progress:
370 UnlockPage(page);
371 page_cache_release(page);
372 ret = 1;
373 spin_lock(&pagemap_lru_lock);
374 /* nr_lru_pages needs the spinlock */
375 nr_lru_pages--;
377 out:
378 spin_unlock(&pagemap_lru_lock);
380 return ret;
383 static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
385 goto inside;
387 for (;;) {
388 page = page->next_hash;
389 inside:
390 if (!page)
391 goto not_found;
392 if (page->mapping != mapping)
393 continue;
394 if (page->index == offset)
395 break;
397 SetPageReferenced(page);
398 not_found:
399 return page;
403 * By the time this is called, the page is locked and
404 * we don't have to worry about any races any more.
406 * Start the IO..
408 static int writeout_one_page(struct page *page)
410 struct buffer_head *bh, *head = page->buffers;
412 bh = head;
413 do {
414 if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
415 continue;
417 bh->b_flushtime = jiffies;
418 ll_rw_block(WRITE, 1, &bh);
419 } while ((bh = bh->b_this_page) != head);
420 return 0;
423 static int waitfor_one_page(struct page *page)
425 int error = 0;
426 struct buffer_head *bh, *head = page->buffers;
428 bh = head;
429 do {
430 wait_on_buffer(bh);
431 if (buffer_req(bh) && !buffer_uptodate(bh))
432 error = -EIO;
433 } while ((bh = bh->b_this_page) != head);
434 return error;
437 static int do_buffer_fdatasync(struct inode *inode, unsigned long start, unsigned long end, int (*fn)(struct page *))
439 struct list_head *head, *curr;
440 struct page *page;
441 int retval = 0;
443 head = &inode->i_mapping->pages;
445 spin_lock(&pagecache_lock);
446 curr = head->next;
447 while (curr != head) {
448 page = list_entry(curr, struct page, list);
449 curr = curr->next;
450 if (!page->buffers)
451 continue;
452 if (page->index >= end)
453 continue;
454 if (page->index < start)
455 continue;
457 page_cache_get(page);
458 spin_unlock(&pagecache_lock);
459 lock_page(page);
461 /* The buffers could have been free'd while we waited for the page lock */
462 if (page->buffers)
463 retval |= fn(page);
465 UnlockPage(page);
466 spin_lock(&pagecache_lock);
467 curr = page->list.next;
468 page_cache_release(page);
470 spin_unlock(&pagecache_lock);
472 return retval;
476 * Two-stage data sync: first start the IO, then go back and
477 * collect the information..
479 int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx)
481 int retval;
483 retval = do_buffer_fdatasync(inode, start_idx, end_idx, writeout_one_page);
484 retval |= do_buffer_fdatasync(inode, start_idx, end_idx, waitfor_one_page);
485 return retval;
489 * Add a page to the inode page cache.
491 * The caller must have locked the page and
492 * set all the page flags correctly..
494 void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index)
496 if (!PageLocked(page))
497 BUG();
499 page_cache_get(page);
500 spin_lock(&pagecache_lock);
501 page->index = index;
502 add_page_to_inode_queue(mapping, page);
503 __add_page_to_hash_queue(page, page_hash(mapping, index));
504 lru_cache_add(page);
505 spin_unlock(&pagecache_lock);
509 * This adds a page to the page cache, starting out as locked,
510 * owned by us, but unreferenced, not uptodate and with no errors.
512 static inline void __add_to_page_cache(struct page * page,
513 struct address_space *mapping, unsigned long offset,
514 struct page **hash)
516 unsigned long flags;
518 if (PageLocked(page))
519 BUG();
521 flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced));
522 page->flags = flags | (1 << PG_locked);
523 page_cache_get(page);
524 page->index = offset;
525 add_page_to_inode_queue(mapping, page);
526 __add_page_to_hash_queue(page, hash);
527 lru_cache_add(page);
530 void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset)
532 spin_lock(&pagecache_lock);
533 __add_to_page_cache(page, mapping, offset, page_hash(mapping, offset));
534 spin_unlock(&pagecache_lock);
537 static int add_to_page_cache_unique(struct page * page,
538 struct address_space *mapping, unsigned long offset,
539 struct page **hash)
541 int err;
542 struct page *alias;
544 spin_lock(&pagecache_lock);
545 alias = __find_page_nolock(mapping, offset, *hash);
547 err = 1;
548 if (!alias) {
549 __add_to_page_cache(page,mapping,offset,hash);
550 err = 0;
553 spin_unlock(&pagecache_lock);
554 return err;
558 * This adds the requested page to the page cache if it isn't already there,
559 * and schedules an I/O to read in its contents from disk.
561 static inline int page_cache_read(struct file * file, unsigned long offset)
563 struct inode *inode = file->f_dentry->d_inode;
564 struct address_space *mapping = inode->i_mapping;
565 struct page **hash = page_hash(mapping, offset);
566 struct page *page;
568 spin_lock(&pagecache_lock);
569 page = __find_page_nolock(mapping, offset, *hash);
570 spin_unlock(&pagecache_lock);
571 if (page)
572 return 0;
574 page = page_cache_alloc();
575 if (!page)
576 return -ENOMEM;
578 if (!add_to_page_cache_unique(page, mapping, offset, hash)) {
579 int error = mapping->a_ops->readpage(file, page);
580 page_cache_release(page);
581 return error;
584 * We arrive here in the unlikely event that someone
585 * raced with us and added our page to the cache first.
587 page_cache_free(page);
588 return 0;
592 * Read in an entire cluster at once. A cluster is usually a 64k-
593 * aligned block that includes the page requested in "offset."
595 static int read_cluster_nonblocking(struct file * file, unsigned long offset,
596 unsigned long filesize)
598 unsigned long pages = CLUSTER_PAGES;
600 offset = CLUSTER_OFFSET(offset);
601 while ((pages-- > 0) && (offset < filesize)) {
602 int error = page_cache_read(file, offset);
603 if (error < 0)
604 return error;
605 offset ++;
608 return 0;
612 * Wait for a page to get unlocked.
614 * This must be called with the caller "holding" the page,
615 * ie with increased "page->count" so that the page won't
616 * go away during the wait..
618 void ___wait_on_page(struct page *page)
620 struct task_struct *tsk = current;
621 DECLARE_WAITQUEUE(wait, tsk);
623 add_wait_queue(&page->wait, &wait);
624 do {
625 sync_page(page);
626 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
627 if (!PageLocked(page))
628 break;
629 schedule();
630 } while (PageLocked(page));
631 tsk->state = TASK_RUNNING;
632 remove_wait_queue(&page->wait, &wait);
636 * Get an exclusive lock on the page..
638 void lock_page(struct page *page)
640 while (TryLockPage(page))
641 ___wait_on_page(page);
646 * a rather lightweight function, finding and getting a reference to a
647 * hashed page atomically, waiting for it if it's locked.
649 struct page * __find_get_page (struct address_space *mapping,
650 unsigned long offset, struct page **hash)
652 struct page *page;
655 * We scan the hash list read-only. Addition to and removal from
656 * the hash-list needs a held write-lock.
658 repeat:
659 spin_lock(&pagecache_lock);
660 page = __find_page_nolock(mapping, offset, *hash);
661 if (page)
662 page_cache_get(page);
663 spin_unlock(&pagecache_lock);
665 /* Found the page, sleep if locked. */
666 if (page && PageLocked(page)) {
667 struct task_struct *tsk = current;
668 DECLARE_WAITQUEUE(wait, tsk);
670 sync_page(page);
672 __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
673 add_wait_queue(&page->wait, &wait);
675 if (PageLocked(page))
676 schedule();
677 __set_task_state(tsk, TASK_RUNNING);
678 remove_wait_queue(&page->wait, &wait);
681 * The page might have been unhashed meanwhile. It's
682 * not freed though because we hold a reference to it.
683 * If this is the case then it will be freed _here_,
684 * and we recheck the hash anyway.
686 page_cache_release(page);
687 goto repeat;
690 * It's not locked so we can return the page and we hold
691 * a reference to it.
693 return page;
697 * Get the lock to a page atomically.
699 struct page * __find_lock_page (struct address_space *mapping,
700 unsigned long offset, struct page **hash)
702 struct page *page;
705 * We scan the hash list read-only. Addition to and removal from
706 * the hash-list needs a held write-lock.
708 repeat:
709 spin_lock(&pagecache_lock);
710 page = __find_page_nolock(mapping, offset, *hash);
711 if (page)
712 page_cache_get(page);
713 spin_unlock(&pagecache_lock);
715 /* Found the page, sleep if locked. */
716 if (page && TryLockPage(page)) {
717 struct task_struct *tsk = current;
718 DECLARE_WAITQUEUE(wait, tsk);
720 sync_page(page);
722 __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
723 add_wait_queue(&page->wait, &wait);
725 if (PageLocked(page))
726 schedule();
727 __set_task_state(tsk, TASK_RUNNING);
728 remove_wait_queue(&page->wait, &wait);
731 * The page might have been unhashed meanwhile. It's
732 * not freed though because we hold a reference to it.
733 * If this is the case then it will be freed _here_,
734 * and we recheck the hash anyway.
736 page_cache_release(page);
737 goto repeat;
740 * It's not locked so we can return the page and we hold
741 * a reference to it.
743 return page;
746 #if 0
747 #define PROFILE_READAHEAD
748 #define DEBUG_READAHEAD
749 #endif
752 * Read-ahead profiling information
753 * --------------------------------
754 * Every PROFILE_MAXREADCOUNT, the following information is written
755 * to the syslog:
756 * Percentage of asynchronous read-ahead.
757 * Average of read-ahead fields context value.
758 * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
759 * to the syslog.
762 #ifdef PROFILE_READAHEAD
764 #define PROFILE_MAXREADCOUNT 1000
766 static unsigned long total_reada;
767 static unsigned long total_async;
768 static unsigned long total_ramax;
769 static unsigned long total_ralen;
770 static unsigned long total_rawin;
772 static void profile_readahead(int async, struct file *filp)
774 unsigned long flags;
776 ++total_reada;
777 if (async)
778 ++total_async;
780 total_ramax += filp->f_ramax;
781 total_ralen += filp->f_ralen;
782 total_rawin += filp->f_rawin;
784 if (total_reada > PROFILE_MAXREADCOUNT) {
785 save_flags(flags);
786 cli();
787 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
788 restore_flags(flags);
789 return;
792 printk("Readahead average: max=%ld, len=%ld, win=%ld, async=%ld%%\n",
793 total_ramax/total_reada,
794 total_ralen/total_reada,
795 total_rawin/total_reada,
796 (total_async*100)/total_reada);
797 #ifdef DEBUG_READAHEAD
798 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n",
799 filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
800 #endif
802 total_reada = 0;
803 total_async = 0;
804 total_ramax = 0;
805 total_ralen = 0;
806 total_rawin = 0;
808 restore_flags(flags);
811 #endif /* defined PROFILE_READAHEAD */
814 * Read-ahead context:
815 * -------------------
816 * The read ahead context fields of the "struct file" are the following:
817 * - f_raend : position of the first byte after the last page we tried to
818 * read ahead.
819 * - f_ramax : current read-ahead maximum size.
820 * - f_ralen : length of the current IO read block we tried to read-ahead.
821 * - f_rawin : length of the current read-ahead window.
822 * if last read-ahead was synchronous then
823 * f_rawin = f_ralen
824 * otherwise (was asynchronous)
825 * f_rawin = previous value of f_ralen + f_ralen
827 * Read-ahead limits:
828 * ------------------
829 * MIN_READAHEAD : minimum read-ahead size when read-ahead.
830 * MAX_READAHEAD : maximum read-ahead size when read-ahead.
832 * Synchronous read-ahead benefits:
833 * --------------------------------
834 * Using reasonable IO xfer length from peripheral devices increase system
835 * performances.
836 * Reasonable means, in this context, not too large but not too small.
837 * The actual maximum value is:
838 * MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
839 * and 32K if defined (4K page size assumed).
841 * Asynchronous read-ahead benefits:
842 * ---------------------------------
843 * Overlapping next read request and user process execution increase system
844 * performance.
846 * Read-ahead risks:
847 * -----------------
848 * We have to guess which further data are needed by the user process.
849 * If these data are often not really needed, it's bad for system
850 * performances.
851 * However, we know that files are often accessed sequentially by
852 * application programs and it seems that it is possible to have some good
853 * strategy in that guessing.
854 * We only try to read-ahead files that seems to be read sequentially.
856 * Asynchronous read-ahead risks:
857 * ------------------------------
858 * In order to maximize overlapping, we must start some asynchronous read
859 * request from the device, as soon as possible.
860 * We must be very careful about:
861 * - The number of effective pending IO read requests.
862 * ONE seems to be the only reasonable value.
863 * - The total memory pool usage for the file access stream.
864 * This maximum memory usage is implicitly 2 IO read chunks:
865 * 2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
866 * 64k if defined (4K page size assumed).
869 static inline int get_max_readahead(struct inode * inode)
871 if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
872 return MAX_READAHEAD;
873 return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
876 static void generic_file_readahead(int reada_ok,
877 struct file * filp, struct inode * inode,
878 struct page * page)
880 unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
881 unsigned long index = page->index;
882 unsigned long max_ahead, ahead;
883 unsigned long raend;
884 int max_readahead = get_max_readahead(inode);
886 raend = filp->f_raend;
887 max_ahead = 0;
890 * The current page is locked.
891 * If the current position is inside the previous read IO request, do not
892 * try to reread previously read ahead pages.
893 * Otherwise decide or not to read ahead some pages synchronously.
894 * If we are not going to read ahead, set the read ahead context for this
895 * page only.
897 if (PageLocked(page)) {
898 if (!filp->f_ralen || index >= raend || index + filp->f_rawin < raend) {
899 raend = index;
900 if (raend < end_index)
901 max_ahead = filp->f_ramax;
902 filp->f_rawin = 0;
903 filp->f_ralen = 1;
904 if (!max_ahead) {
905 filp->f_raend = index + filp->f_ralen;
906 filp->f_rawin += filp->f_ralen;
911 * The current page is not locked.
912 * If we were reading ahead and,
913 * if the current max read ahead size is not zero and,
914 * if the current position is inside the last read-ahead IO request,
915 * it is the moment to try to read ahead asynchronously.
916 * We will later force unplug device in order to force asynchronous read IO.
918 else if (reada_ok && filp->f_ramax && raend >= 1 &&
919 index <= raend && index + filp->f_ralen >= raend) {
921 * Add ONE page to max_ahead in order to try to have about the same IO max size
922 * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
923 * Compute the position of the last page we have tried to read in order to
924 * begin to read ahead just at the next page.
926 raend -= 1;
927 if (raend < end_index)
928 max_ahead = filp->f_ramax + 1;
930 if (max_ahead) {
931 filp->f_rawin = filp->f_ralen;
932 filp->f_ralen = 0;
933 reada_ok = 2;
937 * Try to read ahead pages.
938 * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
939 * scheduler, will work enough for us to avoid too bad actuals IO requests.
941 ahead = 0;
942 while (ahead < max_ahead) {
943 ahead ++;
944 if ((raend + ahead) >= end_index)
945 break;
946 if (page_cache_read(filp, raend + ahead) < 0)
947 break;
950 * If we tried to read ahead some pages,
951 * If we tried to read ahead asynchronously,
952 * Try to force unplug of the device in order to start an asynchronous
953 * read IO request.
954 * Update the read-ahead context.
955 * Store the length of the current read-ahead window.
956 * Double the current max read ahead size.
957 * That heuristic avoid to do some large IO for files that are not really
958 * accessed sequentially.
960 if (ahead) {
961 if (reada_ok == 2) {
962 run_task_queue(&tq_disk);
965 filp->f_ralen += ahead;
966 filp->f_rawin += filp->f_ralen;
967 filp->f_raend = raend + ahead + 1;
969 filp->f_ramax += filp->f_ramax;
971 if (filp->f_ramax > max_readahead)
972 filp->f_ramax = max_readahead;
974 #ifdef PROFILE_READAHEAD
975 profile_readahead((reada_ok == 2), filp);
976 #endif
979 return;
984 * This is a generic file read routine, and uses the
985 * inode->i_op->readpage() function for the actual low-level
986 * stuff.
988 * This is really ugly. But the goto's actually try to clarify some
989 * of the logic when it comes to error handling etc.
991 void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
993 struct inode *inode = filp->f_dentry->d_inode;
994 struct address_space *mapping = inode->i_mapping;
995 unsigned long index, offset;
996 struct page *cached_page;
997 int reada_ok;
998 int error;
999 int max_readahead = get_max_readahead(inode);
1001 cached_page = NULL;
1002 index = *ppos >> PAGE_CACHE_SHIFT;
1003 offset = *ppos & ~PAGE_CACHE_MASK;
1006 * If the current position is outside the previous read-ahead window,
1007 * we reset the current read-ahead context and set read ahead max to zero
1008 * (will be set to just needed value later),
1009 * otherwise, we assume that the file accesses are sequential enough to
1010 * continue read-ahead.
1012 if (index > filp->f_raend || index + filp->f_rawin < filp->f_raend) {
1013 reada_ok = 0;
1014 filp->f_raend = 0;
1015 filp->f_ralen = 0;
1016 filp->f_ramax = 0;
1017 filp->f_rawin = 0;
1018 } else {
1019 reada_ok = 1;
1022 * Adjust the current value of read-ahead max.
1023 * If the read operation stay in the first half page, force no readahead.
1024 * Otherwise try to increase read ahead max just enough to do the read request.
1025 * Then, at least MIN_READAHEAD if read ahead is ok,
1026 * and at most MAX_READAHEAD in all cases.
1028 if (!index && offset + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
1029 filp->f_ramax = 0;
1030 } else {
1031 unsigned long needed;
1033 needed = ((offset + desc->count) >> PAGE_CACHE_SHIFT) + 1;
1035 if (filp->f_ramax < needed)
1036 filp->f_ramax = needed;
1038 if (reada_ok && filp->f_ramax < MIN_READAHEAD)
1039 filp->f_ramax = MIN_READAHEAD;
1040 if (filp->f_ramax > max_readahead)
1041 filp->f_ramax = max_readahead;
1044 for (;;) {
1045 struct page *page, **hash;
1046 unsigned long end_index, nr;
1048 end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1049 if (index > end_index)
1050 break;
1051 nr = PAGE_CACHE_SIZE;
1052 if (index == end_index) {
1053 nr = inode->i_size & ~PAGE_CACHE_MASK;
1054 if (nr <= offset)
1055 break;
1058 nr = nr - offset;
1061 * Try to find the data in the page cache..
1063 hash = page_hash(mapping, index);
1065 spin_lock(&pagecache_lock);
1066 page = __find_page_nolock(mapping, index, *hash);
1067 if (!page)
1068 goto no_cached_page;
1069 found_page:
1070 page_cache_get(page);
1071 spin_unlock(&pagecache_lock);
1073 if (!Page_Uptodate(page))
1074 goto page_not_up_to_date;
1075 generic_file_readahead(reada_ok, filp, inode, page);
1076 page_ok:
1077 /* If users can be writing to this page using arbitrary
1078 * virtual addresses, take care about potential aliasing
1079 * before reading the page on the kernel side.
1081 if (page->mapping->i_mmap_shared != NULL)
1082 flush_dcache_page(page);
1085 * Ok, we have the page, and it's up-to-date, so
1086 * now we can copy it to user space...
1088 * The actor routine returns how many bytes were actually used..
1089 * NOTE! This may not be the same as how much of a user buffer
1090 * we filled up (we may be padding etc), so we can only update
1091 * "pos" here (the actor routine has to update the user buffer
1092 * pointers and the remaining count).
1094 nr = actor(desc, page, offset, nr);
1095 offset += nr;
1096 index += offset >> PAGE_CACHE_SHIFT;
1097 offset &= ~PAGE_CACHE_MASK;
1099 page_cache_release(page);
1100 if (nr && desc->count)
1101 continue;
1102 break;
1105 * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
1107 page_not_up_to_date:
1108 generic_file_readahead(reada_ok, filp, inode, page);
1110 if (Page_Uptodate(page))
1111 goto page_ok;
1113 /* Get exclusive access to the page ... */
1114 lock_page(page);
1115 if (Page_Uptodate(page)) {
1116 UnlockPage(page);
1117 goto page_ok;
1120 readpage:
1121 /* ... and start the actual read. The read will unlock the page. */
1122 error = mapping->a_ops->readpage(filp, page);
1124 if (!error) {
1125 if (Page_Uptodate(page))
1126 goto page_ok;
1128 /* Again, try some read-ahead while waiting for the page to finish.. */
1129 generic_file_readahead(reada_ok, filp, inode, page);
1130 wait_on_page(page);
1131 if (Page_Uptodate(page))
1132 goto page_ok;
1133 error = -EIO;
1136 /* UHHUH! A synchronous read error occurred. Report it */
1137 desc->error = error;
1138 page_cache_release(page);
1139 break;
1141 no_cached_page:
1143 * Ok, it wasn't cached, so we need to create a new
1144 * page..
1146 * We get here with the page cache lock held.
1148 if (!cached_page) {
1149 spin_unlock(&pagecache_lock);
1150 cached_page = page_cache_alloc();
1151 if (!cached_page) {
1152 desc->error = -ENOMEM;
1153 break;
1157 * Somebody may have added the page while we
1158 * dropped the page cache lock. Check for that.
1160 spin_lock(&pagecache_lock);
1161 page = __find_page_nolock(mapping, index, *hash);
1162 if (page)
1163 goto found_page;
1167 * Ok, add the new page to the hash-queues...
1169 page = cached_page;
1170 __add_to_page_cache(page, mapping, index, hash);
1171 spin_unlock(&pagecache_lock);
1172 cached_page = NULL;
1174 goto readpage;
1177 *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1178 filp->f_reada = 1;
1179 if (cached_page)
1180 page_cache_free(cached_page);
1181 UPDATE_ATIME(inode);
1184 static int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
1186 unsigned long kaddr;
1187 unsigned long left, count = desc->count;
1189 if (size > count)
1190 size = count;
1192 kaddr = kmap(page);
1193 left = __copy_to_user(desc->buf, (void *)(kaddr + offset), size);
1194 kunmap(page);
1196 if (left) {
1197 size -= left;
1198 desc->error = -EFAULT;
1200 desc->count = count - size;
1201 desc->written += size;
1202 desc->buf += size;
1203 return size;
1207 * This is the "read()" routine for all filesystems
1208 * that can use the page cache directly.
1210 ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
1212 ssize_t retval;
1214 retval = -EFAULT;
1215 if (access_ok(VERIFY_WRITE, buf, count)) {
1216 retval = 0;
1218 if (count) {
1219 read_descriptor_t desc;
1221 desc.written = 0;
1222 desc.count = count;
1223 desc.buf = buf;
1224 desc.error = 0;
1225 do_generic_file_read(filp, ppos, &desc, file_read_actor);
1227 retval = desc.written;
1228 if (!retval)
1229 retval = desc.error;
1232 return retval;
1235 static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size)
1237 unsigned long kaddr;
1238 ssize_t written;
1239 unsigned long count = desc->count;
1240 struct file *file = (struct file *) desc->buf;
1241 mm_segment_t old_fs;
1243 if (size > count)
1244 size = count;
1245 old_fs = get_fs();
1246 set_fs(KERNEL_DS);
1248 kaddr = kmap(page);
1249 written = file->f_op->write(file, (char *)kaddr + offset,
1250 size, &file->f_pos);
1251 kunmap(page);
1252 set_fs(old_fs);
1253 if (written < 0) {
1254 desc->error = written;
1255 written = 0;
1257 desc->count = count - written;
1258 desc->written += written;
1259 return written;
1262 asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
1264 ssize_t retval;
1265 struct file * in_file, * out_file;
1266 struct inode * in_inode, * out_inode;
1269 * Get input file, and verify that it is ok..
1271 retval = -EBADF;
1272 in_file = fget(in_fd);
1273 if (!in_file)
1274 goto out;
1275 if (!(in_file->f_mode & FMODE_READ))
1276 goto fput_in;
1277 retval = -EINVAL;
1278 in_inode = in_file->f_dentry->d_inode;
1279 if (!in_inode)
1280 goto fput_in;
1281 if (!in_inode->i_mapping->a_ops->readpage)
1282 goto fput_in;
1283 retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
1284 if (retval)
1285 goto fput_in;
1288 * Get output file, and verify that it is ok..
1290 retval = -EBADF;
1291 out_file = fget(out_fd);
1292 if (!out_file)
1293 goto fput_in;
1294 if (!(out_file->f_mode & FMODE_WRITE))
1295 goto fput_out;
1296 retval = -EINVAL;
1297 if (!out_file->f_op || !out_file->f_op->write)
1298 goto fput_out;
1299 out_inode = out_file->f_dentry->d_inode;
1300 if (!out_inode)
1301 goto fput_out;
1302 retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
1303 if (retval)
1304 goto fput_out;
1306 retval = 0;
1307 if (count) {
1308 read_descriptor_t desc;
1309 loff_t pos = 0, *ppos;
1311 retval = -EFAULT;
1312 ppos = &in_file->f_pos;
1313 if (offset) {
1314 if (get_user(pos, offset))
1315 goto fput_out;
1316 ppos = &pos;
1319 desc.written = 0;
1320 desc.count = count;
1321 desc.buf = (char *) out_file;
1322 desc.error = 0;
1323 do_generic_file_read(in_file, ppos, &desc, file_send_actor);
1325 retval = desc.written;
1326 if (!retval)
1327 retval = desc.error;
1328 if (offset)
1329 put_user(pos, offset);
1332 fput_out:
1333 fput(out_file);
1334 fput_in:
1335 fput(in_file);
1336 out:
1337 return retval;
1341 * Read-ahead and flush behind for MADV_SEQUENTIAL areas. Since we are
1342 * sure this is sequential access, we don't need a flexible read-ahead
1343 * window size -- we can always use a large fixed size window.
1345 static void nopage_sequential_readahead(struct vm_area_struct * vma,
1346 unsigned long pgoff, unsigned long filesize)
1348 unsigned long ra_window;
1350 ra_window = get_max_readahead(vma->vm_file->f_dentry->d_inode);
1351 ra_window = CLUSTER_OFFSET(ra_window + CLUSTER_PAGES - 1);
1353 /* vm_raend is zero if we haven't read ahead in this area yet. */
1354 if (vma->vm_raend == 0)
1355 vma->vm_raend = vma->vm_pgoff + ra_window;
1358 * If we've just faulted the page half-way through our window,
1359 * then schedule reads for the next window, and release the
1360 * pages in the previous window.
1362 if ((pgoff + (ra_window >> 1)) == vma->vm_raend) {
1363 unsigned long start = vma->vm_pgoff + vma->vm_raend;
1364 unsigned long end = start + ra_window;
1366 if (end > ((vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff))
1367 end = (vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff;
1368 if (start > end)
1369 return;
1371 while ((start < end) && (start < filesize)) {
1372 if (read_cluster_nonblocking(vma->vm_file,
1373 start, filesize) < 0)
1374 break;
1375 start += CLUSTER_PAGES;
1377 run_task_queue(&tq_disk);
1379 /* if we're far enough past the beginning of this area,
1380 recycle pages that are in the previous window. */
1381 if (vma->vm_raend > (vma->vm_pgoff + ra_window + ra_window)) {
1382 unsigned long window = ra_window << PAGE_SHIFT;
1384 end = vma->vm_start + (vma->vm_raend << PAGE_SHIFT);
1385 end -= window + window;
1386 filemap_sync(vma, end - window, window, MS_INVALIDATE);
1389 vma->vm_raend += ra_window;
1392 return;
1396 * filemap_nopage() is invoked via the vma operations vector for a
1397 * mapped memory region to read in file data during a page fault.
1399 * The goto's are kind of ugly, but this streamlines the normal case of having
1400 * it in the page cache, and handles the special cases reasonably without
1401 * having a lot of duplicated code.
1403 struct page * filemap_nopage(struct vm_area_struct * area,
1404 unsigned long address, int no_share)
1406 int error;
1407 struct file *file = area->vm_file;
1408 struct inode *inode = file->f_dentry->d_inode;
1409 struct address_space *mapping = inode->i_mapping;
1410 struct page *page, **hash, *old_page;
1411 unsigned long size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1413 unsigned long pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
1416 * Semantics for shared and private memory areas are different
1417 * past the end of the file. A shared mapping past the last page
1418 * of the file is an error and results in a SIGBUS, while a
1419 * private mapping just maps in a zero page.
1421 if ((pgoff >= size) && (area->vm_mm == current->mm))
1422 return NULL;
1425 * Do we have something in the page cache already?
1427 hash = page_hash(mapping, pgoff);
1428 retry_find:
1429 page = __find_get_page(mapping, pgoff, hash);
1430 if (!page)
1431 goto no_cached_page;
1434 * Ok, found a page in the page cache, now we need to check
1435 * that it's up-to-date.
1437 if (!Page_Uptodate(page))
1438 goto page_not_uptodate;
1440 success:
1442 * Try read-ahead for sequential areas.
1444 if (VM_SequentialReadHint(area))
1445 nopage_sequential_readahead(area, pgoff, size);
1448 * Found the page and have a reference on it, need to check sharing
1449 * and possibly copy it over to another page..
1451 old_page = page;
1452 if (no_share) {
1453 struct page *new_page = page_cache_alloc();
1455 if (new_page) {
1456 copy_user_highpage(new_page, old_page, address);
1457 flush_page_to_ram(new_page);
1458 } else
1459 new_page = NOPAGE_OOM;
1460 page_cache_release(page);
1461 return new_page;
1464 flush_page_to_ram(old_page);
1465 return old_page;
1467 no_cached_page:
1469 * If the requested offset is within our file, try to read a whole
1470 * cluster of pages at once.
1472 * Otherwise, we're off the end of a privately mapped file,
1473 * so we need to map a zero page.
1475 if ((pgoff < size) && !VM_RandomReadHint(area))
1476 error = read_cluster_nonblocking(file, pgoff, size);
1477 else
1478 error = page_cache_read(file, pgoff);
1481 * The page we want has now been added to the page cache.
1482 * In the unlikely event that someone removed it in the
1483 * meantime, we'll just come back here and read it again.
1485 if (error >= 0)
1486 goto retry_find;
1489 * An error return from page_cache_read can result if the
1490 * system is low on memory, or a problem occurs while trying
1491 * to schedule I/O.
1493 if (error == -ENOMEM)
1494 return NOPAGE_OOM;
1495 return NULL;
1497 page_not_uptodate:
1498 lock_page(page);
1499 if (Page_Uptodate(page)) {
1500 UnlockPage(page);
1501 goto success;
1504 if (!mapping->a_ops->readpage(file, page)) {
1505 wait_on_page(page);
1506 if (Page_Uptodate(page))
1507 goto success;
1511 * Umm, take care of errors if the page isn't up-to-date.
1512 * Try to re-read it _once_. We do this synchronously,
1513 * because there really aren't any performance issues here
1514 * and we need to check for errors.
1516 lock_page(page);
1517 if (Page_Uptodate(page)) {
1518 UnlockPage(page);
1519 goto success;
1521 ClearPageError(page);
1522 if (!mapping->a_ops->readpage(file, page)) {
1523 wait_on_page(page);
1524 if (Page_Uptodate(page))
1525 goto success;
1529 * Things didn't work out. Return zero to tell the
1530 * mm layer so, possibly freeing the page cache page first.
1532 page_cache_release(page);
1533 return NULL;
1536 static int filemap_write_page(struct file *file,
1537 struct page * page,
1538 int wait)
1541 * If a task terminates while we're swapping the page, the vma and
1542 * and file could be released: try_to_swap_out has done a get_file.
1543 * vma/file is guaranteed to exist in the unmap/sync cases because
1544 * mmap_sem is held.
1546 return page->mapping->a_ops->writepage(file, page);
1551 * The page cache takes care of races between somebody
1552 * trying to swap something out and swap something in
1553 * at the same time..
1555 extern void wakeup_bdflush(int);
1556 int filemap_swapout(struct page * page, struct file * file)
1558 int retval = filemap_write_page(file, page, 0);
1559 wakeup_bdflush(0);
1560 return retval;
1563 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
1564 unsigned long address, unsigned int flags)
1566 unsigned long pgoff;
1567 pte_t pte = *ptep;
1568 struct page *page;
1569 int error;
1571 if (!(flags & MS_INVALIDATE)) {
1572 if (!pte_present(pte))
1573 return 0;
1574 if (!pte_dirty(pte))
1575 return 0;
1576 flush_page_to_ram(pte_page(pte));
1577 flush_cache_page(vma, address);
1578 set_pte(ptep, pte_mkclean(pte));
1579 flush_tlb_page(vma, address);
1580 page = pte_page(pte);
1581 page_cache_get(page);
1582 } else {
1583 if (pte_none(pte))
1584 return 0;
1585 flush_cache_page(vma, address);
1586 pte_clear(ptep);
1587 flush_tlb_page(vma, address);
1588 if (!pte_present(pte)) {
1589 swap_free(pte_to_swp_entry(pte));
1590 return 0;
1592 page = pte_page(pte);
1593 if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
1594 page_cache_free(page);
1595 return 0;
1598 pgoff = (address - vma->vm_start) >> PAGE_CACHE_SHIFT;
1599 pgoff += vma->vm_pgoff;
1600 if (page->index != pgoff) {
1601 printk("weirdness: pgoff=%lu index=%lu address=%lu vm_start=%lu vm_pgoff=%lu\n",
1602 pgoff, page->index, address, vma->vm_start, vma->vm_pgoff);
1604 lock_page(page);
1605 error = filemap_write_page(vma->vm_file, page, 1);
1606 UnlockPage(page);
1607 page_cache_free(page);
1608 return error;
1611 static inline int filemap_sync_pte_range(pmd_t * pmd,
1612 unsigned long address, unsigned long size,
1613 struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
1615 pte_t * pte;
1616 unsigned long end;
1617 int error;
1619 if (pmd_none(*pmd))
1620 return 0;
1621 if (pmd_bad(*pmd)) {
1622 pmd_ERROR(*pmd);
1623 pmd_clear(pmd);
1624 return 0;
1626 pte = pte_offset(pmd, address);
1627 offset += address & PMD_MASK;
1628 address &= ~PMD_MASK;
1629 end = address + size;
1630 if (end > PMD_SIZE)
1631 end = PMD_SIZE;
1632 error = 0;
1633 do {
1634 error |= filemap_sync_pte(pte, vma, address + offset, flags);
1635 address += PAGE_SIZE;
1636 pte++;
1637 } while (address && (address < end));
1638 return error;
1641 static inline int filemap_sync_pmd_range(pgd_t * pgd,
1642 unsigned long address, unsigned long size,
1643 struct vm_area_struct *vma, unsigned int flags)
1645 pmd_t * pmd;
1646 unsigned long offset, end;
1647 int error;
1649 if (pgd_none(*pgd))
1650 return 0;
1651 if (pgd_bad(*pgd)) {
1652 pgd_ERROR(*pgd);
1653 pgd_clear(pgd);
1654 return 0;
1656 pmd = pmd_offset(pgd, address);
1657 offset = address & PGDIR_MASK;
1658 address &= ~PGDIR_MASK;
1659 end = address + size;
1660 if (end > PGDIR_SIZE)
1661 end = PGDIR_SIZE;
1662 error = 0;
1663 do {
1664 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
1665 address = (address + PMD_SIZE) & PMD_MASK;
1666 pmd++;
1667 } while (address && (address < end));
1668 return error;
1671 int filemap_sync(struct vm_area_struct * vma, unsigned long address,
1672 size_t size, unsigned int flags)
1674 pgd_t * dir;
1675 unsigned long end = address + size;
1676 int error = 0;
1678 dir = pgd_offset(vma->vm_mm, address);
1679 flush_cache_range(vma->vm_mm, end - size, end);
1680 if (address >= end)
1681 BUG();
1682 do {
1683 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
1684 address = (address + PGDIR_SIZE) & PGDIR_MASK;
1685 dir++;
1686 } while (address && (address < end));
1687 flush_tlb_range(vma->vm_mm, end - size, end);
1688 return error;
1692 * This handles (potentially partial) area unmaps..
1694 static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len)
1696 filemap_sync(vma, start, len, MS_ASYNC);
1700 * Shared mappings need to be able to do the right thing at
1701 * close/unmap/sync. They will also use the private file as
1702 * backing-store for swapping..
1704 static struct vm_operations_struct file_shared_mmap = {
1705 unmap: filemap_unmap, /* unmap - we need to sync the pages */
1706 sync: filemap_sync,
1707 nopage: filemap_nopage,
1708 swapout: filemap_swapout,
1712 * Private mappings just need to be able to load in the map.
1714 * (This is actually used for shared mappings as well, if we
1715 * know they can't ever get write permissions..)
1717 static struct vm_operations_struct file_private_mmap = {
1718 nopage: filemap_nopage,
1721 /* This is used for a general mmap of a disk file */
1723 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1725 struct vm_operations_struct * ops;
1726 struct inode *inode = file->f_dentry->d_inode;
1728 ops = &file_private_mmap;
1729 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1730 if (!inode->i_mapping->a_ops->writepage)
1731 return -EINVAL;
1732 ops = &file_shared_mmap;
1734 if (!inode->i_sb || !S_ISREG(inode->i_mode))
1735 return -EACCES;
1736 if (!inode->i_mapping->a_ops->readpage)
1737 return -ENOEXEC;
1738 UPDATE_ATIME(inode);
1739 vma->vm_ops = ops;
1740 return 0;
1744 * The msync() system call.
1747 static int msync_interval(struct vm_area_struct * vma,
1748 unsigned long start, unsigned long end, int flags)
1750 if (vma->vm_file && vma->vm_ops && vma->vm_ops->sync) {
1751 int error;
1752 error = vma->vm_ops->sync(vma, start, end-start, flags);
1753 if (!error && (flags & MS_SYNC)) {
1754 struct file * file = vma->vm_file;
1755 if (file && file->f_op && file->f_op->fsync) {
1756 down(&file->f_dentry->d_inode->i_sem);
1757 error = file->f_op->fsync(file, file->f_dentry, 1);
1758 up(&file->f_dentry->d_inode->i_sem);
1761 return error;
1763 return 0;
1766 asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
1768 unsigned long end;
1769 struct vm_area_struct * vma;
1770 int unmapped_error, error = -EINVAL;
1772 down(&current->mm->mmap_sem);
1773 if (start & ~PAGE_MASK)
1774 goto out;
1775 len = (len + ~PAGE_MASK) & PAGE_MASK;
1776 end = start + len;
1777 if (end < start)
1778 goto out;
1779 if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1780 goto out;
1781 error = 0;
1782 if (end == start)
1783 goto out;
1785 * If the interval [start,end) covers some unmapped address ranges,
1786 * just ignore them, but return -EFAULT at the end.
1788 vma = find_vma(current->mm, start);
1789 unmapped_error = 0;
1790 for (;;) {
1791 /* Still start < end. */
1792 error = -EFAULT;
1793 if (!vma)
1794 goto out;
1795 /* Here start < vma->vm_end. */
1796 if (start < vma->vm_start) {
1797 unmapped_error = -EFAULT;
1798 start = vma->vm_start;
1800 /* Here vma->vm_start <= start < vma->vm_end. */
1801 if (end <= vma->vm_end) {
1802 if (start < end) {
1803 error = msync_interval(vma, start, end, flags);
1804 if (error)
1805 goto out;
1807 error = unmapped_error;
1808 goto out;
1810 /* Here vma->vm_start <= start < vma->vm_end < end. */
1811 error = msync_interval(vma, start, vma->vm_end, flags);
1812 if (error)
1813 goto out;
1814 start = vma->vm_end;
1815 vma = vma->vm_next;
1817 out:
1818 up(&current->mm->mmap_sem);
1819 return error;
1822 static inline void setup_read_behavior(struct vm_area_struct * vma,
1823 int behavior)
1825 VM_ClearReadHint(vma);
1826 switch(behavior) {
1827 case MADV_SEQUENTIAL:
1828 vma->vm_flags |= VM_SEQ_READ;
1829 break;
1830 case MADV_RANDOM:
1831 vma->vm_flags |= VM_RAND_READ;
1832 break;
1833 default:
1834 break;
1836 return;
1839 static long madvise_fixup_start(struct vm_area_struct * vma,
1840 unsigned long end, int behavior)
1842 struct vm_area_struct * n;
1844 n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1845 if (!n)
1846 return -EAGAIN;
1847 *n = *vma;
1848 n->vm_end = end;
1849 setup_read_behavior(n, behavior);
1850 n->vm_raend = 0;
1851 get_file(n->vm_file);
1852 if (n->vm_ops && n->vm_ops->open)
1853 n->vm_ops->open(n);
1854 vmlist_modify_lock(vma->vm_mm);
1855 vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
1856 vma->vm_start = end;
1857 insert_vm_struct(current->mm, n);
1858 vmlist_modify_unlock(vma->vm_mm);
1859 return 0;
1862 static long madvise_fixup_end(struct vm_area_struct * vma,
1863 unsigned long start, int behavior)
1865 struct vm_area_struct * n;
1867 n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1868 if (!n)
1869 return -EAGAIN;
1870 *n = *vma;
1871 n->vm_start = start;
1872 n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT;
1873 setup_read_behavior(n, behavior);
1874 n->vm_raend = 0;
1875 get_file(n->vm_file);
1876 if (n->vm_ops && n->vm_ops->open)
1877 n->vm_ops->open(n);
1878 vmlist_modify_lock(vma->vm_mm);
1879 vma->vm_end = start;
1880 insert_vm_struct(current->mm, n);
1881 vmlist_modify_unlock(vma->vm_mm);
1882 return 0;
1885 static long madvise_fixup_middle(struct vm_area_struct * vma,
1886 unsigned long start, unsigned long end, int behavior)
1888 struct vm_area_struct * left, * right;
1890 left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1891 if (!left)
1892 return -EAGAIN;
1893 right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1894 if (!right) {
1895 kmem_cache_free(vm_area_cachep, left);
1896 return -EAGAIN;
1898 *left = *vma;
1899 *right = *vma;
1900 left->vm_end = start;
1901 right->vm_start = end;
1902 right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT;
1903 left->vm_raend = 0;
1904 right->vm_raend = 0;
1905 atomic_add(2, &vma->vm_file->f_count);
1907 if (vma->vm_ops && vma->vm_ops->open) {
1908 vma->vm_ops->open(left);
1909 vma->vm_ops->open(right);
1911 vmlist_modify_lock(vma->vm_mm);
1912 vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
1913 vma->vm_start = start;
1914 vma->vm_end = end;
1915 setup_read_behavior(vma, behavior);
1916 vma->vm_raend = 0;
1917 insert_vm_struct(current->mm, left);
1918 insert_vm_struct(current->mm, right);
1919 vmlist_modify_unlock(vma->vm_mm);
1920 return 0;
1924 * We can potentially split a vm area into separate
1925 * areas, each area with its own behavior.
1927 static long madvise_behavior(struct vm_area_struct * vma,
1928 unsigned long start, unsigned long end, int behavior)
1930 int error = 0;
1932 /* This caps the number of vma's this process can own */
1933 if (vma->vm_mm->map_count > MAX_MAP_COUNT)
1934 return -ENOMEM;
1936 if (start == vma->vm_start) {
1937 if (end == vma->vm_end) {
1938 setup_read_behavior(vma, behavior);
1939 vma->vm_raend = 0;
1940 } else
1941 error = madvise_fixup_start(vma, end, behavior);
1942 } else {
1943 if (end == vma->vm_end)
1944 error = madvise_fixup_end(vma, start, behavior);
1945 else
1946 error = madvise_fixup_middle(vma, start, end, behavior);
1949 return error;
1953 * Schedule all required I/O operations, then run the disk queue
1954 * to make sure they are started. Do not wait for completion.
1956 static long madvise_willneed(struct vm_area_struct * vma,
1957 unsigned long start, unsigned long end)
1959 long error = -EBADF;
1960 struct file * file;
1961 unsigned long size, rlim_rss;
1963 /* Doesn't work if there's no mapped file. */
1964 if (!vma->vm_file)
1965 return error;
1966 file = vma->vm_file;
1967 size = (file->f_dentry->d_inode->i_size + PAGE_CACHE_SIZE - 1) >>
1968 PAGE_CACHE_SHIFT;
1970 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1971 if (end > vma->vm_end)
1972 end = vma->vm_end;
1973 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1975 /* Make sure this doesn't exceed the process's max rss. */
1976 error = -EIO;
1977 rlim_rss = current->rlim ? current->rlim[RLIMIT_RSS].rlim_cur :
1978 LONG_MAX; /* default: see resource.h */
1979 if ((vma->vm_mm->rss + (end - start)) > rlim_rss)
1980 return error;
1982 /* round to cluster boundaries if this isn't a "random" area. */
1983 if (!VM_RandomReadHint(vma)) {
1984 start = CLUSTER_OFFSET(start);
1985 end = CLUSTER_OFFSET(end + CLUSTER_PAGES - 1);
1987 while ((start < end) && (start < size)) {
1988 error = read_cluster_nonblocking(file, start, size);
1989 start += CLUSTER_PAGES;
1990 if (error < 0)
1991 break;
1993 } else {
1994 while ((start < end) && (start < size)) {
1995 error = page_cache_read(file, start);
1996 start++;
1997 if (error < 0)
1998 break;
2002 /* Don't wait for someone else to push these requests. */
2003 run_task_queue(&tq_disk);
2005 return error;
2009 * Application no longer needs these pages. If the pages are dirty,
2010 * it's OK to just throw them away. The app will be more careful about
2011 * data it wants to keep. Be sure to free swap resources too. The
2012 * zap_page_range call sets things up for shrink_mmap to actually free
2013 * these pages later if no one else has touched them in the meantime,
2014 * although we could add these pages to a global reuse list for
2015 * shrink_mmap to pick up before reclaiming other pages.
2017 * NB: This interface discards data rather than pushes it out to swap,
2018 * as some implementations do. This has performance implications for
2019 * applications like large transactional databases which want to discard
2020 * pages in anonymous maps after committing to backing store the data
2021 * that was kept in them. There is no reason to write this data out to
2022 * the swap area if the application is discarding it.
2024 * An interface that causes the system to free clean pages and flush
2025 * dirty pages is already available as msync(MS_INVALIDATE).
2027 static long madvise_dontneed(struct vm_area_struct * vma,
2028 unsigned long start, unsigned long end)
2030 if (vma->vm_flags & VM_LOCKED)
2031 return -EINVAL;
2033 flush_cache_range(vma->vm_mm, start, end);
2034 zap_page_range(vma->vm_mm, start, end - start);
2035 flush_tlb_range(vma->vm_mm, start, end);
2036 return 0;
2039 static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
2040 unsigned long end, int behavior)
2042 long error = -EBADF;
2044 switch (behavior) {
2045 case MADV_NORMAL:
2046 case MADV_SEQUENTIAL:
2047 case MADV_RANDOM:
2048 error = madvise_behavior(vma, start, end, behavior);
2049 break;
2051 case MADV_WILLNEED:
2052 error = madvise_willneed(vma, start, end);
2053 break;
2055 case MADV_DONTNEED:
2056 error = madvise_dontneed(vma, start, end);
2057 break;
2059 default:
2060 error = -EINVAL;
2061 break;
2064 return error;
2068 * The madvise(2) system call.
2070 * Applications can use madvise() to advise the kernel how it should
2071 * handle paging I/O in this VM area. The idea is to help the kernel
2072 * use appropriate read-ahead and caching techniques. The information
2073 * provided is advisory only, and can be safely disregarded by the
2074 * kernel without affecting the correct operation of the application.
2076 * behavior values:
2077 * MADV_NORMAL - the default behavior is to read clusters. This
2078 * results in some read-ahead and read-behind.
2079 * MADV_RANDOM - the system should read the minimum amount of data
2080 * on any access, since it is unlikely that the appli-
2081 * cation will need more than what it asks for.
2082 * MADV_SEQUENTIAL - pages in the given range will probably be accessed
2083 * once, so they can be aggressively read ahead, and
2084 * can be freed soon after they are accessed.
2085 * MADV_WILLNEED - the application is notifying the system to read
2086 * some pages ahead.
2087 * MADV_DONTNEED - the application is finished with the given range,
2088 * so the kernel can free resources associated with it.
2090 * return values:
2091 * zero - success
2092 * -EINVAL - start + len < 0, start is not page-aligned,
2093 * "behavior" is not a valid value, or application
2094 * is attempting to release locked or shared pages.
2095 * -ENOMEM - addresses in the specified range are not currently
2096 * mapped, or are outside the AS of the process.
2097 * -EIO - an I/O error occurred while paging in data.
2098 * -EBADF - map exists, but area maps something that isn't a file.
2099 * -EAGAIN - a kernel resource was temporarily unavailable.
2101 asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior)
2103 unsigned long end;
2104 struct vm_area_struct * vma;
2105 int unmapped_error = 0;
2106 int error = -EINVAL;
2108 down(&current->mm->mmap_sem);
2110 if (start & ~PAGE_MASK)
2111 goto out;
2112 len = (len + ~PAGE_MASK) & PAGE_MASK;
2113 end = start + len;
2114 if (end < start)
2115 goto out;
2117 error = 0;
2118 if (end == start)
2119 goto out;
2122 * If the interval [start,end) covers some unmapped address
2123 * ranges, just ignore them, but return -ENOMEM at the end.
2125 vma = find_vma(current->mm, start);
2126 for (;;) {
2127 /* Still start < end. */
2128 error = -ENOMEM;
2129 if (!vma)
2130 goto out;
2132 /* Here start < vma->vm_end. */
2133 if (start < vma->vm_start) {
2134 unmapped_error = -ENOMEM;
2135 start = vma->vm_start;
2138 /* Here vma->vm_start <= start < vma->vm_end. */
2139 if (end <= vma->vm_end) {
2140 if (start < end) {
2141 error = madvise_vma(vma, start, end,
2142 behavior);
2143 if (error)
2144 goto out;
2146 error = unmapped_error;
2147 goto out;
2150 /* Here vma->vm_start <= start < vma->vm_end < end. */
2151 error = madvise_vma(vma, start, vma->vm_end, behavior);
2152 if (error)
2153 goto out;
2154 start = vma->vm_end;
2155 vma = vma->vm_next;
2158 out:
2159 up(&current->mm->mmap_sem);
2160 return error;
2164 * Later we can get more picky about what "in core" means precisely.
2165 * For now, simply check to see if the page is in the page cache,
2166 * and is up to date; i.e. that no page-in operation would be required
2167 * at this time if an application were to map and access this page.
2169 static unsigned char mincore_page(struct vm_area_struct * vma,
2170 unsigned long pgoff)
2172 unsigned char present = 0;
2173 struct address_space * as = &vma->vm_file->f_dentry->d_inode->i_data;
2174 struct page * page, ** hash = page_hash(as, pgoff);
2176 spin_lock(&pagecache_lock);
2177 page = __find_page_nolock(as, pgoff, *hash);
2178 if ((page) && (Page_Uptodate(page)))
2179 present = 1;
2180 spin_unlock(&pagecache_lock);
2182 return present;
2185 static long mincore_vma(struct vm_area_struct * vma,
2186 unsigned long start, unsigned long end, unsigned char * vec)
2188 long error, i, remaining;
2189 unsigned char * tmp;
2191 error = -ENOMEM;
2192 if (!vma->vm_file)
2193 return error;
2195 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2196 if (end > vma->vm_end)
2197 end = vma->vm_end;
2198 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2200 error = -EAGAIN;
2201 tmp = (unsigned char *) __get_free_page(GFP_KERNEL);
2202 if (!tmp)
2203 return error;
2205 /* (end - start) is # of pages, and also # of bytes in "vec */
2206 remaining = (end - start),
2208 error = 0;
2209 for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) {
2210 int j = 0;
2211 long thispiece = (remaining < PAGE_SIZE) ?
2212 remaining : PAGE_SIZE;
2214 while (j < thispiece)
2215 tmp[j++] = mincore_page(vma, start++);
2217 if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) {
2218 error = -EFAULT;
2219 break;
2223 free_page((unsigned long) tmp);
2224 return error;
2228 * The mincore(2) system call.
2230 * mincore() returns the memory residency status of the pages in the
2231 * current process's address space specified by [addr, addr + len).
2232 * The status is returned in a vector of bytes. The least significant
2233 * bit of each byte is 1 if the referenced page is in memory, otherwise
2234 * it is zero.
2236 * Because the status of a page can change after mincore() checks it
2237 * but before it returns to the application, the returned vector may
2238 * contain stale information. Only locked pages are guaranteed to
2239 * remain in memory.
2241 * return values:
2242 * zero - success
2243 * -EFAULT - vec points to an illegal address
2244 * -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE,
2245 * or len has a nonpositive value
2246 * -ENOMEM - Addresses in the range [addr, addr + len] are
2247 * invalid for the address space of this process, or
2248 * specify one or more pages which are not currently
2249 * mapped
2250 * -EAGAIN - A kernel resource was temporarily unavailable.
2252 asmlinkage long sys_mincore(unsigned long start, size_t len,
2253 unsigned char * vec)
2255 int index = 0;
2256 unsigned long end;
2257 struct vm_area_struct * vma;
2258 int unmapped_error = 0;
2259 long error = -EINVAL;
2261 down(&current->mm->mmap_sem);
2263 if (start & ~PAGE_CACHE_MASK)
2264 goto out;
2265 len = (len + ~PAGE_CACHE_MASK) & PAGE_CACHE_MASK;
2266 end = start + len;
2267 if (end < start)
2268 goto out;
2270 error = 0;
2271 if (end == start)
2272 goto out;
2275 * If the interval [start,end) covers some unmapped address
2276 * ranges, just ignore them, but return -ENOMEM at the end.
2278 vma = find_vma(current->mm, start);
2279 for (;;) {
2280 /* Still start < end. */
2281 error = -ENOMEM;
2282 if (!vma)
2283 goto out;
2285 /* Here start < vma->vm_end. */
2286 if (start < vma->vm_start) {
2287 unmapped_error = -ENOMEM;
2288 start = vma->vm_start;
2291 /* Here vma->vm_start <= start < vma->vm_end. */
2292 if (end <= vma->vm_end) {
2293 if (start < end) {
2294 error = mincore_vma(vma, start, end,
2295 &vec[index]);
2296 if (error)
2297 goto out;
2299 error = unmapped_error;
2300 goto out;
2303 /* Here vma->vm_start <= start < vma->vm_end < end. */
2304 error = mincore_vma(vma, start, vma->vm_end, &vec[index]);
2305 if (error)
2306 goto out;
2307 index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT;
2308 start = vma->vm_end;
2309 vma = vma->vm_next;
2312 out:
2313 up(&current->mm->mmap_sem);
2314 return error;
2317 static inline
2318 struct page *__read_cache_page(struct address_space *mapping,
2319 unsigned long index,
2320 int (*filler)(void *,struct page*),
2321 void *data)
2323 struct page **hash = page_hash(mapping, index);
2324 struct page *page, *cached_page = NULL;
2325 int err;
2326 repeat:
2327 page = __find_get_page(mapping, index, hash);
2328 if (!page) {
2329 if (!cached_page) {
2330 cached_page = page_cache_alloc();
2331 if (!cached_page)
2332 return ERR_PTR(-ENOMEM);
2334 page = cached_page;
2335 if (add_to_page_cache_unique(page, mapping, index, hash))
2336 goto repeat;
2337 cached_page = NULL;
2338 err = filler(data, page);
2339 if (err < 0) {
2340 page_cache_release(page);
2341 page = ERR_PTR(err);
2344 if (cached_page)
2345 page_cache_free(cached_page);
2346 return page;
2350 * Read into the page cache. If a page already exists,
2351 * and Page_Uptodate() is not set, try to fill the page.
2353 struct page *read_cache_page(struct address_space *mapping,
2354 unsigned long index,
2355 int (*filler)(void *,struct page*),
2356 void *data)
2358 struct page *page = __read_cache_page(mapping, index, filler, data);
2359 int err;
2361 if (IS_ERR(page) || Page_Uptodate(page))
2362 goto out;
2364 lock_page(page);
2365 if (Page_Uptodate(page)) {
2366 UnlockPage(page);
2367 goto out;
2369 err = filler(data, page);
2370 if (err < 0) {
2371 page_cache_release(page);
2372 page = ERR_PTR(err);
2374 out:
2375 return page;
2378 static inline struct page * __grab_cache_page(struct address_space *mapping,
2379 unsigned long index, struct page **cached_page)
2381 struct page *page, **hash = page_hash(mapping, index);
2382 repeat:
2383 page = __find_lock_page(mapping, index, hash);
2384 if (!page) {
2385 if (!*cached_page) {
2386 *cached_page = page_cache_alloc();
2387 if (!*cached_page)
2388 return NULL;
2390 page = *cached_page;
2391 if (add_to_page_cache_unique(page, mapping, index, hash))
2392 goto repeat;
2393 *cached_page = NULL;
2395 return page;
2399 * Returns locked page at given index in given cache, creating it if needed.
2402 struct page *grab_cache_page(struct address_space *mapping, unsigned long index)
2404 struct page *cached_page = NULL;
2405 struct page *page = __grab_cache_page(mapping,index,&cached_page);
2406 if (cached_page)
2407 page_cache_free(cached_page);
2408 return page;
2411 static inline void remove_suid(struct inode *inode)
2413 unsigned int mode;
2415 /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
2416 mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
2418 /* was any of the uid bits set? */
2419 mode &= inode->i_mode;
2420 if (mode && !capable(CAP_FSETID)) {
2421 inode->i_mode &= ~mode;
2422 mark_inode_dirty(inode);
2427 * Write to a file through the page cache.
2429 * We currently put everything into the page cache prior to writing it.
2430 * This is not a problem when writing full pages. With partial pages,
2431 * however, we first have to read the data into the cache, then
2432 * dirty the page, and finally schedule it for writing. Alternatively, we
2433 * could write-through just the portion of data that would go into that
2434 * page, but that would kill performance for applications that write data
2435 * line by line, and it's prone to race conditions.
2437 * Note that this routine doesn't try to keep track of dirty pages. Each
2438 * file system has to do this all by itself, unfortunately.
2439 * okir@monad.swb.de
2441 ssize_t
2442 generic_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)
2444 struct inode *inode = file->f_dentry->d_inode;
2445 struct address_space *mapping = inode->i_mapping;
2446 unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
2447 loff_t pos;
2448 struct page *page, *cached_page;
2449 unsigned long written;
2450 long status;
2451 int err;
2453 cached_page = NULL;
2455 down(&inode->i_sem);
2457 pos = *ppos;
2458 err = -EINVAL;
2459 if (pos < 0)
2460 goto out;
2462 err = file->f_error;
2463 if (err) {
2464 file->f_error = 0;
2465 goto out;
2468 written = 0;
2470 if (file->f_flags & O_APPEND)
2471 pos = inode->i_size;
2474 * Check whether we've reached the file size limit.
2476 err = -EFBIG;
2477 if (limit != RLIM_INFINITY) {
2478 if (pos >= limit) {
2479 send_sig(SIGXFSZ, current, 0);
2480 goto out;
2482 if (count > limit - pos) {
2483 send_sig(SIGXFSZ, current, 0);
2484 count = limit - pos;
2488 status = 0;
2489 if (count) {
2490 remove_suid(inode);
2491 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
2492 mark_inode_dirty(inode);
2495 while (count) {
2496 unsigned long bytes, index, offset;
2497 char *kaddr;
2500 * Try to find the page in the cache. If it isn't there,
2501 * allocate a free page.
2503 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
2504 index = pos >> PAGE_CACHE_SHIFT;
2505 bytes = PAGE_CACHE_SIZE - offset;
2506 if (bytes > count)
2507 bytes = count;
2509 status = -ENOMEM; /* we'll assign it later anyway */
2510 page = __grab_cache_page(mapping, index, &cached_page);
2511 if (!page)
2512 break;
2514 /* We have exclusive IO access to the page.. */
2515 if (!PageLocked(page)) {
2516 PAGE_BUG(page);
2519 status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes);
2520 if (status)
2521 goto unlock;
2522 kaddr = page_address(page);
2523 status = copy_from_user(kaddr+offset, buf, bytes);
2524 flush_dcache_page(page);
2525 if (status)
2526 goto fail_write;
2527 status = mapping->a_ops->commit_write(file, page, offset, offset+bytes);
2528 if (!status)
2529 status = bytes;
2531 if (status >= 0) {
2532 written += status;
2533 count -= status;
2534 pos += status;
2535 buf += status;
2537 unlock:
2538 /* Mark it unlocked again and drop the page.. */
2539 UnlockPage(page);
2540 page_cache_release(page);
2542 if (status < 0)
2543 break;
2545 *ppos = pos;
2547 if (cached_page)
2548 page_cache_free(cached_page);
2550 err = written ? written : status;
2551 out:
2552 up(&inode->i_sem);
2553 return err;
2554 fail_write:
2555 status = -EFAULT;
2556 ClearPageUptodate(page);
2557 kunmap(page);
2558 goto unlock;
2561 void __init page_cache_init(unsigned long mempages)
2563 unsigned long htable_size, order;
2565 htable_size = mempages;
2566 htable_size *= sizeof(struct page *);
2567 for(order = 0; (PAGE_SIZE << order) < htable_size; order++)
2570 do {
2571 unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *);
2573 page_hash_bits = 0;
2574 while((tmp >>= 1UL) != 0UL)
2575 page_hash_bits++;
2577 page_hash_table = (struct page **)
2578 __get_free_pages(GFP_ATOMIC, order);
2579 } while(page_hash_table == NULL && --order > 0);
2581 printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
2582 (1 << page_hash_bits), order, (PAGE_SIZE << order));
2583 if (!page_hash_table)
2584 panic("Failed to allocate page hash table\n");
2585 memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));