Beautify diagnostic messages.
[linux-2.6/linux-mips.git] / mm / filemap.c
blob58f642b301e2863e547afddf14a8ae164c720e56
1 /*
2 * linux/mm/filemap.c
4 * Copyright (C) 1994-1999 Linus Torvalds
5 */
7 /*
8 * This file handles the generic file mmap semantics used by
9 * most "normal" filesystems (but you don't /have/ to use this:
10 * the NFS filesystem used to do this differently, for example)
12 #include <linux/malloc.h>
13 #include <linux/shm.h>
14 #include <linux/mman.h>
15 #include <linux/locks.h>
16 #include <linux/pagemap.h>
17 #include <linux/swap.h>
18 #include <linux/smp_lock.h>
19 #include <linux/blkdev.h>
20 #include <linux/file.h>
21 #include <linux/swapctl.h>
22 #include <linux/slab.h>
23 #include <linux/init.h>
24 #include <linux/mm.h>
26 #include <asm/pgalloc.h>
27 #include <asm/uaccess.h>
28 #include <asm/mman.h>
30 #include <linux/highmem.h>
33 * Shared mappings implemented 30.11.1994. It's not fully working yet,
34 * though.
36 * Shared mappings now work. 15.8.1995 Bruno.
38 * finished 'unifying' the page and buffer cache and SMP-threaded the
39 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
41 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
44 atomic_t page_cache_size = ATOMIC_INIT(0);
45 unsigned int page_hash_bits;
46 struct page **page_hash_table;
47 struct list_head lru_cache;
49 static spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
51 * NOTE: to avoid deadlocking you must never acquire the pagecache_lock with
52 * the pagemap_lru_lock held.
54 spinlock_t pagemap_lru_lock = SPIN_LOCK_UNLOCKED;
56 #define CLUSTER_PAGES (1 << page_cluster)
57 #define CLUSTER_OFFSET(x) (((x) >> page_cluster) << page_cluster)
59 void __add_page_to_hash_queue(struct page * page, struct page **p)
61 atomic_inc(&page_cache_size);
62 if((page->next_hash = *p) != NULL)
63 (*p)->pprev_hash = &page->next_hash;
64 *p = page;
65 page->pprev_hash = p;
66 if (page->buffers)
67 PAGE_BUG(page);
70 static inline void remove_page_from_hash_queue(struct page * page)
72 if(page->pprev_hash) {
73 if(page->next_hash)
74 page->next_hash->pprev_hash = page->pprev_hash;
75 *page->pprev_hash = page->next_hash;
76 page->pprev_hash = NULL;
78 atomic_dec(&page_cache_size);
81 static inline int sync_page(struct page *page)
83 struct address_space *mapping = page->mapping;
85 if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
86 return mapping->a_ops->sync_page(page);
87 return 0;
91 * Remove a page from the page cache and free it. Caller has to make
92 * sure the page is locked and that nobody else uses it - or that usage
93 * is safe.
95 static inline void __remove_inode_page(struct page *page)
97 remove_page_from_inode_queue(page);
98 remove_page_from_hash_queue(page);
99 page->mapping = NULL;
102 void remove_inode_page(struct page *page)
104 if (!PageLocked(page))
105 PAGE_BUG(page);
107 spin_lock(&pagecache_lock);
108 __remove_inode_page(page);
109 spin_unlock(&pagecache_lock);
113 * invalidate_inode_pages - Invalidate all the unlocked pages of one inode
114 * @inode: the inode which pages we want to invalidate
116 * This function only removes the unlocked pages, if you want to
117 * remove all the pages of one inode, you must call truncate_inode_pages.
120 void invalidate_inode_pages(struct inode * inode)
122 struct list_head *head, *curr;
123 struct page * page;
125 head = &inode->i_mapping->pages;
127 spin_lock(&pagecache_lock);
128 spin_lock(&pagemap_lru_lock);
129 curr = head->next;
131 while (curr != head) {
132 page = list_entry(curr, struct page, list);
133 curr = curr->next;
135 /* We cannot invalidate a locked page */
136 if (TryLockPage(page))
137 continue;
139 __lru_cache_del(page);
140 __remove_inode_page(page);
141 UnlockPage(page);
142 page_cache_release(page);
145 spin_unlock(&pagemap_lru_lock);
146 spin_unlock(&pagecache_lock);
150 * Truncate the page cache at a set offset, removing the pages
151 * that are beyond that offset (and zeroing out partial pages).
153 void truncate_inode_pages(struct address_space * mapping, loff_t lstart)
155 struct list_head *head, *curr;
156 struct page * page;
157 unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
158 unsigned long start;
160 start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
162 repeat:
163 head = &mapping->pages;
164 spin_lock(&pagecache_lock);
165 curr = head->next;
166 while (curr != head) {
167 unsigned long offset;
169 page = list_entry(curr, struct page, list);
170 curr = curr->next;
172 offset = page->index;
174 /* page wholly truncated - free it */
175 if (offset >= start) {
176 if (TryLockPage(page)) {
177 page_cache_get(page);
178 spin_unlock(&pagecache_lock);
179 wait_on_page(page);
180 page_cache_release(page);
181 goto repeat;
183 page_cache_get(page);
184 spin_unlock(&pagecache_lock);
186 if (!page->buffers || block_flushpage(page, 0))
187 lru_cache_del(page);
190 * We remove the page from the page cache
191 * _after_ we have destroyed all buffer-cache
192 * references to it. Otherwise some other process
193 * might think this inode page is not in the
194 * page cache and creates a buffer-cache alias
195 * to it causing all sorts of fun problems ...
197 remove_inode_page(page);
198 ClearPageDirty(page);
200 UnlockPage(page);
201 page_cache_release(page);
202 page_cache_release(page);
205 * We have done things without the pagecache lock,
206 * so we'll have to repeat the scan.
207 * It's not possible to deadlock here because
208 * we are guaranteed to make progress. (ie. we have
209 * just removed a page)
211 goto repeat;
214 * there is only one partial page possible.
216 if (!partial)
217 continue;
219 /* and it's the one preceeding the first wholly truncated page */
220 if ((offset + 1) != start)
221 continue;
223 /* partial truncate, clear end of page */
224 if (TryLockPage(page)) {
225 spin_unlock(&pagecache_lock);
226 goto repeat;
228 page_cache_get(page);
229 spin_unlock(&pagecache_lock);
231 memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
232 if (page->buffers)
233 block_flushpage(page, partial);
235 partial = 0;
238 * we have dropped the spinlock so we have to
239 * restart.
241 UnlockPage(page);
242 page_cache_release(page);
243 goto repeat;
245 spin_unlock(&pagecache_lock);
249 * nr_dirty represents the number of dirty pages that we will write async
250 * before doing sync writes. We can only do sync writes if we can
251 * wait for IO (__GFP_IO set).
253 int shrink_mmap(int priority, int gfp_mask)
255 int ret = 0, count, nr_dirty;
256 struct list_head * page_lru;
257 struct page * page = NULL;
259 count = nr_lru_pages / (priority + 1);
260 nr_dirty = priority;
262 /* we need pagemap_lru_lock for list_del() ... subtle code below */
263 spin_lock(&pagemap_lru_lock);
264 while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache) {
265 page = list_entry(page_lru, struct page, lru);
266 list_del(page_lru);
268 if (PageTestandClearReferenced(page))
269 goto dispose_continue;
271 count--;
273 * Avoid unscalable SMP locking for pages we can
274 * immediate tell are untouchable..
276 if (!page->buffers && page_count(page) > 1)
277 goto dispose_continue;
279 if (TryLockPage(page))
280 goto dispose_continue;
282 /* Release the pagemap_lru lock even if the page is not yet
283 queued in any lru queue since we have just locked down
284 the page so nobody else may SMP race with us running
285 a lru_cache_del() (lru_cache_del() always run with the
286 page locked down ;). */
287 spin_unlock(&pagemap_lru_lock);
289 /* avoid freeing the page while it's locked */
290 page_cache_get(page);
293 * Is it a buffer page? Try to clean it up regardless
294 * of zone - it's old.
296 if (page->buffers) {
297 int wait;
299 * 0 - free it if can do so without IO
300 * 1 - start write-out of dirty buffers
301 * 2 - wait for locked buffers
303 wait = (gfp_mask & __GFP_IO) ? (nr_dirty-- < 0) ? 2 : 1 : 0;
304 if (!try_to_free_buffers(page, wait))
305 goto unlock_continue;
306 /* page was locked, inode can't go away under us */
307 if (!page->mapping) {
308 atomic_dec(&buffermem_pages);
309 goto made_buffer_progress;
313 /* Take the pagecache_lock spinlock held to avoid
314 other tasks to notice the page while we are looking at its
315 page count. If it's a pagecache-page we'll free it
316 in one atomic transaction after checking its page count. */
317 spin_lock(&pagecache_lock);
320 * We can't free pages unless there's just one user
321 * (count == 2 because we added one ourselves above).
323 if (page_count(page) != 2)
324 goto cache_unlock_continue;
327 * Is it a page swap page? If so, we want to
328 * drop it if it is no longer used, even if it
329 * were to be marked referenced..
331 if (PageSwapCache(page)) {
332 spin_unlock(&pagecache_lock);
333 __delete_from_swap_cache(page);
334 goto made_inode_progress;
338 * Page is from a zone we don't care about.
339 * Don't drop page cache entries in vain.
341 if (page->zone->free_pages > page->zone->pages_high)
342 goto cache_unlock_continue;
344 /* is it a page-cache page? */
345 if (page->mapping) {
346 if (!PageDirty(page) && !pgcache_under_min()) {
347 __remove_inode_page(page);
348 spin_unlock(&pagecache_lock);
349 goto made_inode_progress;
351 goto cache_unlock_continue;
354 printk(KERN_ERR "shrink_mmap: unknown LRU page!\n");
356 cache_unlock_continue:
357 spin_unlock(&pagecache_lock);
358 unlock_continue:
359 spin_lock(&pagemap_lru_lock);
360 UnlockPage(page);
361 page_cache_release(page);
362 dispose_continue:
363 list_add(page_lru, &lru_cache);
365 goto out;
367 made_inode_progress:
368 page_cache_release(page);
369 made_buffer_progress:
370 UnlockPage(page);
371 page_cache_release(page);
372 ret = 1;
373 spin_lock(&pagemap_lru_lock);
374 /* nr_lru_pages needs the spinlock */
375 nr_lru_pages--;
377 out:
378 spin_unlock(&pagemap_lru_lock);
380 return ret;
383 static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
385 goto inside;
387 for (;;) {
388 page = page->next_hash;
389 inside:
390 if (!page)
391 goto not_found;
392 if (page->mapping != mapping)
393 continue;
394 if (page->index == offset)
395 break;
397 SetPageReferenced(page);
398 not_found:
399 return page;
403 * By the time this is called, the page is locked and
404 * we don't have to worry about any races any more.
406 * Start the IO..
408 static int writeout_one_page(struct page *page)
410 struct buffer_head *bh, *head = page->buffers;
412 bh = head;
413 do {
414 if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
415 continue;
417 bh->b_flushtime = 0;
418 ll_rw_block(WRITE, 1, &bh);
419 } while ((bh = bh->b_this_page) != head);
420 return 0;
423 static int waitfor_one_page(struct page *page)
425 int error = 0;
426 struct buffer_head *bh, *head = page->buffers;
428 bh = head;
429 do {
430 wait_on_buffer(bh);
431 if (buffer_req(bh) && !buffer_uptodate(bh))
432 error = -EIO;
433 } while ((bh = bh->b_this_page) != head);
434 return error;
437 static int do_buffer_fdatasync(struct inode *inode, unsigned long start, unsigned long end, int (*fn)(struct page *))
439 struct list_head *head, *curr;
440 struct page *page;
441 int retval = 0;
443 head = &inode->i_mapping->pages;
445 spin_lock(&pagecache_lock);
446 curr = head->next;
447 while (curr != head) {
448 page = list_entry(curr, struct page, list);
449 curr = curr->next;
450 if (!page->buffers)
451 continue;
452 if (page->index >= end)
453 continue;
454 if (page->index < start)
455 continue;
457 page_cache_get(page);
458 spin_unlock(&pagecache_lock);
459 lock_page(page);
461 /* The buffers could have been free'd while we waited for the page lock */
462 if (page->buffers)
463 retval |= fn(page);
465 UnlockPage(page);
466 spin_lock(&pagecache_lock);
467 curr = page->list.next;
468 page_cache_release(page);
470 spin_unlock(&pagecache_lock);
472 return retval;
476 * Two-stage data sync: first start the IO, then go back and
477 * collect the information..
479 int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx)
481 int retval;
483 retval = do_buffer_fdatasync(inode, start_idx, end_idx, writeout_one_page);
484 retval |= do_buffer_fdatasync(inode, start_idx, end_idx, waitfor_one_page);
485 return retval;
489 * Add a page to the inode page cache.
491 * The caller must have locked the page and
492 * set all the page flags correctly..
494 void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index)
496 if (!PageLocked(page))
497 BUG();
499 page_cache_get(page);
500 spin_lock(&pagecache_lock);
501 page->index = index;
502 add_page_to_inode_queue(mapping, page);
503 __add_page_to_hash_queue(page, page_hash(mapping, index));
504 lru_cache_add(page);
505 spin_unlock(&pagecache_lock);
509 * This adds a page to the page cache, starting out as locked,
510 * owned by us, but unreferenced, not uptodate and with no errors.
512 static inline void __add_to_page_cache(struct page * page,
513 struct address_space *mapping, unsigned long offset,
514 struct page **hash)
516 struct page *alias;
517 unsigned long flags;
519 if (PageLocked(page))
520 BUG();
522 flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced));
523 page->flags = flags | (1 << PG_locked);
524 page_cache_get(page);
525 page->index = offset;
526 add_page_to_inode_queue(mapping, page);
527 __add_page_to_hash_queue(page, hash);
528 lru_cache_add(page);
529 alias = __find_page_nolock(mapping, offset, *hash);
530 if (alias != page)
531 BUG();
534 void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset)
536 spin_lock(&pagecache_lock);
537 __add_to_page_cache(page, mapping, offset, page_hash(mapping, offset));
538 spin_unlock(&pagecache_lock);
541 static int add_to_page_cache_unique(struct page * page,
542 struct address_space *mapping, unsigned long offset,
543 struct page **hash)
545 int err;
546 struct page *alias;
548 spin_lock(&pagecache_lock);
549 alias = __find_page_nolock(mapping, offset, *hash);
551 err = 1;
552 if (!alias) {
553 __add_to_page_cache(page,mapping,offset,hash);
554 err = 0;
557 spin_unlock(&pagecache_lock);
558 return err;
562 * This adds the requested page to the page cache if it isn't already there,
563 * and schedules an I/O to read in its contents from disk.
565 static inline int page_cache_read(struct file * file, unsigned long offset)
567 struct inode *inode = file->f_dentry->d_inode;
568 struct address_space *mapping = inode->i_mapping;
569 struct page **hash = page_hash(mapping, offset);
570 struct page *page;
572 spin_lock(&pagecache_lock);
573 page = __find_page_nolock(mapping, offset, *hash);
574 spin_unlock(&pagecache_lock);
575 if (page)
576 return 0;
578 page = page_cache_alloc();
579 if (!page)
580 return -ENOMEM;
582 if (!add_to_page_cache_unique(page, mapping, offset, hash)) {
583 int error = mapping->a_ops->readpage(file, page);
584 page_cache_release(page);
585 return error;
588 * We arrive here in the unlikely event that someone
589 * raced with us and added our page to the cache first.
591 page_cache_free(page);
592 return 0;
596 * Read in an entire cluster at once. A cluster is usually a 64k-
597 * aligned block that includes the page requested in "offset."
599 static int read_cluster_nonblocking(struct file * file, unsigned long offset,
600 unsigned long filesize)
602 unsigned long pages = CLUSTER_PAGES;
604 offset = CLUSTER_OFFSET(offset);
605 while ((pages-- > 0) && (offset < filesize)) {
606 int error = page_cache_read(file, offset);
607 if (error < 0)
608 return error;
609 offset ++;
612 return 0;
616 * Wait for a page to get unlocked.
618 * This must be called with the caller "holding" the page,
619 * ie with increased "page->count" so that the page won't
620 * go away during the wait..
622 void ___wait_on_page(struct page *page)
624 struct task_struct *tsk = current;
625 DECLARE_WAITQUEUE(wait, tsk);
627 add_wait_queue(&page->wait, &wait);
628 do {
629 sync_page(page);
630 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
631 if (!PageLocked(page))
632 break;
633 schedule();
634 } while (PageLocked(page));
635 tsk->state = TASK_RUNNING;
636 remove_wait_queue(&page->wait, &wait);
640 * Get an exclusive lock on the page..
642 void lock_page(struct page *page)
644 while (TryLockPage(page))
645 ___wait_on_page(page);
650 * a rather lightweight function, finding and getting a reference to a
651 * hashed page atomically, waiting for it if it's locked.
653 struct page * __find_get_page (struct address_space *mapping,
654 unsigned long offset, struct page **hash)
656 struct page *page;
659 * We scan the hash list read-only. Addition to and removal from
660 * the hash-list needs a held write-lock.
662 repeat:
663 spin_lock(&pagecache_lock);
664 page = __find_page_nolock(mapping, offset, *hash);
665 if (page)
666 page_cache_get(page);
667 spin_unlock(&pagecache_lock);
669 /* Found the page, sleep if locked. */
670 if (page && PageLocked(page)) {
671 struct task_struct *tsk = current;
672 DECLARE_WAITQUEUE(wait, tsk);
674 sync_page(page);
676 __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
677 add_wait_queue(&page->wait, &wait);
679 if (PageLocked(page))
680 schedule();
681 __set_task_state(tsk, TASK_RUNNING);
682 remove_wait_queue(&page->wait, &wait);
685 * The page might have been unhashed meanwhile. It's
686 * not freed though because we hold a reference to it.
687 * If this is the case then it will be freed _here_,
688 * and we recheck the hash anyway.
690 page_cache_release(page);
691 goto repeat;
694 * It's not locked so we can return the page and we hold
695 * a reference to it.
697 return page;
701 * Get the lock to a page atomically.
703 struct page * __find_lock_page (struct address_space *mapping,
704 unsigned long offset, struct page **hash)
706 struct page *page;
709 * We scan the hash list read-only. Addition to and removal from
710 * the hash-list needs a held write-lock.
712 repeat:
713 spin_lock(&pagecache_lock);
714 page = __find_page_nolock(mapping, offset, *hash);
715 if (page)
716 page_cache_get(page);
717 spin_unlock(&pagecache_lock);
719 /* Found the page, sleep if locked. */
720 if (page && TryLockPage(page)) {
721 struct task_struct *tsk = current;
722 DECLARE_WAITQUEUE(wait, tsk);
724 sync_page(page);
726 __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
727 add_wait_queue(&page->wait, &wait);
729 if (PageLocked(page))
730 schedule();
731 __set_task_state(tsk, TASK_RUNNING);
732 remove_wait_queue(&page->wait, &wait);
735 * The page might have been unhashed meanwhile. It's
736 * not freed though because we hold a reference to it.
737 * If this is the case then it will be freed _here_,
738 * and we recheck the hash anyway.
740 page_cache_release(page);
741 goto repeat;
744 * It's not locked so we can return the page and we hold
745 * a reference to it.
747 return page;
750 #if 0
751 #define PROFILE_READAHEAD
752 #define DEBUG_READAHEAD
753 #endif
756 * Read-ahead profiling information
757 * --------------------------------
758 * Every PROFILE_MAXREADCOUNT, the following information is written
759 * to the syslog:
760 * Percentage of asynchronous read-ahead.
761 * Average of read-ahead fields context value.
762 * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
763 * to the syslog.
766 #ifdef PROFILE_READAHEAD
768 #define PROFILE_MAXREADCOUNT 1000
770 static unsigned long total_reada;
771 static unsigned long total_async;
772 static unsigned long total_ramax;
773 static unsigned long total_ralen;
774 static unsigned long total_rawin;
776 static void profile_readahead(int async, struct file *filp)
778 unsigned long flags;
780 ++total_reada;
781 if (async)
782 ++total_async;
784 total_ramax += filp->f_ramax;
785 total_ralen += filp->f_ralen;
786 total_rawin += filp->f_rawin;
788 if (total_reada > PROFILE_MAXREADCOUNT) {
789 save_flags(flags);
790 cli();
791 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
792 restore_flags(flags);
793 return;
796 printk("Readahead average: max=%ld, len=%ld, win=%ld, async=%ld%%\n",
797 total_ramax/total_reada,
798 total_ralen/total_reada,
799 total_rawin/total_reada,
800 (total_async*100)/total_reada);
801 #ifdef DEBUG_READAHEAD
802 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n",
803 filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
804 #endif
806 total_reada = 0;
807 total_async = 0;
808 total_ramax = 0;
809 total_ralen = 0;
810 total_rawin = 0;
812 restore_flags(flags);
815 #endif /* defined PROFILE_READAHEAD */
818 * Read-ahead context:
819 * -------------------
820 * The read ahead context fields of the "struct file" are the following:
821 * - f_raend : position of the first byte after the last page we tried to
822 * read ahead.
823 * - f_ramax : current read-ahead maximum size.
824 * - f_ralen : length of the current IO read block we tried to read-ahead.
825 * - f_rawin : length of the current read-ahead window.
826 * if last read-ahead was synchronous then
827 * f_rawin = f_ralen
828 * otherwise (was asynchronous)
829 * f_rawin = previous value of f_ralen + f_ralen
831 * Read-ahead limits:
832 * ------------------
833 * MIN_READAHEAD : minimum read-ahead size when read-ahead.
834 * MAX_READAHEAD : maximum read-ahead size when read-ahead.
836 * Synchronous read-ahead benefits:
837 * --------------------------------
838 * Using reasonable IO xfer length from peripheral devices increase system
839 * performances.
840 * Reasonable means, in this context, not too large but not too small.
841 * The actual maximum value is:
842 * MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
843 * and 32K if defined (4K page size assumed).
845 * Asynchronous read-ahead benefits:
846 * ---------------------------------
847 * Overlapping next read request and user process execution increase system
848 * performance.
850 * Read-ahead risks:
851 * -----------------
852 * We have to guess which further data are needed by the user process.
853 * If these data are often not really needed, it's bad for system
854 * performances.
855 * However, we know that files are often accessed sequentially by
856 * application programs and it seems that it is possible to have some good
857 * strategy in that guessing.
858 * We only try to read-ahead files that seems to be read sequentially.
860 * Asynchronous read-ahead risks:
861 * ------------------------------
862 * In order to maximize overlapping, we must start some asynchronous read
863 * request from the device, as soon as possible.
864 * We must be very careful about:
865 * - The number of effective pending IO read requests.
866 * ONE seems to be the only reasonable value.
867 * - The total memory pool usage for the file access stream.
868 * This maximum memory usage is implicitly 2 IO read chunks:
869 * 2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
870 * 64k if defined (4K page size assumed).
873 static inline int get_max_readahead(struct inode * inode)
875 if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
876 return MAX_READAHEAD;
877 return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
880 static void generic_file_readahead(int reada_ok,
881 struct file * filp, struct inode * inode,
882 struct page * page)
884 unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
885 unsigned long index = page->index;
886 unsigned long max_ahead, ahead;
887 unsigned long raend;
888 int max_readahead = get_max_readahead(inode);
890 raend = filp->f_raend;
891 max_ahead = 0;
894 * The current page is locked.
895 * If the current position is inside the previous read IO request, do not
896 * try to reread previously read ahead pages.
897 * Otherwise decide or not to read ahead some pages synchronously.
898 * If we are not going to read ahead, set the read ahead context for this
899 * page only.
901 if (PageLocked(page)) {
902 if (!filp->f_ralen || index >= raend || index + filp->f_ralen < raend) {
903 raend = index;
904 if (raend < end_index)
905 max_ahead = filp->f_ramax;
906 filp->f_rawin = 0;
907 filp->f_ralen = 1;
908 if (!max_ahead) {
909 filp->f_raend = index + filp->f_ralen;
910 filp->f_rawin += filp->f_ralen;
915 * The current page is not locked.
916 * If we were reading ahead and,
917 * if the current max read ahead size is not zero and,
918 * if the current position is inside the last read-ahead IO request,
919 * it is the moment to try to read ahead asynchronously.
920 * We will later force unplug device in order to force asynchronous read IO.
922 else if (reada_ok && filp->f_ramax && raend >= 1 &&
923 index <= raend && index + filp->f_ralen >= raend) {
925 * Add ONE page to max_ahead in order to try to have about the same IO max size
926 * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
927 * Compute the position of the last page we have tried to read in order to
928 * begin to read ahead just at the next page.
930 raend -= 1;
931 if (raend < end_index)
932 max_ahead = filp->f_ramax + 1;
934 if (max_ahead) {
935 filp->f_rawin = filp->f_ralen;
936 filp->f_ralen = 0;
937 reada_ok = 2;
941 * Try to read ahead pages.
942 * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
943 * scheduler, will work enough for us to avoid too bad actuals IO requests.
945 ahead = 0;
946 while (ahead < max_ahead) {
947 ahead ++;
948 if ((raend + ahead) >= end_index)
949 break;
950 if (page_cache_read(filp, raend + ahead) < 0)
951 break;
954 * If we tried to read ahead some pages,
955 * If we tried to read ahead asynchronously,
956 * Try to force unplug of the device in order to start an asynchronous
957 * read IO request.
958 * Update the read-ahead context.
959 * Store the length of the current read-ahead window.
960 * Double the current max read ahead size.
961 * That heuristic avoid to do some large IO for files that are not really
962 * accessed sequentially.
964 if (ahead) {
965 if (reada_ok == 2) {
966 run_task_queue(&tq_disk);
969 filp->f_ralen += ahead;
970 filp->f_rawin += filp->f_ralen;
971 filp->f_raend = raend + ahead + 1;
973 filp->f_ramax += filp->f_ramax;
975 if (filp->f_ramax > max_readahead)
976 filp->f_ramax = max_readahead;
978 #ifdef PROFILE_READAHEAD
979 profile_readahead((reada_ok == 2), filp);
980 #endif
983 return;
988 * This is a generic file read routine, and uses the
989 * inode->i_op->readpage() function for the actual low-level
990 * stuff.
992 * This is really ugly. But the goto's actually try to clarify some
993 * of the logic when it comes to error handling etc.
995 void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
997 struct inode *inode = filp->f_dentry->d_inode;
998 struct address_space *mapping = inode->i_mapping;
999 unsigned long index, offset;
1000 struct page *cached_page;
1001 int reada_ok;
1002 int error;
1003 int max_readahead = get_max_readahead(inode);
1005 cached_page = NULL;
1006 index = *ppos >> PAGE_CACHE_SHIFT;
1007 offset = *ppos & ~PAGE_CACHE_MASK;
1010 * If the current position is outside the previous read-ahead window,
1011 * we reset the current read-ahead context and set read ahead max to zero
1012 * (will be set to just needed value later),
1013 * otherwise, we assume that the file accesses are sequential enough to
1014 * continue read-ahead.
1016 if (index > filp->f_raend || index + filp->f_rawin < filp->f_raend) {
1017 reada_ok = 0;
1018 filp->f_raend = 0;
1019 filp->f_ralen = 0;
1020 filp->f_ramax = 0;
1021 filp->f_rawin = 0;
1022 } else {
1023 reada_ok = 1;
1026 * Adjust the current value of read-ahead max.
1027 * If the read operation stay in the first half page, force no readahead.
1028 * Otherwise try to increase read ahead max just enough to do the read request.
1029 * Then, at least MIN_READAHEAD if read ahead is ok,
1030 * and at most MAX_READAHEAD in all cases.
1032 if (!index && offset + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
1033 filp->f_ramax = 0;
1034 } else {
1035 unsigned long needed;
1037 needed = ((offset + desc->count) >> PAGE_CACHE_SHIFT) + 1;
1039 if (filp->f_ramax < needed)
1040 filp->f_ramax = needed;
1042 if (reada_ok && filp->f_ramax < MIN_READAHEAD)
1043 filp->f_ramax = MIN_READAHEAD;
1044 if (filp->f_ramax > max_readahead)
1045 filp->f_ramax = max_readahead;
1048 for (;;) {
1049 struct page *page, **hash;
1050 unsigned long end_index, nr;
1052 end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1053 if (index > end_index)
1054 break;
1055 nr = PAGE_CACHE_SIZE;
1056 if (index == end_index) {
1057 nr = inode->i_size & ~PAGE_CACHE_MASK;
1058 if (nr <= offset)
1059 break;
1062 nr = nr - offset;
1065 * Try to find the data in the page cache..
1067 hash = page_hash(mapping, index);
1069 spin_lock(&pagecache_lock);
1070 page = __find_page_nolock(mapping, index, *hash);
1071 if (!page)
1072 goto no_cached_page;
1073 found_page:
1074 page_cache_get(page);
1075 spin_unlock(&pagecache_lock);
1077 if (!Page_Uptodate(page))
1078 goto page_not_up_to_date;
1079 page_ok:
1081 * Ok, we have the page, and it's up-to-date, so
1082 * now we can copy it to user space...
1084 * The actor routine returns how many bytes were actually used..
1085 * NOTE! This may not be the same as how much of a user buffer
1086 * we filled up (we may be padding etc), so we can only update
1087 * "pos" here (the actor routine has to update the user buffer
1088 * pointers and the remaining count).
1090 nr = actor(desc, page, offset, nr);
1091 offset += nr;
1092 index += offset >> PAGE_CACHE_SHIFT;
1093 offset &= ~PAGE_CACHE_MASK;
1095 page_cache_release(page);
1096 if (nr && desc->count)
1097 continue;
1098 break;
1101 * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
1103 page_not_up_to_date:
1104 generic_file_readahead(reada_ok, filp, inode, page);
1106 if (Page_Uptodate(page))
1107 goto page_ok;
1109 /* Get exclusive access to the page ... */
1110 lock_page(page);
1111 if (Page_Uptodate(page)) {
1112 UnlockPage(page);
1113 goto page_ok;
1116 readpage:
1117 /* ... and start the actual read. The read will unlock the page. */
1118 error = mapping->a_ops->readpage(filp, page);
1120 if (!error) {
1121 if (Page_Uptodate(page))
1122 goto page_ok;
1124 /* Again, try some read-ahead while waiting for the page to finish.. */
1125 generic_file_readahead(reada_ok, filp, inode, page);
1126 wait_on_page(page);
1127 if (Page_Uptodate(page))
1128 goto page_ok;
1129 error = -EIO;
1132 /* UHHUH! A synchronous read error occurred. Report it */
1133 desc->error = error;
1134 page_cache_release(page);
1135 break;
1137 no_cached_page:
1139 * Ok, it wasn't cached, so we need to create a new
1140 * page..
1142 * We get here with the page cache lock held.
1144 if (!cached_page) {
1145 spin_unlock(&pagecache_lock);
1146 cached_page = page_cache_alloc();
1147 if (!cached_page) {
1148 desc->error = -ENOMEM;
1149 break;
1153 * Somebody may have added the page while we
1154 * dropped the page cache lock. Check for that.
1156 spin_lock(&pagecache_lock);
1157 page = __find_page_nolock(mapping, index, *hash);
1158 if (page)
1159 goto found_page;
1163 * Ok, add the new page to the hash-queues...
1165 page = cached_page;
1166 __add_to_page_cache(page, mapping, index, hash);
1167 spin_unlock(&pagecache_lock);
1168 cached_page = NULL;
1170 goto readpage;
1173 *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1174 filp->f_reada = 1;
1175 if (cached_page)
1176 page_cache_free(cached_page);
1177 UPDATE_ATIME(inode);
1180 static int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
1182 unsigned long kaddr;
1183 unsigned long left, count = desc->count;
1185 if (size > count)
1186 size = count;
1188 kaddr = kmap(page);
1189 left = __copy_to_user(desc->buf, (void *)(kaddr + offset), size);
1190 kunmap(page);
1192 if (left) {
1193 size -= left;
1194 desc->error = -EFAULT;
1196 desc->count = count - size;
1197 desc->written += size;
1198 desc->buf += size;
1199 return size;
1203 * This is the "read()" routine for all filesystems
1204 * that can use the page cache directly.
1206 ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
1208 ssize_t retval;
1210 retval = -EFAULT;
1211 if (access_ok(VERIFY_WRITE, buf, count)) {
1212 retval = 0;
1214 if (count) {
1215 read_descriptor_t desc;
1217 desc.written = 0;
1218 desc.count = count;
1219 desc.buf = buf;
1220 desc.error = 0;
1221 do_generic_file_read(filp, ppos, &desc, file_read_actor);
1223 retval = desc.written;
1224 if (!retval)
1225 retval = desc.error;
1228 return retval;
1231 static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size)
1233 unsigned long kaddr;
1234 ssize_t written;
1235 unsigned long count = desc->count;
1236 struct file *file = (struct file *) desc->buf;
1237 mm_segment_t old_fs;
1239 if (size > count)
1240 size = count;
1241 old_fs = get_fs();
1242 set_fs(KERNEL_DS);
1244 kaddr = kmap(page);
1245 written = file->f_op->write(file, (char *)kaddr + offset,
1246 size, &file->f_pos);
1247 kunmap(page);
1248 set_fs(old_fs);
1249 if (written < 0) {
1250 desc->error = written;
1251 written = 0;
1253 desc->count = count - written;
1254 desc->written += written;
1255 return written;
1258 asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
1260 ssize_t retval;
1261 struct file * in_file, * out_file;
1262 struct inode * in_inode, * out_inode;
1265 * Get input file, and verify that it is ok..
1267 retval = -EBADF;
1268 in_file = fget(in_fd);
1269 if (!in_file)
1270 goto out;
1271 if (!(in_file->f_mode & FMODE_READ))
1272 goto fput_in;
1273 retval = -EINVAL;
1274 in_inode = in_file->f_dentry->d_inode;
1275 if (!in_inode)
1276 goto fput_in;
1277 if (!in_inode->i_mapping->a_ops->readpage)
1278 goto fput_in;
1279 retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
1280 if (retval)
1281 goto fput_in;
1284 * Get output file, and verify that it is ok..
1286 retval = -EBADF;
1287 out_file = fget(out_fd);
1288 if (!out_file)
1289 goto fput_in;
1290 if (!(out_file->f_mode & FMODE_WRITE))
1291 goto fput_out;
1292 retval = -EINVAL;
1293 if (!out_file->f_op || !out_file->f_op->write)
1294 goto fput_out;
1295 out_inode = out_file->f_dentry->d_inode;
1296 if (!out_inode)
1297 goto fput_out;
1298 retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
1299 if (retval)
1300 goto fput_out;
1302 retval = 0;
1303 if (count) {
1304 read_descriptor_t desc;
1305 loff_t pos = 0, *ppos;
1307 retval = -EFAULT;
1308 ppos = &in_file->f_pos;
1309 if (offset) {
1310 if (get_user(pos, offset))
1311 goto fput_out;
1312 ppos = &pos;
1315 desc.written = 0;
1316 desc.count = count;
1317 desc.buf = (char *) out_file;
1318 desc.error = 0;
1319 do_generic_file_read(in_file, ppos, &desc, file_send_actor);
1321 retval = desc.written;
1322 if (!retval)
1323 retval = desc.error;
1324 if (offset)
1325 put_user(pos, offset);
1328 fput_out:
1329 fput(out_file);
1330 fput_in:
1331 fput(in_file);
1332 out:
1333 return retval;
1337 * Read-ahead and flush behind for MADV_SEQUENTIAL areas. Since we are
1338 * sure this is sequential access, we don't need a flexible read-ahead
1339 * window size -- we can always use a large fixed size window.
1341 static void nopage_sequential_readahead(struct vm_area_struct * vma,
1342 unsigned long pgoff, unsigned long filesize)
1344 unsigned long ra_window;
1346 ra_window = get_max_readahead(vma->vm_file->f_dentry->d_inode);
1347 ra_window = CLUSTER_OFFSET(ra_window + CLUSTER_PAGES - 1);
1349 /* vm_raend is zero if we haven't read ahead in this area yet. */
1350 if (vma->vm_raend == 0)
1351 vma->vm_raend = vma->vm_pgoff + ra_window;
1354 * If we've just faulted the page half-way through our window,
1355 * then schedule reads for the next window, and release the
1356 * pages in the previous window.
1358 if ((pgoff + (ra_window >> 1)) == vma->vm_raend) {
1359 unsigned long start = vma->vm_pgoff + vma->vm_raend;
1360 unsigned long end = start + ra_window;
1362 if (end > ((vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff))
1363 end = (vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff;
1364 if (start > end)
1365 return;
1367 while ((start < end) && (start < filesize)) {
1368 if (read_cluster_nonblocking(vma->vm_file,
1369 start, filesize) < 0)
1370 break;
1371 start += CLUSTER_PAGES;
1373 run_task_queue(&tq_disk);
1375 /* if we're far enough past the beginning of this area,
1376 recycle pages that are in the previous window. */
1377 if (vma->vm_raend > (vma->vm_pgoff + ra_window + ra_window)) {
1378 unsigned long window = ra_window << PAGE_SHIFT;
1380 end = vma->vm_start + (vma->vm_raend << PAGE_SHIFT);
1381 end -= window + window;
1382 filemap_sync(vma, end - window, window, MS_INVALIDATE);
1385 vma->vm_raend += ra_window;
1388 return;
1392 * filemap_nopage() is invoked via the vma operations vector for a
1393 * mapped memory region to read in file data during a page fault.
1395 * The goto's are kind of ugly, but this streamlines the normal case of having
1396 * it in the page cache, and handles the special cases reasonably without
1397 * having a lot of duplicated code.
1399 struct page * filemap_nopage(struct vm_area_struct * area,
1400 unsigned long address, int no_share)
1402 int error;
1403 struct file *file = area->vm_file;
1404 struct inode *inode = file->f_dentry->d_inode;
1405 struct address_space *mapping = inode->i_mapping;
1406 struct page *page, **hash, *old_page;
1407 unsigned long size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1409 unsigned long pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
1412 * Semantics for shared and private memory areas are different
1413 * past the end of the file. A shared mapping past the last page
1414 * of the file is an error and results in a SIGBUS, while a
1415 * private mapping just maps in a zero page.
1417 if ((pgoff >= size) && (area->vm_mm == current->mm))
1418 return NULL;
1421 * Do we have something in the page cache already?
1423 hash = page_hash(mapping, pgoff);
1424 retry_find:
1425 page = __find_get_page(mapping, pgoff, hash);
1426 if (!page)
1427 goto no_cached_page;
1430 * Ok, found a page in the page cache, now we need to check
1431 * that it's up-to-date.
1433 if (!Page_Uptodate(page))
1434 goto page_not_uptodate;
1436 success:
1438 * Try read-ahead for sequential areas.
1440 if (VM_SequentialReadHint(area))
1441 nopage_sequential_readahead(area, pgoff, size);
1444 * Found the page and have a reference on it, need to check sharing
1445 * and possibly copy it over to another page..
1447 old_page = page;
1448 if (no_share) {
1449 struct page *new_page = page_cache_alloc();
1451 if (new_page) {
1452 copy_user_highpage(new_page, old_page, address);
1453 flush_page_to_ram(new_page);
1454 } else
1455 new_page = NOPAGE_OOM;
1456 page_cache_release(page);
1457 return new_page;
1460 flush_page_to_ram(old_page);
1461 return old_page;
1463 no_cached_page:
1465 * If the requested offset is within our file, try to read a whole
1466 * cluster of pages at once.
1468 * Otherwise, we're off the end of a privately mapped file,
1469 * so we need to map a zero page.
1471 if ((pgoff < size) && !VM_RandomReadHint(area))
1472 error = read_cluster_nonblocking(file, pgoff, size);
1473 else
1474 error = page_cache_read(file, pgoff);
1477 * The page we want has now been added to the page cache.
1478 * In the unlikely event that someone removed it in the
1479 * meantime, we'll just come back here and read it again.
1481 if (error >= 0)
1482 goto retry_find;
1485 * An error return from page_cache_read can result if the
1486 * system is low on memory, or a problem occurs while trying
1487 * to schedule I/O.
1489 if (error == -ENOMEM)
1490 return NOPAGE_OOM;
1491 return NULL;
1493 page_not_uptodate:
1494 lock_page(page);
1495 if (Page_Uptodate(page)) {
1496 UnlockPage(page);
1497 goto success;
1500 if (!mapping->a_ops->readpage(file, page)) {
1501 wait_on_page(page);
1502 if (Page_Uptodate(page))
1503 goto success;
1507 * Umm, take care of errors if the page isn't up-to-date.
1508 * Try to re-read it _once_. We do this synchronously,
1509 * because there really aren't any performance issues here
1510 * and we need to check for errors.
1512 lock_page(page);
1513 if (Page_Uptodate(page)) {
1514 UnlockPage(page);
1515 goto success;
1517 ClearPageError(page);
1518 if (!mapping->a_ops->readpage(file, page)) {
1519 wait_on_page(page);
1520 if (Page_Uptodate(page))
1521 goto success;
1525 * Things didn't work out. Return zero to tell the
1526 * mm layer so, possibly freeing the page cache page first.
1528 page_cache_release(page);
1529 return NULL;
1532 static int filemap_write_page(struct file *file,
1533 struct page * page,
1534 int wait)
1537 * If a task terminates while we're swapping the page, the vma and
1538 * and file could be released: try_to_swap_out has done a get_file.
1539 * vma/file is guaranteed to exist in the unmap/sync cases because
1540 * mmap_sem is held.
1542 return page->mapping->a_ops->writepage(file, page);
1547 * The page cache takes care of races between somebody
1548 * trying to swap something out and swap something in
1549 * at the same time..
1551 extern void wakeup_bdflush(int);
1552 int filemap_swapout(struct page * page, struct file * file)
1554 int retval = filemap_write_page(file, page, 0);
1555 wakeup_bdflush(0);
1556 return retval;
1559 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
1560 unsigned long address, unsigned int flags)
1562 unsigned long pgoff;
1563 pte_t pte = *ptep;
1564 struct page *page;
1565 int error;
1567 if (!(flags & MS_INVALIDATE)) {
1568 if (!pte_present(pte))
1569 return 0;
1570 if (!pte_dirty(pte))
1571 return 0;
1572 flush_page_to_ram(pte_page(pte));
1573 flush_cache_page(vma, address);
1574 set_pte(ptep, pte_mkclean(pte));
1575 flush_tlb_page(vma, address);
1576 page = pte_page(pte);
1577 page_cache_get(page);
1578 } else {
1579 if (pte_none(pte))
1580 return 0;
1581 flush_cache_page(vma, address);
1582 pte_clear(ptep);
1583 flush_tlb_page(vma, address);
1584 if (!pte_present(pte)) {
1585 swap_free(pte_to_swp_entry(pte));
1586 return 0;
1588 page = pte_page(pte);
1589 if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
1590 page_cache_free(page);
1591 return 0;
1594 pgoff = (address - vma->vm_start) >> PAGE_CACHE_SHIFT;
1595 pgoff += vma->vm_pgoff;
1596 if (page->index != pgoff) {
1597 printk("weirdness: pgoff=%lu index=%lu address=%lu vm_start=%lu vm_pgoff=%lu\n",
1598 pgoff, page->index, address, vma->vm_start, vma->vm_pgoff);
1600 lock_page(page);
1601 error = filemap_write_page(vma->vm_file, page, 1);
1602 UnlockPage(page);
1603 page_cache_free(page);
1604 return error;
1607 static inline int filemap_sync_pte_range(pmd_t * pmd,
1608 unsigned long address, unsigned long size,
1609 struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
1611 pte_t * pte;
1612 unsigned long end;
1613 int error;
1615 if (pmd_none(*pmd))
1616 return 0;
1617 if (pmd_bad(*pmd)) {
1618 pmd_ERROR(*pmd);
1619 pmd_clear(pmd);
1620 return 0;
1622 pte = pte_offset(pmd, address);
1623 offset += address & PMD_MASK;
1624 address &= ~PMD_MASK;
1625 end = address + size;
1626 if (end > PMD_SIZE)
1627 end = PMD_SIZE;
1628 error = 0;
1629 do {
1630 error |= filemap_sync_pte(pte, vma, address + offset, flags);
1631 address += PAGE_SIZE;
1632 pte++;
1633 } while (address && (address < end));
1634 return error;
1637 static inline int filemap_sync_pmd_range(pgd_t * pgd,
1638 unsigned long address, unsigned long size,
1639 struct vm_area_struct *vma, unsigned int flags)
1641 pmd_t * pmd;
1642 unsigned long offset, end;
1643 int error;
1645 if (pgd_none(*pgd))
1646 return 0;
1647 if (pgd_bad(*pgd)) {
1648 pgd_ERROR(*pgd);
1649 pgd_clear(pgd);
1650 return 0;
1652 pmd = pmd_offset(pgd, address);
1653 offset = address & PGDIR_MASK;
1654 address &= ~PGDIR_MASK;
1655 end = address + size;
1656 if (end > PGDIR_SIZE)
1657 end = PGDIR_SIZE;
1658 error = 0;
1659 do {
1660 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
1661 address = (address + PMD_SIZE) & PMD_MASK;
1662 pmd++;
1663 } while (address && (address < end));
1664 return error;
1667 int filemap_sync(struct vm_area_struct * vma, unsigned long address,
1668 size_t size, unsigned int flags)
1670 pgd_t * dir;
1671 unsigned long end = address + size;
1672 int error = 0;
1674 dir = pgd_offset(vma->vm_mm, address);
1675 flush_cache_range(vma->vm_mm, end - size, end);
1676 if (address >= end)
1677 BUG();
1678 do {
1679 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
1680 address = (address + PGDIR_SIZE) & PGDIR_MASK;
1681 dir++;
1682 } while (address && (address < end));
1683 flush_tlb_range(vma->vm_mm, end - size, end);
1684 return error;
1688 * This handles (potentially partial) area unmaps..
1690 static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len)
1692 filemap_sync(vma, start, len, MS_ASYNC);
1696 * Shared mappings need to be able to do the right thing at
1697 * close/unmap/sync. They will also use the private file as
1698 * backing-store for swapping..
1700 static struct vm_operations_struct file_shared_mmap = {
1701 unmap: filemap_unmap, /* unmap - we need to sync the pages */
1702 sync: filemap_sync,
1703 nopage: filemap_nopage,
1704 swapout: filemap_swapout,
1708 * Private mappings just need to be able to load in the map.
1710 * (This is actually used for shared mappings as well, if we
1711 * know they can't ever get write permissions..)
1713 static struct vm_operations_struct file_private_mmap = {
1714 nopage: filemap_nopage,
1717 /* This is used for a general mmap of a disk file */
1719 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1721 struct vm_operations_struct * ops;
1722 struct inode *inode = file->f_dentry->d_inode;
1724 ops = &file_private_mmap;
1725 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1726 if (!inode->i_mapping->a_ops->writepage)
1727 return -EINVAL;
1728 ops = &file_shared_mmap;
1730 if (!inode->i_sb || !S_ISREG(inode->i_mode))
1731 return -EACCES;
1732 if (!inode->i_mapping->a_ops->readpage)
1733 return -ENOEXEC;
1734 UPDATE_ATIME(inode);
1735 vma->vm_ops = ops;
1736 return 0;
1740 * The msync() system call.
1743 static int msync_interval(struct vm_area_struct * vma,
1744 unsigned long start, unsigned long end, int flags)
1746 if (vma->vm_file && vma->vm_ops && vma->vm_ops->sync) {
1747 int error;
1748 error = vma->vm_ops->sync(vma, start, end-start, flags);
1749 if (!error && (flags & MS_SYNC)) {
1750 struct file * file = vma->vm_file;
1751 if (file && file->f_op && file->f_op->fsync) {
1752 down(&file->f_dentry->d_inode->i_sem);
1753 error = file->f_op->fsync(file, file->f_dentry, 1);
1754 up(&file->f_dentry->d_inode->i_sem);
1757 return error;
1759 return 0;
1762 asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
1764 unsigned long end;
1765 struct vm_area_struct * vma;
1766 int unmapped_error, error = -EINVAL;
1768 down(&current->mm->mmap_sem);
1769 if (start & ~PAGE_MASK)
1770 goto out;
1771 len = (len + ~PAGE_MASK) & PAGE_MASK;
1772 end = start + len;
1773 if (end < start)
1774 goto out;
1775 if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1776 goto out;
1777 error = 0;
1778 if (end == start)
1779 goto out;
1781 * If the interval [start,end) covers some unmapped address ranges,
1782 * just ignore them, but return -EFAULT at the end.
1784 vma = find_vma(current->mm, start);
1785 unmapped_error = 0;
1786 for (;;) {
1787 /* Still start < end. */
1788 error = -EFAULT;
1789 if (!vma)
1790 goto out;
1791 /* Here start < vma->vm_end. */
1792 if (start < vma->vm_start) {
1793 unmapped_error = -EFAULT;
1794 start = vma->vm_start;
1796 /* Here vma->vm_start <= start < vma->vm_end. */
1797 if (end <= vma->vm_end) {
1798 if (start < end) {
1799 error = msync_interval(vma, start, end, flags);
1800 if (error)
1801 goto out;
1803 error = unmapped_error;
1804 goto out;
1806 /* Here vma->vm_start <= start < vma->vm_end < end. */
1807 error = msync_interval(vma, start, vma->vm_end, flags);
1808 if (error)
1809 goto out;
1810 start = vma->vm_end;
1811 vma = vma->vm_next;
1813 out:
1814 up(&current->mm->mmap_sem);
1815 return error;
1818 static inline void setup_read_behavior(struct vm_area_struct * vma,
1819 int behavior)
1821 VM_ClearReadHint(vma);
1822 switch(behavior) {
1823 case MADV_SEQUENTIAL:
1824 vma->vm_flags |= VM_SEQ_READ;
1825 break;
1826 case MADV_RANDOM:
1827 vma->vm_flags |= VM_RAND_READ;
1828 break;
1829 default:
1830 break;
1832 return;
1835 static long madvise_fixup_start(struct vm_area_struct * vma,
1836 unsigned long end, int behavior)
1838 struct vm_area_struct * n;
1840 n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1841 if (!n)
1842 return -EAGAIN;
1843 *n = *vma;
1844 n->vm_end = end;
1845 setup_read_behavior(n, behavior);
1846 n->vm_raend = 0;
1847 get_file(n->vm_file);
1848 if (n->vm_ops && n->vm_ops->open)
1849 n->vm_ops->open(n);
1850 vmlist_modify_lock(vma->vm_mm);
1851 vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
1852 vma->vm_start = end;
1853 insert_vm_struct(current->mm, n);
1854 vmlist_modify_unlock(vma->vm_mm);
1855 return 0;
1858 static long madvise_fixup_end(struct vm_area_struct * vma,
1859 unsigned long start, int behavior)
1861 struct vm_area_struct * n;
1863 n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1864 if (!n)
1865 return -EAGAIN;
1866 *n = *vma;
1867 n->vm_start = start;
1868 n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT;
1869 setup_read_behavior(n, behavior);
1870 n->vm_raend = 0;
1871 get_file(n->vm_file);
1872 if (n->vm_ops && n->vm_ops->open)
1873 n->vm_ops->open(n);
1874 vmlist_modify_lock(vma->vm_mm);
1875 vma->vm_end = start;
1876 insert_vm_struct(current->mm, n);
1877 vmlist_modify_unlock(vma->vm_mm);
1878 return 0;
1881 static long madvise_fixup_middle(struct vm_area_struct * vma,
1882 unsigned long start, unsigned long end, int behavior)
1884 struct vm_area_struct * left, * right;
1886 left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1887 if (!left)
1888 return -EAGAIN;
1889 right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1890 if (!right) {
1891 kmem_cache_free(vm_area_cachep, left);
1892 return -EAGAIN;
1894 *left = *vma;
1895 *right = *vma;
1896 left->vm_end = start;
1897 right->vm_start = end;
1898 right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT;
1899 left->vm_raend = 0;
1900 right->vm_raend = 0;
1901 atomic_add(2, &vma->vm_file->f_count);
1903 if (vma->vm_ops && vma->vm_ops->open) {
1904 vma->vm_ops->open(left);
1905 vma->vm_ops->open(right);
1907 vmlist_modify_lock(vma->vm_mm);
1908 vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
1909 vma->vm_start = start;
1910 vma->vm_end = end;
1911 setup_read_behavior(vma, behavior);
1912 vma->vm_raend = 0;
1913 insert_vm_struct(current->mm, left);
1914 insert_vm_struct(current->mm, right);
1915 vmlist_modify_unlock(vma->vm_mm);
1916 return 0;
1920 * We can potentially split a vm area into separate
1921 * areas, each area with its own behavior.
1923 static long madvise_behavior(struct vm_area_struct * vma,
1924 unsigned long start, unsigned long end, int behavior)
1926 int error = 0;
1928 /* This caps the number of vma's this process can own */
1929 if (vma->vm_mm->map_count > MAX_MAP_COUNT)
1930 return -ENOMEM;
1932 if (start == vma->vm_start) {
1933 if (end == vma->vm_end) {
1934 setup_read_behavior(vma, behavior);
1935 vma->vm_raend = 0;
1936 } else
1937 error = madvise_fixup_start(vma, end, behavior);
1938 } else {
1939 if (end == vma->vm_end)
1940 error = madvise_fixup_end(vma, start, behavior);
1941 else
1942 error = madvise_fixup_middle(vma, start, end, behavior);
1945 return error;
1949 * Schedule all required I/O operations, then run the disk queue
1950 * to make sure they are started. Do not wait for completion.
1952 static long madvise_willneed(struct vm_area_struct * vma,
1953 unsigned long start, unsigned long end)
1955 long error = -EBADF;
1956 struct file * file;
1957 unsigned long size, rlim_rss;
1959 /* Doesn't work if there's no mapped file. */
1960 if (!vma->vm_file)
1961 return error;
1962 file = vma->vm_file;
1963 size = (file->f_dentry->d_inode->i_size + PAGE_CACHE_SIZE - 1) >>
1964 PAGE_CACHE_SHIFT;
1966 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1967 if (end > vma->vm_end)
1968 end = vma->vm_end;
1969 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1971 /* Make sure this doesn't exceed the process's max rss. */
1972 error = -EIO;
1973 rlim_rss = current->rlim ? current->rlim[RLIMIT_RSS].rlim_cur :
1974 LONG_MAX; /* default: see resource.h */
1975 if ((vma->vm_mm->rss + (end - start)) > rlim_rss)
1976 return error;
1978 /* round to cluster boundaries if this isn't a "random" area. */
1979 if (!VM_RandomReadHint(vma)) {
1980 start = CLUSTER_OFFSET(start);
1981 end = CLUSTER_OFFSET(end + CLUSTER_PAGES - 1);
1983 while ((start < end) && (start < size)) {
1984 error = read_cluster_nonblocking(file, start, size);
1985 start += CLUSTER_PAGES;
1986 if (error < 0)
1987 break;
1989 } else {
1990 while ((start < end) && (start < size)) {
1991 error = page_cache_read(file, start);
1992 start++;
1993 if (error < 0)
1994 break;
1998 /* Don't wait for someone else to push these requests. */
1999 run_task_queue(&tq_disk);
2001 return error;
2005 * Application no longer needs these pages. If the pages are dirty,
2006 * it's OK to just throw them away. The app will be more careful about
2007 * data it wants to keep. Be sure to free swap resources too. The
2008 * zap_page_range call sets things up for shrink_mmap to actually free
2009 * these pages later if no one else has touched them in the meantime,
2010 * although we could add these pages to a global reuse list for
2011 * shrink_mmap to pick up before reclaiming other pages.
2013 * NB: This interface discards data rather than pushes it out to swap,
2014 * as some implementations do. This has performance implications for
2015 * applications like large transactional databases which want to discard
2016 * pages in anonymous maps after committing to backing store the data
2017 * that was kept in them. There is no reason to write this data out to
2018 * the swap area if the application is discarding it.
2020 * An interface that causes the system to free clean pages and flush
2021 * dirty pages is already available as msync(MS_INVALIDATE).
2023 static long madvise_dontneed(struct vm_area_struct * vma,
2024 unsigned long start, unsigned long end)
2026 if (vma->vm_flags & VM_LOCKED)
2027 return -EINVAL;
2029 flush_cache_range(vma->vm_mm, start, end);
2030 zap_page_range(vma->vm_mm, start, end - start);
2031 flush_tlb_range(vma->vm_mm, start, end);
2032 return 0;
2035 static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
2036 unsigned long end, int behavior)
2038 long error = -EBADF;
2040 switch (behavior) {
2041 case MADV_NORMAL:
2042 case MADV_SEQUENTIAL:
2043 case MADV_RANDOM:
2044 error = madvise_behavior(vma, start, end, behavior);
2045 break;
2047 case MADV_WILLNEED:
2048 error = madvise_willneed(vma, start, end);
2049 break;
2051 case MADV_DONTNEED:
2052 error = madvise_dontneed(vma, start, end);
2053 break;
2055 default:
2056 error = -EINVAL;
2057 break;
2060 return error;
2064 * The madvise(2) system call.
2066 * Applications can use madvise() to advise the kernel how it should
2067 * handle paging I/O in this VM area. The idea is to help the kernel
2068 * use appropriate read-ahead and caching techniques. The information
2069 * provided is advisory only, and can be safely disregarded by the
2070 * kernel without affecting the correct operation of the application.
2072 * behavior values:
2073 * MADV_NORMAL - the default behavior is to read clusters. This
2074 * results in some read-ahead and read-behind.
2075 * MADV_RANDOM - the system should read the minimum amount of data
2076 * on any access, since it is unlikely that the appli-
2077 * cation will need more than what it asks for.
2078 * MADV_SEQUENTIAL - pages in the given range will probably be accessed
2079 * once, so they can be aggressively read ahead, and
2080 * can be freed soon after they are accessed.
2081 * MADV_WILLNEED - the application is notifying the system to read
2082 * some pages ahead.
2083 * MADV_DONTNEED - the application is finished with the given range,
2084 * so the kernel can free resources associated with it.
2086 * return values:
2087 * zero - success
2088 * -EINVAL - start + len < 0, start is not page-aligned,
2089 * "behavior" is not a valid value, or application
2090 * is attempting to release locked or shared pages.
2091 * -ENOMEM - addresses in the specified range are not currently
2092 * mapped, or are outside the AS of the process.
2093 * -EIO - an I/O error occurred while paging in data.
2094 * -EBADF - map exists, but area maps something that isn't a file.
2095 * -EAGAIN - a kernel resource was temporarily unavailable.
2097 asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior)
2099 unsigned long end;
2100 struct vm_area_struct * vma;
2101 int unmapped_error = 0;
2102 int error = -EINVAL;
2104 down(&current->mm->mmap_sem);
2106 if (start & ~PAGE_MASK)
2107 goto out;
2108 len = (len + ~PAGE_MASK) & PAGE_MASK;
2109 end = start + len;
2110 if (end < start)
2111 goto out;
2113 error = 0;
2114 if (end == start)
2115 goto out;
2118 * If the interval [start,end) covers some unmapped address
2119 * ranges, just ignore them, but return -ENOMEM at the end.
2121 vma = find_vma(current->mm, start);
2122 for (;;) {
2123 /* Still start < end. */
2124 error = -ENOMEM;
2125 if (!vma)
2126 goto out;
2128 /* Here start < vma->vm_end. */
2129 if (start < vma->vm_start) {
2130 unmapped_error = -ENOMEM;
2131 start = vma->vm_start;
2134 /* Here vma->vm_start <= start < vma->vm_end. */
2135 if (end <= vma->vm_end) {
2136 if (start < end) {
2137 error = madvise_vma(vma, start, end,
2138 behavior);
2139 if (error)
2140 goto out;
2142 error = unmapped_error;
2143 goto out;
2146 /* Here vma->vm_start <= start < vma->vm_end < end. */
2147 error = madvise_vma(vma, start, vma->vm_end, behavior);
2148 if (error)
2149 goto out;
2150 start = vma->vm_end;
2151 vma = vma->vm_next;
2154 out:
2155 up(&current->mm->mmap_sem);
2156 return error;
2160 * Later we can get more picky about what "in core" means precisely.
2161 * For now, simply check to see if the page is in the page cache,
2162 * and is up to date; i.e. that no page-in operation would be required
2163 * at this time if an application were to map and access this page.
2165 static unsigned char mincore_page(struct vm_area_struct * vma,
2166 unsigned long pgoff)
2168 unsigned char present = 0;
2169 struct address_space * as = &vma->vm_file->f_dentry->d_inode->i_data;
2170 struct page * page, ** hash = page_hash(as, pgoff);
2172 spin_lock(&pagecache_lock);
2173 page = __find_page_nolock(as, pgoff, *hash);
2174 if ((page) && (Page_Uptodate(page)))
2175 present = 1;
2176 spin_unlock(&pagecache_lock);
2178 return present;
2181 static long mincore_vma(struct vm_area_struct * vma,
2182 unsigned long start, unsigned long end, unsigned char * vec)
2184 long error, i, remaining;
2185 unsigned char * tmp;
2187 error = -ENOMEM;
2188 if (!vma->vm_file)
2189 return error;
2191 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2192 if (end > vma->vm_end)
2193 end = vma->vm_end;
2194 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2196 error = -EAGAIN;
2197 tmp = (unsigned char *) __get_free_page(GFP_KERNEL);
2198 if (!tmp)
2199 return error;
2201 /* (end - start) is # of pages, and also # of bytes in "vec */
2202 remaining = (end - start),
2204 error = 0;
2205 for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) {
2206 int j = 0;
2207 long thispiece = (remaining < PAGE_SIZE) ?
2208 remaining : PAGE_SIZE;
2210 while (j < thispiece)
2211 tmp[j++] = mincore_page(vma, start++);
2213 if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) {
2214 error = -EFAULT;
2215 break;
2219 free_page((unsigned long) tmp);
2220 return error;
2224 * The mincore(2) system call.
2226 * mincore() returns the memory residency status of the pages in the
2227 * current process's address space specified by [addr, addr + len).
2228 * The status is returned in a vector of bytes. The least significant
2229 * bit of each byte is 1 if the referenced page is in memory, otherwise
2230 * it is zero.
2232 * Because the status of a page can change after mincore() checks it
2233 * but before it returns to the application, the returned vector may
2234 * contain stale information. Only locked pages are guaranteed to
2235 * remain in memory.
2237 * return values:
2238 * zero - success
2239 * -EFAULT - vec points to an illegal address
2240 * -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE,
2241 * or len has a nonpositive value
2242 * -ENOMEM - Addresses in the range [addr, addr + len] are
2243 * invalid for the address space of this process, or
2244 * specify one or more pages which are not currently
2245 * mapped
2246 * -EAGAIN - A kernel resource was temporarily unavailable.
2248 asmlinkage long sys_mincore(unsigned long start, size_t len,
2249 unsigned char * vec)
2251 int index = 0;
2252 unsigned long end;
2253 struct vm_area_struct * vma;
2254 int unmapped_error = 0;
2255 long error = -EINVAL;
2257 down(&current->mm->mmap_sem);
2259 if (start & ~PAGE_CACHE_MASK)
2260 goto out;
2261 len = (len + ~PAGE_CACHE_MASK) & PAGE_CACHE_MASK;
2262 end = start + len;
2263 if (end < start)
2264 goto out;
2266 error = 0;
2267 if (end == start)
2268 goto out;
2271 * If the interval [start,end) covers some unmapped address
2272 * ranges, just ignore them, but return -ENOMEM at the end.
2274 vma = find_vma(current->mm, start);
2275 for (;;) {
2276 /* Still start < end. */
2277 error = -ENOMEM;
2278 if (!vma)
2279 goto out;
2281 /* Here start < vma->vm_end. */
2282 if (start < vma->vm_start) {
2283 unmapped_error = -ENOMEM;
2284 start = vma->vm_start;
2287 /* Here vma->vm_start <= start < vma->vm_end. */
2288 if (end <= vma->vm_end) {
2289 if (start < end) {
2290 error = mincore_vma(vma, start, end,
2291 &vec[index]);
2292 if (error)
2293 goto out;
2295 error = unmapped_error;
2296 goto out;
2299 /* Here vma->vm_start <= start < vma->vm_end < end. */
2300 error = mincore_vma(vma, start, vma->vm_end, &vec[index]);
2301 if (error)
2302 goto out;
2303 index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT;
2304 start = vma->vm_end;
2305 vma = vma->vm_next;
2308 out:
2309 up(&current->mm->mmap_sem);
2310 return error;
2313 static inline
2314 struct page *__read_cache_page(struct address_space *mapping,
2315 unsigned long index,
2316 int (*filler)(void *,struct page*),
2317 void *data)
2319 struct page **hash = page_hash(mapping, index);
2320 struct page *page, *cached_page = NULL;
2321 int err;
2322 repeat:
2323 page = __find_get_page(mapping, index, hash);
2324 if (!page) {
2325 if (!cached_page) {
2326 cached_page = page_cache_alloc();
2327 if (!cached_page)
2328 return ERR_PTR(-ENOMEM);
2330 page = cached_page;
2331 if (add_to_page_cache_unique(page, mapping, index, hash))
2332 goto repeat;
2333 cached_page = NULL;
2334 err = filler(data, page);
2335 if (err < 0) {
2336 page_cache_release(page);
2337 page = ERR_PTR(err);
2340 if (cached_page)
2341 page_cache_free(cached_page);
2342 return page;
2346 * Read into the page cache. If a page already exists,
2347 * and Page_Uptodate() is not set, try to fill the page.
2349 struct page *read_cache_page(struct address_space *mapping,
2350 unsigned long index,
2351 int (*filler)(void *,struct page*),
2352 void *data)
2354 struct page *page = __read_cache_page(mapping, index, filler, data);
2355 int err;
2357 if (IS_ERR(page) || Page_Uptodate(page))
2358 goto out;
2360 lock_page(page);
2361 if (Page_Uptodate(page)) {
2362 UnlockPage(page);
2363 goto out;
2365 err = filler(data, page);
2366 if (err < 0) {
2367 page_cache_release(page);
2368 page = ERR_PTR(err);
2370 out:
2371 return page;
2374 static inline struct page * __grab_cache_page(struct address_space *mapping,
2375 unsigned long index, struct page **cached_page)
2377 struct page *page, **hash = page_hash(mapping, index);
2378 repeat:
2379 page = __find_lock_page(mapping, index, hash);
2380 if (!page) {
2381 if (!*cached_page) {
2382 *cached_page = page_cache_alloc();
2383 if (!*cached_page)
2384 return NULL;
2386 page = *cached_page;
2387 if (add_to_page_cache_unique(page, mapping, index, hash))
2388 goto repeat;
2389 *cached_page = NULL;
2391 return page;
2395 * Returns locked page at given index in given cache, creating it if needed.
2398 struct page *grab_cache_page(struct address_space *mapping, unsigned long index)
2400 struct page *cached_page = NULL;
2401 struct page *page = __grab_cache_page(mapping,index,&cached_page);
2402 if (cached_page)
2403 page_cache_free(cached_page);
2404 return page;
2407 static inline void remove_suid(struct inode *inode)
2409 unsigned int mode;
2411 /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
2412 mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
2414 /* was any of the uid bits set? */
2415 mode &= inode->i_mode;
2416 if (mode && !capable(CAP_FSETID)) {
2417 inode->i_mode &= ~mode;
2418 mark_inode_dirty(inode);
2423 * Write to a file through the page cache.
2425 * We currently put everything into the page cache prior to writing it.
2426 * This is not a problem when writing full pages. With partial pages,
2427 * however, we first have to read the data into the cache, then
2428 * dirty the page, and finally schedule it for writing. Alternatively, we
2429 * could write-through just the portion of data that would go into that
2430 * page, but that would kill performance for applications that write data
2431 * line by line, and it's prone to race conditions.
2433 * Note that this routine doesn't try to keep track of dirty pages. Each
2434 * file system has to do this all by itself, unfortunately.
2435 * okir@monad.swb.de
2437 ssize_t
2438 generic_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)
2440 struct inode *inode = file->f_dentry->d_inode;
2441 struct address_space *mapping = inode->i_mapping;
2442 unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
2443 loff_t pos;
2444 struct page *page, *cached_page;
2445 unsigned long written;
2446 long status;
2447 int err;
2449 cached_page = NULL;
2451 down(&inode->i_sem);
2453 pos = *ppos;
2454 err = -EINVAL;
2455 if (pos < 0)
2456 goto out;
2458 err = file->f_error;
2459 if (err) {
2460 file->f_error = 0;
2461 goto out;
2464 written = 0;
2466 if (file->f_flags & O_APPEND)
2467 pos = inode->i_size;
2470 * Check whether we've reached the file size limit.
2472 err = -EFBIG;
2473 if (limit != RLIM_INFINITY) {
2474 if (pos >= limit) {
2475 send_sig(SIGXFSZ, current, 0);
2476 goto out;
2478 if (count > limit - pos) {
2479 send_sig(SIGXFSZ, current, 0);
2480 count = limit - pos;
2484 status = 0;
2485 if (count) {
2486 remove_suid(inode);
2487 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
2488 mark_inode_dirty(inode);
2491 while (count) {
2492 unsigned long bytes, index, offset;
2493 char *kaddr;
2496 * Try to find the page in the cache. If it isn't there,
2497 * allocate a free page.
2499 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
2500 index = pos >> PAGE_CACHE_SHIFT;
2501 bytes = PAGE_CACHE_SIZE - offset;
2502 if (bytes > count)
2503 bytes = count;
2505 status = -ENOMEM; /* we'll assign it later anyway */
2506 page = __grab_cache_page(mapping, index, &cached_page);
2507 if (!page)
2508 break;
2510 /* We have exclusive IO access to the page.. */
2511 if (!PageLocked(page)) {
2512 PAGE_BUG(page);
2515 status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes);
2516 if (status)
2517 goto unlock;
2518 kaddr = page_address(page);
2519 status = copy_from_user(kaddr+offset, buf, bytes);
2520 flush_dcache_page(page);
2521 if (status)
2522 goto fail_write;
2523 status = mapping->a_ops->commit_write(file, page, offset, offset+bytes);
2524 if (!status)
2525 status = bytes;
2527 if (status >= 0) {
2528 written += status;
2529 count -= status;
2530 pos += status;
2531 buf += status;
2533 unlock:
2534 /* Mark it unlocked again and drop the page.. */
2535 UnlockPage(page);
2536 page_cache_release(page);
2538 if (status < 0)
2539 break;
2541 *ppos = pos;
2543 if (cached_page)
2544 page_cache_free(cached_page);
2546 err = written ? written : status;
2547 out:
2548 up(&inode->i_sem);
2549 return err;
2550 fail_write:
2551 status = -EFAULT;
2552 ClearPageUptodate(page);
2553 kunmap(page);
2554 goto unlock;
2557 void __init page_cache_init(unsigned long mempages)
2559 unsigned long htable_size, order;
2561 htable_size = mempages;
2562 htable_size *= sizeof(struct page *);
2563 for(order = 0; (PAGE_SIZE << order) < htable_size; order++)
2566 do {
2567 unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *);
2569 page_hash_bits = 0;
2570 while((tmp >>= 1UL) != 0UL)
2571 page_hash_bits++;
2573 page_hash_table = (struct page **)
2574 __get_free_pages(GFP_ATOMIC, order);
2575 } while(page_hash_table == NULL && --order > 0);
2577 printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
2578 (1 << page_hash_bits), order, (PAGE_SIZE << order));
2579 if (!page_hash_table)
2580 panic("Failed to allocate page hash table\n");
2581 memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));