- Kai Germaschewski: ymfpci cleanups and resource leak fixes
[davej-history.git] / mm / filemap.c
blobec8ff8ac7e411172c9fb4b507a916d3f3bce2a62
1 /*
2 * linux/mm/filemap.c
4 * Copyright (C) 1994-1999 Linus Torvalds
5 */
7 /*
8 * This file handles the generic file mmap semantics used by
9 * most "normal" filesystems (but you don't /have/ to use this:
10 * the NFS filesystem used to do this differently, for example)
12 #include <linux/malloc.h>
13 #include <linux/shm.h>
14 #include <linux/mman.h>
15 #include <linux/locks.h>
16 #include <linux/pagemap.h>
17 #include <linux/swap.h>
18 #include <linux/smp_lock.h>
19 #include <linux/blkdev.h>
20 #include <linux/file.h>
21 #include <linux/swapctl.h>
22 #include <linux/slab.h>
23 #include <linux/init.h>
24 #include <linux/mm.h>
26 #include <asm/pgalloc.h>
27 #include <asm/uaccess.h>
28 #include <asm/mman.h>
30 #include <linux/highmem.h>
33 * Shared mappings implemented 30.11.1994. It's not fully working yet,
34 * though.
36 * Shared mappings now work. 15.8.1995 Bruno.
38 * finished 'unifying' the page and buffer cache and SMP-threaded the
39 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
41 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
44 atomic_t page_cache_size = ATOMIC_INIT(0);
45 unsigned int page_hash_bits;
46 struct page **page_hash_table;
48 spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
50 * NOTE: to avoid deadlocking you must never acquire the pagecache_lock with
51 * the pagemap_lru_lock held.
53 spinlock_t pagemap_lru_lock = SPIN_LOCK_UNLOCKED;
55 #define CLUSTER_PAGES (1 << page_cluster)
56 #define CLUSTER_OFFSET(x) (((x) >> page_cluster) << page_cluster)
58 void __add_page_to_hash_queue(struct page * page, struct page **p)
60 atomic_inc(&page_cache_size);
61 if((page->next_hash = *p) != NULL)
62 (*p)->pprev_hash = &page->next_hash;
63 *p = page;
64 page->pprev_hash = p;
65 if (page->buffers)
66 PAGE_BUG(page);
69 static inline void remove_page_from_hash_queue(struct page * page)
71 if(page->pprev_hash) {
72 if(page->next_hash)
73 page->next_hash->pprev_hash = page->pprev_hash;
74 *page->pprev_hash = page->next_hash;
75 page->pprev_hash = NULL;
77 atomic_dec(&page_cache_size);
80 static inline int sync_page(struct page *page)
82 struct address_space *mapping = page->mapping;
84 if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
85 return mapping->a_ops->sync_page(page);
86 return 0;
90 * Remove a page from the page cache and free it. Caller has to make
91 * sure the page is locked and that nobody else uses it - or that usage
92 * is safe.
94 void __remove_inode_page(struct page *page)
96 remove_page_from_inode_queue(page);
97 remove_page_from_hash_queue(page);
98 page->mapping = NULL;
101 void remove_inode_page(struct page *page)
103 if (!PageLocked(page))
104 PAGE_BUG(page);
106 spin_lock(&pagecache_lock);
107 __remove_inode_page(page);
108 spin_unlock(&pagecache_lock);
112 * invalidate_inode_pages - Invalidate all the unlocked pages of one inode
113 * @inode: the inode which pages we want to invalidate
115 * This function only removes the unlocked pages, if you want to
116 * remove all the pages of one inode, you must call truncate_inode_pages.
119 void invalidate_inode_pages(struct inode * inode)
121 struct list_head *head, *curr;
122 struct page * page;
124 head = &inode->i_mapping->pages;
126 spin_lock(&pagecache_lock);
127 spin_lock(&pagemap_lru_lock);
128 curr = head->next;
130 while (curr != head) {
131 page = list_entry(curr, struct page, list);
132 curr = curr->next;
134 /* We cannot invalidate a locked page */
135 if (TryLockPage(page))
136 continue;
138 /* Neither can we invalidate something in use.. */
139 if (page_count(page) != 1) {
140 UnlockPage(page);
141 continue;
144 __lru_cache_del(page);
145 __remove_inode_page(page);
146 UnlockPage(page);
147 page_cache_release(page);
150 spin_unlock(&pagemap_lru_lock);
151 spin_unlock(&pagecache_lock);
154 static inline void truncate_partial_page(struct page *page, unsigned partial)
156 memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
158 if (page->buffers)
159 block_flushpage(page, partial);
163 static inline void truncate_complete_page(struct page *page)
165 /* Leave it on the LRU if it gets converted into anonymous buffers */
166 if (!page->buffers || block_flushpage(page, 0))
167 lru_cache_del(page);
170 * We remove the page from the page cache _after_ we have
171 * destroyed all buffer-cache references to it. Otherwise some
172 * other process might think this inode page is not in the
173 * page cache and creates a buffer-cache alias to it causing
174 * all sorts of fun problems ...
176 ClearPageDirty(page);
177 ClearPageUptodate(page);
178 remove_inode_page(page);
179 page_cache_release(page);
183 * truncate_inode_pages - truncate *all* the pages from an offset
184 * @mapping: mapping to truncate
185 * @lstart: offset from with to truncate
187 * Truncate the page cache at a set offset, removing the pages
188 * that are beyond that offset (and zeroing out partial pages).
189 * If any page is locked we wait for it to become unlocked.
191 void truncate_inode_pages(struct address_space * mapping, loff_t lstart)
193 struct list_head *head, *curr;
194 struct page * page;
195 unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
196 unsigned long start;
198 start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
200 repeat:
201 head = &mapping->pages;
202 spin_lock(&pagecache_lock);
203 curr = head->next;
204 while (curr != head) {
205 unsigned long offset;
207 page = list_entry(curr, struct page, list);
208 curr = curr->next;
209 offset = page->index;
211 /* Is one of the pages to truncate? */
212 if ((offset >= start) || (partial && (offset + 1) == start)) {
213 if (TryLockPage(page)) {
214 page_cache_get(page);
215 spin_unlock(&pagecache_lock);
216 wait_on_page(page);
217 page_cache_release(page);
218 goto repeat;
220 page_cache_get(page);
221 spin_unlock(&pagecache_lock);
223 if (partial && (offset + 1) == start) {
224 truncate_partial_page(page, partial);
225 partial = 0;
226 } else
227 truncate_complete_page(page);
229 UnlockPage(page);
230 page_cache_release(page);
233 * We have done things without the pagecache lock,
234 * so we'll have to repeat the scan.
235 * It's not possible to deadlock here because
236 * we are guaranteed to make progress. (ie. we have
237 * just removed a page)
239 goto repeat;
242 spin_unlock(&pagecache_lock);
245 static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
247 goto inside;
249 for (;;) {
250 page = page->next_hash;
251 inside:
252 if (!page)
253 goto not_found;
254 if (page->mapping != mapping)
255 continue;
256 if (page->index == offset)
257 break;
260 * Touching the page may move it to the active list.
261 * If we end up with too few inactive pages, we wake
262 * up kswapd.
264 age_page_up(page);
265 if (inactive_shortage() > inactive_target / 2 && free_shortage())
266 wakeup_kswapd(0);
267 not_found:
268 return page;
272 * By the time this is called, the page is locked and
273 * we don't have to worry about any races any more.
275 * Start the IO..
277 static int writeout_one_page(struct page *page)
279 struct buffer_head *bh, *head = page->buffers;
281 bh = head;
282 do {
283 if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
284 continue;
286 bh->b_flushtime = jiffies;
287 ll_rw_block(WRITE, 1, &bh);
288 } while ((bh = bh->b_this_page) != head);
289 return 0;
292 static int waitfor_one_page(struct page *page)
294 int error = 0;
295 struct buffer_head *bh, *head = page->buffers;
297 bh = head;
298 do {
299 wait_on_buffer(bh);
300 if (buffer_req(bh) && !buffer_uptodate(bh))
301 error = -EIO;
302 } while ((bh = bh->b_this_page) != head);
303 return error;
306 static int do_buffer_fdatasync(struct inode *inode, unsigned long start, unsigned long end, int (*fn)(struct page *))
308 struct list_head *head, *curr;
309 struct page *page;
310 int retval = 0;
312 head = &inode->i_mapping->pages;
314 spin_lock(&pagecache_lock);
315 curr = head->next;
316 while (curr != head) {
317 page = list_entry(curr, struct page, list);
318 curr = curr->next;
319 if (!page->buffers)
320 continue;
321 if (page->index >= end)
322 continue;
323 if (page->index < start)
324 continue;
326 page_cache_get(page);
327 spin_unlock(&pagecache_lock);
328 lock_page(page);
330 /* The buffers could have been free'd while we waited for the page lock */
331 if (page->buffers)
332 retval |= fn(page);
334 UnlockPage(page);
335 spin_lock(&pagecache_lock);
336 curr = page->list.next;
337 page_cache_release(page);
339 spin_unlock(&pagecache_lock);
341 return retval;
345 * Two-stage data sync: first start the IO, then go back and
346 * collect the information..
348 int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx)
350 int retval;
352 retval = do_buffer_fdatasync(inode, start_idx, end_idx, writeout_one_page);
353 retval |= do_buffer_fdatasync(inode, start_idx, end_idx, waitfor_one_page);
354 return retval;
358 * Add a page to the inode page cache.
360 * The caller must have locked the page and
361 * set all the page flags correctly..
363 void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index)
365 if (!PageLocked(page))
366 BUG();
368 page_cache_get(page);
369 spin_lock(&pagecache_lock);
370 page->index = index;
371 add_page_to_inode_queue(mapping, page);
372 __add_page_to_hash_queue(page, page_hash(mapping, index));
373 lru_cache_add(page);
374 spin_unlock(&pagecache_lock);
378 * This adds a page to the page cache, starting out as locked,
379 * owned by us, but unreferenced, not uptodate and with no errors.
381 static inline void __add_to_page_cache(struct page * page,
382 struct address_space *mapping, unsigned long offset,
383 struct page **hash)
385 unsigned long flags;
387 if (PageLocked(page))
388 BUG();
390 flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced) | (1 << PG_arch_1));
391 page->flags = flags | (1 << PG_locked);
392 page_cache_get(page);
393 page->index = offset;
394 add_page_to_inode_queue(mapping, page);
395 __add_page_to_hash_queue(page, hash);
396 lru_cache_add(page);
399 void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset)
401 spin_lock(&pagecache_lock);
402 __add_to_page_cache(page, mapping, offset, page_hash(mapping, offset));
403 spin_unlock(&pagecache_lock);
406 static int add_to_page_cache_unique(struct page * page,
407 struct address_space *mapping, unsigned long offset,
408 struct page **hash)
410 int err;
411 struct page *alias;
413 spin_lock(&pagecache_lock);
414 alias = __find_page_nolock(mapping, offset, *hash);
416 err = 1;
417 if (!alias) {
418 __add_to_page_cache(page,mapping,offset,hash);
419 err = 0;
422 spin_unlock(&pagecache_lock);
423 return err;
427 * This adds the requested page to the page cache if it isn't already there,
428 * and schedules an I/O to read in its contents from disk.
430 static inline int page_cache_read(struct file * file, unsigned long offset)
432 struct inode *inode = file->f_dentry->d_inode;
433 struct address_space *mapping = inode->i_mapping;
434 struct page **hash = page_hash(mapping, offset);
435 struct page *page;
437 spin_lock(&pagecache_lock);
438 page = __find_page_nolock(mapping, offset, *hash);
439 spin_unlock(&pagecache_lock);
440 if (page)
441 return 0;
443 page = page_cache_alloc();
444 if (!page)
445 return -ENOMEM;
447 if (!add_to_page_cache_unique(page, mapping, offset, hash)) {
448 int error = mapping->a_ops->readpage(file, page);
449 page_cache_release(page);
450 return error;
453 * We arrive here in the unlikely event that someone
454 * raced with us and added our page to the cache first.
456 page_cache_free(page);
457 return 0;
461 * Read in an entire cluster at once. A cluster is usually a 64k-
462 * aligned block that includes the page requested in "offset."
464 static int read_cluster_nonblocking(struct file * file, unsigned long offset,
465 unsigned long filesize)
467 unsigned long pages = CLUSTER_PAGES;
469 offset = CLUSTER_OFFSET(offset);
470 while ((pages-- > 0) && (offset < filesize)) {
471 int error = page_cache_read(file, offset);
472 if (error < 0)
473 return error;
474 offset ++;
477 return 0;
481 * Wait for a page to get unlocked.
483 * This must be called with the caller "holding" the page,
484 * ie with increased "page->count" so that the page won't
485 * go away during the wait..
487 void ___wait_on_page(struct page *page)
489 struct task_struct *tsk = current;
490 DECLARE_WAITQUEUE(wait, tsk);
492 add_wait_queue(&page->wait, &wait);
493 do {
494 sync_page(page);
495 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
496 if (!PageLocked(page))
497 break;
498 run_task_queue(&tq_disk);
499 schedule();
500 } while (PageLocked(page));
501 tsk->state = TASK_RUNNING;
502 remove_wait_queue(&page->wait, &wait);
506 * Get a lock on the page, assuming we need to sleep
507 * to get it..
509 static void __lock_page(struct page *page)
511 struct task_struct *tsk = current;
512 DECLARE_WAITQUEUE(wait, tsk);
514 add_wait_queue_exclusive(&page->wait, &wait);
515 for (;;) {
516 sync_page(page);
517 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
518 if (PageLocked(page)) {
519 run_task_queue(&tq_disk);
520 schedule();
521 continue;
523 if (!TryLockPage(page))
524 break;
526 tsk->state = TASK_RUNNING;
527 remove_wait_queue(&page->wait, &wait);
532 * Get an exclusive lock on the page, optimistically
533 * assuming it's not locked..
535 void lock_page(struct page *page)
537 if (TryLockPage(page))
538 __lock_page(page);
542 * a rather lightweight function, finding and getting a reference to a
543 * hashed page atomically, waiting for it if it's locked.
545 static struct page * __find_get_page(struct address_space *mapping,
546 unsigned long offset, struct page **hash)
548 struct page *page;
551 * We scan the hash list read-only. Addition to and removal from
552 * the hash-list needs a held write-lock.
554 spin_lock(&pagecache_lock);
555 page = __find_page_nolock(mapping, offset, *hash);
556 if (page)
557 page_cache_get(page);
558 spin_unlock(&pagecache_lock);
559 return page;
563 * Get the lock to a page atomically.
565 struct page * __find_lock_page (struct address_space *mapping,
566 unsigned long offset, struct page **hash)
568 struct page *page;
571 * We scan the hash list read-only. Addition to and removal from
572 * the hash-list needs a held write-lock.
574 repeat:
575 spin_lock(&pagecache_lock);
576 page = __find_page_nolock(mapping, offset, *hash);
577 if (page) {
578 page_cache_get(page);
579 spin_unlock(&pagecache_lock);
581 lock_page(page);
583 /* Is the page still hashed? Ok, good.. */
584 if (page->mapping)
585 return page;
587 /* Nope: we raced. Release and try again.. */
588 UnlockPage(page);
589 page_cache_release(page);
590 goto repeat;
592 spin_unlock(&pagecache_lock);
593 return NULL;
596 #if 0
597 #define PROFILE_READAHEAD
598 #define DEBUG_READAHEAD
599 #endif
602 * We combine this with read-ahead to deactivate pages when we
603 * think there's sequential IO going on. Note that this is
604 * harmless since we don't actually evict the pages from memory
605 * but just move them to the inactive list.
607 * TODO:
608 * - make the readahead code smarter
609 * - move readahead to the VMA level so we can do the same
610 * trick with mmap()
612 * Rik van Riel, 2000
614 static void drop_behind(struct file * file, unsigned long index)
616 struct inode *inode = file->f_dentry->d_inode;
617 struct address_space *mapping = inode->i_mapping;
618 struct page **hash;
619 struct page *page;
620 unsigned long start;
622 /* Nothing to drop-behind if we're on the first page. */
623 if (!index)
624 return;
626 if (index > file->f_rawin)
627 start = index - file->f_rawin;
628 else
629 start = 0;
632 * Go backwards from index-1 and drop all pages in the
633 * readahead window. Since the readahead window may have
634 * been increased since the last time we were called, we
635 * stop when the page isn't there.
637 spin_lock(&pagecache_lock);
638 while (--index >= start) {
639 hash = page_hash(mapping, index);
640 page = __find_page_nolock(mapping, index, *hash);
641 if (!page)
642 break;
643 deactivate_page(page);
645 spin_unlock(&pagecache_lock);
649 * Read-ahead profiling information
650 * --------------------------------
651 * Every PROFILE_MAXREADCOUNT, the following information is written
652 * to the syslog:
653 * Percentage of asynchronous read-ahead.
654 * Average of read-ahead fields context value.
655 * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
656 * to the syslog.
659 #ifdef PROFILE_READAHEAD
661 #define PROFILE_MAXREADCOUNT 1000
663 static unsigned long total_reada;
664 static unsigned long total_async;
665 static unsigned long total_ramax;
666 static unsigned long total_ralen;
667 static unsigned long total_rawin;
669 static void profile_readahead(int async, struct file *filp)
671 unsigned long flags;
673 ++total_reada;
674 if (async)
675 ++total_async;
677 total_ramax += filp->f_ramax;
678 total_ralen += filp->f_ralen;
679 total_rawin += filp->f_rawin;
681 if (total_reada > PROFILE_MAXREADCOUNT) {
682 save_flags(flags);
683 cli();
684 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
685 restore_flags(flags);
686 return;
689 printk("Readahead average: max=%ld, len=%ld, win=%ld, async=%ld%%\n",
690 total_ramax/total_reada,
691 total_ralen/total_reada,
692 total_rawin/total_reada,
693 (total_async*100)/total_reada);
694 #ifdef DEBUG_READAHEAD
695 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n",
696 filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
697 #endif
699 total_reada = 0;
700 total_async = 0;
701 total_ramax = 0;
702 total_ralen = 0;
703 total_rawin = 0;
705 restore_flags(flags);
708 #endif /* defined PROFILE_READAHEAD */
711 * Read-ahead context:
712 * -------------------
713 * The read ahead context fields of the "struct file" are the following:
714 * - f_raend : position of the first byte after the last page we tried to
715 * read ahead.
716 * - f_ramax : current read-ahead maximum size.
717 * - f_ralen : length of the current IO read block we tried to read-ahead.
718 * - f_rawin : length of the current read-ahead window.
719 * if last read-ahead was synchronous then
720 * f_rawin = f_ralen
721 * otherwise (was asynchronous)
722 * f_rawin = previous value of f_ralen + f_ralen
724 * Read-ahead limits:
725 * ------------------
726 * MIN_READAHEAD : minimum read-ahead size when read-ahead.
727 * MAX_READAHEAD : maximum read-ahead size when read-ahead.
729 * Synchronous read-ahead benefits:
730 * --------------------------------
731 * Using reasonable IO xfer length from peripheral devices increase system
732 * performances.
733 * Reasonable means, in this context, not too large but not too small.
734 * The actual maximum value is:
735 * MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
736 * and 32K if defined (4K page size assumed).
738 * Asynchronous read-ahead benefits:
739 * ---------------------------------
740 * Overlapping next read request and user process execution increase system
741 * performance.
743 * Read-ahead risks:
744 * -----------------
745 * We have to guess which further data are needed by the user process.
746 * If these data are often not really needed, it's bad for system
747 * performances.
748 * However, we know that files are often accessed sequentially by
749 * application programs and it seems that it is possible to have some good
750 * strategy in that guessing.
751 * We only try to read-ahead files that seems to be read sequentially.
753 * Asynchronous read-ahead risks:
754 * ------------------------------
755 * In order to maximize overlapping, we must start some asynchronous read
756 * request from the device, as soon as possible.
757 * We must be very careful about:
758 * - The number of effective pending IO read requests.
759 * ONE seems to be the only reasonable value.
760 * - The total memory pool usage for the file access stream.
761 * This maximum memory usage is implicitly 2 IO read chunks:
762 * 2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
763 * 64k if defined (4K page size assumed).
766 static inline int get_max_readahead(struct inode * inode)
768 if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
769 return MAX_READAHEAD;
770 return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
773 static void generic_file_readahead(int reada_ok,
774 struct file * filp, struct inode * inode,
775 struct page * page)
777 unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
778 unsigned long index = page->index;
779 unsigned long max_ahead, ahead;
780 unsigned long raend;
781 int max_readahead = get_max_readahead(inode);
783 raend = filp->f_raend;
784 max_ahead = 0;
787 * The current page is locked.
788 * If the current position is inside the previous read IO request, do not
789 * try to reread previously read ahead pages.
790 * Otherwise decide or not to read ahead some pages synchronously.
791 * If we are not going to read ahead, set the read ahead context for this
792 * page only.
794 if (PageLocked(page)) {
795 if (!filp->f_ralen || index >= raend || index + filp->f_rawin < raend) {
796 raend = index;
797 if (raend < end_index)
798 max_ahead = filp->f_ramax;
799 filp->f_rawin = 0;
800 filp->f_ralen = 1;
801 if (!max_ahead) {
802 filp->f_raend = index + filp->f_ralen;
803 filp->f_rawin += filp->f_ralen;
808 * The current page is not locked.
809 * If we were reading ahead and,
810 * if the current max read ahead size is not zero and,
811 * if the current position is inside the last read-ahead IO request,
812 * it is the moment to try to read ahead asynchronously.
813 * We will later force unplug device in order to force asynchronous read IO.
815 else if (reada_ok && filp->f_ramax && raend >= 1 &&
816 index <= raend && index + filp->f_ralen >= raend) {
818 * Add ONE page to max_ahead in order to try to have about the same IO max size
819 * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
820 * Compute the position of the last page we have tried to read in order to
821 * begin to read ahead just at the next page.
823 raend -= 1;
824 if (raend < end_index)
825 max_ahead = filp->f_ramax + 1;
827 if (max_ahead) {
828 filp->f_rawin = filp->f_ralen;
829 filp->f_ralen = 0;
830 reada_ok = 2;
834 * Try to read ahead pages.
835 * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
836 * scheduler, will work enough for us to avoid too bad actuals IO requests.
838 ahead = 0;
839 while (ahead < max_ahead) {
840 ahead ++;
841 if ((raend + ahead) >= end_index)
842 break;
843 if (page_cache_read(filp, raend + ahead) < 0)
844 break;
847 * If we tried to read ahead some pages,
848 * If we tried to read ahead asynchronously,
849 * Try to force unplug of the device in order to start an asynchronous
850 * read IO request.
851 * Update the read-ahead context.
852 * Store the length of the current read-ahead window.
853 * Double the current max read ahead size.
854 * That heuristic avoid to do some large IO for files that are not really
855 * accessed sequentially.
857 if (ahead) {
858 if (reada_ok == 2) {
859 run_task_queue(&tq_disk);
862 filp->f_ralen += ahead;
863 filp->f_rawin += filp->f_ralen;
864 filp->f_raend = raend + ahead + 1;
866 filp->f_ramax += filp->f_ramax;
868 if (filp->f_ramax > max_readahead)
869 filp->f_ramax = max_readahead;
872 * Move the pages that have already been passed
873 * to the inactive list.
875 drop_behind(filp, index);
877 #ifdef PROFILE_READAHEAD
878 profile_readahead((reada_ok == 2), filp);
879 #endif
882 return;
887 * This is a generic file read routine, and uses the
888 * inode->i_op->readpage() function for the actual low-level
889 * stuff.
891 * This is really ugly. But the goto's actually try to clarify some
892 * of the logic when it comes to error handling etc.
894 void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
896 struct inode *inode = filp->f_dentry->d_inode;
897 struct address_space *mapping = inode->i_mapping;
898 unsigned long index, offset;
899 struct page *cached_page;
900 int reada_ok;
901 int error;
902 int max_readahead = get_max_readahead(inode);
904 cached_page = NULL;
905 index = *ppos >> PAGE_CACHE_SHIFT;
906 offset = *ppos & ~PAGE_CACHE_MASK;
909 * If the current position is outside the previous read-ahead window,
910 * we reset the current read-ahead context and set read ahead max to zero
911 * (will be set to just needed value later),
912 * otherwise, we assume that the file accesses are sequential enough to
913 * continue read-ahead.
915 if (index > filp->f_raend || index + filp->f_rawin < filp->f_raend) {
916 reada_ok = 0;
917 filp->f_raend = 0;
918 filp->f_ralen = 0;
919 filp->f_ramax = 0;
920 filp->f_rawin = 0;
921 } else {
922 reada_ok = 1;
925 * Adjust the current value of read-ahead max.
926 * If the read operation stay in the first half page, force no readahead.
927 * Otherwise try to increase read ahead max just enough to do the read request.
928 * Then, at least MIN_READAHEAD if read ahead is ok,
929 * and at most MAX_READAHEAD in all cases.
931 if (!index && offset + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
932 filp->f_ramax = 0;
933 } else {
934 unsigned long needed;
936 needed = ((offset + desc->count) >> PAGE_CACHE_SHIFT) + 1;
938 if (filp->f_ramax < needed)
939 filp->f_ramax = needed;
941 if (reada_ok && filp->f_ramax < MIN_READAHEAD)
942 filp->f_ramax = MIN_READAHEAD;
943 if (filp->f_ramax > max_readahead)
944 filp->f_ramax = max_readahead;
947 for (;;) {
948 struct page *page, **hash;
949 unsigned long end_index, nr;
951 end_index = inode->i_size >> PAGE_CACHE_SHIFT;
952 if (index > end_index)
953 break;
954 nr = PAGE_CACHE_SIZE;
955 if (index == end_index) {
956 nr = inode->i_size & ~PAGE_CACHE_MASK;
957 if (nr <= offset)
958 break;
961 nr = nr - offset;
964 * Try to find the data in the page cache..
966 hash = page_hash(mapping, index);
968 spin_lock(&pagecache_lock);
969 page = __find_page_nolock(mapping, index, *hash);
970 if (!page)
971 goto no_cached_page;
972 found_page:
973 page_cache_get(page);
974 spin_unlock(&pagecache_lock);
976 if (!Page_Uptodate(page))
977 goto page_not_up_to_date;
978 generic_file_readahead(reada_ok, filp, inode, page);
979 page_ok:
980 /* If users can be writing to this page using arbitrary
981 * virtual addresses, take care about potential aliasing
982 * before reading the page on the kernel side.
984 if (mapping->i_mmap_shared != NULL)
985 flush_dcache_page(page);
988 * Ok, we have the page, and it's up-to-date, so
989 * now we can copy it to user space...
991 * The actor routine returns how many bytes were actually used..
992 * NOTE! This may not be the same as how much of a user buffer
993 * we filled up (we may be padding etc), so we can only update
994 * "pos" here (the actor routine has to update the user buffer
995 * pointers and the remaining count).
997 nr = actor(desc, page, offset, nr);
998 offset += nr;
999 index += offset >> PAGE_CACHE_SHIFT;
1000 offset &= ~PAGE_CACHE_MASK;
1002 page_cache_release(page);
1003 if (nr && desc->count)
1004 continue;
1005 break;
1008 * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
1010 page_not_up_to_date:
1011 generic_file_readahead(reada_ok, filp, inode, page);
1013 if (Page_Uptodate(page))
1014 goto page_ok;
1016 /* Get exclusive access to the page ... */
1017 lock_page(page);
1019 /* Did it get unhashed before we got the lock? */
1020 if (!page->mapping) {
1021 UnlockPage(page);
1022 page_cache_release(page);
1023 continue;
1026 /* Did somebody else fill it already? */
1027 if (Page_Uptodate(page)) {
1028 UnlockPage(page);
1029 goto page_ok;
1032 readpage:
1033 /* ... and start the actual read. The read will unlock the page. */
1034 error = mapping->a_ops->readpage(filp, page);
1036 if (!error) {
1037 if (Page_Uptodate(page))
1038 goto page_ok;
1040 /* Again, try some read-ahead while waiting for the page to finish.. */
1041 generic_file_readahead(reada_ok, filp, inode, page);
1042 wait_on_page(page);
1043 if (Page_Uptodate(page))
1044 goto page_ok;
1045 error = -EIO;
1048 /* UHHUH! A synchronous read error occurred. Report it */
1049 desc->error = error;
1050 page_cache_release(page);
1051 break;
1053 no_cached_page:
1055 * Ok, it wasn't cached, so we need to create a new
1056 * page..
1058 * We get here with the page cache lock held.
1060 if (!cached_page) {
1061 spin_unlock(&pagecache_lock);
1062 cached_page = page_cache_alloc();
1063 if (!cached_page) {
1064 desc->error = -ENOMEM;
1065 break;
1069 * Somebody may have added the page while we
1070 * dropped the page cache lock. Check for that.
1072 spin_lock(&pagecache_lock);
1073 page = __find_page_nolock(mapping, index, *hash);
1074 if (page)
1075 goto found_page;
1079 * Ok, add the new page to the hash-queues...
1081 page = cached_page;
1082 __add_to_page_cache(page, mapping, index, hash);
1083 spin_unlock(&pagecache_lock);
1084 cached_page = NULL;
1086 goto readpage;
1089 *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1090 filp->f_reada = 1;
1091 if (cached_page)
1092 page_cache_free(cached_page);
1093 UPDATE_ATIME(inode);
1096 static int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
1098 char *kaddr;
1099 unsigned long left, count = desc->count;
1101 if (size > count)
1102 size = count;
1104 kaddr = kmap(page);
1105 left = __copy_to_user(desc->buf, kaddr + offset, size);
1106 kunmap(page);
1108 if (left) {
1109 size -= left;
1110 desc->error = -EFAULT;
1112 desc->count = count - size;
1113 desc->written += size;
1114 desc->buf += size;
1115 return size;
1119 * This is the "read()" routine for all filesystems
1120 * that can use the page cache directly.
1122 ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
1124 ssize_t retval;
1126 retval = -EFAULT;
1127 if (access_ok(VERIFY_WRITE, buf, count)) {
1128 retval = 0;
1130 if (count) {
1131 read_descriptor_t desc;
1133 desc.written = 0;
1134 desc.count = count;
1135 desc.buf = buf;
1136 desc.error = 0;
1137 do_generic_file_read(filp, ppos, &desc, file_read_actor);
1139 retval = desc.written;
1140 if (!retval)
1141 retval = desc.error;
1144 return retval;
1147 static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size)
1149 char *kaddr;
1150 ssize_t written;
1151 unsigned long count = desc->count;
1152 struct file *file = (struct file *) desc->buf;
1153 mm_segment_t old_fs;
1155 if (size > count)
1156 size = count;
1157 old_fs = get_fs();
1158 set_fs(KERNEL_DS);
1160 kaddr = kmap(page);
1161 written = file->f_op->write(file, kaddr + offset, size, &file->f_pos);
1162 kunmap(page);
1163 set_fs(old_fs);
1164 if (written < 0) {
1165 desc->error = written;
1166 written = 0;
1168 desc->count = count - written;
1169 desc->written += written;
1170 return written;
1173 asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
1175 ssize_t retval;
1176 struct file * in_file, * out_file;
1177 struct inode * in_inode, * out_inode;
1180 * Get input file, and verify that it is ok..
1182 retval = -EBADF;
1183 in_file = fget(in_fd);
1184 if (!in_file)
1185 goto out;
1186 if (!(in_file->f_mode & FMODE_READ))
1187 goto fput_in;
1188 retval = -EINVAL;
1189 in_inode = in_file->f_dentry->d_inode;
1190 if (!in_inode)
1191 goto fput_in;
1192 if (!in_inode->i_mapping->a_ops->readpage)
1193 goto fput_in;
1194 retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
1195 if (retval)
1196 goto fput_in;
1199 * Get output file, and verify that it is ok..
1201 retval = -EBADF;
1202 out_file = fget(out_fd);
1203 if (!out_file)
1204 goto fput_in;
1205 if (!(out_file->f_mode & FMODE_WRITE))
1206 goto fput_out;
1207 retval = -EINVAL;
1208 if (!out_file->f_op || !out_file->f_op->write)
1209 goto fput_out;
1210 out_inode = out_file->f_dentry->d_inode;
1211 retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
1212 if (retval)
1213 goto fput_out;
1215 retval = 0;
1216 if (count) {
1217 read_descriptor_t desc;
1218 loff_t pos = 0, *ppos;
1220 retval = -EFAULT;
1221 ppos = &in_file->f_pos;
1222 if (offset) {
1223 if (get_user(pos, offset))
1224 goto fput_out;
1225 ppos = &pos;
1228 desc.written = 0;
1229 desc.count = count;
1230 desc.buf = (char *) out_file;
1231 desc.error = 0;
1232 do_generic_file_read(in_file, ppos, &desc, file_send_actor);
1234 retval = desc.written;
1235 if (!retval)
1236 retval = desc.error;
1237 if (offset)
1238 put_user(pos, offset);
1241 fput_out:
1242 fput(out_file);
1243 fput_in:
1244 fput(in_file);
1245 out:
1246 return retval;
1250 * Read-ahead and flush behind for MADV_SEQUENTIAL areas. Since we are
1251 * sure this is sequential access, we don't need a flexible read-ahead
1252 * window size -- we can always use a large fixed size window.
1254 static void nopage_sequential_readahead(struct vm_area_struct * vma,
1255 unsigned long pgoff, unsigned long filesize)
1257 unsigned long ra_window;
1259 ra_window = get_max_readahead(vma->vm_file->f_dentry->d_inode);
1260 ra_window = CLUSTER_OFFSET(ra_window + CLUSTER_PAGES - 1);
1262 /* vm_raend is zero if we haven't read ahead in this area yet. */
1263 if (vma->vm_raend == 0)
1264 vma->vm_raend = vma->vm_pgoff + ra_window;
1267 * If we've just faulted the page half-way through our window,
1268 * then schedule reads for the next window, and release the
1269 * pages in the previous window.
1271 if ((pgoff + (ra_window >> 1)) == vma->vm_raend) {
1272 unsigned long start = vma->vm_pgoff + vma->vm_raend;
1273 unsigned long end = start + ra_window;
1275 if (end > ((vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff))
1276 end = (vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff;
1277 if (start > end)
1278 return;
1280 while ((start < end) && (start < filesize)) {
1281 if (read_cluster_nonblocking(vma->vm_file,
1282 start, filesize) < 0)
1283 break;
1284 start += CLUSTER_PAGES;
1286 run_task_queue(&tq_disk);
1288 /* if we're far enough past the beginning of this area,
1289 recycle pages that are in the previous window. */
1290 if (vma->vm_raend > (vma->vm_pgoff + ra_window + ra_window)) {
1291 unsigned long window = ra_window << PAGE_SHIFT;
1293 end = vma->vm_start + (vma->vm_raend << PAGE_SHIFT);
1294 end -= window + window;
1295 filemap_sync(vma, end - window, window, MS_INVALIDATE);
1298 vma->vm_raend += ra_window;
1301 return;
1305 * filemap_nopage() is invoked via the vma operations vector for a
1306 * mapped memory region to read in file data during a page fault.
1308 * The goto's are kind of ugly, but this streamlines the normal case of having
1309 * it in the page cache, and handles the special cases reasonably without
1310 * having a lot of duplicated code.
1312 struct page * filemap_nopage(struct vm_area_struct * area,
1313 unsigned long address, int no_share)
1315 int error;
1316 struct file *file = area->vm_file;
1317 struct inode *inode = file->f_dentry->d_inode;
1318 struct address_space *mapping = inode->i_mapping;
1319 struct page *page, **hash, *old_page;
1320 unsigned long size, pgoff;
1322 pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
1324 retry_all:
1326 * An external ptracer can access pages that normally aren't
1327 * accessible..
1329 size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1330 if ((pgoff >= size) && (area->vm_mm == current->mm))
1331 return NULL;
1334 * Do we have something in the page cache already?
1336 hash = page_hash(mapping, pgoff);
1337 retry_find:
1338 page = __find_get_page(mapping, pgoff, hash);
1339 if (!page)
1340 goto no_cached_page;
1343 * Ok, found a page in the page cache, now we need to check
1344 * that it's up-to-date.
1346 if (!Page_Uptodate(page))
1347 goto page_not_uptodate;
1349 success:
1351 * Try read-ahead for sequential areas.
1353 if (VM_SequentialReadHint(area))
1354 nopage_sequential_readahead(area, pgoff, size);
1357 * Found the page and have a reference on it, need to check sharing
1358 * and possibly copy it over to another page..
1360 old_page = page;
1361 if (no_share) {
1362 struct page *new_page = page_cache_alloc();
1364 if (new_page) {
1365 copy_user_highpage(new_page, old_page, address);
1366 flush_page_to_ram(new_page);
1367 } else
1368 new_page = NOPAGE_OOM;
1369 page_cache_release(page);
1370 return new_page;
1373 flush_page_to_ram(old_page);
1374 return old_page;
1376 no_cached_page:
1378 * If the requested offset is within our file, try to read a whole
1379 * cluster of pages at once.
1381 * Otherwise, we're off the end of a privately mapped file,
1382 * so we need to map a zero page.
1384 if ((pgoff < size) && !VM_RandomReadHint(area))
1385 error = read_cluster_nonblocking(file, pgoff, size);
1386 else
1387 error = page_cache_read(file, pgoff);
1390 * The page we want has now been added to the page cache.
1391 * In the unlikely event that someone removed it in the
1392 * meantime, we'll just come back here and read it again.
1394 if (error >= 0)
1395 goto retry_find;
1398 * An error return from page_cache_read can result if the
1399 * system is low on memory, or a problem occurs while trying
1400 * to schedule I/O.
1402 if (error == -ENOMEM)
1403 return NOPAGE_OOM;
1404 return NULL;
1406 page_not_uptodate:
1407 lock_page(page);
1409 /* Did it get unhashed while we waited for it? */
1410 if (!page->mapping) {
1411 UnlockPage(page);
1412 page_cache_release(page);
1413 goto retry_all;
1416 /* Did somebody else get it up-to-date? */
1417 if (Page_Uptodate(page)) {
1418 UnlockPage(page);
1419 goto success;
1422 if (!mapping->a_ops->readpage(file, page)) {
1423 wait_on_page(page);
1424 if (Page_Uptodate(page))
1425 goto success;
1429 * Umm, take care of errors if the page isn't up-to-date.
1430 * Try to re-read it _once_. We do this synchronously,
1431 * because there really aren't any performance issues here
1432 * and we need to check for errors.
1434 lock_page(page);
1436 /* Somebody truncated the page on us? */
1437 if (!page->mapping) {
1438 UnlockPage(page);
1439 page_cache_release(page);
1440 goto retry_all;
1443 /* Somebody else successfully read it in? */
1444 if (Page_Uptodate(page)) {
1445 UnlockPage(page);
1446 goto success;
1448 ClearPageError(page);
1449 if (!mapping->a_ops->readpage(file, page)) {
1450 wait_on_page(page);
1451 if (Page_Uptodate(page))
1452 goto success;
1456 * Things didn't work out. Return zero to tell the
1457 * mm layer so, possibly freeing the page cache page first.
1459 page_cache_release(page);
1460 return NULL;
1464 * If a task terminates while we're swapping the page, the vma and
1465 * and file could be released: try_to_swap_out has done a get_file.
1466 * vma/file is guaranteed to exist in the unmap/sync cases because
1467 * mmap_sem is held.
1469 * The "mapping" test takes care of somebody having truncated the
1470 * page and thus made this write-page a no-op..
1472 static int filemap_write_page(struct page * page, int wait)
1474 struct address_space * mapping = page->mapping;
1475 int error = 0;
1477 if (mapping && mapping->a_ops->writepage) {
1478 ClearPageDirty(page);
1479 error = mapping->a_ops->writepage(page);
1481 return error;
1486 * The page cache takes care of races between somebody
1487 * trying to swap something out and swap something in
1488 * at the same time..
1490 extern void wakeup_bdflush(int);
1491 int filemap_swapout(struct page * page, struct file *file)
1493 SetPageDirty(page);
1494 return 0;
1497 /* Called with mm->page_table_lock held to protect against other
1498 * threads/the swapper from ripping pte's out from under us.
1500 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
1501 unsigned long address, unsigned int flags)
1503 pte_t pte;
1504 struct page *page;
1505 int error;
1507 pte = *ptep;
1509 if (!pte_present(pte))
1510 goto out;
1511 if (!ptep_test_and_clear_dirty(ptep))
1512 goto out;
1514 flush_page_to_ram(pte_page(pte));
1515 flush_cache_page(vma, address);
1516 flush_tlb_page(vma, address);
1517 page = pte_page(pte);
1518 page_cache_get(page);
1519 spin_unlock(&vma->vm_mm->page_table_lock);
1521 lock_page(page);
1522 error = filemap_write_page(page, 1);
1523 page_cache_free(page);
1525 spin_lock(&vma->vm_mm->page_table_lock);
1526 return error;
1528 out:
1529 return 0;
1532 static inline int filemap_sync_pte_range(pmd_t * pmd,
1533 unsigned long address, unsigned long size,
1534 struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
1536 pte_t * pte;
1537 unsigned long end;
1538 int error;
1540 if (pmd_none(*pmd))
1541 return 0;
1542 if (pmd_bad(*pmd)) {
1543 pmd_ERROR(*pmd);
1544 pmd_clear(pmd);
1545 return 0;
1547 pte = pte_offset(pmd, address);
1548 offset += address & PMD_MASK;
1549 address &= ~PMD_MASK;
1550 end = address + size;
1551 if (end > PMD_SIZE)
1552 end = PMD_SIZE;
1553 error = 0;
1554 do {
1555 error |= filemap_sync_pte(pte, vma, address + offset, flags);
1556 address += PAGE_SIZE;
1557 pte++;
1558 } while (address && (address < end));
1559 return error;
1562 static inline int filemap_sync_pmd_range(pgd_t * pgd,
1563 unsigned long address, unsigned long size,
1564 struct vm_area_struct *vma, unsigned int flags)
1566 pmd_t * pmd;
1567 unsigned long offset, end;
1568 int error;
1570 if (pgd_none(*pgd))
1571 return 0;
1572 if (pgd_bad(*pgd)) {
1573 pgd_ERROR(*pgd);
1574 pgd_clear(pgd);
1575 return 0;
1577 pmd = pmd_offset(pgd, address);
1578 offset = address & PGDIR_MASK;
1579 address &= ~PGDIR_MASK;
1580 end = address + size;
1581 if (end > PGDIR_SIZE)
1582 end = PGDIR_SIZE;
1583 error = 0;
1584 do {
1585 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
1586 address = (address + PMD_SIZE) & PMD_MASK;
1587 pmd++;
1588 } while (address && (address < end));
1589 return error;
1592 int filemap_sync(struct vm_area_struct * vma, unsigned long address,
1593 size_t size, unsigned int flags)
1595 pgd_t * dir;
1596 unsigned long end = address + size;
1597 int error = 0;
1599 /* Aquire the lock early; it may be possible to avoid dropping
1600 * and reaquiring it repeatedly.
1602 spin_lock(&vma->vm_mm->page_table_lock);
1604 dir = pgd_offset(vma->vm_mm, address);
1605 flush_cache_range(vma->vm_mm, end - size, end);
1606 if (address >= end)
1607 BUG();
1608 do {
1609 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
1610 address = (address + PGDIR_SIZE) & PGDIR_MASK;
1611 dir++;
1612 } while (address && (address < end));
1613 flush_tlb_range(vma->vm_mm, end - size, end);
1615 spin_unlock(&vma->vm_mm->page_table_lock);
1617 return error;
1621 * Shared mappings need to be able to do the right thing at
1622 * close/unmap/sync. They will also use the private file as
1623 * backing-store for swapping..
1625 static struct vm_operations_struct file_shared_mmap = {
1626 sync: filemap_sync,
1627 nopage: filemap_nopage,
1628 swapout: filemap_swapout,
1632 * Private mappings just need to be able to load in the map.
1634 * (This is actually used for shared mappings as well, if we
1635 * know they can't ever get write permissions..)
1637 static struct vm_operations_struct file_private_mmap = {
1638 nopage: filemap_nopage,
1641 /* This is used for a general mmap of a disk file */
1643 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1645 struct vm_operations_struct * ops;
1646 struct inode *inode = file->f_dentry->d_inode;
1648 ops = &file_private_mmap;
1649 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1650 if (!inode->i_mapping->a_ops->writepage)
1651 return -EINVAL;
1652 ops = &file_shared_mmap;
1654 if (!inode->i_sb || !S_ISREG(inode->i_mode))
1655 return -EACCES;
1656 if (!inode->i_mapping->a_ops->readpage)
1657 return -ENOEXEC;
1658 UPDATE_ATIME(inode);
1659 vma->vm_ops = ops;
1660 return 0;
1664 * The msync() system call.
1667 static int msync_interval(struct vm_area_struct * vma,
1668 unsigned long start, unsigned long end, int flags)
1670 if (vma->vm_file && vma->vm_ops && vma->vm_ops->sync) {
1671 int error;
1672 error = vma->vm_ops->sync(vma, start, end-start, flags);
1673 if (!error && (flags & MS_SYNC)) {
1674 struct file * file = vma->vm_file;
1675 if (file && file->f_op && file->f_op->fsync) {
1676 down(&file->f_dentry->d_inode->i_sem);
1677 error = file->f_op->fsync(file, file->f_dentry, 1);
1678 up(&file->f_dentry->d_inode->i_sem);
1681 return error;
1683 return 0;
1686 asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
1688 unsigned long end;
1689 struct vm_area_struct * vma;
1690 int unmapped_error, error = -EINVAL;
1692 down(&current->mm->mmap_sem);
1693 if (start & ~PAGE_MASK)
1694 goto out;
1695 len = (len + ~PAGE_MASK) & PAGE_MASK;
1696 end = start + len;
1697 if (end < start)
1698 goto out;
1699 if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1700 goto out;
1701 error = 0;
1702 if (end == start)
1703 goto out;
1705 * If the interval [start,end) covers some unmapped address ranges,
1706 * just ignore them, but return -EFAULT at the end.
1708 vma = find_vma(current->mm, start);
1709 unmapped_error = 0;
1710 for (;;) {
1711 /* Still start < end. */
1712 error = -EFAULT;
1713 if (!vma)
1714 goto out;
1715 /* Here start < vma->vm_end. */
1716 if (start < vma->vm_start) {
1717 unmapped_error = -EFAULT;
1718 start = vma->vm_start;
1720 /* Here vma->vm_start <= start < vma->vm_end. */
1721 if (end <= vma->vm_end) {
1722 if (start < end) {
1723 error = msync_interval(vma, start, end, flags);
1724 if (error)
1725 goto out;
1727 error = unmapped_error;
1728 goto out;
1730 /* Here vma->vm_start <= start < vma->vm_end < end. */
1731 error = msync_interval(vma, start, vma->vm_end, flags);
1732 if (error)
1733 goto out;
1734 start = vma->vm_end;
1735 vma = vma->vm_next;
1737 out:
1738 up(&current->mm->mmap_sem);
1739 return error;
1742 static inline void setup_read_behavior(struct vm_area_struct * vma,
1743 int behavior)
1745 VM_ClearReadHint(vma);
1746 switch(behavior) {
1747 case MADV_SEQUENTIAL:
1748 vma->vm_flags |= VM_SEQ_READ;
1749 break;
1750 case MADV_RANDOM:
1751 vma->vm_flags |= VM_RAND_READ;
1752 break;
1753 default:
1754 break;
1756 return;
1759 static long madvise_fixup_start(struct vm_area_struct * vma,
1760 unsigned long end, int behavior)
1762 struct vm_area_struct * n;
1764 n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1765 if (!n)
1766 return -EAGAIN;
1767 *n = *vma;
1768 n->vm_end = end;
1769 setup_read_behavior(n, behavior);
1770 n->vm_raend = 0;
1771 get_file(n->vm_file);
1772 if (n->vm_ops && n->vm_ops->open)
1773 n->vm_ops->open(n);
1774 lock_vma_mappings(vma);
1775 spin_lock(&vma->vm_mm->page_table_lock);
1776 vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
1777 vma->vm_start = end;
1778 __insert_vm_struct(current->mm, n);
1779 spin_unlock(&vma->vm_mm->page_table_lock);
1780 unlock_vma_mappings(vma);
1781 return 0;
1784 static long madvise_fixup_end(struct vm_area_struct * vma,
1785 unsigned long start, int behavior)
1787 struct vm_area_struct * n;
1789 n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1790 if (!n)
1791 return -EAGAIN;
1792 *n = *vma;
1793 n->vm_start = start;
1794 n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT;
1795 setup_read_behavior(n, behavior);
1796 n->vm_raend = 0;
1797 get_file(n->vm_file);
1798 if (n->vm_ops && n->vm_ops->open)
1799 n->vm_ops->open(n);
1800 lock_vma_mappings(vma);
1801 spin_lock(&vma->vm_mm->page_table_lock);
1802 vma->vm_end = start;
1803 __insert_vm_struct(current->mm, n);
1804 spin_unlock(&vma->vm_mm->page_table_lock);
1805 unlock_vma_mappings(vma);
1806 return 0;
1809 static long madvise_fixup_middle(struct vm_area_struct * vma,
1810 unsigned long start, unsigned long end, int behavior)
1812 struct vm_area_struct * left, * right;
1814 left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1815 if (!left)
1816 return -EAGAIN;
1817 right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1818 if (!right) {
1819 kmem_cache_free(vm_area_cachep, left);
1820 return -EAGAIN;
1822 *left = *vma;
1823 *right = *vma;
1824 left->vm_end = start;
1825 right->vm_start = end;
1826 right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT;
1827 left->vm_raend = 0;
1828 right->vm_raend = 0;
1829 atomic_add(2, &vma->vm_file->f_count);
1831 if (vma->vm_ops && vma->vm_ops->open) {
1832 vma->vm_ops->open(left);
1833 vma->vm_ops->open(right);
1835 lock_vma_mappings(vma);
1836 spin_lock(&vma->vm_mm->page_table_lock);
1837 vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
1838 vma->vm_start = start;
1839 vma->vm_end = end;
1840 setup_read_behavior(vma, behavior);
1841 vma->vm_raend = 0;
1842 __insert_vm_struct(current->mm, left);
1843 __insert_vm_struct(current->mm, right);
1844 spin_unlock(&vma->vm_mm->page_table_lock);
1845 unlock_vma_mappings(vma);
1846 return 0;
1850 * We can potentially split a vm area into separate
1851 * areas, each area with its own behavior.
1853 static long madvise_behavior(struct vm_area_struct * vma,
1854 unsigned long start, unsigned long end, int behavior)
1856 int error = 0;
1858 /* This caps the number of vma's this process can own */
1859 if (vma->vm_mm->map_count > MAX_MAP_COUNT)
1860 return -ENOMEM;
1862 if (start == vma->vm_start) {
1863 if (end == vma->vm_end) {
1864 setup_read_behavior(vma, behavior);
1865 vma->vm_raend = 0;
1866 } else
1867 error = madvise_fixup_start(vma, end, behavior);
1868 } else {
1869 if (end == vma->vm_end)
1870 error = madvise_fixup_end(vma, start, behavior);
1871 else
1872 error = madvise_fixup_middle(vma, start, end, behavior);
1875 return error;
1879 * Schedule all required I/O operations, then run the disk queue
1880 * to make sure they are started. Do not wait for completion.
1882 static long madvise_willneed(struct vm_area_struct * vma,
1883 unsigned long start, unsigned long end)
1885 long error = -EBADF;
1886 struct file * file;
1887 unsigned long size, rlim_rss;
1889 /* Doesn't work if there's no mapped file. */
1890 if (!vma->vm_file)
1891 return error;
1892 file = vma->vm_file;
1893 size = (file->f_dentry->d_inode->i_size + PAGE_CACHE_SIZE - 1) >>
1894 PAGE_CACHE_SHIFT;
1896 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1897 if (end > vma->vm_end)
1898 end = vma->vm_end;
1899 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1901 /* Make sure this doesn't exceed the process's max rss. */
1902 error = -EIO;
1903 rlim_rss = current->rlim ? current->rlim[RLIMIT_RSS].rlim_cur :
1904 LONG_MAX; /* default: see resource.h */
1905 if ((vma->vm_mm->rss + (end - start)) > rlim_rss)
1906 return error;
1908 /* round to cluster boundaries if this isn't a "random" area. */
1909 if (!VM_RandomReadHint(vma)) {
1910 start = CLUSTER_OFFSET(start);
1911 end = CLUSTER_OFFSET(end + CLUSTER_PAGES - 1);
1913 while ((start < end) && (start < size)) {
1914 error = read_cluster_nonblocking(file, start, size);
1915 start += CLUSTER_PAGES;
1916 if (error < 0)
1917 break;
1919 } else {
1920 while ((start < end) && (start < size)) {
1921 error = page_cache_read(file, start);
1922 start++;
1923 if (error < 0)
1924 break;
1928 /* Don't wait for someone else to push these requests. */
1929 run_task_queue(&tq_disk);
1931 return error;
1935 * Application no longer needs these pages. If the pages are dirty,
1936 * it's OK to just throw them away. The app will be more careful about
1937 * data it wants to keep. Be sure to free swap resources too. The
1938 * zap_page_range call sets things up for refill_inactive to actually free
1939 * these pages later if no one else has touched them in the meantime,
1940 * although we could add these pages to a global reuse list for
1941 * refill_inactive to pick up before reclaiming other pages.
1943 * NB: This interface discards data rather than pushes it out to swap,
1944 * as some implementations do. This has performance implications for
1945 * applications like large transactional databases which want to discard
1946 * pages in anonymous maps after committing to backing store the data
1947 * that was kept in them. There is no reason to write this data out to
1948 * the swap area if the application is discarding it.
1950 * An interface that causes the system to free clean pages and flush
1951 * dirty pages is already available as msync(MS_INVALIDATE).
1953 static long madvise_dontneed(struct vm_area_struct * vma,
1954 unsigned long start, unsigned long end)
1956 if (vma->vm_flags & VM_LOCKED)
1957 return -EINVAL;
1959 flush_cache_range(vma->vm_mm, start, end);
1960 zap_page_range(vma->vm_mm, start, end - start);
1961 flush_tlb_range(vma->vm_mm, start, end);
1962 return 0;
1965 static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
1966 unsigned long end, int behavior)
1968 long error = -EBADF;
1970 switch (behavior) {
1971 case MADV_NORMAL:
1972 case MADV_SEQUENTIAL:
1973 case MADV_RANDOM:
1974 error = madvise_behavior(vma, start, end, behavior);
1975 break;
1977 case MADV_WILLNEED:
1978 error = madvise_willneed(vma, start, end);
1979 break;
1981 case MADV_DONTNEED:
1982 error = madvise_dontneed(vma, start, end);
1983 break;
1985 default:
1986 error = -EINVAL;
1987 break;
1990 return error;
1994 * The madvise(2) system call.
1996 * Applications can use madvise() to advise the kernel how it should
1997 * handle paging I/O in this VM area. The idea is to help the kernel
1998 * use appropriate read-ahead and caching techniques. The information
1999 * provided is advisory only, and can be safely disregarded by the
2000 * kernel without affecting the correct operation of the application.
2002 * behavior values:
2003 * MADV_NORMAL - the default behavior is to read clusters. This
2004 * results in some read-ahead and read-behind.
2005 * MADV_RANDOM - the system should read the minimum amount of data
2006 * on any access, since it is unlikely that the appli-
2007 * cation will need more than what it asks for.
2008 * MADV_SEQUENTIAL - pages in the given range will probably be accessed
2009 * once, so they can be aggressively read ahead, and
2010 * can be freed soon after they are accessed.
2011 * MADV_WILLNEED - the application is notifying the system to read
2012 * some pages ahead.
2013 * MADV_DONTNEED - the application is finished with the given range,
2014 * so the kernel can free resources associated with it.
2016 * return values:
2017 * zero - success
2018 * -EINVAL - start + len < 0, start is not page-aligned,
2019 * "behavior" is not a valid value, or application
2020 * is attempting to release locked or shared pages.
2021 * -ENOMEM - addresses in the specified range are not currently
2022 * mapped, or are outside the AS of the process.
2023 * -EIO - an I/O error occurred while paging in data.
2024 * -EBADF - map exists, but area maps something that isn't a file.
2025 * -EAGAIN - a kernel resource was temporarily unavailable.
2027 asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior)
2029 unsigned long end;
2030 struct vm_area_struct * vma;
2031 int unmapped_error = 0;
2032 int error = -EINVAL;
2034 down(&current->mm->mmap_sem);
2036 if (start & ~PAGE_MASK)
2037 goto out;
2038 len = (len + ~PAGE_MASK) & PAGE_MASK;
2039 end = start + len;
2040 if (end < start)
2041 goto out;
2043 error = 0;
2044 if (end == start)
2045 goto out;
2048 * If the interval [start,end) covers some unmapped address
2049 * ranges, just ignore them, but return -ENOMEM at the end.
2051 vma = find_vma(current->mm, start);
2052 for (;;) {
2053 /* Still start < end. */
2054 error = -ENOMEM;
2055 if (!vma)
2056 goto out;
2058 /* Here start < vma->vm_end. */
2059 if (start < vma->vm_start) {
2060 unmapped_error = -ENOMEM;
2061 start = vma->vm_start;
2064 /* Here vma->vm_start <= start < vma->vm_end. */
2065 if (end <= vma->vm_end) {
2066 if (start < end) {
2067 error = madvise_vma(vma, start, end,
2068 behavior);
2069 if (error)
2070 goto out;
2072 error = unmapped_error;
2073 goto out;
2076 /* Here vma->vm_start <= start < vma->vm_end < end. */
2077 error = madvise_vma(vma, start, vma->vm_end, behavior);
2078 if (error)
2079 goto out;
2080 start = vma->vm_end;
2081 vma = vma->vm_next;
2084 out:
2085 up(&current->mm->mmap_sem);
2086 return error;
2090 * Later we can get more picky about what "in core" means precisely.
2091 * For now, simply check to see if the page is in the page cache,
2092 * and is up to date; i.e. that no page-in operation would be required
2093 * at this time if an application were to map and access this page.
2095 static unsigned char mincore_page(struct vm_area_struct * vma,
2096 unsigned long pgoff)
2098 unsigned char present = 0;
2099 struct address_space * as = &vma->vm_file->f_dentry->d_inode->i_data;
2100 struct page * page, ** hash = page_hash(as, pgoff);
2102 spin_lock(&pagecache_lock);
2103 page = __find_page_nolock(as, pgoff, *hash);
2104 if ((page) && (Page_Uptodate(page)))
2105 present = 1;
2106 spin_unlock(&pagecache_lock);
2108 return present;
2111 static long mincore_vma(struct vm_area_struct * vma,
2112 unsigned long start, unsigned long end, unsigned char * vec)
2114 long error, i, remaining;
2115 unsigned char * tmp;
2117 error = -ENOMEM;
2118 if (!vma->vm_file)
2119 return error;
2121 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2122 if (end > vma->vm_end)
2123 end = vma->vm_end;
2124 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2126 error = -EAGAIN;
2127 tmp = (unsigned char *) __get_free_page(GFP_KERNEL);
2128 if (!tmp)
2129 return error;
2131 /* (end - start) is # of pages, and also # of bytes in "vec */
2132 remaining = (end - start),
2134 error = 0;
2135 for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) {
2136 int j = 0;
2137 long thispiece = (remaining < PAGE_SIZE) ?
2138 remaining : PAGE_SIZE;
2140 while (j < thispiece)
2141 tmp[j++] = mincore_page(vma, start++);
2143 if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) {
2144 error = -EFAULT;
2145 break;
2149 free_page((unsigned long) tmp);
2150 return error;
2154 * The mincore(2) system call.
2156 * mincore() returns the memory residency status of the pages in the
2157 * current process's address space specified by [addr, addr + len).
2158 * The status is returned in a vector of bytes. The least significant
2159 * bit of each byte is 1 if the referenced page is in memory, otherwise
2160 * it is zero.
2162 * Because the status of a page can change after mincore() checks it
2163 * but before it returns to the application, the returned vector may
2164 * contain stale information. Only locked pages are guaranteed to
2165 * remain in memory.
2167 * return values:
2168 * zero - success
2169 * -EFAULT - vec points to an illegal address
2170 * -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE,
2171 * or len has a nonpositive value
2172 * -ENOMEM - Addresses in the range [addr, addr + len] are
2173 * invalid for the address space of this process, or
2174 * specify one or more pages which are not currently
2175 * mapped
2176 * -EAGAIN - A kernel resource was temporarily unavailable.
2178 asmlinkage long sys_mincore(unsigned long start, size_t len,
2179 unsigned char * vec)
2181 int index = 0;
2182 unsigned long end;
2183 struct vm_area_struct * vma;
2184 int unmapped_error = 0;
2185 long error = -EINVAL;
2187 down(&current->mm->mmap_sem);
2189 if (start & ~PAGE_CACHE_MASK)
2190 goto out;
2191 len = (len + ~PAGE_CACHE_MASK) & PAGE_CACHE_MASK;
2192 end = start + len;
2193 if (end < start)
2194 goto out;
2196 error = 0;
2197 if (end == start)
2198 goto out;
2201 * If the interval [start,end) covers some unmapped address
2202 * ranges, just ignore them, but return -ENOMEM at the end.
2204 vma = find_vma(current->mm, start);
2205 for (;;) {
2206 /* Still start < end. */
2207 error = -ENOMEM;
2208 if (!vma)
2209 goto out;
2211 /* Here start < vma->vm_end. */
2212 if (start < vma->vm_start) {
2213 unmapped_error = -ENOMEM;
2214 start = vma->vm_start;
2217 /* Here vma->vm_start <= start < vma->vm_end. */
2218 if (end <= vma->vm_end) {
2219 if (start < end) {
2220 error = mincore_vma(vma, start, end,
2221 &vec[index]);
2222 if (error)
2223 goto out;
2225 error = unmapped_error;
2226 goto out;
2229 /* Here vma->vm_start <= start < vma->vm_end < end. */
2230 error = mincore_vma(vma, start, vma->vm_end, &vec[index]);
2231 if (error)
2232 goto out;
2233 index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT;
2234 start = vma->vm_end;
2235 vma = vma->vm_next;
2238 out:
2239 up(&current->mm->mmap_sem);
2240 return error;
2243 static inline
2244 struct page *__read_cache_page(struct address_space *mapping,
2245 unsigned long index,
2246 int (*filler)(void *,struct page*),
2247 void *data)
2249 struct page **hash = page_hash(mapping, index);
2250 struct page *page, *cached_page = NULL;
2251 int err;
2252 repeat:
2253 page = __find_get_page(mapping, index, hash);
2254 if (!page) {
2255 if (!cached_page) {
2256 cached_page = page_cache_alloc();
2257 if (!cached_page)
2258 return ERR_PTR(-ENOMEM);
2260 page = cached_page;
2261 if (add_to_page_cache_unique(page, mapping, index, hash))
2262 goto repeat;
2263 cached_page = NULL;
2264 err = filler(data, page);
2265 if (err < 0) {
2266 page_cache_release(page);
2267 page = ERR_PTR(err);
2270 if (cached_page)
2271 page_cache_free(cached_page);
2272 return page;
2276 * Read into the page cache. If a page already exists,
2277 * and Page_Uptodate() is not set, try to fill the page.
2279 struct page *read_cache_page(struct address_space *mapping,
2280 unsigned long index,
2281 int (*filler)(void *,struct page*),
2282 void *data)
2284 struct page *page;
2285 int err;
2287 retry:
2288 page = __read_cache_page(mapping, index, filler, data);
2289 if (IS_ERR(page) || Page_Uptodate(page))
2290 goto out;
2292 lock_page(page);
2293 if (!page->mapping) {
2294 UnlockPage(page);
2295 page_cache_release(page);
2296 goto retry;
2298 if (Page_Uptodate(page)) {
2299 UnlockPage(page);
2300 goto out;
2302 err = filler(data, page);
2303 if (err < 0) {
2304 page_cache_release(page);
2305 page = ERR_PTR(err);
2307 out:
2308 return page;
2311 static inline struct page * __grab_cache_page(struct address_space *mapping,
2312 unsigned long index, struct page **cached_page)
2314 struct page *page, **hash = page_hash(mapping, index);
2315 repeat:
2316 page = __find_lock_page(mapping, index, hash);
2317 if (!page) {
2318 if (!*cached_page) {
2319 *cached_page = page_cache_alloc();
2320 if (!*cached_page)
2321 return NULL;
2323 page = *cached_page;
2324 if (add_to_page_cache_unique(page, mapping, index, hash))
2325 goto repeat;
2326 *cached_page = NULL;
2328 return page;
2332 * Returns locked page at given index in given cache, creating it if needed.
2335 struct page *grab_cache_page(struct address_space *mapping, unsigned long index)
2337 struct page *cached_page = NULL;
2338 struct page *page = __grab_cache_page(mapping,index,&cached_page);
2339 if (cached_page)
2340 page_cache_free(cached_page);
2341 return page;
2344 static inline void remove_suid(struct inode *inode)
2346 unsigned int mode;
2348 /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
2349 mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
2351 /* was any of the uid bits set? */
2352 mode &= inode->i_mode;
2353 if (mode && !capable(CAP_FSETID)) {
2354 inode->i_mode &= ~mode;
2355 mark_inode_dirty(inode);
2360 * Write to a file through the page cache.
2362 * We currently put everything into the page cache prior to writing it.
2363 * This is not a problem when writing full pages. With partial pages,
2364 * however, we first have to read the data into the cache, then
2365 * dirty the page, and finally schedule it for writing. Alternatively, we
2366 * could write-through just the portion of data that would go into that
2367 * page, but that would kill performance for applications that write data
2368 * line by line, and it's prone to race conditions.
2370 * Note that this routine doesn't try to keep track of dirty pages. Each
2371 * file system has to do this all by itself, unfortunately.
2372 * okir@monad.swb.de
2374 ssize_t
2375 generic_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)
2377 struct inode *inode = file->f_dentry->d_inode;
2378 struct address_space *mapping = inode->i_mapping;
2379 unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
2380 loff_t pos;
2381 struct page *page, *cached_page;
2382 unsigned long written;
2383 long status;
2384 int err;
2386 cached_page = NULL;
2388 down(&inode->i_sem);
2390 pos = *ppos;
2391 err = -EINVAL;
2392 if (pos < 0)
2393 goto out;
2395 err = file->f_error;
2396 if (err) {
2397 file->f_error = 0;
2398 goto out;
2401 written = 0;
2403 if (file->f_flags & O_APPEND)
2404 pos = inode->i_size;
2407 * Check whether we've reached the file size limit.
2409 err = -EFBIG;
2410 if (limit != RLIM_INFINITY) {
2411 if (pos >= limit) {
2412 send_sig(SIGXFSZ, current, 0);
2413 goto out;
2415 if (count > limit - pos) {
2416 send_sig(SIGXFSZ, current, 0);
2417 count = limit - pos;
2421 status = 0;
2422 if (count) {
2423 remove_suid(inode);
2424 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
2425 mark_inode_dirty_sync(inode);
2428 while (count) {
2429 unsigned long bytes, index, offset;
2430 char *kaddr;
2433 * Try to find the page in the cache. If it isn't there,
2434 * allocate a free page.
2436 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
2437 index = pos >> PAGE_CACHE_SHIFT;
2438 bytes = PAGE_CACHE_SIZE - offset;
2439 if (bytes > count)
2440 bytes = count;
2442 status = -ENOMEM; /* we'll assign it later anyway */
2443 page = __grab_cache_page(mapping, index, &cached_page);
2444 if (!page)
2445 break;
2447 /* We have exclusive IO access to the page.. */
2448 if (!PageLocked(page)) {
2449 PAGE_BUG(page);
2452 status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes);
2453 if (status)
2454 goto unlock;
2455 kaddr = page_address(page);
2456 status = copy_from_user(kaddr+offset, buf, bytes);
2457 flush_dcache_page(page);
2458 if (status)
2459 goto fail_write;
2460 status = mapping->a_ops->commit_write(file, page, offset, offset+bytes);
2461 if (!status)
2462 status = bytes;
2464 if (status >= 0) {
2465 written += status;
2466 count -= status;
2467 pos += status;
2468 buf += status;
2470 unlock:
2471 /* Mark it unlocked again and drop the page.. */
2472 UnlockPage(page);
2473 deactivate_page(page);
2474 page_cache_release(page);
2476 if (status < 0)
2477 break;
2479 *ppos = pos;
2481 if (cached_page)
2482 page_cache_free(cached_page);
2484 /* For now, when the user asks for O_SYNC, we'll actually
2485 * provide O_DSYNC. */
2486 if ((status >= 0) && (file->f_flags & O_SYNC))
2487 status = generic_osync_inode(inode, 1); /* 1 means datasync */
2489 err = written ? written : status;
2490 out:
2492 up(&inode->i_sem);
2493 return err;
2494 fail_write:
2495 status = -EFAULT;
2496 ClearPageUptodate(page);
2497 kunmap(page);
2498 goto unlock;
2501 void __init page_cache_init(unsigned long mempages)
2503 unsigned long htable_size, order;
2505 htable_size = mempages;
2506 htable_size *= sizeof(struct page *);
2507 for(order = 0; (PAGE_SIZE << order) < htable_size; order++)
2510 do {
2511 unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *);
2513 page_hash_bits = 0;
2514 while((tmp >>= 1UL) != 0UL)
2515 page_hash_bits++;
2517 page_hash_table = (struct page **)
2518 __get_free_pages(GFP_ATOMIC, order);
2519 } while(page_hash_table == NULL && --order > 0);
2521 printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
2522 (1 << page_hash_bits), order, (PAGE_SIZE << order));
2523 if (!page_hash_table)
2524 panic("Failed to allocate page hash table\n");
2525 memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));