Import 2.3.13pre6
[davej-history.git] / mm / filemap.c
blob40a5ca73616413390a929266f48a9c838d437871
1 /*
2 * linux/mm/filemap.c
4 * Copyright (C) 1994-1999 Linus Torvalds
5 */
7 /*
8 * This file handles the generic file mmap semantics used by
9 * most "normal" filesystems (but you don't /have/ to use this:
10 * the NFS filesystem used to do this differently, for example)
12 #include <linux/malloc.h>
13 #include <linux/shm.h>
14 #include <linux/mman.h>
15 #include <linux/locks.h>
16 #include <linux/pagemap.h>
17 #include <linux/swap.h>
18 #include <linux/smp_lock.h>
19 #include <linux/blkdev.h>
20 #include <linux/file.h>
21 #include <linux/swapctl.h>
22 #include <linux/slab.h>
23 #include <linux/init.h>
25 #include <asm/pgtable.h>
26 #include <asm/uaccess.h>
29 * Shared mappings implemented 30.11.1994. It's not fully working yet,
30 * though.
32 * Shared mappings now work. 15.8.1995 Bruno.
34 * finished 'unifying' the page and buffer cache and SMP-threaded the
35 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
38 atomic_t page_cache_size = ATOMIC_INIT(0);
39 unsigned int page_hash_bits;
40 struct page **page_hash_table;
42 spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
45 void __add_page_to_hash_queue(struct page * page, struct page **p)
47 atomic_inc(&page_cache_size);
48 if((page->next_hash = *p) != NULL)
49 (*p)->pprev_hash = &page->next_hash;
50 *p = page;
51 page->pprev_hash = p;
52 if (page->buffers)
53 PAGE_BUG(page);
56 static void remove_page_from_hash_queue(struct page * page)
58 if(page->pprev_hash) {
59 if(page->next_hash)
60 page->next_hash->pprev_hash = page->pprev_hash;
61 *page->pprev_hash = page->next_hash;
62 page->pprev_hash = NULL;
64 atomic_dec(&page_cache_size);
67 static void remove_page_from_inode_queue(struct page * page)
69 struct inode * inode = page->inode;
70 struct page *prev, *next;
72 inode->i_nrpages--;
73 next = page->next;
74 prev = page->prev;
75 if (inode->i_pages == page)
76 inode->i_pages = next;
77 if (next)
78 next->prev = prev;
79 if (prev)
80 prev->next = next;
81 page->next = NULL;
82 page->prev = NULL;
86 * Remove a page from the page cache and free it. Caller has to make
87 * sure the page is locked and that nobody else uses it - or that usage
88 * is safe.
90 void remove_inode_page(struct page *page)
92 if (!PageLocked(page))
93 PAGE_BUG(page);
95 spin_lock(&pagecache_lock);
96 remove_page_from_inode_queue(page);
97 remove_page_from_hash_queue(page);
98 page->inode = NULL;
99 spin_unlock(&pagecache_lock);
102 void invalidate_inode_pages(struct inode * inode)
104 struct page ** p;
105 struct page * page;
107 repeat:
108 spin_lock(&pagecache_lock);
109 p = &inode->i_pages;
110 while ((page = *p) != NULL) {
111 get_page(page);
112 if (TryLockPage(page)) {
113 spin_unlock(&pagecache_lock);
114 wait_on_page(page);
115 page_cache_release(page);
116 goto repeat;
118 if (page_count(page) != 2)
119 printk("hm, busy page invalidated? (not necesserily a bug)\n");
121 remove_page_from_inode_queue(page);
122 remove_page_from_hash_queue(page);
123 page->inode = NULL;
124 UnlockPage(page);
125 page_cache_release(page);
126 page_cache_release(page);
129 spin_unlock(&pagecache_lock);
132 * Truncate the page cache at a set offset, removing the pages
133 * that are beyond that offset (and zeroing out partial pages).
135 void truncate_inode_pages(struct inode * inode, unsigned long start)
137 struct page ** p;
138 struct page * page;
139 int partial = 0;
141 repeat:
142 spin_lock(&pagecache_lock);
143 p = &inode->i_pages;
144 while ((page = *p) != NULL) {
145 unsigned long offset = page->offset;
147 /* page wholly truncated - free it */
148 if (offset >= start) {
149 get_page(page);
150 spin_unlock(&pagecache_lock);
152 lock_page(page);
154 if (inode->i_op->flushpage)
155 inode->i_op->flushpage(inode, page, 0);
158 * We remove the page from the page cache
159 * _after_ we have destroyed all buffer-cache
160 * references to it. Otherwise some other process
161 * might think this inode page is not in the
162 * page cache and creates a buffer-cache alias
163 * to it causing all sorts of fun problems ...
165 remove_inode_page(page);
167 UnlockPage(page);
168 page_cache_release(page);
169 page_cache_release(page);
172 * We have done things without the pagecache lock,
173 * so we'll have to repeat the scan.
174 * It's not possible to deadlock here because
175 * we are guaranteed to make progress. (ie. we have
176 * just removed a page)
178 goto repeat;
180 p = &page->next;
182 * there is only one partial page possible.
184 if (partial)
185 continue;
187 offset = start - offset;
188 /* partial truncate, clear end of page */
189 if (offset < PAGE_CACHE_SIZE) {
190 unsigned long address;
191 get_page(page);
192 spin_unlock(&pagecache_lock);
194 lock_page(page);
195 partial = 1;
197 address = page_address(page);
198 memset((void *) (offset + address), 0, PAGE_CACHE_SIZE - offset);
199 flush_page_to_ram(address);
201 if (inode->i_op->flushpage)
202 inode->i_op->flushpage(inode, page, offset);
204 * we have dropped the spinlock so we have to
205 * restart.
207 UnlockPage(page);
208 page_cache_release(page);
209 goto repeat;
212 spin_unlock(&pagecache_lock);
215 extern atomic_t too_many_dirty_buffers;
217 int shrink_mmap(int priority, int gfp_mask)
219 static unsigned long clock = 0;
220 unsigned long limit = num_physpages << 1;
221 struct page * page;
222 int count, users;
224 count = limit >> priority;
226 page = mem_map + clock;
227 do {
228 int referenced;
230 /* This works even in the presence of PageSkip because
231 * the first two entries at the beginning of a hole will
232 * be marked, not just the first.
234 page++;
235 clock++;
236 if (clock >= max_mapnr) {
237 clock = 0;
238 page = mem_map;
240 if (PageSkip(page)) {
241 /* next_hash is overloaded for PageSkip */
242 page = page->next_hash;
243 clock = page - mem_map;
246 referenced = test_and_clear_bit(PG_referenced, &page->flags);
248 if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
249 continue;
251 count--;
254 * Some common cases that we just short-circuit without
255 * getting the locks - we need to re-check this once we
256 * have the lock, but that's fine.
258 users = page_count(page);
259 if (!users)
260 continue;
261 if (!page->buffers) {
262 if (!page->inode)
263 continue;
264 if (users > 1)
265 continue;
269 * ok, now the page looks interesting. Re-check things
270 * and keep the lock.
272 spin_lock(&pagecache_lock);
273 if (!page->inode && !page->buffers) {
274 spin_unlock(&pagecache_lock);
275 continue;
277 if (!page_count(page)) {
278 spin_unlock(&pagecache_lock);
279 BUG();
280 continue;
282 get_page(page);
283 if (TryLockPage(page)) {
284 spin_unlock(&pagecache_lock);
285 goto put_continue;
289 * we keep pagecache_lock locked and unlock it in
290 * each branch, so that the page->inode case doesnt
291 * have to re-grab it. Here comes the 'real' logic
292 * to free memory:
295 /* Is it a buffer page? */
296 if (page->buffers) {
297 spin_unlock(&pagecache_lock);
298 if (!try_to_free_buffers(page))
299 goto unlock_continue;
300 /* page was locked, inode can't go away under us */
301 if (!page->inode)
303 atomic_sub(PAGE_CACHE_SIZE, &buffermem);
304 goto made_progress;
306 spin_lock(&pagecache_lock);
310 * We can't free pages unless there's just one user
311 * (count == 2 because we added one ourselves above).
313 if (page_count(page) != 2)
314 goto spin_unlock_continue;
317 * Is it a page swap page? If so, we want to
318 * drop it if it is no longer used, even if it
319 * were to be marked referenced..
321 if (PageSwapCache(page)) {
322 spin_unlock(&pagecache_lock);
323 if (referenced && swap_count(page->offset) != 2)
324 goto unlock_continue;
325 __delete_from_swap_cache(page);
326 page_cache_release(page);
327 goto made_progress;
330 /* is it a page-cache page? */
331 if (!referenced && page->inode && !pgcache_under_min()) {
332 remove_page_from_inode_queue(page);
333 remove_page_from_hash_queue(page);
334 page->inode = NULL;
335 spin_unlock(&pagecache_lock);
337 page_cache_release(page);
338 goto made_progress;
340 spin_unlock_continue:
341 spin_unlock(&pagecache_lock);
342 unlock_continue:
343 UnlockPage(page);
344 put_continue:
345 put_page(page);
346 } while (count > 0);
347 return 0;
348 made_progress:
349 UnlockPage(page);
350 put_page(page);
351 return 1;
354 static inline struct page * __find_page_nolock(struct inode * inode, unsigned long offset, struct page *page)
356 goto inside;
358 for (;;) {
359 page = page->next_hash;
360 inside:
361 if (!page)
362 goto not_found;
363 if (page->inode != inode)
364 continue;
365 if (page->offset == offset)
366 break;
368 set_bit(PG_referenced, &page->flags);
369 not_found:
370 return page;
374 * By the time this is called, the page is locked and
375 * we don't have to worry about any races any more.
377 * Start the IO..
379 static int writeout_one_page(struct page *page)
381 struct buffer_head *bh, *head = page->buffers;
383 bh = head;
384 do {
385 if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
386 continue;
388 bh->b_flushtime = 0;
389 ll_rw_block(WRITE, 1, &bh);
390 } while ((bh = bh->b_this_page) != head);
391 return 0;
394 static int waitfor_one_page(struct page *page)
396 int error = 0;
397 struct buffer_head *bh, *head = page->buffers;
399 bh = head;
400 do {
401 wait_on_buffer(bh);
402 if (buffer_req(bh) && !buffer_uptodate(bh))
403 error = -EIO;
404 } while ((bh = bh->b_this_page) != head);
405 return error;
408 static int do_buffer_fdatasync(struct inode *inode, unsigned long start, unsigned long end, int (*fn)(struct page *))
410 struct page *next;
411 int retval = 0;
413 start &= PAGE_MASK;
415 spin_lock(&pagecache_lock);
416 next = inode->i_pages;
417 while (next) {
418 struct page *page = next;
419 next = page->next;
420 if (!page->buffers)
421 continue;
422 if (page->offset >= end)
423 continue;
424 if (page->offset < start)
425 continue;
427 get_page(page);
428 spin_unlock(&pagecache_lock);
429 lock_page(page);
431 /* The buffers could have been free'd while we waited for the page lock */
432 if (page->buffers)
433 retval |= fn(page);
435 UnlockPage(page);
436 spin_lock(&pagecache_lock);
437 next = page->next;
438 page_cache_release(page);
440 spin_unlock(&pagecache_lock);
442 return retval;
446 * Two-stage data sync: first start the IO, then go back and
447 * collect the information..
449 int generic_buffer_fdatasync(struct inode *inode, unsigned long start, unsigned long end)
451 int retval;
453 retval = do_buffer_fdatasync(inode, start, end, writeout_one_page);
454 retval |= do_buffer_fdatasync(inode, start, end, waitfor_one_page);
455 return retval;
459 * This adds a page to the page cache, starting out as locked,
460 * owned by us, referenced, but not uptodate and with no errors.
462 static inline void __add_to_page_cache(struct page * page,
463 struct inode * inode, unsigned long offset,
464 struct page **hash)
466 unsigned long flags;
468 flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error));
469 page->flags = flags | ((1 << PG_locked) | (1 << PG_referenced));
470 page->owner = current; /* REMOVEME */
471 get_page(page);
472 page->offset = offset;
473 add_page_to_inode_queue(inode, page);
474 __add_page_to_hash_queue(page, hash);
477 void add_to_page_cache(struct page * page, struct inode * inode, unsigned long offset)
479 spin_lock(&pagecache_lock);
480 __add_to_page_cache(page, inode, offset, page_hash(inode, offset));
481 spin_unlock(&pagecache_lock);
484 int add_to_page_cache_unique(struct page * page,
485 struct inode * inode, unsigned long offset,
486 struct page **hash)
488 int err;
489 struct page *alias;
491 spin_lock(&pagecache_lock);
492 alias = __find_page_nolock(inode, offset, *hash);
494 err = 1;
495 if (!alias) {
496 __add_to_page_cache(page,inode,offset,hash);
497 err = 0;
500 spin_unlock(&pagecache_lock);
501 return err;
505 * Try to read ahead in the file. "page_cache" is a potentially free page
506 * that we could use for the cache (if it is 0 we can try to create one,
507 * this is all overlapped with the IO on the previous page finishing anyway)
509 static unsigned long try_to_read_ahead(struct file * file,
510 unsigned long offset, unsigned long page_cache)
512 struct inode *inode = file->f_dentry->d_inode;
513 struct page * page;
514 struct page ** hash;
516 offset &= PAGE_CACHE_MASK;
517 switch (page_cache) {
518 case 0:
519 page_cache = page_cache_alloc();
520 if (!page_cache)
521 break;
522 default:
523 if (offset >= inode->i_size)
524 break;
525 hash = page_hash(inode, offset);
526 page = page_cache_entry(page_cache);
527 if (!add_to_page_cache_unique(page, inode, offset, hash)) {
529 * We do not have to check the return value here
530 * because it's a readahead.
532 inode->i_op->readpage(file, page);
533 page_cache = 0;
534 page_cache_release(page);
537 return page_cache;
541 * Wait for a page to get unlocked.
543 * This must be called with the caller "holding" the page,
544 * ie with increased "page->count" so that the page won't
545 * go away during the wait..
547 void ___wait_on_page(struct page *page)
549 struct task_struct *tsk = current;
550 DECLARE_WAITQUEUE(wait, tsk);
552 add_wait_queue(&page->wait, &wait);
553 do {
554 tsk->state = TASK_UNINTERRUPTIBLE;
555 run_task_queue(&tq_disk);
556 if (!PageLocked(page))
557 break;
558 schedule();
559 } while (PageLocked(page));
560 tsk->state = TASK_RUNNING;
561 remove_wait_queue(&page->wait, &wait);
565 * Get an exclusive lock on the page..
567 void lock_page(struct page *page)
569 if (TryLockPage(page)) {
570 struct task_struct *tsk = current;
571 DECLARE_WAITQUEUE(wait, current);
573 run_task_queue(&tq_disk);
574 add_wait_queue(&page->wait, &wait);
575 tsk->state = TASK_UNINTERRUPTIBLE;
577 while (TryLockPage(page)) {
578 run_task_queue(&tq_disk);
579 schedule();
580 tsk->state = TASK_UNINTERRUPTIBLE;
583 remove_wait_queue(&page->wait, &wait);
584 tsk->state = TASK_RUNNING;
590 * a rather lightweight function, finding and getting a reference to a
591 * hashed page atomically, waiting for it if it's locked.
593 struct page * __find_get_page (struct inode * inode,
594 unsigned long offset, struct page **hash)
596 struct page *page;
599 * We scan the hash list read-only. Addition to and removal from
600 * the hash-list needs a held write-lock.
602 repeat:
603 spin_lock(&pagecache_lock);
604 page = __find_page_nolock(inode, offset, *hash);
605 if (page)
606 get_page(page);
607 spin_unlock(&pagecache_lock);
609 /* Found the page, sleep if locked. */
610 if (page && PageLocked(page)) {
611 struct task_struct *tsk = current;
612 DECLARE_WAITQUEUE(wait, tsk);
614 add_wait_queue(&page->wait, &wait);
615 tsk->state = TASK_UNINTERRUPTIBLE;
617 run_task_queue(&tq_disk);
618 if (PageLocked(page))
619 schedule();
620 tsk->state = TASK_RUNNING;
621 remove_wait_queue(&page->wait, &wait);
624 * The page might have been unhashed meanwhile. It's
625 * not freed though because we hold a reference to it.
626 * If this is the case then it will be freed _here_,
627 * and we recheck the hash anyway.
629 page_cache_release(page);
630 goto repeat;
633 * It's not locked so we can return the page and we hold
634 * a reference to it.
636 return page;
640 * Get the lock to a page atomically.
642 struct page * __find_lock_page (struct inode * inode,
643 unsigned long offset, struct page **hash)
645 struct page *page;
648 * We scan the hash list read-only. Addition to and removal from
649 * the hash-list needs a held write-lock.
651 repeat:
652 spin_lock(&pagecache_lock);
653 page = __find_page_nolock(inode, offset, *hash);
654 if (page)
655 get_page(page);
656 spin_unlock(&pagecache_lock);
658 /* Found the page, sleep if locked. */
659 if (page && TryLockPage(page)) {
660 struct task_struct *tsk = current;
661 DECLARE_WAITQUEUE(wait, tsk);
663 add_wait_queue(&page->wait, &wait);
664 tsk->state = TASK_UNINTERRUPTIBLE;
666 run_task_queue(&tq_disk);
667 if (PageLocked(page))
668 schedule();
669 tsk->state = TASK_RUNNING;
670 remove_wait_queue(&page->wait, &wait);
673 * The page might have been unhashed meanwhile. It's
674 * not freed though because we hold a reference to it.
675 * If this is the case then it will be freed _here_,
676 * and we recheck the hash anyway.
678 page_cache_release(page);
679 goto repeat;
682 * It's not locked so we can return the page and we hold
683 * a reference to it.
685 return page;
688 #if 0
689 #define PROFILE_READAHEAD
690 #define DEBUG_READAHEAD
691 #endif
694 * Read-ahead profiling information
695 * --------------------------------
696 * Every PROFILE_MAXREADCOUNT, the following information is written
697 * to the syslog:
698 * Percentage of asynchronous read-ahead.
699 * Average of read-ahead fields context value.
700 * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
701 * to the syslog.
704 #ifdef PROFILE_READAHEAD
706 #define PROFILE_MAXREADCOUNT 1000
708 static unsigned long total_reada;
709 static unsigned long total_async;
710 static unsigned long total_ramax;
711 static unsigned long total_ralen;
712 static unsigned long total_rawin;
714 static void profile_readahead(int async, struct file *filp)
716 unsigned long flags;
718 ++total_reada;
719 if (async)
720 ++total_async;
722 total_ramax += filp->f_ramax;
723 total_ralen += filp->f_ralen;
724 total_rawin += filp->f_rawin;
726 if (total_reada > PROFILE_MAXREADCOUNT) {
727 save_flags(flags);
728 cli();
729 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
730 restore_flags(flags);
731 return;
734 printk("Readahead average: max=%ld, len=%ld, win=%ld, async=%ld%%\n",
735 total_ramax/total_reada,
736 total_ralen/total_reada,
737 total_rawin/total_reada,
738 (total_async*100)/total_reada);
739 #ifdef DEBUG_READAHEAD
740 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%ld\n",
741 filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
742 #endif
744 total_reada = 0;
745 total_async = 0;
746 total_ramax = 0;
747 total_ralen = 0;
748 total_rawin = 0;
750 restore_flags(flags);
753 #endif /* defined PROFILE_READAHEAD */
756 * Read-ahead context:
757 * -------------------
758 * The read ahead context fields of the "struct file" are the following:
759 * - f_raend : position of the first byte after the last page we tried to
760 * read ahead.
761 * - f_ramax : current read-ahead maximum size.
762 * - f_ralen : length of the current IO read block we tried to read-ahead.
763 * - f_rawin : length of the current read-ahead window.
764 * if last read-ahead was synchronous then
765 * f_rawin = f_ralen
766 * otherwise (was asynchronous)
767 * f_rawin = previous value of f_ralen + f_ralen
769 * Read-ahead limits:
770 * ------------------
771 * MIN_READAHEAD : minimum read-ahead size when read-ahead.
772 * MAX_READAHEAD : maximum read-ahead size when read-ahead.
774 * Synchronous read-ahead benefits:
775 * --------------------------------
776 * Using reasonable IO xfer length from peripheral devices increase system
777 * performances.
778 * Reasonable means, in this context, not too large but not too small.
779 * The actual maximum value is:
780 * MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
781 * and 32K if defined (4K page size assumed).
783 * Asynchronous read-ahead benefits:
784 * ---------------------------------
785 * Overlapping next read request and user process execution increase system
786 * performance.
788 * Read-ahead risks:
789 * -----------------
790 * We have to guess which further data are needed by the user process.
791 * If these data are often not really needed, it's bad for system
792 * performances.
793 * However, we know that files are often accessed sequentially by
794 * application programs and it seems that it is possible to have some good
795 * strategy in that guessing.
796 * We only try to read-ahead files that seems to be read sequentially.
798 * Asynchronous read-ahead risks:
799 * ------------------------------
800 * In order to maximize overlapping, we must start some asynchronous read
801 * request from the device, as soon as possible.
802 * We must be very careful about:
803 * - The number of effective pending IO read requests.
804 * ONE seems to be the only reasonable value.
805 * - The total memory pool usage for the file access stream.
806 * This maximum memory usage is implicitly 2 IO read chunks:
807 * 2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
808 * 64k if defined (4K page size assumed).
811 static inline int get_max_readahead(struct inode * inode)
813 if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
814 return MAX_READAHEAD;
815 return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
818 static inline unsigned long generic_file_readahead(int reada_ok,
819 struct file * filp, struct inode * inode,
820 unsigned long ppos, struct page * page, unsigned long page_cache)
822 unsigned long max_ahead, ahead;
823 unsigned long raend;
824 int max_readahead = get_max_readahead(inode);
826 raend = filp->f_raend & PAGE_CACHE_MASK;
827 max_ahead = 0;
830 * The current page is locked.
831 * If the current position is inside the previous read IO request, do not
832 * try to reread previously read ahead pages.
833 * Otherwise decide or not to read ahead some pages synchronously.
834 * If we are not going to read ahead, set the read ahead context for this
835 * page only.
837 if (PageLocked(page)) {
838 if (!filp->f_ralen || ppos >= raend || ppos + filp->f_ralen < raend) {
839 raend = ppos;
840 if (raend < inode->i_size)
841 max_ahead = filp->f_ramax;
842 filp->f_rawin = 0;
843 filp->f_ralen = PAGE_CACHE_SIZE;
844 if (!max_ahead) {
845 filp->f_raend = ppos + filp->f_ralen;
846 filp->f_rawin += filp->f_ralen;
851 * The current page is not locked.
852 * If we were reading ahead and,
853 * if the current max read ahead size is not zero and,
854 * if the current position is inside the last read-ahead IO request,
855 * it is the moment to try to read ahead asynchronously.
856 * We will later force unplug device in order to force asynchronous read IO.
858 else if (reada_ok && filp->f_ramax && raend >= PAGE_CACHE_SIZE &&
859 ppos <= raend && ppos + filp->f_ralen >= raend) {
861 * Add ONE page to max_ahead in order to try to have about the same IO max size
862 * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
863 * Compute the position of the last page we have tried to read in order to
864 * begin to read ahead just at the next page.
866 raend -= PAGE_CACHE_SIZE;
867 if (raend < inode->i_size)
868 max_ahead = filp->f_ramax + PAGE_CACHE_SIZE;
870 if (max_ahead) {
871 filp->f_rawin = filp->f_ralen;
872 filp->f_ralen = 0;
873 reada_ok = 2;
877 * Try to read ahead pages.
878 * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
879 * scheduler, will work enough for us to avoid too bad actuals IO requests.
881 ahead = 0;
882 while (ahead < max_ahead) {
883 ahead += PAGE_CACHE_SIZE;
884 page_cache = try_to_read_ahead(filp, raend + ahead,
885 page_cache);
888 * If we tried to read ahead some pages,
889 * If we tried to read ahead asynchronously,
890 * Try to force unplug of the device in order to start an asynchronous
891 * read IO request.
892 * Update the read-ahead context.
893 * Store the length of the current read-ahead window.
894 * Double the current max read ahead size.
895 * That heuristic avoid to do some large IO for files that are not really
896 * accessed sequentially.
898 if (ahead) {
899 if (reada_ok == 2) {
900 run_task_queue(&tq_disk);
903 filp->f_ralen += ahead;
904 filp->f_rawin += filp->f_ralen;
905 filp->f_raend = raend + ahead + PAGE_CACHE_SIZE;
907 filp->f_ramax += filp->f_ramax;
909 if (filp->f_ramax > max_readahead)
910 filp->f_ramax = max_readahead;
912 #ifdef PROFILE_READAHEAD
913 profile_readahead((reada_ok == 2), filp);
914 #endif
917 return page_cache;
921 * "descriptor" for what we're up to with a read.
922 * This allows us to use the same read code yet
923 * have multiple different users of the data that
924 * we read from a file.
926 * The simplest case just copies the data to user
927 * mode.
929 typedef struct {
930 size_t written;
931 size_t count;
932 char * buf;
933 int error;
934 } read_descriptor_t;
936 typedef int (*read_actor_t)(read_descriptor_t *, const char *, unsigned long);
939 * This is a generic file read routine, and uses the
940 * inode->i_op->readpage() function for the actual low-level
941 * stuff.
943 * This is really ugly. But the goto's actually try to clarify some
944 * of the logic when it comes to error handling etc.
946 static void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
948 struct dentry *dentry = filp->f_dentry;
949 struct inode *inode = dentry->d_inode;
950 size_t pos, pgpos, page_cache;
951 int reada_ok;
952 int error;
953 int max_readahead = get_max_readahead(inode);
955 page_cache = 0;
957 pos = *ppos;
958 pgpos = pos & PAGE_CACHE_MASK;
960 * If the current position is outside the previous read-ahead window,
961 * we reset the current read-ahead context and set read ahead max to zero
962 * (will be set to just needed value later),
963 * otherwise, we assume that the file accesses are sequential enough to
964 * continue read-ahead.
966 if (pgpos > filp->f_raend || pgpos + filp->f_rawin < filp->f_raend) {
967 reada_ok = 0;
968 filp->f_raend = 0;
969 filp->f_ralen = 0;
970 filp->f_ramax = 0;
971 filp->f_rawin = 0;
972 } else {
973 reada_ok = 1;
976 * Adjust the current value of read-ahead max.
977 * If the read operation stay in the first half page, force no readahead.
978 * Otherwise try to increase read ahead max just enough to do the read request.
979 * Then, at least MIN_READAHEAD if read ahead is ok,
980 * and at most MAX_READAHEAD in all cases.
982 if (pos + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
983 filp->f_ramax = 0;
984 } else {
985 unsigned long needed;
987 needed = ((pos + desc->count) & PAGE_CACHE_MASK) - pgpos;
989 if (filp->f_ramax < needed)
990 filp->f_ramax = needed;
992 if (reada_ok && filp->f_ramax < MIN_READAHEAD)
993 filp->f_ramax = MIN_READAHEAD;
994 if (filp->f_ramax > max_readahead)
995 filp->f_ramax = max_readahead;
998 for (;;) {
999 struct page *page, **hash;
1001 if (pos >= inode->i_size)
1002 break;
1005 * Try to find the data in the page cache..
1007 hash = page_hash(inode, pos & PAGE_CACHE_MASK);
1009 spin_lock(&pagecache_lock);
1010 page = __find_page_nolock(inode, pos & PAGE_CACHE_MASK, *hash);
1011 if (!page)
1012 goto no_cached_page;
1013 found_page:
1014 get_page(page);
1015 spin_unlock(&pagecache_lock);
1017 if (!Page_Uptodate(page))
1018 goto page_not_up_to_date;
1019 page_ok:
1021 * Ok, we have the page, and it's up-to-date, so
1022 * now we can copy it to user space...
1025 unsigned long offset, nr;
1027 offset = pos & ~PAGE_CACHE_MASK;
1028 nr = PAGE_CACHE_SIZE - offset;
1029 if (nr > inode->i_size - pos)
1030 nr = inode->i_size - pos;
1033 * The actor routine returns how many bytes were actually used..
1034 * NOTE! This may not be the same as how much of a user buffer
1035 * we filled up (we may be padding etc), so we can only update
1036 * "pos" here (the actor routine has to update the user buffer
1037 * pointers and the remaining count).
1039 nr = actor(desc, (const char *) (page_address(page) + offset), nr);
1040 pos += nr;
1041 page_cache_release(page);
1042 if (nr && desc->count)
1043 continue;
1044 break;
1048 * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
1050 page_not_up_to_date:
1051 page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_CACHE_MASK, page, page_cache);
1053 if (Page_Uptodate(page))
1054 goto page_ok;
1056 /* Get exclusive access to the page ... */
1057 lock_page(page);
1058 if (Page_Uptodate(page)) {
1059 UnlockPage(page);
1060 goto page_ok;
1063 readpage:
1064 /* ... and start the actual read. The read will unlock the page. */
1065 error = inode->i_op->readpage(filp, page);
1067 if (!error) {
1068 if (Page_Uptodate(page))
1069 goto page_ok;
1071 /* Again, try some read-ahead while waiting for the page to finish.. */
1072 page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_CACHE_MASK, page, page_cache);
1073 wait_on_page(page);
1074 if (Page_Uptodate(page))
1075 goto page_ok;
1076 error = -EIO;
1079 /* UHHUH! A synchronous read error occurred. Report it */
1080 desc->error = error;
1081 page_cache_release(page);
1082 break;
1084 no_cached_page:
1086 * Ok, it wasn't cached, so we need to create a new
1087 * page..
1089 * We get here with the page cache lock held.
1091 if (!page_cache) {
1092 spin_unlock(&pagecache_lock);
1093 page_cache = page_cache_alloc();
1094 if (!page_cache) {
1095 desc->error = -ENOMEM;
1096 break;
1100 * Somebody may have added the page while we
1101 * dropped the page cache lock. Check for that.
1103 spin_lock(&pagecache_lock);
1104 page = __find_page_nolock(inode, pos & PAGE_CACHE_MASK, *hash);
1105 if (page)
1106 goto found_page;
1110 * Ok, add the new page to the hash-queues...
1112 page = page_cache_entry(page_cache);
1113 __add_to_page_cache(page, inode, pos & PAGE_CACHE_MASK, hash);
1114 spin_unlock(&pagecache_lock);
1116 page_cache = 0;
1117 goto readpage;
1120 *ppos = pos;
1121 filp->f_reada = 1;
1122 if (page_cache)
1123 page_cache_free(page_cache);
1124 UPDATE_ATIME(inode);
1127 static int file_read_actor(read_descriptor_t * desc, const char *area, unsigned long size)
1129 unsigned long left;
1130 unsigned long count = desc->count;
1132 if (size > count)
1133 size = count;
1134 left = __copy_to_user(desc->buf, area, size);
1135 if (left) {
1136 size -= left;
1137 desc->error = -EFAULT;
1139 desc->count = count - size;
1140 desc->written += size;
1141 desc->buf += size;
1142 return size;
1146 * This is the "read()" routine for all filesystems
1147 * that can use the page cache directly.
1149 ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
1151 ssize_t retval;
1153 retval = -EFAULT;
1154 if (access_ok(VERIFY_WRITE, buf, count)) {
1155 retval = 0;
1156 if (count) {
1157 read_descriptor_t desc;
1159 desc.written = 0;
1160 desc.count = count;
1161 desc.buf = buf;
1162 desc.error = 0;
1163 do_generic_file_read(filp, ppos, &desc, file_read_actor);
1165 retval = desc.written;
1166 if (!retval)
1167 retval = desc.error;
1170 return retval;
1173 static int file_send_actor(read_descriptor_t * desc, const char *area, unsigned long size)
1175 ssize_t written;
1176 unsigned long count = desc->count;
1177 struct file *file = (struct file *) desc->buf;
1178 mm_segment_t old_fs;
1180 if (size > count)
1181 size = count;
1182 old_fs = get_fs();
1183 set_fs(KERNEL_DS);
1184 written = file->f_op->write(file, area, size, &file->f_pos);
1185 set_fs(old_fs);
1186 if (written < 0) {
1187 desc->error = written;
1188 written = 0;
1190 desc->count = count - written;
1191 desc->written += written;
1192 return written;
1195 asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
1197 ssize_t retval;
1198 struct file * in_file, * out_file;
1199 struct inode * in_inode, * out_inode;
1202 * Get input file, and verify that it is ok..
1204 retval = -EBADF;
1205 in_file = fget(in_fd);
1206 if (!in_file)
1207 goto out;
1208 if (!(in_file->f_mode & FMODE_READ))
1209 goto fput_in;
1210 retval = -EINVAL;
1211 in_inode = in_file->f_dentry->d_inode;
1212 if (!in_inode)
1213 goto fput_in;
1214 if (!in_inode->i_op || !in_inode->i_op->readpage)
1215 goto fput_in;
1216 retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
1217 if (retval)
1218 goto fput_in;
1221 * Get output file, and verify that it is ok..
1223 retval = -EBADF;
1224 out_file = fget(out_fd);
1225 if (!out_file)
1226 goto fput_in;
1227 if (!(out_file->f_mode & FMODE_WRITE))
1228 goto fput_out;
1229 retval = -EINVAL;
1230 if (!out_file->f_op || !out_file->f_op->write)
1231 goto fput_out;
1232 out_inode = out_file->f_dentry->d_inode;
1233 if (!out_inode)
1234 goto fput_out;
1235 retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
1236 if (retval)
1237 goto fput_out;
1239 retval = 0;
1240 if (count) {
1241 read_descriptor_t desc;
1242 loff_t pos = 0, *ppos;
1244 retval = -EFAULT;
1245 ppos = &in_file->f_pos;
1246 if (offset) {
1247 if (get_user(pos, offset))
1248 goto fput_out;
1249 ppos = &pos;
1252 desc.written = 0;
1253 desc.count = count;
1254 desc.buf = (char *) out_file;
1255 desc.error = 0;
1256 do_generic_file_read(in_file, ppos, &desc, file_send_actor);
1258 retval = desc.written;
1259 if (!retval)
1260 retval = desc.error;
1261 if (offset)
1262 put_user(pos, offset);
1265 fput_out:
1266 fput(out_file);
1267 fput_in:
1268 fput(in_file);
1269 out:
1270 return retval;
1274 * Semantics for shared and private memory areas are different past the end
1275 * of the file. A shared mapping past the last page of the file is an error
1276 * and results in a SIGBUS, while a private mapping just maps in a zero page.
1278 * The goto's are kind of ugly, but this streamlines the normal case of having
1279 * it in the page cache, and handles the special cases reasonably without
1280 * having a lot of duplicated code.
1282 * WSH 06/04/97: fixed a memory leak and moved the allocation of new_page
1283 * ahead of the wait if we're sure to need it.
1285 static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long address, int no_share)
1287 struct file * file = area->vm_file;
1288 struct dentry * dentry = file->f_dentry;
1289 struct inode * inode = dentry->d_inode;
1290 unsigned long offset, reada, i;
1291 struct page * page, **hash;
1292 unsigned long old_page, new_page;
1293 int error;
1295 new_page = 0;
1296 offset = (address & PAGE_MASK) - area->vm_start + area->vm_offset;
1297 if (offset >= inode->i_size && (area->vm_flags & VM_SHARED) && area->vm_mm == current->mm)
1298 goto no_page;
1301 * Do we have something in the page cache already?
1303 hash = page_hash(inode, offset);
1304 retry_find:
1305 page = __find_get_page(inode, offset, hash);
1306 if (!page)
1307 goto no_cached_page;
1309 found_page:
1311 * Ok, found a page in the page cache, now we need to check
1312 * that it's up-to-date. First check whether we'll need an
1313 * extra page -- better to overlap the allocation with the I/O.
1315 if (no_share && !new_page) {
1316 new_page = page_cache_alloc();
1317 if (!new_page)
1318 goto failure;
1321 if (!Page_Uptodate(page)) {
1322 lock_page(page);
1323 if (!Page_Uptodate(page))
1324 goto page_not_uptodate;
1325 UnlockPage(page);
1328 success:
1330 * Found the page and have a reference on it, need to check sharing
1331 * and possibly copy it over to another page..
1333 old_page = page_address(page);
1334 if (!no_share) {
1336 * Ok, we can share the cached page directly.. Get rid
1337 * of any potential extra pages.
1339 if (new_page)
1340 page_cache_free(new_page);
1342 flush_page_to_ram(old_page);
1343 return old_page;
1347 * No sharing ... copy to the new page.
1349 copy_page(new_page, old_page);
1350 flush_page_to_ram(new_page);
1351 page_cache_release(page);
1352 return new_page;
1354 no_cached_page:
1356 * Try to read in an entire cluster at once.
1358 reada = offset;
1359 reada >>= PAGE_CACHE_SHIFT + page_cluster;
1360 reada <<= PAGE_CACHE_SHIFT + page_cluster;
1362 for (i = 1 << page_cluster; i > 0; --i, reada += PAGE_CACHE_SIZE)
1363 new_page = try_to_read_ahead(file, reada, new_page);
1365 if (!new_page)
1366 new_page = page_cache_alloc();
1367 if (!new_page)
1368 goto no_page;
1371 * During getting the above page we might have slept,
1372 * so we need to re-check the situation with the page
1373 * cache.. The page we just got may be useful if we
1374 * can't share, so don't get rid of it here.
1376 page = __find_get_page(inode, offset, hash);
1377 if (page)
1378 goto found_page;
1381 * Now, create a new page-cache page from the page we got
1383 page = page_cache_entry(new_page);
1384 if (add_to_page_cache_unique(page, inode, offset, hash))
1385 goto retry_find;
1388 * Now it's ours and locked, we can do initial IO to it:
1390 new_page = 0;
1392 page_not_uptodate:
1393 error = inode->i_op->readpage(file, page);
1395 if (!error) {
1396 wait_on_page(page);
1397 if (PageError(page))
1398 goto page_read_error;
1399 goto success;
1402 page_read_error:
1404 * Umm, take care of errors if the page isn't up-to-date.
1405 * Try to re-read it _once_. We do this synchronously,
1406 * because there really aren't any performance issues here
1407 * and we need to check for errors.
1409 if (!PageLocked(page))
1410 PAGE_BUG(page);
1411 ClearPageError(page);
1412 error = inode->i_op->readpage(file, page);
1413 if (error)
1414 goto failure;
1415 wait_on_page(page);
1416 if (Page_Uptodate(page))
1417 goto success;
1420 * Things didn't work out. Return zero to tell the
1421 * mm layer so, possibly freeing the page cache page first.
1423 failure:
1424 page_cache_release(page);
1425 if (new_page)
1426 page_cache_free(new_page);
1427 no_page:
1428 return 0;
1432 * Tries to write a shared mapped page to its backing store. May return -EIO
1433 * if the disk is full.
1435 static inline int do_write_page(struct inode * inode, struct file * file,
1436 const char * page_addr, unsigned long offset)
1438 int retval;
1439 unsigned long size;
1440 int (*writepage) (struct file *, struct page *);
1441 struct page * page;
1443 size = offset + PAGE_SIZE;
1444 /* refuse to extend file size.. */
1445 if (S_ISREG(inode->i_mode)) {
1446 if (size > inode->i_size)
1447 size = inode->i_size;
1448 /* Ho humm.. We should have tested for this earlier */
1449 if (size < offset)
1450 return -EIO;
1452 size -= offset;
1453 retval = -EIO;
1454 writepage = inode->i_op->writepage;
1455 page = mem_map + MAP_NR(page_addr);
1456 lock_page(page);
1458 retval = writepage(file, page);
1460 UnlockPage(page);
1461 return retval;
1464 static int filemap_write_page(struct vm_area_struct * vma,
1465 unsigned long offset,
1466 unsigned long page,
1467 int wait)
1469 int result;
1470 struct file * file;
1471 struct dentry * dentry;
1472 struct inode * inode;
1474 file = vma->vm_file;
1475 dentry = file->f_dentry;
1476 inode = dentry->d_inode;
1479 * If a task terminates while we're swapping the page, the vma and
1480 * and file could be released ... increment the count to be safe.
1482 get_file(file);
1483 result = do_write_page(inode, file, (const char *) page, offset);
1484 fput(file);
1485 return result;
1490 * The page cache takes care of races between somebody
1491 * trying to swap something out and swap something in
1492 * at the same time..
1494 extern void wakeup_bdflush(int);
1495 int filemap_swapout(struct vm_area_struct * vma, struct page * page)
1497 int retval = filemap_write_page(vma, page->offset, page_address(page), 0);
1498 wakeup_bdflush(0);
1499 return retval;
1502 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
1503 unsigned long address, unsigned int flags)
1505 pte_t pte = *ptep;
1506 unsigned long pageaddr;
1507 struct page *page;
1508 int error;
1510 if (!(flags & MS_INVALIDATE)) {
1511 if (!pte_present(pte))
1512 return 0;
1513 if (!pte_dirty(pte))
1514 return 0;
1515 flush_page_to_ram(pte_page(pte));
1516 flush_cache_page(vma, address);
1517 set_pte(ptep, pte_mkclean(pte));
1518 flush_tlb_page(vma, address);
1519 pageaddr = pte_page(pte);
1520 page = page_cache_entry(pageaddr);
1521 get_page(page);
1522 } else {
1523 if (pte_none(pte))
1524 return 0;
1525 flush_cache_page(vma, address);
1526 pte_clear(ptep);
1527 flush_tlb_page(vma, address);
1528 if (!pte_present(pte)) {
1529 swap_free(pte_val(pte));
1530 return 0;
1532 pageaddr = pte_page(pte);
1533 if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
1534 page_cache_free(pageaddr);
1535 return 0;
1538 error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, pageaddr, 1);
1539 page_cache_free(pageaddr);
1540 return error;
1543 static inline int filemap_sync_pte_range(pmd_t * pmd,
1544 unsigned long address, unsigned long size,
1545 struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
1547 pte_t * pte;
1548 unsigned long end;
1549 int error;
1551 if (pmd_none(*pmd))
1552 return 0;
1553 if (pmd_bad(*pmd)) {
1554 printk("filemap_sync_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
1555 pmd_clear(pmd);
1556 return 0;
1558 pte = pte_offset(pmd, address);
1559 offset += address & PMD_MASK;
1560 address &= ~PMD_MASK;
1561 end = address + size;
1562 if (end > PMD_SIZE)
1563 end = PMD_SIZE;
1564 error = 0;
1565 do {
1566 error |= filemap_sync_pte(pte, vma, address + offset, flags);
1567 address += PAGE_SIZE;
1568 pte++;
1569 } while (address < end);
1570 return error;
1573 static inline int filemap_sync_pmd_range(pgd_t * pgd,
1574 unsigned long address, unsigned long size,
1575 struct vm_area_struct *vma, unsigned int flags)
1577 pmd_t * pmd;
1578 unsigned long offset, end;
1579 int error;
1581 if (pgd_none(*pgd))
1582 return 0;
1583 if (pgd_bad(*pgd)) {
1584 printk("filemap_sync_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd));
1585 pgd_clear(pgd);
1586 return 0;
1588 pmd = pmd_offset(pgd, address);
1589 offset = address & PGDIR_MASK;
1590 address &= ~PGDIR_MASK;
1591 end = address + size;
1592 if (end > PGDIR_SIZE)
1593 end = PGDIR_SIZE;
1594 error = 0;
1595 do {
1596 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
1597 address = (address + PMD_SIZE) & PMD_MASK;
1598 pmd++;
1599 } while (address < end);
1600 return error;
1603 static int filemap_sync(struct vm_area_struct * vma, unsigned long address,
1604 size_t size, unsigned int flags)
1606 pgd_t * dir;
1607 unsigned long end = address + size;
1608 int error = 0;
1610 dir = pgd_offset(vma->vm_mm, address);
1611 flush_cache_range(vma->vm_mm, end - size, end);
1612 while (address < end) {
1613 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
1614 address = (address + PGDIR_SIZE) & PGDIR_MASK;
1615 dir++;
1617 flush_tlb_range(vma->vm_mm, end - size, end);
1618 return error;
1622 * This handles (potentially partial) area unmaps..
1624 static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len)
1626 filemap_sync(vma, start, len, MS_ASYNC);
1630 * Shared mappings need to be able to do the right thing at
1631 * close/unmap/sync. They will also use the private file as
1632 * backing-store for swapping..
1634 static struct vm_operations_struct file_shared_mmap = {
1635 NULL, /* no special open */
1636 NULL, /* no special close */
1637 filemap_unmap, /* unmap - we need to sync the pages */
1638 NULL, /* no special protect */
1639 filemap_sync, /* sync */
1640 NULL, /* advise */
1641 filemap_nopage, /* nopage */
1642 NULL, /* wppage */
1643 filemap_swapout /* swapout */
1647 * Private mappings just need to be able to load in the map.
1649 * (This is actually used for shared mappings as well, if we
1650 * know they can't ever get write permissions..)
1652 static struct vm_operations_struct file_private_mmap = {
1653 NULL, /* open */
1654 NULL, /* close */
1655 NULL, /* unmap */
1656 NULL, /* protect */
1657 NULL, /* sync */
1658 NULL, /* advise */
1659 filemap_nopage, /* nopage */
1660 NULL, /* wppage */
1661 NULL /* swapout */
1664 /* This is used for a general mmap of a disk file */
1666 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1668 struct vm_operations_struct * ops;
1669 struct inode *inode = file->f_dentry->d_inode;
1671 ops = &file_private_mmap;
1672 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1673 if (!inode->i_op || !inode->i_op->writepage)
1674 return -EINVAL;
1675 ops = &file_shared_mmap;
1677 if (!inode->i_sb || !S_ISREG(inode->i_mode))
1678 return -EACCES;
1679 if (!inode->i_op || !inode->i_op->readpage)
1680 return -ENOEXEC;
1681 UPDATE_ATIME(inode);
1682 vma->vm_ops = ops;
1683 return 0;
1688 * The msync() system call.
1691 static int msync_interval(struct vm_area_struct * vma,
1692 unsigned long start, unsigned long end, int flags)
1694 if (vma->vm_file && vma->vm_ops && vma->vm_ops->sync) {
1695 int error;
1696 error = vma->vm_ops->sync(vma, start, end-start, flags);
1697 if (!error && (flags & MS_SYNC)) {
1698 struct file * file = vma->vm_file;
1699 if (file) {
1700 struct dentry * dentry = file->f_dentry;
1701 error = file_fsync(file, dentry);
1704 return error;
1706 return 0;
1709 asmlinkage int sys_msync(unsigned long start, size_t len, int flags)
1711 unsigned long end;
1712 struct vm_area_struct * vma;
1713 int unmapped_error, error = -EINVAL;
1715 down(&current->mm->mmap_sem);
1716 lock_kernel();
1717 if (start & ~PAGE_MASK)
1718 goto out;
1719 len = (len + ~PAGE_MASK) & PAGE_MASK;
1720 end = start + len;
1721 if (end < start)
1722 goto out;
1723 if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1724 goto out;
1725 error = 0;
1726 if (end == start)
1727 goto out;
1729 * If the interval [start,end) covers some unmapped address ranges,
1730 * just ignore them, but return -EFAULT at the end.
1732 vma = find_vma(current->mm, start);
1733 unmapped_error = 0;
1734 for (;;) {
1735 /* Still start < end. */
1736 error = -EFAULT;
1737 if (!vma)
1738 goto out;
1739 /* Here start < vma->vm_end. */
1740 if (start < vma->vm_start) {
1741 unmapped_error = -EFAULT;
1742 start = vma->vm_start;
1744 /* Here vma->vm_start <= start < vma->vm_end. */
1745 if (end <= vma->vm_end) {
1746 if (start < end) {
1747 error = msync_interval(vma, start, end, flags);
1748 if (error)
1749 goto out;
1751 error = unmapped_error;
1752 goto out;
1754 /* Here vma->vm_start <= start < vma->vm_end < end. */
1755 error = msync_interval(vma, start, vma->vm_end, flags);
1756 if (error)
1757 goto out;
1758 start = vma->vm_end;
1759 vma = vma->vm_next;
1761 out:
1762 unlock_kernel();
1763 up(&current->mm->mmap_sem);
1764 return error;
1768 * Write to a file through the page cache. This is mainly for the
1769 * benefit of NFS and possibly other network-based file systems.
1771 * We currently put everything into the page cache prior to writing it.
1772 * This is not a problem when writing full pages. With partial pages,
1773 * however, we first have to read the data into the cache, then
1774 * dirty the page, and finally schedule it for writing. Alternatively, we
1775 * could write-through just the portion of data that would go into that
1776 * page, but that would kill performance for applications that write data
1777 * line by line, and it's prone to race conditions.
1779 * Note that this routine doesn't try to keep track of dirty pages. Each
1780 * file system has to do this all by itself, unfortunately.
1781 * okir@monad.swb.de
1783 ssize_t
1784 generic_file_write(struct file *file, const char *buf,
1785 size_t count, loff_t *ppos,
1786 writepage_t write_one_page)
1788 struct dentry *dentry = file->f_dentry;
1789 struct inode *inode = dentry->d_inode;
1790 unsigned long pos = *ppos;
1791 unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
1792 struct page *page, **hash;
1793 unsigned long page_cache = 0;
1794 unsigned long written;
1795 long status;
1796 int err;
1798 err = file->f_error;
1799 if (err) {
1800 file->f_error = 0;
1801 goto out;
1804 written = 0;
1806 if (file->f_flags & O_APPEND)
1807 pos = inode->i_size;
1810 * Check whether we've reached the file size limit.
1812 err = -EFBIG;
1813 if (pos >= limit) {
1814 send_sig(SIGXFSZ, current, 0);
1815 goto out;
1818 status = 0;
1820 * Check whether to truncate the write,
1821 * and send the signal if we do.
1823 if (count > limit - pos) {
1824 send_sig(SIGXFSZ, current, 0);
1825 count = limit - pos;
1828 while (count) {
1829 unsigned long bytes, pgpos, offset;
1831 * Try to find the page in the cache. If it isn't there,
1832 * allocate a free page.
1834 offset = (pos & ~PAGE_CACHE_MASK);
1835 pgpos = pos & PAGE_CACHE_MASK;
1836 bytes = PAGE_CACHE_SIZE - offset;
1837 if (bytes > count)
1838 bytes = count;
1840 hash = page_hash(inode, pgpos);
1841 repeat_find:
1842 page = __find_lock_page(inode, pgpos, hash);
1843 if (!page) {
1844 if (!page_cache) {
1845 page_cache = page_cache_alloc();
1846 if (page_cache)
1847 goto repeat_find;
1848 status = -ENOMEM;
1849 break;
1851 page = page_cache_entry(page_cache);
1852 if (add_to_page_cache_unique(page,inode,pgpos,hash))
1853 goto repeat_find;
1855 page_cache = 0;
1858 /* We have exclusive IO access to the page.. */
1859 if (!PageLocked(page)) {
1860 PAGE_BUG(page);
1861 } else {
1862 if (page->owner != current) {
1863 PAGE_BUG(page);
1867 status = write_one_page(file, page, offset, bytes, buf);
1869 /* Mark it unlocked again and drop the page.. */
1870 UnlockPage(page);
1871 page_cache_release(page);
1873 if (status < 0)
1874 break;
1876 written += status;
1877 count -= status;
1878 pos += status;
1879 buf += status;
1881 *ppos = pos;
1882 if (pos > inode->i_size)
1883 inode->i_size = pos;
1885 if (page_cache)
1886 page_cache_free(page_cache);
1888 err = written ? written : status;
1889 out:
1890 return err;
1894 * Support routines for directory caching using the page cache.
1898 * Unlock and free a page.
1900 void put_cached_page(unsigned long addr)
1902 struct page * page = page_cache_entry(addr);
1904 UnlockPage(page);
1905 if (page_count(page) != 2)
1906 panic("put_cached_page: page count=%d\n",
1907 page_count(page));
1908 page_cache_release(page);
1911 void __init page_cache_init(unsigned long memory_size)
1913 unsigned long htable_size, order;
1915 htable_size = memory_size >> PAGE_SHIFT;
1916 htable_size *= sizeof(struct page *);
1917 for(order = 0; (PAGE_SIZE << order) < htable_size; order++)
1920 do {
1921 unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *);
1923 page_hash_bits = 0;
1924 while((tmp >>= 1UL) != 0UL)
1925 page_hash_bits++;
1927 page_hash_table = (struct page **)
1928 __get_free_pages(GFP_ATOMIC, order);
1929 } while(page_hash_table == NULL && --order > 0);
1931 printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
1932 (1 << page_hash_bits), order, (PAGE_SIZE << order));
1933 if (!page_hash_table)
1934 panic("Failed to allocate page hash table\n");
1935 memset(page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));