Import 2.3.12pre9
[davej-history.git] / mm / filemap.c
blob78b5d17e729f57007f129ca715b6596ce38fa8cf
1 /*
2 * linux/mm/filemap.c
4 * Copyright (C) 1994-1999 Linus Torvalds
5 */
7 /*
8 * This file handles the generic file mmap semantics used by
9 * most "normal" filesystems (but you don't /have/ to use this:
10 * the NFS filesystem used to do this differently, for example)
12 #include <linux/malloc.h>
13 #include <linux/shm.h>
14 #include <linux/mman.h>
15 #include <linux/locks.h>
16 #include <linux/pagemap.h>
17 #include <linux/swap.h>
18 #include <linux/smp_lock.h>
19 #include <linux/blkdev.h>
20 #include <linux/file.h>
21 #include <linux/swapctl.h>
22 #include <linux/slab.h>
23 #include <linux/init.h>
25 #include <asm/pgtable.h>
26 #include <asm/uaccess.h>
29 * Shared mappings implemented 30.11.1994. It's not fully working yet,
30 * though.
32 * Shared mappings now work. 15.8.1995 Bruno.
34 * finished 'unifying' the page and buffer cache and SMP-threaded the
35 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
38 atomic_t page_cache_size = ATOMIC_INIT(0);
39 unsigned int page_hash_bits;
40 struct page **page_hash_table;
42 spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
45 void __add_page_to_hash_queue(struct page * page, struct page **p)
47 atomic_inc(&page_cache_size);
48 if((page->next_hash = *p) != NULL)
49 (*p)->pprev_hash = &page->next_hash;
50 *p = page;
51 page->pprev_hash = p;
52 if (page->buffers)
53 PAGE_BUG(page);
56 static void remove_page_from_hash_queue(struct page * page)
58 if(page->pprev_hash) {
59 if(page->next_hash)
60 page->next_hash->pprev_hash = page->pprev_hash;
61 *page->pprev_hash = page->next_hash;
62 page->pprev_hash = NULL;
64 atomic_dec(&page_cache_size);
67 static void remove_page_from_inode_queue(struct page * page)
69 struct inode * inode = page->inode;
70 struct page *prev, *next;
72 inode->i_nrpages--;
73 next = page->next;
74 prev = page->prev;
75 if (inode->i_pages == page)
76 inode->i_pages = next;
77 if (next)
78 next->prev = prev;
79 if (prev)
80 prev->next = next;
81 page->next = NULL;
82 page->prev = NULL;
86 * Remove a page from the page cache and free it. Caller has to make
87 * sure the page is locked and that nobody else uses it - or that usage
88 * is safe.
90 void remove_inode_page(struct page *page)
92 if (!PageLocked(page))
93 PAGE_BUG(page);
95 spin_lock(&pagecache_lock);
96 remove_page_from_inode_queue(page);
97 remove_page_from_hash_queue(page);
98 page->inode = NULL;
99 spin_unlock(&pagecache_lock);
102 void invalidate_inode_pages(struct inode * inode)
104 struct page ** p;
105 struct page * page;
107 repeat:
108 spin_lock(&pagecache_lock);
109 p = &inode->i_pages;
110 while ((page = *p) != NULL) {
111 get_page(page);
112 if (TryLockPage(page)) {
113 spin_unlock(&pagecache_lock);
114 wait_on_page(page);
115 page_cache_release(page);
116 goto repeat;
118 if (page_count(page) != 2)
119 printk("hm, busy page invalidated? (not necesserily a bug)\n");
121 remove_page_from_inode_queue(page);
122 remove_page_from_hash_queue(page);
123 page->inode = NULL;
124 UnlockPage(page);
125 page_cache_release(page);
126 page_cache_release(page);
129 spin_unlock(&pagecache_lock);
132 * Truncate the page cache at a set offset, removing the pages
133 * that are beyond that offset (and zeroing out partial pages).
135 void truncate_inode_pages(struct inode * inode, unsigned long start)
137 struct page ** p;
138 struct page * page;
139 int partial = 0;
141 repeat:
142 spin_lock(&pagecache_lock);
143 p = &inode->i_pages;
144 while ((page = *p) != NULL) {
145 unsigned long offset = page->offset;
147 /* page wholly truncated - free it */
148 if (offset >= start) {
149 get_page(page);
150 spin_unlock(&pagecache_lock);
152 lock_page(page);
154 if (inode->i_op->flushpage)
155 inode->i_op->flushpage(inode, page, 0);
158 * We remove the page from the page cache
159 * _after_ we have destroyed all buffer-cache
160 * references to it. Otherwise some other process
161 * might think this inode page is not in the
162 * page cache and creates a buffer-cache alias
163 * to it causing all sorts of fun problems ...
165 remove_inode_page(page);
167 UnlockPage(page);
168 page_cache_release(page);
169 page_cache_release(page);
172 * We have done things without the pagecache lock,
173 * so we'll have to repeat the scan.
174 * It's not possible to deadlock here because
175 * we are guaranteed to make progress. (ie. we have
176 * just removed a page)
178 goto repeat;
180 p = &page->next;
182 * there is only one partial page possible.
184 if (partial)
185 continue;
187 offset = start - offset;
188 /* partial truncate, clear end of page */
189 if (offset < PAGE_CACHE_SIZE) {
190 unsigned long address;
191 get_page(page);
192 spin_unlock(&pagecache_lock);
194 lock_page(page);
195 partial = 1;
197 address = page_address(page);
198 memset((void *) (offset + address), 0, PAGE_CACHE_SIZE - offset);
199 flush_page_to_ram(address);
201 if (inode->i_op->flushpage)
202 inode->i_op->flushpage(inode, page, offset);
204 * we have dropped the spinlock so we have to
205 * restart.
207 UnlockPage(page);
208 page_cache_release(page);
209 goto repeat;
212 spin_unlock(&pagecache_lock);
215 extern atomic_t too_many_dirty_buffers;
217 int shrink_mmap(int priority, int gfp_mask)
219 static unsigned long clock = 0;
220 unsigned long limit = num_physpages << 1;
221 struct page * page;
222 int count, users;
224 count = limit >> priority;
226 page = mem_map + clock;
227 do {
228 int referenced;
230 /* This works even in the presence of PageSkip because
231 * the first two entries at the beginning of a hole will
232 * be marked, not just the first.
234 page++;
235 clock++;
236 if (clock >= max_mapnr) {
237 clock = 0;
238 page = mem_map;
240 if (PageSkip(page)) {
241 /* next_hash is overloaded for PageSkip */
242 page = page->next_hash;
243 clock = page - mem_map;
246 referenced = test_and_clear_bit(PG_referenced, &page->flags);
248 if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
249 continue;
251 count--;
254 * Some common cases that we just short-circuit without
255 * getting the locks - we need to re-check this once we
256 * have the lock, but that's fine.
258 users = page_count(page);
259 if (!users)
260 continue;
261 if (!page->buffers) {
262 if (!page->inode)
263 continue;
264 if (users > 1)
265 continue;
269 * ok, now the page looks interesting. Re-check things
270 * and keep the lock.
272 spin_lock(&pagecache_lock);
273 if (!page->inode && !page->buffers) {
274 spin_unlock(&pagecache_lock);
275 continue;
277 if (!page_count(page)) {
278 spin_unlock(&pagecache_lock);
279 BUG();
280 continue;
282 get_page(page);
283 if (TryLockPage(page)) {
284 spin_unlock(&pagecache_lock);
285 goto put_continue;
289 * we keep pagecache_lock locked and unlock it in
290 * each branch, so that the page->inode case doesnt
291 * have to re-grab it. Here comes the 'real' logic
292 * to free memory:
295 /* Is it a buffer page? */
296 if (page->buffers) {
297 int mem = page->inode ? 0 : PAGE_CACHE_SIZE;
298 spin_unlock(&pagecache_lock);
299 if (!try_to_free_buffers(page))
300 goto unlock_continue;
301 atomic_sub(mem, &buffermem);
302 spin_lock(&pagecache_lock);
306 * We can't free pages unless there's just one user
307 * (count == 2 because we added one ourselves above).
309 if (page_count(page) != 2)
310 goto spin_unlock_continue;
313 * Is it a page swap page? If so, we want to
314 * drop it if it is no longer used, even if it
315 * were to be marked referenced..
317 if (PageSwapCache(page)) {
318 spin_unlock(&pagecache_lock);
319 if (referenced && swap_count(page->offset) != 2)
320 goto unlock_continue;
321 __delete_from_swap_cache(page);
322 page_cache_release(page);
323 goto made_progress;
326 /* is it a page-cache page? */
327 if (!referenced && page->inode && !pgcache_under_min()) {
328 remove_page_from_inode_queue(page);
329 remove_page_from_hash_queue(page);
330 page->inode = NULL;
331 spin_unlock(&pagecache_lock);
333 page_cache_release(page);
334 goto made_progress;
336 spin_unlock_continue:
337 spin_unlock(&pagecache_lock);
338 unlock_continue:
339 UnlockPage(page);
340 put_continue:
341 put_page(page);
342 } while (count > 0);
343 return 0;
344 made_progress:
345 UnlockPage(page);
346 put_page(page);
347 return 1;
350 static inline struct page * __find_page_nolock(struct inode * inode, unsigned long offset, struct page *page)
352 goto inside;
354 for (;;) {
355 page = page->next_hash;
356 inside:
357 if (!page)
358 goto not_found;
359 if (page->inode != inode)
360 continue;
361 if (page->offset == offset)
362 break;
364 set_bit(PG_referenced, &page->flags);
365 not_found:
366 return page;
370 * By the time this is called, the page is locked and
371 * we don't have to worry about any races any more.
373 * Start the IO..
375 static int writeout_one_page(struct page *page)
377 struct buffer_head *bh, *head = page->buffers;
379 bh = head;
380 do {
381 if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
382 continue;
384 bh->b_flushtime = 0;
385 ll_rw_block(WRITE, 1, &bh);
386 } while ((bh = bh->b_this_page) != head);
387 return 0;
390 static int waitfor_one_page(struct page *page)
392 int error = 0;
393 struct buffer_head *bh, *head = page->buffers;
395 bh = head;
396 do {
397 wait_on_buffer(bh);
398 if (buffer_req(bh) && !buffer_uptodate(bh))
399 error = -EIO;
400 } while ((bh = bh->b_this_page) != head);
401 return error;
404 static int do_buffer_fdatasync(struct inode *inode, unsigned long start, unsigned long end, int (*fn)(struct page *))
406 struct page *next;
407 int retval = 0;
409 start &= PAGE_MASK;
411 spin_lock(&pagecache_lock);
412 next = inode->i_pages;
413 while (next) {
414 struct page *page = next;
415 next = page->next;
416 if (!page->buffers)
417 continue;
418 if (page->offset >= end)
419 continue;
420 if (page->offset < start)
421 continue;
423 get_page(page);
424 spin_unlock(&pagecache_lock);
425 lock_page(page);
427 /* The buffers could have been free'd while we waited for the page lock */
428 if (page->buffers)
429 retval |= fn(page);
431 UnlockPage(page);
432 spin_lock(&pagecache_lock);
433 next = page->next;
434 page_cache_release(page);
436 spin_unlock(&pagecache_lock);
438 return retval;
442 * Two-stage data sync: first start the IO, then go back and
443 * collect the information..
445 int generic_buffer_fdatasync(struct inode *inode, unsigned long start, unsigned long end)
447 int retval;
449 retval = do_buffer_fdatasync(inode, start, end, writeout_one_page);
450 retval |= do_buffer_fdatasync(inode, start, end, waitfor_one_page);
451 return retval;
455 * This adds a page to the page cache, starting out as locked,
456 * owned by us, referenced, but not uptodate and with no errors.
458 static inline void __add_to_page_cache(struct page * page,
459 struct inode * inode, unsigned long offset,
460 struct page **hash)
462 unsigned long flags;
464 flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error));
465 page->flags = flags | ((1 << PG_locked) | (1 << PG_referenced));
466 page->owner = current; /* REMOVEME */
467 get_page(page);
468 page->offset = offset;
469 add_page_to_inode_queue(inode, page);
470 __add_page_to_hash_queue(page, hash);
473 void add_to_page_cache(struct page * page, struct inode * inode, unsigned long offset)
475 spin_lock(&pagecache_lock);
476 __add_to_page_cache(page, inode, offset, page_hash(inode, offset));
477 spin_unlock(&pagecache_lock);
480 int add_to_page_cache_unique(struct page * page,
481 struct inode * inode, unsigned long offset,
482 struct page **hash)
484 int err;
485 struct page *alias;
487 spin_lock(&pagecache_lock);
488 alias = __find_page_nolock(inode, offset, *hash);
490 err = 1;
491 if (!alias) {
492 __add_to_page_cache(page,inode,offset,hash);
493 err = 0;
496 spin_unlock(&pagecache_lock);
497 return err;
501 * Try to read ahead in the file. "page_cache" is a potentially free page
502 * that we could use for the cache (if it is 0 we can try to create one,
503 * this is all overlapped with the IO on the previous page finishing anyway)
505 static unsigned long try_to_read_ahead(struct file * file,
506 unsigned long offset, unsigned long page_cache)
508 struct inode *inode = file->f_dentry->d_inode;
509 struct page * page;
510 struct page ** hash;
512 offset &= PAGE_CACHE_MASK;
513 switch (page_cache) {
514 case 0:
515 page_cache = page_cache_alloc();
516 if (!page_cache)
517 break;
518 default:
519 if (offset >= inode->i_size)
520 break;
521 hash = page_hash(inode, offset);
522 page = page_cache_entry(page_cache);
523 if (!add_to_page_cache_unique(page, inode, offset, hash)) {
525 * We do not have to check the return value here
526 * because it's a readahead.
528 inode->i_op->readpage(file, page);
529 page_cache = 0;
530 page_cache_release(page);
533 return page_cache;
537 * Wait for a page to get unlocked.
539 * This must be called with the caller "holding" the page,
540 * ie with increased "page->count" so that the page won't
541 * go away during the wait..
543 void ___wait_on_page(struct page *page)
545 struct task_struct *tsk = current;
546 DECLARE_WAITQUEUE(wait, tsk);
548 add_wait_queue(&page->wait, &wait);
549 do {
550 tsk->state = TASK_UNINTERRUPTIBLE;
551 run_task_queue(&tq_disk);
552 if (!PageLocked(page))
553 break;
554 schedule();
555 } while (PageLocked(page));
556 tsk->state = TASK_RUNNING;
557 remove_wait_queue(&page->wait, &wait);
561 * Get an exclusive lock on the page..
563 void lock_page(struct page *page)
565 if (TryLockPage(page)) {
566 struct task_struct *tsk = current;
567 DECLARE_WAITQUEUE(wait, current);
569 run_task_queue(&tq_disk);
570 add_wait_queue(&page->wait, &wait);
571 tsk->state = TASK_UNINTERRUPTIBLE;
573 while (TryLockPage(page)) {
574 run_task_queue(&tq_disk);
575 schedule();
576 tsk->state = TASK_UNINTERRUPTIBLE;
579 remove_wait_queue(&page->wait, &wait);
580 tsk->state = TASK_RUNNING;
586 * a rather lightweight function, finding and getting a reference to a
587 * hashed page atomically, waiting for it if it's locked.
589 struct page * __find_get_page (struct inode * inode,
590 unsigned long offset, struct page **hash)
592 struct page *page;
595 * We scan the hash list read-only. Addition to and removal from
596 * the hash-list needs a held write-lock.
598 repeat:
599 spin_lock(&pagecache_lock);
600 page = __find_page_nolock(inode, offset, *hash);
601 if (page)
602 get_page(page);
603 spin_unlock(&pagecache_lock);
605 /* Found the page, sleep if locked. */
606 if (page && PageLocked(page)) {
607 struct task_struct *tsk = current;
608 DECLARE_WAITQUEUE(wait, tsk);
610 add_wait_queue(&page->wait, &wait);
611 tsk->state = TASK_UNINTERRUPTIBLE;
613 run_task_queue(&tq_disk);
614 if (PageLocked(page))
615 schedule();
616 tsk->state = TASK_RUNNING;
617 remove_wait_queue(&page->wait, &wait);
620 * The page might have been unhashed meanwhile. It's
621 * not freed though because we hold a reference to it.
622 * If this is the case then it will be freed _here_,
623 * and we recheck the hash anyway.
625 page_cache_release(page);
626 goto repeat;
629 * It's not locked so we can return the page and we hold
630 * a reference to it.
632 return page;
636 * Get the lock to a page atomically.
638 struct page * __find_lock_page (struct inode * inode,
639 unsigned long offset, struct page **hash)
641 struct page *page;
644 * We scan the hash list read-only. Addition to and removal from
645 * the hash-list needs a held write-lock.
647 repeat:
648 spin_lock(&pagecache_lock);
649 page = __find_page_nolock(inode, offset, *hash);
650 if (page)
651 get_page(page);
652 spin_unlock(&pagecache_lock);
654 /* Found the page, sleep if locked. */
655 if (page && TryLockPage(page)) {
656 struct task_struct *tsk = current;
657 DECLARE_WAITQUEUE(wait, tsk);
659 add_wait_queue(&page->wait, &wait);
660 tsk->state = TASK_UNINTERRUPTIBLE;
662 run_task_queue(&tq_disk);
663 if (PageLocked(page))
664 schedule();
665 tsk->state = TASK_RUNNING;
666 remove_wait_queue(&page->wait, &wait);
669 * The page might have been unhashed meanwhile. It's
670 * not freed though because we hold a reference to it.
671 * If this is the case then it will be freed _here_,
672 * and we recheck the hash anyway.
674 page_cache_release(page);
675 goto repeat;
678 * It's not locked so we can return the page and we hold
679 * a reference to it.
681 return page;
684 #if 0
685 #define PROFILE_READAHEAD
686 #define DEBUG_READAHEAD
687 #endif
690 * Read-ahead profiling information
691 * --------------------------------
692 * Every PROFILE_MAXREADCOUNT, the following information is written
693 * to the syslog:
694 * Percentage of asynchronous read-ahead.
695 * Average of read-ahead fields context value.
696 * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
697 * to the syslog.
700 #ifdef PROFILE_READAHEAD
702 #define PROFILE_MAXREADCOUNT 1000
704 static unsigned long total_reada;
705 static unsigned long total_async;
706 static unsigned long total_ramax;
707 static unsigned long total_ralen;
708 static unsigned long total_rawin;
710 static void profile_readahead(int async, struct file *filp)
712 unsigned long flags;
714 ++total_reada;
715 if (async)
716 ++total_async;
718 total_ramax += filp->f_ramax;
719 total_ralen += filp->f_ralen;
720 total_rawin += filp->f_rawin;
722 if (total_reada > PROFILE_MAXREADCOUNT) {
723 save_flags(flags);
724 cli();
725 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
726 restore_flags(flags);
727 return;
730 printk("Readahead average: max=%ld, len=%ld, win=%ld, async=%ld%%\n",
731 total_ramax/total_reada,
732 total_ralen/total_reada,
733 total_rawin/total_reada,
734 (total_async*100)/total_reada);
735 #ifdef DEBUG_READAHEAD
736 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%ld\n",
737 filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
738 #endif
740 total_reada = 0;
741 total_async = 0;
742 total_ramax = 0;
743 total_ralen = 0;
744 total_rawin = 0;
746 restore_flags(flags);
749 #endif /* defined PROFILE_READAHEAD */
752 * Read-ahead context:
753 * -------------------
754 * The read ahead context fields of the "struct file" are the following:
755 * - f_raend : position of the first byte after the last page we tried to
756 * read ahead.
757 * - f_ramax : current read-ahead maximum size.
758 * - f_ralen : length of the current IO read block we tried to read-ahead.
759 * - f_rawin : length of the current read-ahead window.
760 * if last read-ahead was synchronous then
761 * f_rawin = f_ralen
762 * otherwise (was asynchronous)
763 * f_rawin = previous value of f_ralen + f_ralen
765 * Read-ahead limits:
766 * ------------------
767 * MIN_READAHEAD : minimum read-ahead size when read-ahead.
768 * MAX_READAHEAD : maximum read-ahead size when read-ahead.
770 * Synchronous read-ahead benefits:
771 * --------------------------------
772 * Using reasonable IO xfer length from peripheral devices increase system
773 * performances.
774 * Reasonable means, in this context, not too large but not too small.
775 * The actual maximum value is:
776 * MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
777 * and 32K if defined (4K page size assumed).
779 * Asynchronous read-ahead benefits:
780 * ---------------------------------
781 * Overlapping next read request and user process execution increase system
782 * performance.
784 * Read-ahead risks:
785 * -----------------
786 * We have to guess which further data are needed by the user process.
787 * If these data are often not really needed, it's bad for system
788 * performances.
789 * However, we know that files are often accessed sequentially by
790 * application programs and it seems that it is possible to have some good
791 * strategy in that guessing.
792 * We only try to read-ahead files that seems to be read sequentially.
794 * Asynchronous read-ahead risks:
795 * ------------------------------
796 * In order to maximize overlapping, we must start some asynchronous read
797 * request from the device, as soon as possible.
798 * We must be very careful about:
799 * - The number of effective pending IO read requests.
800 * ONE seems to be the only reasonable value.
801 * - The total memory pool usage for the file access stream.
802 * This maximum memory usage is implicitly 2 IO read chunks:
803 * 2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
804 * 64k if defined (4K page size assumed).
807 static inline int get_max_readahead(struct inode * inode)
809 if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
810 return MAX_READAHEAD;
811 return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
814 static inline unsigned long generic_file_readahead(int reada_ok,
815 struct file * filp, struct inode * inode,
816 unsigned long ppos, struct page * page, unsigned long page_cache)
818 unsigned long max_ahead, ahead;
819 unsigned long raend;
820 int max_readahead = get_max_readahead(inode);
822 raend = filp->f_raend & PAGE_CACHE_MASK;
823 max_ahead = 0;
826 * The current page is locked.
827 * If the current position is inside the previous read IO request, do not
828 * try to reread previously read ahead pages.
829 * Otherwise decide or not to read ahead some pages synchronously.
830 * If we are not going to read ahead, set the read ahead context for this
831 * page only.
833 if (PageLocked(page)) {
834 if (!filp->f_ralen || ppos >= raend || ppos + filp->f_ralen < raend) {
835 raend = ppos;
836 if (raend < inode->i_size)
837 max_ahead = filp->f_ramax;
838 filp->f_rawin = 0;
839 filp->f_ralen = PAGE_CACHE_SIZE;
840 if (!max_ahead) {
841 filp->f_raend = ppos + filp->f_ralen;
842 filp->f_rawin += filp->f_ralen;
847 * The current page is not locked.
848 * If we were reading ahead and,
849 * if the current max read ahead size is not zero and,
850 * if the current position is inside the last read-ahead IO request,
851 * it is the moment to try to read ahead asynchronously.
852 * We will later force unplug device in order to force asynchronous read IO.
854 else if (reada_ok && filp->f_ramax && raend >= PAGE_CACHE_SIZE &&
855 ppos <= raend && ppos + filp->f_ralen >= raend) {
857 * Add ONE page to max_ahead in order to try to have about the same IO max size
858 * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
859 * Compute the position of the last page we have tried to read in order to
860 * begin to read ahead just at the next page.
862 raend -= PAGE_CACHE_SIZE;
863 if (raend < inode->i_size)
864 max_ahead = filp->f_ramax + PAGE_CACHE_SIZE;
866 if (max_ahead) {
867 filp->f_rawin = filp->f_ralen;
868 filp->f_ralen = 0;
869 reada_ok = 2;
873 * Try to read ahead pages.
874 * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
875 * scheduler, will work enough for us to avoid too bad actuals IO requests.
877 ahead = 0;
878 while (ahead < max_ahead) {
879 ahead += PAGE_CACHE_SIZE;
880 page_cache = try_to_read_ahead(filp, raend + ahead,
881 page_cache);
884 * If we tried to read ahead some pages,
885 * If we tried to read ahead asynchronously,
886 * Try to force unplug of the device in order to start an asynchronous
887 * read IO request.
888 * Update the read-ahead context.
889 * Store the length of the current read-ahead window.
890 * Double the current max read ahead size.
891 * That heuristic avoid to do some large IO for files that are not really
892 * accessed sequentially.
894 if (ahead) {
895 if (reada_ok == 2) {
896 run_task_queue(&tq_disk);
899 filp->f_ralen += ahead;
900 filp->f_rawin += filp->f_ralen;
901 filp->f_raend = raend + ahead + PAGE_CACHE_SIZE;
903 filp->f_ramax += filp->f_ramax;
905 if (filp->f_ramax > max_readahead)
906 filp->f_ramax = max_readahead;
908 #ifdef PROFILE_READAHEAD
909 profile_readahead((reada_ok == 2), filp);
910 #endif
913 return page_cache;
917 * "descriptor" for what we're up to with a read.
918 * This allows us to use the same read code yet
919 * have multiple different users of the data that
920 * we read from a file.
922 * The simplest case just copies the data to user
923 * mode.
925 typedef struct {
926 size_t written;
927 size_t count;
928 char * buf;
929 int error;
930 } read_descriptor_t;
932 typedef int (*read_actor_t)(read_descriptor_t *, const char *, unsigned long);
935 * This is a generic file read routine, and uses the
936 * inode->i_op->readpage() function for the actual low-level
937 * stuff.
939 * This is really ugly. But the goto's actually try to clarify some
940 * of the logic when it comes to error handling etc.
942 static void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
944 struct dentry *dentry = filp->f_dentry;
945 struct inode *inode = dentry->d_inode;
946 size_t pos, pgpos, page_cache;
947 int reada_ok;
948 int error;
949 int max_readahead = get_max_readahead(inode);
951 page_cache = 0;
953 pos = *ppos;
954 pgpos = pos & PAGE_CACHE_MASK;
956 * If the current position is outside the previous read-ahead window,
957 * we reset the current read-ahead context and set read ahead max to zero
958 * (will be set to just needed value later),
959 * otherwise, we assume that the file accesses are sequential enough to
960 * continue read-ahead.
962 if (pgpos > filp->f_raend || pgpos + filp->f_rawin < filp->f_raend) {
963 reada_ok = 0;
964 filp->f_raend = 0;
965 filp->f_ralen = 0;
966 filp->f_ramax = 0;
967 filp->f_rawin = 0;
968 } else {
969 reada_ok = 1;
972 * Adjust the current value of read-ahead max.
973 * If the read operation stay in the first half page, force no readahead.
974 * Otherwise try to increase read ahead max just enough to do the read request.
975 * Then, at least MIN_READAHEAD if read ahead is ok,
976 * and at most MAX_READAHEAD in all cases.
978 if (pos + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
979 filp->f_ramax = 0;
980 } else {
981 unsigned long needed;
983 needed = ((pos + desc->count) & PAGE_CACHE_MASK) - pgpos;
985 if (filp->f_ramax < needed)
986 filp->f_ramax = needed;
988 if (reada_ok && filp->f_ramax < MIN_READAHEAD)
989 filp->f_ramax = MIN_READAHEAD;
990 if (filp->f_ramax > max_readahead)
991 filp->f_ramax = max_readahead;
994 for (;;) {
995 struct page *page, **hash;
997 if (pos >= inode->i_size)
998 break;
1001 * Try to find the data in the page cache..
1003 hash = page_hash(inode, pos & PAGE_CACHE_MASK);
1005 spin_lock(&pagecache_lock);
1006 page = __find_page_nolock(inode, pos & PAGE_CACHE_MASK, *hash);
1007 if (!page)
1008 goto no_cached_page;
1009 found_page:
1010 get_page(page);
1011 spin_unlock(&pagecache_lock);
1013 if (!Page_Uptodate(page))
1014 goto page_not_up_to_date;
1015 page_ok:
1017 * Ok, we have the page, and it's up-to-date, so
1018 * now we can copy it to user space...
1021 unsigned long offset, nr;
1023 offset = pos & ~PAGE_CACHE_MASK;
1024 nr = PAGE_CACHE_SIZE - offset;
1025 if (nr > inode->i_size - pos)
1026 nr = inode->i_size - pos;
1029 * The actor routine returns how many bytes were actually used..
1030 * NOTE! This may not be the same as how much of a user buffer
1031 * we filled up (we may be padding etc), so we can only update
1032 * "pos" here (the actor routine has to update the user buffer
1033 * pointers and the remaining count).
1035 nr = actor(desc, (const char *) (page_address(page) + offset), nr);
1036 pos += nr;
1037 page_cache_release(page);
1038 if (nr && desc->count)
1039 continue;
1040 break;
1044 * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
1046 page_not_up_to_date:
1047 page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_CACHE_MASK, page, page_cache);
1049 if (Page_Uptodate(page))
1050 goto page_ok;
1052 /* Get exclusive access to the page ... */
1053 lock_page(page);
1054 if (Page_Uptodate(page)) {
1055 UnlockPage(page);
1056 goto page_ok;
1059 readpage:
1060 /* ... and start the actual read. The read will unlock the page. */
1061 error = inode->i_op->readpage(filp, page);
1063 if (!error) {
1064 if (Page_Uptodate(page))
1065 goto page_ok;
1067 /* Again, try some read-ahead while waiting for the page to finish.. */
1068 page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_CACHE_MASK, page, page_cache);
1069 wait_on_page(page);
1070 if (Page_Uptodate(page))
1071 goto page_ok;
1072 error = -EIO;
1075 /* UHHUH! A synchronous read error occurred. Report it */
1076 desc->error = error;
1077 page_cache_release(page);
1078 break;
1080 no_cached_page:
1082 * Ok, it wasn't cached, so we need to create a new
1083 * page..
1085 * We get here with the page cache lock held.
1087 if (!page_cache) {
1088 spin_unlock(&pagecache_lock);
1089 page_cache = page_cache_alloc();
1090 if (!page_cache) {
1091 desc->error = -ENOMEM;
1092 break;
1096 * Somebody may have added the page while we
1097 * dropped the page cache lock. Check for that.
1099 spin_lock(&pagecache_lock);
1100 page = __find_page_nolock(inode, pos & PAGE_CACHE_MASK, *hash);
1101 if (page)
1102 goto found_page;
1106 * Ok, add the new page to the hash-queues...
1108 page = page_cache_entry(page_cache);
1109 __add_to_page_cache(page, inode, pos & PAGE_CACHE_MASK, hash);
1110 spin_unlock(&pagecache_lock);
1112 page_cache = 0;
1113 goto readpage;
1116 *ppos = pos;
1117 filp->f_reada = 1;
1118 if (page_cache)
1119 page_cache_free(page_cache);
1120 UPDATE_ATIME(inode);
1123 static int file_read_actor(read_descriptor_t * desc, const char *area, unsigned long size)
1125 unsigned long left;
1126 unsigned long count = desc->count;
1128 if (size > count)
1129 size = count;
1130 left = __copy_to_user(desc->buf, area, size);
1131 if (left) {
1132 size -= left;
1133 desc->error = -EFAULT;
1135 desc->count = count - size;
1136 desc->written += size;
1137 desc->buf += size;
1138 return size;
1142 * This is the "read()" routine for all filesystems
1143 * that can use the page cache directly.
1145 ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
1147 ssize_t retval;
1149 retval = -EFAULT;
1150 if (access_ok(VERIFY_WRITE, buf, count)) {
1151 retval = 0;
1152 if (count) {
1153 read_descriptor_t desc;
1155 desc.written = 0;
1156 desc.count = count;
1157 desc.buf = buf;
1158 desc.error = 0;
1159 do_generic_file_read(filp, ppos, &desc, file_read_actor);
1161 retval = desc.written;
1162 if (!retval)
1163 retval = desc.error;
1166 return retval;
1169 static int file_send_actor(read_descriptor_t * desc, const char *area, unsigned long size)
1171 ssize_t written;
1172 unsigned long count = desc->count;
1173 struct file *file = (struct file *) desc->buf;
1174 mm_segment_t old_fs;
1176 if (size > count)
1177 size = count;
1178 old_fs = get_fs();
1179 set_fs(KERNEL_DS);
1180 written = file->f_op->write(file, area, size, &file->f_pos);
1181 set_fs(old_fs);
1182 if (written < 0) {
1183 desc->error = written;
1184 written = 0;
1186 desc->count = count - written;
1187 desc->written += written;
1188 return written;
1191 asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
1193 ssize_t retval;
1194 struct file * in_file, * out_file;
1195 struct inode * in_inode, * out_inode;
1198 * Get input file, and verify that it is ok..
1200 retval = -EBADF;
1201 in_file = fget(in_fd);
1202 if (!in_file)
1203 goto out;
1204 if (!(in_file->f_mode & FMODE_READ))
1205 goto fput_in;
1206 retval = -EINVAL;
1207 in_inode = in_file->f_dentry->d_inode;
1208 if (!in_inode)
1209 goto fput_in;
1210 if (!in_inode->i_op || !in_inode->i_op->readpage)
1211 goto fput_in;
1212 retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
1213 if (retval)
1214 goto fput_in;
1217 * Get output file, and verify that it is ok..
1219 retval = -EBADF;
1220 out_file = fget(out_fd);
1221 if (!out_file)
1222 goto fput_in;
1223 if (!(out_file->f_mode & FMODE_WRITE))
1224 goto fput_out;
1225 retval = -EINVAL;
1226 if (!out_file->f_op || !out_file->f_op->write)
1227 goto fput_out;
1228 out_inode = out_file->f_dentry->d_inode;
1229 if (!out_inode)
1230 goto fput_out;
1231 retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
1232 if (retval)
1233 goto fput_out;
1235 retval = 0;
1236 if (count) {
1237 read_descriptor_t desc;
1238 loff_t pos = 0, *ppos;
1240 retval = -EFAULT;
1241 ppos = &in_file->f_pos;
1242 if (offset) {
1243 if (get_user(pos, offset))
1244 goto fput_out;
1245 ppos = &pos;
1248 desc.written = 0;
1249 desc.count = count;
1250 desc.buf = (char *) out_file;
1251 desc.error = 0;
1252 do_generic_file_read(in_file, ppos, &desc, file_send_actor);
1254 retval = desc.written;
1255 if (!retval)
1256 retval = desc.error;
1257 if (offset)
1258 put_user(pos, offset);
1261 fput_out:
1262 fput(out_file);
1263 fput_in:
1264 fput(in_file);
1265 out:
1266 return retval;
1270 * Semantics for shared and private memory areas are different past the end
1271 * of the file. A shared mapping past the last page of the file is an error
1272 * and results in a SIGBUS, while a private mapping just maps in a zero page.
1274 * The goto's are kind of ugly, but this streamlines the normal case of having
1275 * it in the page cache, and handles the special cases reasonably without
1276 * having a lot of duplicated code.
1278 * WSH 06/04/97: fixed a memory leak and moved the allocation of new_page
1279 * ahead of the wait if we're sure to need it.
1281 static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long address, int no_share)
1283 struct file * file = area->vm_file;
1284 struct dentry * dentry = file->f_dentry;
1285 struct inode * inode = dentry->d_inode;
1286 unsigned long offset, reada, i;
1287 struct page * page, **hash;
1288 unsigned long old_page, new_page;
1289 int error;
1291 new_page = 0;
1292 offset = (address & PAGE_MASK) - area->vm_start + area->vm_offset;
1293 if (offset >= inode->i_size && (area->vm_flags & VM_SHARED) && area->vm_mm == current->mm)
1294 goto no_page;
1297 * Do we have something in the page cache already?
1299 hash = page_hash(inode, offset);
1300 retry_find:
1301 page = __find_get_page(inode, offset, hash);
1302 if (!page)
1303 goto no_cached_page;
1305 found_page:
1307 * Ok, found a page in the page cache, now we need to check
1308 * that it's up-to-date. First check whether we'll need an
1309 * extra page -- better to overlap the allocation with the I/O.
1311 if (no_share && !new_page) {
1312 new_page = page_cache_alloc();
1313 if (!new_page)
1314 goto failure;
1317 if (!Page_Uptodate(page)) {
1318 lock_page(page);
1319 if (!Page_Uptodate(page))
1320 goto page_not_uptodate;
1321 UnlockPage(page);
1324 success:
1326 * Found the page and have a reference on it, need to check sharing
1327 * and possibly copy it over to another page..
1329 old_page = page_address(page);
1330 if (!no_share) {
1332 * Ok, we can share the cached page directly.. Get rid
1333 * of any potential extra pages.
1335 if (new_page)
1336 page_cache_free(new_page);
1338 flush_page_to_ram(old_page);
1339 return old_page;
1343 * No sharing ... copy to the new page.
1345 copy_page(new_page, old_page);
1346 flush_page_to_ram(new_page);
1347 page_cache_release(page);
1348 return new_page;
1350 no_cached_page:
1352 * Try to read in an entire cluster at once.
1354 reada = offset;
1355 reada >>= PAGE_CACHE_SHIFT + page_cluster;
1356 reada <<= PAGE_CACHE_SHIFT + page_cluster;
1358 for (i = 1 << page_cluster; i > 0; --i, reada += PAGE_CACHE_SIZE)
1359 new_page = try_to_read_ahead(file, reada, new_page);
1361 if (!new_page)
1362 new_page = page_cache_alloc();
1363 if (!new_page)
1364 goto no_page;
1367 * During getting the above page we might have slept,
1368 * so we need to re-check the situation with the page
1369 * cache.. The page we just got may be useful if we
1370 * can't share, so don't get rid of it here.
1372 page = __find_get_page(inode, offset, hash);
1373 if (page)
1374 goto found_page;
1377 * Now, create a new page-cache page from the page we got
1379 page = page_cache_entry(new_page);
1380 if (add_to_page_cache_unique(page, inode, offset, hash))
1381 goto retry_find;
1384 * Now it's ours and locked, we can do initial IO to it:
1386 new_page = 0;
1388 page_not_uptodate:
1389 error = inode->i_op->readpage(file, page);
1391 if (!error) {
1392 wait_on_page(page);
1393 if (PageError(page))
1394 goto page_read_error;
1395 goto success;
1398 page_read_error:
1400 * Umm, take care of errors if the page isn't up-to-date.
1401 * Try to re-read it _once_. We do this synchronously,
1402 * because there really aren't any performance issues here
1403 * and we need to check for errors.
1405 if (!PageLocked(page))
1406 PAGE_BUG(page);
1407 ClearPageError(page);
1408 error = inode->i_op->readpage(file, page);
1409 if (error)
1410 goto failure;
1411 wait_on_page(page);
1412 if (Page_Uptodate(page))
1413 goto success;
1416 * Things didn't work out. Return zero to tell the
1417 * mm layer so, possibly freeing the page cache page first.
1419 failure:
1420 page_cache_release(page);
1421 if (new_page)
1422 page_cache_free(new_page);
1423 no_page:
1424 return 0;
1428 * Tries to write a shared mapped page to its backing store. May return -EIO
1429 * if the disk is full.
1431 static inline int do_write_page(struct inode * inode, struct file * file,
1432 const char * page_addr, unsigned long offset)
1434 int retval;
1435 unsigned long size;
1436 int (*writepage) (struct file *, struct page *);
1437 struct page * page;
1439 size = offset + PAGE_SIZE;
1440 /* refuse to extend file size.. */
1441 if (S_ISREG(inode->i_mode)) {
1442 if (size > inode->i_size)
1443 size = inode->i_size;
1444 /* Ho humm.. We should have tested for this earlier */
1445 if (size < offset)
1446 return -EIO;
1448 size -= offset;
1449 retval = -EIO;
1450 writepage = inode->i_op->writepage;
1451 page = mem_map + MAP_NR(page_addr);
1452 lock_page(page);
1454 retval = writepage(file, page);
1456 UnlockPage(page);
1457 return retval;
1460 static int filemap_write_page(struct vm_area_struct * vma,
1461 unsigned long offset,
1462 unsigned long page,
1463 int wait)
1465 int result;
1466 struct file * file;
1467 struct dentry * dentry;
1468 struct inode * inode;
1470 file = vma->vm_file;
1471 dentry = file->f_dentry;
1472 inode = dentry->d_inode;
1475 * If a task terminates while we're swapping the page, the vma and
1476 * and file could be released ... increment the count to be safe.
1478 get_file(file);
1479 result = do_write_page(inode, file, (const char *) page, offset);
1480 fput(file);
1481 return result;
1486 * The page cache takes care of races between somebody
1487 * trying to swap something out and swap something in
1488 * at the same time..
1490 extern void wakeup_bdflush(int);
1491 int filemap_swapout(struct vm_area_struct * vma, struct page * page)
1493 int retval = filemap_write_page(vma, page->offset, page_address(page), 0);
1494 wakeup_bdflush(0);
1495 return retval;
1498 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
1499 unsigned long address, unsigned int flags)
1501 pte_t pte = *ptep;
1502 unsigned long pageaddr;
1503 struct page *page;
1504 int error;
1506 if (!(flags & MS_INVALIDATE)) {
1507 if (!pte_present(pte))
1508 return 0;
1509 if (!pte_dirty(pte))
1510 return 0;
1511 flush_page_to_ram(pte_page(pte));
1512 flush_cache_page(vma, address);
1513 set_pte(ptep, pte_mkclean(pte));
1514 flush_tlb_page(vma, address);
1515 pageaddr = pte_page(pte);
1516 page = page_cache_entry(pageaddr);
1517 get_page(page);
1518 } else {
1519 if (pte_none(pte))
1520 return 0;
1521 flush_cache_page(vma, address);
1522 pte_clear(ptep);
1523 flush_tlb_page(vma, address);
1524 if (!pte_present(pte)) {
1525 swap_free(pte_val(pte));
1526 return 0;
1528 pageaddr = pte_page(pte);
1529 if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
1530 page_cache_free(pageaddr);
1531 return 0;
1534 error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, pageaddr, 1);
1535 page_cache_free(pageaddr);
1536 return error;
1539 static inline int filemap_sync_pte_range(pmd_t * pmd,
1540 unsigned long address, unsigned long size,
1541 struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
1543 pte_t * pte;
1544 unsigned long end;
1545 int error;
1547 if (pmd_none(*pmd))
1548 return 0;
1549 if (pmd_bad(*pmd)) {
1550 printk("filemap_sync_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
1551 pmd_clear(pmd);
1552 return 0;
1554 pte = pte_offset(pmd, address);
1555 offset += address & PMD_MASK;
1556 address &= ~PMD_MASK;
1557 end = address + size;
1558 if (end > PMD_SIZE)
1559 end = PMD_SIZE;
1560 error = 0;
1561 do {
1562 error |= filemap_sync_pte(pte, vma, address + offset, flags);
1563 address += PAGE_SIZE;
1564 pte++;
1565 } while (address < end);
1566 return error;
1569 static inline int filemap_sync_pmd_range(pgd_t * pgd,
1570 unsigned long address, unsigned long size,
1571 struct vm_area_struct *vma, unsigned int flags)
1573 pmd_t * pmd;
1574 unsigned long offset, end;
1575 int error;
1577 if (pgd_none(*pgd))
1578 return 0;
1579 if (pgd_bad(*pgd)) {
1580 printk("filemap_sync_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd));
1581 pgd_clear(pgd);
1582 return 0;
1584 pmd = pmd_offset(pgd, address);
1585 offset = address & PGDIR_MASK;
1586 address &= ~PGDIR_MASK;
1587 end = address + size;
1588 if (end > PGDIR_SIZE)
1589 end = PGDIR_SIZE;
1590 error = 0;
1591 do {
1592 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
1593 address = (address + PMD_SIZE) & PMD_MASK;
1594 pmd++;
1595 } while (address < end);
1596 return error;
1599 static int filemap_sync(struct vm_area_struct * vma, unsigned long address,
1600 size_t size, unsigned int flags)
1602 pgd_t * dir;
1603 unsigned long end = address + size;
1604 int error = 0;
1606 dir = pgd_offset(vma->vm_mm, address);
1607 flush_cache_range(vma->vm_mm, end - size, end);
1608 while (address < end) {
1609 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
1610 address = (address + PGDIR_SIZE) & PGDIR_MASK;
1611 dir++;
1613 flush_tlb_range(vma->vm_mm, end - size, end);
1614 return error;
1618 * This handles (potentially partial) area unmaps..
1620 static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len)
1622 filemap_sync(vma, start, len, MS_ASYNC);
1626 * Shared mappings need to be able to do the right thing at
1627 * close/unmap/sync. They will also use the private file as
1628 * backing-store for swapping..
1630 static struct vm_operations_struct file_shared_mmap = {
1631 NULL, /* no special open */
1632 NULL, /* no special close */
1633 filemap_unmap, /* unmap - we need to sync the pages */
1634 NULL, /* no special protect */
1635 filemap_sync, /* sync */
1636 NULL, /* advise */
1637 filemap_nopage, /* nopage */
1638 NULL, /* wppage */
1639 filemap_swapout /* swapout */
1643 * Private mappings just need to be able to load in the map.
1645 * (This is actually used for shared mappings as well, if we
1646 * know they can't ever get write permissions..)
1648 static struct vm_operations_struct file_private_mmap = {
1649 NULL, /* open */
1650 NULL, /* close */
1651 NULL, /* unmap */
1652 NULL, /* protect */
1653 NULL, /* sync */
1654 NULL, /* advise */
1655 filemap_nopage, /* nopage */
1656 NULL, /* wppage */
1657 NULL /* swapout */
1660 /* This is used for a general mmap of a disk file */
1662 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1664 struct vm_operations_struct * ops;
1665 struct inode *inode = file->f_dentry->d_inode;
1667 ops = &file_private_mmap;
1668 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1669 if (!inode->i_op || !inode->i_op->writepage)
1670 return -EINVAL;
1671 ops = &file_shared_mmap;
1673 if (!inode->i_sb || !S_ISREG(inode->i_mode))
1674 return -EACCES;
1675 if (!inode->i_op || !inode->i_op->readpage)
1676 return -ENOEXEC;
1677 UPDATE_ATIME(inode);
1678 vma->vm_ops = ops;
1679 return 0;
1684 * The msync() system call.
1687 static int msync_interval(struct vm_area_struct * vma,
1688 unsigned long start, unsigned long end, int flags)
1690 if (vma->vm_file && vma->vm_ops && vma->vm_ops->sync) {
1691 int error;
1692 error = vma->vm_ops->sync(vma, start, end-start, flags);
1693 if (!error && (flags & MS_SYNC)) {
1694 struct file * file = vma->vm_file;
1695 if (file) {
1696 struct dentry * dentry = file->f_dentry;
1697 error = file_fsync(file, dentry);
1700 return error;
1702 return 0;
1705 asmlinkage int sys_msync(unsigned long start, size_t len, int flags)
1707 unsigned long end;
1708 struct vm_area_struct * vma;
1709 int unmapped_error, error = -EINVAL;
1711 down(&current->mm->mmap_sem);
1712 lock_kernel();
1713 if (start & ~PAGE_MASK)
1714 goto out;
1715 len = (len + ~PAGE_MASK) & PAGE_MASK;
1716 end = start + len;
1717 if (end < start)
1718 goto out;
1719 if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1720 goto out;
1721 error = 0;
1722 if (end == start)
1723 goto out;
1725 * If the interval [start,end) covers some unmapped address ranges,
1726 * just ignore them, but return -EFAULT at the end.
1728 vma = find_vma(current->mm, start);
1729 unmapped_error = 0;
1730 for (;;) {
1731 /* Still start < end. */
1732 error = -EFAULT;
1733 if (!vma)
1734 goto out;
1735 /* Here start < vma->vm_end. */
1736 if (start < vma->vm_start) {
1737 unmapped_error = -EFAULT;
1738 start = vma->vm_start;
1740 /* Here vma->vm_start <= start < vma->vm_end. */
1741 if (end <= vma->vm_end) {
1742 if (start < end) {
1743 error = msync_interval(vma, start, end, flags);
1744 if (error)
1745 goto out;
1747 error = unmapped_error;
1748 goto out;
1750 /* Here vma->vm_start <= start < vma->vm_end < end. */
1751 error = msync_interval(vma, start, vma->vm_end, flags);
1752 if (error)
1753 goto out;
1754 start = vma->vm_end;
1755 vma = vma->vm_next;
1757 out:
1758 unlock_kernel();
1759 up(&current->mm->mmap_sem);
1760 return error;
1764 * Write to a file through the page cache. This is mainly for the
1765 * benefit of NFS and possibly other network-based file systems.
1767 * We currently put everything into the page cache prior to writing it.
1768 * This is not a problem when writing full pages. With partial pages,
1769 * however, we first have to read the data into the cache, then
1770 * dirty the page, and finally schedule it for writing. Alternatively, we
1771 * could write-through just the portion of data that would go into that
1772 * page, but that would kill performance for applications that write data
1773 * line by line, and it's prone to race conditions.
1775 * Note that this routine doesn't try to keep track of dirty pages. Each
1776 * file system has to do this all by itself, unfortunately.
1777 * okir@monad.swb.de
1779 ssize_t
1780 generic_file_write(struct file *file, const char *buf,
1781 size_t count, loff_t *ppos,
1782 writepage_t write_one_page)
1784 struct dentry *dentry = file->f_dentry;
1785 struct inode *inode = dentry->d_inode;
1786 unsigned long pos = *ppos;
1787 unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
1788 struct page *page, **hash;
1789 unsigned long page_cache = 0;
1790 unsigned long written;
1791 long status;
1792 int err;
1794 err = file->f_error;
1795 if (err) {
1796 file->f_error = 0;
1797 goto out;
1800 written = 0;
1802 if (file->f_flags & O_APPEND)
1803 pos = inode->i_size;
1806 * Check whether we've reached the file size limit.
1808 err = -EFBIG;
1809 if (pos >= limit) {
1810 send_sig(SIGXFSZ, current, 0);
1811 goto out;
1814 status = 0;
1816 * Check whether to truncate the write,
1817 * and send the signal if we do.
1819 if (count > limit - pos) {
1820 send_sig(SIGXFSZ, current, 0);
1821 count = limit - pos;
1824 while (count) {
1825 unsigned long bytes, pgpos, offset;
1827 * Try to find the page in the cache. If it isn't there,
1828 * allocate a free page.
1830 offset = (pos & ~PAGE_CACHE_MASK);
1831 pgpos = pos & PAGE_CACHE_MASK;
1832 bytes = PAGE_CACHE_SIZE - offset;
1833 if (bytes > count)
1834 bytes = count;
1836 hash = page_hash(inode, pgpos);
1837 repeat_find:
1838 page = __find_lock_page(inode, pgpos, hash);
1839 if (!page) {
1840 if (!page_cache) {
1841 page_cache = page_cache_alloc();
1842 if (page_cache)
1843 goto repeat_find;
1844 status = -ENOMEM;
1845 break;
1847 page = page_cache_entry(page_cache);
1848 if (add_to_page_cache_unique(page,inode,pgpos,hash))
1849 goto repeat_find;
1851 page_cache = 0;
1854 /* We have exclusive IO access to the page.. */
1855 if (!PageLocked(page)) {
1856 PAGE_BUG(page);
1857 } else {
1858 if (page->owner != current) {
1859 PAGE_BUG(page);
1863 status = write_one_page(file, page, offset, bytes, buf);
1865 /* Mark it unlocked again and drop the page.. */
1866 UnlockPage(page);
1867 page_cache_release(page);
1869 if (status < 0)
1870 break;
1872 written += status;
1873 count -= status;
1874 pos += status;
1875 buf += status;
1877 *ppos = pos;
1878 if (pos > inode->i_size)
1879 inode->i_size = pos;
1881 if (page_cache)
1882 page_cache_free(page_cache);
1884 err = written ? written : status;
1885 out:
1886 return err;
1890 * Support routines for directory caching using the page cache.
1894 * Unlock and free a page.
1896 void put_cached_page(unsigned long addr)
1898 struct page * page = page_cache_entry(addr);
1900 UnlockPage(page);
1901 if (page_count(page) != 2)
1902 panic("put_cached_page: page count=%d\n",
1903 page_count(page));
1904 page_cache_release(page);
1907 void __init page_cache_init(unsigned long memory_size)
1909 unsigned long htable_size, order;
1911 htable_size = memory_size >> PAGE_SHIFT;
1912 htable_size *= sizeof(struct page *);
1913 for(order = 0; (PAGE_SIZE << order) < htable_size; order++)
1916 do {
1917 unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *);
1919 page_hash_bits = 0;
1920 while((tmp >>= 1UL) != 0UL)
1921 page_hash_bits++;
1923 page_hash_table = (struct page **)
1924 __get_free_pages(GFP_ATOMIC, order);
1925 } while(page_hash_table == NULL && --order > 0);
1927 printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
1928 (1 << page_hash_bits), order, (PAGE_SIZE << order));
1929 if (!page_hash_table)
1930 panic("Failed to allocate page hash table\n");
1931 memset(page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));