2.2.0-final
[davej-history.git] / mm / filemap.c
blobd3d3b42874f83839a466aeeefd9867c5150d9a53
1 /*
2 * linux/mm/filemap.c
4 * Copyright (C) 1994, 1995 Linus Torvalds
5 */
7 /*
8 * This file handles the generic file mmap semantics used by
9 * most "normal" filesystems (but you don't /have/ to use this:
10 * the NFS filesystem used to do this differently, for example)
12 #include <linux/malloc.h>
13 #include <linux/shm.h>
14 #include <linux/mman.h>
15 #include <linux/locks.h>
16 #include <linux/pagemap.h>
17 #include <linux/swap.h>
18 #include <linux/smp_lock.h>
19 #include <linux/blkdev.h>
20 #include <linux/file.h>
21 #include <linux/swapctl.h>
23 #include <asm/pgtable.h>
24 #include <asm/uaccess.h>
27 * Shared mappings implemented 30.11.1994. It's not fully working yet,
28 * though.
30 * Shared mappings now work. 15.8.1995 Bruno.
33 unsigned long page_cache_size = 0;
34 struct page * page_hash_table[PAGE_HASH_SIZE];
37 * Simple routines for both non-shared and shared mappings.
40 #define release_page(page) __free_page((page))
43 * Invalidate the pages of an inode, removing all pages that aren't
44 * locked down (those are sure to be up-to-date anyway, so we shouldn't
45 * invalidate them).
47 void invalidate_inode_pages(struct inode * inode)
49 struct page ** p;
50 struct page * page;
52 p = &inode->i_pages;
53 while ((page = *p) != NULL) {
54 if (PageLocked(page)) {
55 p = &page->next;
56 continue;
58 inode->i_nrpages--;
59 if ((*p = page->next) != NULL)
60 (*p)->prev = page->prev;
61 page->next = NULL;
62 page->prev = NULL;
63 remove_page_from_hash_queue(page);
64 page->inode = NULL;
65 __free_page(page);
66 continue;
71 * Truncate the page cache at a set offset, removing the pages
72 * that are beyond that offset (and zeroing out partial pages).
74 void truncate_inode_pages(struct inode * inode, unsigned long start)
76 struct page ** p;
77 struct page * page;
79 repeat:
80 p = &inode->i_pages;
81 while ((page = *p) != NULL) {
82 unsigned long offset = page->offset;
84 /* page wholly truncated - free it */
85 if (offset >= start) {
86 if (PageLocked(page)) {
87 wait_on_page(page);
88 goto repeat;
90 inode->i_nrpages--;
91 if ((*p = page->next) != NULL)
92 (*p)->prev = page->prev;
93 page->next = NULL;
94 page->prev = NULL;
95 remove_page_from_hash_queue(page);
96 page->inode = NULL;
97 __free_page(page);
98 continue;
100 p = &page->next;
101 offset = start - offset;
102 /* partial truncate, clear end of page */
103 if (offset < PAGE_SIZE) {
104 unsigned long address = page_address(page);
105 memset((void *) (offset + address), 0, PAGE_SIZE - offset);
106 flush_page_to_ram(address);
112 * Remove a page from the page cache and free it.
114 void remove_inode_page(struct page *page)
116 remove_page_from_hash_queue(page);
117 remove_page_from_inode_queue(page);
118 __free_page(page);
121 int shrink_mmap(int priority, int gfp_mask)
123 static unsigned long clock = 0;
124 unsigned long limit = num_physpages;
125 struct page * page;
126 int count;
128 count = (limit << 1) >> priority;
130 page = mem_map + clock;
131 do {
132 int referenced;
134 /* This works even in the presence of PageSkip because
135 * the first two entries at the beginning of a hole will
136 * be marked, not just the first.
138 page++;
139 clock++;
140 if (clock >= max_mapnr) {
141 clock = 0;
142 page = mem_map;
144 if (PageSkip(page)) {
145 /* next_hash is overloaded for PageSkip */
146 page = page->next_hash;
147 clock = page - mem_map;
150 count--;
151 referenced = test_and_clear_bit(PG_referenced, &page->flags);
153 if (PageLocked(page))
154 continue;
156 if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
157 continue;
159 /* We can't free pages unless there's just one user */
160 if (atomic_read(&page->count) != 1)
161 continue;
164 * Is it a page swap page? If so, we want to
165 * drop it if it is no longer used, even if it
166 * were to be marked referenced..
168 if (PageSwapCache(page)) {
169 if (referenced && swap_count(page->offset) != 1)
170 continue;
171 delete_from_swap_cache(page);
172 return 1;
175 if (referenced)
176 continue;
178 /* Is it a buffer page? */
179 if (page->buffers) {
180 if (buffer_under_min())
181 continue;
182 if (!try_to_free_buffers(page))
183 continue;
184 return 1;
187 /* is it a page-cache page? */
188 if (page->inode) {
189 if (pgcache_under_min())
190 continue;
191 remove_inode_page(page);
192 return 1;
195 } while (count > 0);
196 return 0;
200 * Update a page cache copy, when we're doing a "write()" system call
201 * See also "update_vm_cache()".
203 void update_vm_cache(struct inode * inode, unsigned long pos, const char * buf, int count)
205 unsigned long offset, len;
207 offset = (pos & ~PAGE_MASK);
208 pos = pos & PAGE_MASK;
209 len = PAGE_SIZE - offset;
210 do {
211 struct page * page;
213 if (len > count)
214 len = count;
215 page = find_page(inode, pos);
216 if (page) {
217 wait_on_page(page);
218 memcpy((void *) (offset + page_address(page)), buf, len);
219 release_page(page);
221 count -= len;
222 buf += len;
223 len = PAGE_SIZE;
224 offset = 0;
225 pos += PAGE_SIZE;
226 } while (count);
229 static inline void add_to_page_cache(struct page * page,
230 struct inode * inode, unsigned long offset,
231 struct page **hash)
233 atomic_inc(&page->count);
234 page->flags = (page->flags & ~((1 << PG_uptodate) | (1 << PG_error))) | (1 << PG_referenced);
235 page->offset = offset;
236 add_page_to_inode_queue(inode, page);
237 __add_page_to_hash_queue(page, hash);
241 * Try to read ahead in the file. "page_cache" is a potentially free page
242 * that we could use for the cache (if it is 0 we can try to create one,
243 * this is all overlapped with the IO on the previous page finishing anyway)
245 static unsigned long try_to_read_ahead(struct file * file,
246 unsigned long offset, unsigned long page_cache)
248 struct inode *inode = file->f_dentry->d_inode;
249 struct page * page;
250 struct page ** hash;
252 offset &= PAGE_MASK;
253 switch (page_cache) {
254 case 0:
255 page_cache = __get_free_page(GFP_USER);
256 if (!page_cache)
257 break;
258 default:
259 if (offset >= inode->i_size)
260 break;
261 hash = page_hash(inode, offset);
262 page = __find_page(inode, offset, *hash);
263 if (!page) {
265 * Ok, add the new page to the hash-queues...
267 page = mem_map + MAP_NR(page_cache);
268 add_to_page_cache(page, inode, offset, hash);
269 inode->i_op->readpage(file, page);
270 page_cache = 0;
272 release_page(page);
274 return page_cache;
278 * Wait for IO to complete on a locked page.
280 * This must be called with the caller "holding" the page,
281 * ie with increased "page->count" so that the page won't
282 * go away during the wait..
284 void __wait_on_page(struct page *page)
286 struct task_struct *tsk = current;
287 struct wait_queue wait;
289 wait.task = tsk;
290 add_wait_queue(&page->wait, &wait);
291 repeat:
292 tsk->state = TASK_UNINTERRUPTIBLE;
293 run_task_queue(&tq_disk);
294 if (PageLocked(page)) {
295 schedule();
296 goto repeat;
298 tsk->state = TASK_RUNNING;
299 remove_wait_queue(&page->wait, &wait);
302 #if 0
303 #define PROFILE_READAHEAD
304 #define DEBUG_READAHEAD
305 #endif
308 * Read-ahead profiling information
309 * --------------------------------
310 * Every PROFILE_MAXREADCOUNT, the following information is written
311 * to the syslog:
312 * Percentage of asynchronous read-ahead.
313 * Average of read-ahead fields context value.
314 * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
315 * to the syslog.
318 #ifdef PROFILE_READAHEAD
320 #define PROFILE_MAXREADCOUNT 1000
322 static unsigned long total_reada;
323 static unsigned long total_async;
324 static unsigned long total_ramax;
325 static unsigned long total_ralen;
326 static unsigned long total_rawin;
328 static void profile_readahead(int async, struct file *filp)
330 unsigned long flags;
332 ++total_reada;
333 if (async)
334 ++total_async;
336 total_ramax += filp->f_ramax;
337 total_ralen += filp->f_ralen;
338 total_rawin += filp->f_rawin;
340 if (total_reada > PROFILE_MAXREADCOUNT) {
341 save_flags(flags);
342 cli();
343 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
344 restore_flags(flags);
345 return;
348 printk("Readahead average: max=%ld, len=%ld, win=%ld, async=%ld%%\n",
349 total_ramax/total_reada,
350 total_ralen/total_reada,
351 total_rawin/total_reada,
352 (total_async*100)/total_reada);
353 #ifdef DEBUG_READAHEAD
354 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%ld\n",
355 filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
356 #endif
358 total_reada = 0;
359 total_async = 0;
360 total_ramax = 0;
361 total_ralen = 0;
362 total_rawin = 0;
364 restore_flags(flags);
367 #endif /* defined PROFILE_READAHEAD */
370 * Read-ahead context:
371 * -------------------
372 * The read ahead context fields of the "struct file" are the following:
373 * - f_raend : position of the first byte after the last page we tried to
374 * read ahead.
375 * - f_ramax : current read-ahead maximum size.
376 * - f_ralen : length of the current IO read block we tried to read-ahead.
377 * - f_rawin : length of the current read-ahead window.
378 * if last read-ahead was synchronous then
379 * f_rawin = f_ralen
380 * otherwise (was asynchronous)
381 * f_rawin = previous value of f_ralen + f_ralen
383 * Read-ahead limits:
384 * ------------------
385 * MIN_READAHEAD : minimum read-ahead size when read-ahead.
386 * MAX_READAHEAD : maximum read-ahead size when read-ahead.
388 * Synchronous read-ahead benefits:
389 * --------------------------------
390 * Using reasonable IO xfer length from peripheral devices increase system
391 * performances.
392 * Reasonable means, in this context, not too large but not too small.
393 * The actual maximum value is:
394 * MAX_READAHEAD + PAGE_SIZE = 76k is CONFIG_READA_SMALL is undefined
395 * and 32K if defined (4K page size assumed).
397 * Asynchronous read-ahead benefits:
398 * ---------------------------------
399 * Overlapping next read request and user process execution increase system
400 * performance.
402 * Read-ahead risks:
403 * -----------------
404 * We have to guess which further data are needed by the user process.
405 * If these data are often not really needed, it's bad for system
406 * performances.
407 * However, we know that files are often accessed sequentially by
408 * application programs and it seems that it is possible to have some good
409 * strategy in that guessing.
410 * We only try to read-ahead files that seems to be read sequentially.
412 * Asynchronous read-ahead risks:
413 * ------------------------------
414 * In order to maximize overlapping, we must start some asynchronous read
415 * request from the device, as soon as possible.
416 * We must be very careful about:
417 * - The number of effective pending IO read requests.
418 * ONE seems to be the only reasonable value.
419 * - The total memory pool usage for the file access stream.
420 * This maximum memory usage is implicitly 2 IO read chunks:
421 * 2*(MAX_READAHEAD + PAGE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
422 * 64k if defined (4K page size assumed).
425 static inline int get_max_readahead(struct inode * inode)
427 if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
428 return MAX_READAHEAD;
429 return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
432 static inline unsigned long generic_file_readahead(int reada_ok,
433 struct file * filp, struct inode * inode,
434 unsigned long ppos, struct page * page, unsigned long page_cache)
436 unsigned long max_ahead, ahead;
437 unsigned long raend;
438 int max_readahead = get_max_readahead(inode);
440 raend = filp->f_raend & PAGE_MASK;
441 max_ahead = 0;
444 * The current page is locked.
445 * If the current position is inside the previous read IO request, do not
446 * try to reread previously read ahead pages.
447 * Otherwise decide or not to read ahead some pages synchronously.
448 * If we are not going to read ahead, set the read ahead context for this
449 * page only.
451 if (PageLocked(page)) {
452 if (!filp->f_ralen || ppos >= raend || ppos + filp->f_ralen < raend) {
453 raend = ppos;
454 if (raend < inode->i_size)
455 max_ahead = filp->f_ramax;
456 filp->f_rawin = 0;
457 filp->f_ralen = PAGE_SIZE;
458 if (!max_ahead) {
459 filp->f_raend = ppos + filp->f_ralen;
460 filp->f_rawin += filp->f_ralen;
465 * The current page is not locked.
466 * If we were reading ahead and,
467 * if the current max read ahead size is not zero and,
468 * if the current position is inside the last read-ahead IO request,
469 * it is the moment to try to read ahead asynchronously.
470 * We will later force unplug device in order to force asynchronous read IO.
472 else if (reada_ok && filp->f_ramax && raend >= PAGE_SIZE &&
473 ppos <= raend && ppos + filp->f_ralen >= raend) {
475 * Add ONE page to max_ahead in order to try to have about the same IO max size
476 * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_SIZE.
477 * Compute the position of the last page we have tried to read in order to
478 * begin to read ahead just at the next page.
480 raend -= PAGE_SIZE;
481 if (raend < inode->i_size)
482 max_ahead = filp->f_ramax + PAGE_SIZE;
484 if (max_ahead) {
485 filp->f_rawin = filp->f_ralen;
486 filp->f_ralen = 0;
487 reada_ok = 2;
491 * Try to read ahead pages.
492 * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
493 * scheduler, will work enough for us to avoid too bad actuals IO requests.
495 ahead = 0;
496 while (ahead < max_ahead) {
497 ahead += PAGE_SIZE;
498 page_cache = try_to_read_ahead(filp, raend + ahead,
499 page_cache);
502 * If we tried to read ahead some pages,
503 * If we tried to read ahead asynchronously,
504 * Try to force unplug of the device in order to start an asynchronous
505 * read IO request.
506 * Update the read-ahead context.
507 * Store the length of the current read-ahead window.
508 * Double the current max read ahead size.
509 * That heuristic avoid to do some large IO for files that are not really
510 * accessed sequentially.
512 if (ahead) {
513 if (reada_ok == 2) {
514 run_task_queue(&tq_disk);
517 filp->f_ralen += ahead;
518 filp->f_rawin += filp->f_ralen;
519 filp->f_raend = raend + ahead + PAGE_SIZE;
521 filp->f_ramax += filp->f_ramax;
523 if (filp->f_ramax > max_readahead)
524 filp->f_ramax = max_readahead;
526 #ifdef PROFILE_READAHEAD
527 profile_readahead((reada_ok == 2), filp);
528 #endif
531 return page_cache;
535 * "descriptor" for what we're up to with a read.
536 * This allows us to use the same read code yet
537 * have multiple different users of the data that
538 * we read from a file.
540 * The simplest case just copies the data to user
541 * mode.
543 typedef struct {
544 size_t written;
545 size_t count;
546 char * buf;
547 int error;
548 } read_descriptor_t;
550 typedef int (*read_actor_t)(read_descriptor_t *, const char *, unsigned long);
553 * This is a generic file read routine, and uses the
554 * inode->i_op->readpage() function for the actual low-level
555 * stuff.
557 * This is really ugly. But the goto's actually try to clarify some
558 * of the logic when it comes to error handling etc.
560 static void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
562 struct dentry *dentry = filp->f_dentry;
563 struct inode *inode = dentry->d_inode;
564 size_t pos, pgpos, page_cache;
565 int reada_ok;
566 int max_readahead = get_max_readahead(inode);
568 page_cache = 0;
570 pos = *ppos;
571 pgpos = pos & PAGE_MASK;
573 * If the current position is outside the previous read-ahead window,
574 * we reset the current read-ahead context and set read ahead max to zero
575 * (will be set to just needed value later),
576 * otherwise, we assume that the file accesses are sequential enough to
577 * continue read-ahead.
579 if (pgpos > filp->f_raend || pgpos + filp->f_rawin < filp->f_raend) {
580 reada_ok = 0;
581 filp->f_raend = 0;
582 filp->f_ralen = 0;
583 filp->f_ramax = 0;
584 filp->f_rawin = 0;
585 } else {
586 reada_ok = 1;
589 * Adjust the current value of read-ahead max.
590 * If the read operation stay in the first half page, force no readahead.
591 * Otherwise try to increase read ahead max just enough to do the read request.
592 * Then, at least MIN_READAHEAD if read ahead is ok,
593 * and at most MAX_READAHEAD in all cases.
595 if (pos + desc->count <= (PAGE_SIZE >> 1)) {
596 filp->f_ramax = 0;
597 } else {
598 unsigned long needed;
600 needed = ((pos + desc->count) & PAGE_MASK) - pgpos;
602 if (filp->f_ramax < needed)
603 filp->f_ramax = needed;
605 if (reada_ok && filp->f_ramax < MIN_READAHEAD)
606 filp->f_ramax = MIN_READAHEAD;
607 if (filp->f_ramax > max_readahead)
608 filp->f_ramax = max_readahead;
611 for (;;) {
612 struct page *page, **hash;
614 if (pos >= inode->i_size)
615 break;
618 * Try to find the data in the page cache..
620 hash = page_hash(inode, pos & PAGE_MASK);
621 page = __find_page(inode, pos & PAGE_MASK, *hash);
622 if (!page)
623 goto no_cached_page;
625 found_page:
627 * Try to read ahead only if the current page is filled or being filled.
628 * Otherwise, if we were reading ahead, decrease max read ahead size to
629 * the minimum value.
630 * In this context, that seems to may happen only on some read error or if
631 * the page has been rewritten.
633 if (PageUptodate(page) || PageLocked(page))
634 page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_MASK, page, page_cache);
635 else if (reada_ok && filp->f_ramax > MIN_READAHEAD)
636 filp->f_ramax = MIN_READAHEAD;
638 wait_on_page(page);
640 if (!PageUptodate(page))
641 goto page_read_error;
643 success:
645 * Ok, we have the page, it's up-to-date and ok,
646 * so now we can finally copy it to user space...
649 unsigned long offset, nr;
651 offset = pos & ~PAGE_MASK;
652 nr = PAGE_SIZE - offset;
653 if (nr > inode->i_size - pos)
654 nr = inode->i_size - pos;
657 * The actor routine returns how many bytes were actually used..
658 * NOTE! This may not be the same as how much of a user buffer
659 * we filled up (we may be padding etc), so we can only update
660 * "pos" here (the actor routine has to update the user buffer
661 * pointers and the remaining count).
663 nr = actor(desc, (const char *) (page_address(page) + offset), nr);
664 pos += nr;
665 release_page(page);
666 if (nr && desc->count)
667 continue;
668 break;
671 no_cached_page:
673 * Ok, it wasn't cached, so we need to create a new
674 * page..
676 if (!page_cache) {
677 page_cache = __get_free_page(GFP_USER);
679 * That could have slept, so go around to the
680 * very beginning..
682 if (page_cache)
683 continue;
684 desc->error = -ENOMEM;
685 break;
689 * Ok, add the new page to the hash-queues...
691 page = mem_map + MAP_NR(page_cache);
692 page_cache = 0;
693 add_to_page_cache(page, inode, pos & PAGE_MASK, hash);
696 * Error handling is tricky. If we get a read error,
697 * the cached page stays in the cache (but uptodate=0),
698 * and the next process that accesses it will try to
699 * re-read it. This is needed for NFS etc, where the
700 * identity of the reader can decide if we can read the
701 * page or not..
704 * We have to read the page.
705 * If we were reading ahead, we had previously tried to read this page,
706 * That means that the page has probably been removed from the cache before
707 * the application process needs it, or has been rewritten.
708 * Decrease max readahead size to the minimum value in that situation.
710 if (reada_ok && filp->f_ramax > MIN_READAHEAD)
711 filp->f_ramax = MIN_READAHEAD;
714 int error = inode->i_op->readpage(filp, page);
715 if (!error)
716 goto found_page;
717 desc->error = error;
718 release_page(page);
719 break;
722 page_read_error:
724 * We found the page, but it wasn't up-to-date.
725 * Try to re-read it _once_. We do this synchronously,
726 * because this happens only if there were errors.
729 int error = inode->i_op->readpage(filp, page);
730 if (!error) {
731 wait_on_page(page);
732 if (PageUptodate(page) && !PageError(page))
733 goto success;
734 error = -EIO; /* Some unspecified error occurred.. */
736 desc->error = error;
737 release_page(page);
738 break;
742 *ppos = pos;
743 filp->f_reada = 1;
744 if (page_cache)
745 free_page(page_cache);
746 UPDATE_ATIME(inode);
749 static int file_read_actor(read_descriptor_t * desc, const char *area, unsigned long size)
751 unsigned long left;
752 unsigned long count = desc->count;
754 if (size > count)
755 size = count;
756 left = __copy_to_user(desc->buf, area, size);
757 if (left) {
758 size -= left;
759 desc->error = -EFAULT;
761 desc->count = count - size;
762 desc->written += size;
763 desc->buf += size;
764 return size;
768 * This is the "read()" routine for all filesystems
769 * that can use the page cache directly.
771 ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
773 ssize_t retval;
775 retval = -EFAULT;
776 if (access_ok(VERIFY_WRITE, buf, count)) {
777 retval = 0;
778 if (count) {
779 read_descriptor_t desc;
781 desc.written = 0;
782 desc.count = count;
783 desc.buf = buf;
784 desc.error = 0;
785 do_generic_file_read(filp, ppos, &desc, file_read_actor);
787 retval = desc.written;
788 if (!retval)
789 retval = desc.error;
792 return retval;
795 static int file_send_actor(read_descriptor_t * desc, const char *area, unsigned long size)
797 ssize_t written;
798 unsigned long count = desc->count;
799 struct file *file = (struct file *) desc->buf;
800 struct inode *inode = file->f_dentry->d_inode;
801 mm_segment_t old_fs;
803 if (size > count)
804 size = count;
805 down(&inode->i_sem);
806 old_fs = get_fs();
807 set_fs(KERNEL_DS);
808 written = file->f_op->write(file, area, size, &file->f_pos);
809 set_fs(old_fs);
810 up(&inode->i_sem);
811 if (written < 0) {
812 desc->error = written;
813 written = 0;
815 desc->count = count - written;
816 desc->written += written;
817 return written;
820 asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
822 ssize_t retval;
823 struct file * in_file, * out_file;
824 struct inode * in_inode, * out_inode;
826 lock_kernel();
829 * Get input file, and verify that it is ok..
831 retval = -EBADF;
832 in_file = fget(in_fd);
833 if (!in_file)
834 goto out;
835 if (!(in_file->f_mode & FMODE_READ))
836 goto fput_in;
837 retval = -EINVAL;
838 in_inode = in_file->f_dentry->d_inode;
839 if (!in_inode)
840 goto fput_in;
841 if (!in_inode->i_op || !in_inode->i_op->readpage)
842 goto fput_in;
843 retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
844 if (retval)
845 goto fput_in;
848 * Get output file, and verify that it is ok..
850 retval = -EBADF;
851 out_file = fget(out_fd);
852 if (!out_file)
853 goto fput_in;
854 if (!(out_file->f_mode & FMODE_WRITE))
855 goto fput_out;
856 retval = -EINVAL;
857 if (!out_file->f_op || !out_file->f_op->write)
858 goto fput_out;
859 out_inode = out_file->f_dentry->d_inode;
860 if (!out_inode)
861 goto fput_out;
862 retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
863 if (retval)
864 goto fput_out;
866 retval = 0;
867 if (count) {
868 read_descriptor_t desc;
869 loff_t pos = 0, *ppos;
871 retval = -EFAULT;
872 ppos = &in_file->f_pos;
873 if (offset) {
874 if (get_user(pos, offset))
875 goto fput_out;
876 ppos = &pos;
879 desc.written = 0;
880 desc.count = count;
881 desc.buf = (char *) out_file;
882 desc.error = 0;
883 do_generic_file_read(in_file, ppos, &desc, file_send_actor);
885 retval = desc.written;
886 if (!retval)
887 retval = desc.error;
888 if (offset)
889 put_user(pos, offset);
893 fput_out:
894 fput(out_file);
895 fput_in:
896 fput(in_file);
897 out:
898 unlock_kernel();
899 return retval;
903 * Semantics for shared and private memory areas are different past the end
904 * of the file. A shared mapping past the last page of the file is an error
905 * and results in a SIGBUS, while a private mapping just maps in a zero page.
907 * The goto's are kind of ugly, but this streamlines the normal case of having
908 * it in the page cache, and handles the special cases reasonably without
909 * having a lot of duplicated code.
911 * WSH 06/04/97: fixed a memory leak and moved the allocation of new_page
912 * ahead of the wait if we're sure to need it.
914 static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long address, int no_share)
916 struct file * file = area->vm_file;
917 struct dentry * dentry = file->f_dentry;
918 struct inode * inode = dentry->d_inode;
919 unsigned long offset, reada, i;
920 struct page * page, **hash;
921 unsigned long old_page, new_page;
923 new_page = 0;
924 offset = (address & PAGE_MASK) - area->vm_start + area->vm_offset;
925 if (offset >= inode->i_size && (area->vm_flags & VM_SHARED) && area->vm_mm == current->mm)
926 goto no_page;
929 * Do we have something in the page cache already?
931 hash = page_hash(inode, offset);
932 page = __find_page(inode, offset, *hash);
933 if (!page)
934 goto no_cached_page;
936 found_page:
938 * Ok, found a page in the page cache, now we need to check
939 * that it's up-to-date. First check whether we'll need an
940 * extra page -- better to overlap the allocation with the I/O.
942 if (no_share && !new_page) {
943 new_page = __get_free_page(GFP_USER);
944 if (!new_page)
945 goto failure;
948 if (PageLocked(page))
949 goto page_locked_wait;
950 if (!PageUptodate(page))
951 goto page_read_error;
953 success:
955 * Found the page, need to check sharing and possibly
956 * copy it over to another page..
958 old_page = page_address(page);
959 if (!no_share) {
961 * Ok, we can share the cached page directly.. Get rid
962 * of any potential extra pages.
964 if (new_page)
965 free_page(new_page);
967 flush_page_to_ram(old_page);
968 return old_page;
972 * No sharing ... copy to the new page.
974 copy_page(new_page, old_page);
975 flush_page_to_ram(new_page);
976 release_page(page);
977 return new_page;
979 no_cached_page:
981 * Try to read in an entire cluster at once.
983 reada = offset;
984 reada >>= PAGE_SHIFT + page_cluster;
985 reada <<= PAGE_SHIFT + page_cluster;
987 for (i = 1 << page_cluster; i > 0; --i, reada += PAGE_SIZE)
988 new_page = try_to_read_ahead(file, reada, new_page);
990 if (!new_page)
991 new_page = __get_free_page(GFP_USER);
992 if (!new_page)
993 goto no_page;
996 * During getting the above page we might have slept,
997 * so we need to re-check the situation with the page
998 * cache.. The page we just got may be useful if we
999 * can't share, so don't get rid of it here.
1001 page = find_page(inode, offset);
1002 if (page)
1003 goto found_page;
1006 * Now, create a new page-cache page from the page we got
1008 page = mem_map + MAP_NR(new_page);
1009 new_page = 0;
1010 add_to_page_cache(page, inode, offset, hash);
1012 if (inode->i_op->readpage(file, page) != 0)
1013 goto failure;
1015 goto found_page;
1017 page_locked_wait:
1018 __wait_on_page(page);
1019 if (PageUptodate(page))
1020 goto success;
1022 page_read_error:
1024 * Umm, take care of errors if the page isn't up-to-date.
1025 * Try to re-read it _once_. We do this synchronously,
1026 * because there really aren't any performance issues here
1027 * and we need to check for errors.
1029 if (inode->i_op->readpage(file, page) != 0)
1030 goto failure;
1031 wait_on_page(page);
1032 if (PageError(page))
1033 goto failure;
1034 if (PageUptodate(page))
1035 goto success;
1038 * Things didn't work out. Return zero to tell the
1039 * mm layer so, possibly freeing the page cache page first.
1041 failure:
1042 release_page(page);
1043 if (new_page)
1044 free_page(new_page);
1045 no_page:
1046 return 0;
1050 * Tries to write a shared mapped page to its backing store. May return -EIO
1051 * if the disk is full.
1053 static inline int do_write_page(struct inode * inode, struct file * file,
1054 const char * page, unsigned long offset)
1056 int retval;
1057 unsigned long size;
1058 loff_t loff = offset;
1059 mm_segment_t old_fs;
1061 size = offset + PAGE_SIZE;
1062 /* refuse to extend file size.. */
1063 if (S_ISREG(inode->i_mode)) {
1064 if (size > inode->i_size)
1065 size = inode->i_size;
1066 /* Ho humm.. We should have tested for this earlier */
1067 if (size < offset)
1068 return -EIO;
1070 size -= offset;
1071 old_fs = get_fs();
1072 set_fs(KERNEL_DS);
1073 retval = -EIO;
1074 if (size == file->f_op->write(file, (const char *) page, size, &loff))
1075 retval = 0;
1076 set_fs(old_fs);
1077 return retval;
1080 static int filemap_write_page(struct vm_area_struct * vma,
1081 unsigned long offset,
1082 unsigned long page)
1084 int result;
1085 struct file * file;
1086 struct dentry * dentry;
1087 struct inode * inode;
1089 file = vma->vm_file;
1090 dentry = file->f_dentry;
1091 inode = dentry->d_inode;
1092 if (!file->f_op->write)
1093 return -EIO;
1096 * If a task terminates while we're swapping the page, the vma and
1097 * and file could be released ... increment the count to be safe.
1099 file->f_count++;
1100 down(&inode->i_sem);
1101 result = do_write_page(inode, file, (const char *) page, offset);
1102 up(&inode->i_sem);
1103 fput(file);
1104 return result;
1109 * The page cache takes care of races between somebody
1110 * trying to swap something out and swap something in
1111 * at the same time..
1113 int filemap_swapout(struct vm_area_struct * vma, struct page * page)
1115 return filemap_write_page(vma, page->offset, page_address(page));
1118 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
1119 unsigned long address, unsigned int flags)
1121 pte_t pte = *ptep;
1122 unsigned long page;
1123 int error;
1125 if (!(flags & MS_INVALIDATE)) {
1126 if (!pte_present(pte))
1127 return 0;
1128 if (!pte_dirty(pte))
1129 return 0;
1130 flush_page_to_ram(pte_page(pte));
1131 flush_cache_page(vma, address);
1132 set_pte(ptep, pte_mkclean(pte));
1133 flush_tlb_page(vma, address);
1134 page = pte_page(pte);
1135 atomic_inc(&mem_map[MAP_NR(page)].count);
1136 } else {
1137 if (pte_none(pte))
1138 return 0;
1139 flush_cache_page(vma, address);
1140 pte_clear(ptep);
1141 flush_tlb_page(vma, address);
1142 if (!pte_present(pte)) {
1143 swap_free(pte_val(pte));
1144 return 0;
1146 page = pte_page(pte);
1147 if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
1148 free_page(page);
1149 return 0;
1152 error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page);
1153 free_page(page);
1154 return error;
1157 static inline int filemap_sync_pte_range(pmd_t * pmd,
1158 unsigned long address, unsigned long size,
1159 struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
1161 pte_t * pte;
1162 unsigned long end;
1163 int error;
1165 if (pmd_none(*pmd))
1166 return 0;
1167 if (pmd_bad(*pmd)) {
1168 printk("filemap_sync_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
1169 pmd_clear(pmd);
1170 return 0;
1172 pte = pte_offset(pmd, address);
1173 offset += address & PMD_MASK;
1174 address &= ~PMD_MASK;
1175 end = address + size;
1176 if (end > PMD_SIZE)
1177 end = PMD_SIZE;
1178 error = 0;
1179 do {
1180 error |= filemap_sync_pte(pte, vma, address + offset, flags);
1181 address += PAGE_SIZE;
1182 pte++;
1183 } while (address < end);
1184 return error;
1187 static inline int filemap_sync_pmd_range(pgd_t * pgd,
1188 unsigned long address, unsigned long size,
1189 struct vm_area_struct *vma, unsigned int flags)
1191 pmd_t * pmd;
1192 unsigned long offset, end;
1193 int error;
1195 if (pgd_none(*pgd))
1196 return 0;
1197 if (pgd_bad(*pgd)) {
1198 printk("filemap_sync_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd));
1199 pgd_clear(pgd);
1200 return 0;
1202 pmd = pmd_offset(pgd, address);
1203 offset = address & PGDIR_MASK;
1204 address &= ~PGDIR_MASK;
1205 end = address + size;
1206 if (end > PGDIR_SIZE)
1207 end = PGDIR_SIZE;
1208 error = 0;
1209 do {
1210 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
1211 address = (address + PMD_SIZE) & PMD_MASK;
1212 pmd++;
1213 } while (address < end);
1214 return error;
1217 static int filemap_sync(struct vm_area_struct * vma, unsigned long address,
1218 size_t size, unsigned int flags)
1220 pgd_t * dir;
1221 unsigned long end = address + size;
1222 int error = 0;
1224 dir = pgd_offset(vma->vm_mm, address);
1225 flush_cache_range(vma->vm_mm, end - size, end);
1226 while (address < end) {
1227 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
1228 address = (address + PGDIR_SIZE) & PGDIR_MASK;
1229 dir++;
1231 flush_tlb_range(vma->vm_mm, end - size, end);
1232 return error;
1236 * This handles (potentially partial) area unmaps..
1238 static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len)
1240 filemap_sync(vma, start, len, MS_ASYNC);
1244 * Shared mappings need to be able to do the right thing at
1245 * close/unmap/sync. They will also use the private file as
1246 * backing-store for swapping..
1248 static struct vm_operations_struct file_shared_mmap = {
1249 NULL, /* no special open */
1250 NULL, /* no special close */
1251 filemap_unmap, /* unmap - we need to sync the pages */
1252 NULL, /* no special protect */
1253 filemap_sync, /* sync */
1254 NULL, /* advise */
1255 filemap_nopage, /* nopage */
1256 NULL, /* wppage */
1257 filemap_swapout, /* swapout */
1258 NULL, /* swapin */
1262 * Private mappings just need to be able to load in the map.
1264 * (This is actually used for shared mappings as well, if we
1265 * know they can't ever get write permissions..)
1267 static struct vm_operations_struct file_private_mmap = {
1268 NULL, /* open */
1269 NULL, /* close */
1270 NULL, /* unmap */
1271 NULL, /* protect */
1272 NULL, /* sync */
1273 NULL, /* advise */
1274 filemap_nopage, /* nopage */
1275 NULL, /* wppage */
1276 NULL, /* swapout */
1277 NULL, /* swapin */
1280 /* This is used for a general mmap of a disk file */
1282 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1284 struct vm_operations_struct * ops;
1285 struct inode *inode = file->f_dentry->d_inode;
1287 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1288 ops = &file_shared_mmap;
1289 /* share_page() can only guarantee proper page sharing if
1290 * the offsets are all page aligned. */
1291 if (vma->vm_offset & (PAGE_SIZE - 1))
1292 return -EINVAL;
1293 } else {
1294 ops = &file_private_mmap;
1295 if (vma->vm_offset & (inode->i_sb->s_blocksize - 1))
1296 return -EINVAL;
1298 if (!inode->i_sb || !S_ISREG(inode->i_mode))
1299 return -EACCES;
1300 if (!inode->i_op || !inode->i_op->readpage)
1301 return -ENOEXEC;
1302 UPDATE_ATIME(inode);
1303 vma->vm_file = file;
1304 file->f_count++;
1305 vma->vm_ops = ops;
1306 return 0;
1311 * The msync() system call.
1314 static int msync_interval(struct vm_area_struct * vma,
1315 unsigned long start, unsigned long end, int flags)
1317 if (vma->vm_file && vma->vm_ops && vma->vm_ops->sync) {
1318 int error;
1319 error = vma->vm_ops->sync(vma, start, end-start, flags);
1320 if (!error && (flags & MS_SYNC)) {
1321 struct file * file = vma->vm_file;
1322 if (file) {
1323 struct dentry * dentry = file->f_dentry;
1324 struct inode * inode = dentry->d_inode;
1325 down(&inode->i_sem);
1326 error = file_fsync(file, dentry);
1327 up(&inode->i_sem);
1330 return error;
1332 return 0;
1335 asmlinkage int sys_msync(unsigned long start, size_t len, int flags)
1337 unsigned long end;
1338 struct vm_area_struct * vma;
1339 int unmapped_error, error = -EINVAL;
1341 down(&current->mm->mmap_sem);
1342 lock_kernel();
1343 if (start & ~PAGE_MASK)
1344 goto out;
1345 len = (len + ~PAGE_MASK) & PAGE_MASK;
1346 end = start + len;
1347 if (end < start)
1348 goto out;
1349 if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1350 goto out;
1351 error = 0;
1352 if (end == start)
1353 goto out;
1355 * If the interval [start,end) covers some unmapped address ranges,
1356 * just ignore them, but return -EFAULT at the end.
1358 vma = find_vma(current->mm, start);
1359 unmapped_error = 0;
1360 for (;;) {
1361 /* Still start < end. */
1362 error = -EFAULT;
1363 if (!vma)
1364 goto out;
1365 /* Here start < vma->vm_end. */
1366 if (start < vma->vm_start) {
1367 unmapped_error = -EFAULT;
1368 start = vma->vm_start;
1370 /* Here vma->vm_start <= start < vma->vm_end. */
1371 if (end <= vma->vm_end) {
1372 if (start < end) {
1373 error = msync_interval(vma, start, end, flags);
1374 if (error)
1375 goto out;
1377 error = unmapped_error;
1378 goto out;
1380 /* Here vma->vm_start <= start < vma->vm_end < end. */
1381 error = msync_interval(vma, start, vma->vm_end, flags);
1382 if (error)
1383 goto out;
1384 start = vma->vm_end;
1385 vma = vma->vm_next;
1387 out:
1388 unlock_kernel();
1389 up(&current->mm->mmap_sem);
1390 return error;
1394 * Write to a file through the page cache. This is mainly for the
1395 * benefit of NFS and possibly other network-based file systems.
1397 * We currently put everything into the page cache prior to writing it.
1398 * This is not a problem when writing full pages. With partial pages,
1399 * however, we first have to read the data into the cache, then
1400 * dirty the page, and finally schedule it for writing. Alternatively, we
1401 * could write-through just the portion of data that would go into that
1402 * page, but that would kill performance for applications that write data
1403 * line by line, and it's prone to race conditions.
1405 * Note that this routine doesn't try to keep track of dirty pages. Each
1406 * file system has to do this all by itself, unfortunately.
1407 * okir@monad.swb.de
1409 ssize_t
1410 generic_file_write(struct file *file, const char *buf,
1411 size_t count, loff_t *ppos)
1413 struct dentry *dentry = file->f_dentry;
1414 struct inode *inode = dentry->d_inode;
1415 unsigned long pos = *ppos;
1416 unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
1417 struct page *page, **hash;
1418 unsigned long page_cache = 0;
1419 unsigned long written;
1420 long status, sync;
1422 if (!inode->i_op || !inode->i_op->updatepage)
1423 return -EIO;
1425 sync = file->f_flags & O_SYNC;
1426 written = 0;
1428 if (file->f_flags & O_APPEND)
1429 pos = inode->i_size;
1432 * Check whether we've reached the file size limit.
1434 status = -EFBIG;
1435 if (pos >= limit) {
1436 send_sig(SIGXFSZ, current, 0);
1437 goto out;
1440 status = 0;
1442 * Check whether to truncate the write,
1443 * and send the signal if we do.
1445 if (count > limit - pos) {
1446 send_sig(SIGXFSZ, current, 0);
1447 count = limit - pos;
1450 while (count) {
1451 unsigned long bytes, pgpos, offset;
1453 * Try to find the page in the cache. If it isn't there,
1454 * allocate a free page.
1456 offset = (pos & ~PAGE_MASK);
1457 pgpos = pos & PAGE_MASK;
1458 bytes = PAGE_SIZE - offset;
1459 if (bytes > count)
1460 bytes = count;
1462 hash = page_hash(inode, pgpos);
1463 page = __find_page(inode, pgpos, *hash);
1464 if (!page) {
1465 if (!page_cache) {
1466 page_cache = __get_free_page(GFP_USER);
1467 if (page_cache)
1468 continue;
1469 status = -ENOMEM;
1470 break;
1472 page = mem_map + MAP_NR(page_cache);
1473 add_to_page_cache(page, inode, pgpos, hash);
1474 page_cache = 0;
1477 /* Get exclusive IO access to the page.. */
1478 wait_on_page(page);
1479 set_bit(PG_locked, &page->flags);
1482 * Do the real work.. If the writer ends up delaying the write,
1483 * the writer needs to increment the page use counts until he
1484 * is done with the page.
1486 bytes -= copy_from_user((u8*)page_address(page) + offset, buf, bytes);
1487 status = -EFAULT;
1488 if (bytes)
1489 status = inode->i_op->updatepage(file, page, offset, bytes, sync);
1491 /* Mark it unlocked again and drop the page.. */
1492 clear_bit(PG_locked, &page->flags);
1493 wake_up(&page->wait);
1494 __free_page(page);
1496 if (status < 0)
1497 break;
1499 written += status;
1500 count -= status;
1501 pos += status;
1502 buf += status;
1504 *ppos = pos;
1505 if (pos > inode->i_size)
1506 inode->i_size = pos;
1508 if (page_cache)
1509 free_page(page_cache);
1510 out:
1511 return written ? written : status;
1515 * Support routines for directory cacheing using the page cache.
1519 * Finds the page at the specified offset, installing a new page
1520 * if requested. The count is incremented and the page is locked.
1522 * Note: we don't have to worry about races here, as the caller
1523 * is holding the inode semaphore.
1525 unsigned long get_cached_page(struct inode * inode, unsigned long offset,
1526 int new)
1528 struct page * page;
1529 struct page ** hash;
1530 unsigned long page_cache = 0;
1532 hash = page_hash(inode, offset);
1533 page = __find_page(inode, offset, *hash);
1534 if (!page) {
1535 if (!new)
1536 goto out;
1537 page_cache = get_free_page(GFP_USER);
1538 if (!page_cache)
1539 goto out;
1540 page = mem_map + MAP_NR(page_cache);
1541 add_to_page_cache(page, inode, offset, hash);
1543 if (atomic_read(&page->count) != 2)
1544 printk(KERN_ERR "get_cached_page: page count=%d\n",
1545 atomic_read(&page->count));
1546 if (test_bit(PG_locked, &page->flags))
1547 printk(KERN_ERR "get_cached_page: page already locked!\n");
1548 set_bit(PG_locked, &page->flags);
1549 page_cache = page_address(page);
1551 out:
1552 return page_cache;
1556 * Unlock and free a page.
1558 void put_cached_page(unsigned long addr)
1560 struct page * page = mem_map + MAP_NR(addr);
1562 if (!test_bit(PG_locked, &page->flags))
1563 printk("put_cached_page: page not locked!\n");
1564 if (atomic_read(&page->count) != 2)
1565 printk("put_cached_page: page count=%d\n",
1566 atomic_read(&page->count));
1567 clear_bit(PG_locked, &page->flags);
1568 wake_up(&page->wait);
1569 __free_page(page);