Import 2.3.7pre3
[davej-history.git] / mm / filemap.c
blobb1ce3a46d7bf5abab1d6a4c18a191abc6f89b55d
1 /*
2 * linux/mm/filemap.c
4 * Copyright (C) 1994-1999 Linus Torvalds
5 */
7 /*
8 * This file handles the generic file mmap semantics used by
9 * most "normal" filesystems (but you don't /have/ to use this:
10 * the NFS filesystem used to do this differently, for example)
12 #include <linux/malloc.h>
13 #include <linux/shm.h>
14 #include <linux/mman.h>
15 #include <linux/locks.h>
16 #include <linux/pagemap.h>
17 #include <linux/swap.h>
18 #include <linux/smp_lock.h>
19 #include <linux/blkdev.h>
20 #include <linux/file.h>
21 #include <linux/swapctl.h>
22 #include <linux/slab.h>
24 #include <asm/pgtable.h>
25 #include <asm/uaccess.h>
28 * Shared mappings implemented 30.11.1994. It's not fully working yet,
29 * though.
31 * Shared mappings now work. 15.8.1995 Bruno.
33 * finished 'unifying' the page and buffer cache and SMP-threaded the
34 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
37 atomic_t page_cache_size = ATOMIC_INIT(0);
38 struct page * page_hash_table[PAGE_HASH_SIZE];
40 /*
41 * Define a request structure for outstanding page write requests
42 * to the background page io daemon
45 struct pio_request
47 struct pio_request * next;
48 struct file * file;
49 unsigned long offset;
50 unsigned long page;
52 static struct pio_request *pio_first = NULL, **pio_last = &pio_first;
53 static kmem_cache_t *pio_request_cache;
54 static DECLARE_WAIT_QUEUE_HEAD(pio_wait);
56 spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
59 static inline void
60 make_pio_request(struct file *, unsigned long, unsigned long);
62 void __add_page_to_hash_queue(struct page * page, struct page **p){
63 atomic_inc(&page_cache_size);
64 if((page->next_hash = *p) != NULL)
65 (*p)->pprev_hash = &page->next_hash;
66 *p = page;
67 page->pprev_hash = p;
68 if (page->buffers)
69 PAGE_BUG(page);
72 static void remove_page_from_hash_queue(struct page * page)
74 if(page->pprev_hash) {
75 if(page->next_hash)
76 page->next_hash->pprev_hash = page->pprev_hash;
77 *page->pprev_hash = page->next_hash;
78 page->pprev_hash = NULL;
80 atomic_dec(&page_cache_size);
83 void invalidate_inode_pages(struct inode * inode)
85 struct page ** p;
86 struct page * page;
88 repeat:
89 spin_lock(&pagecache_lock);
90 p = &inode->i_pages;
91 while ((page = *p) != NULL) {
92 get_page(page);
93 if (TryLockPage(page)) {
94 spin_unlock(&pagecache_lock);
95 wait_on_page(page);
96 page_cache_release(page);
97 goto repeat;
99 if (page_count(page) != 2)
100 printk("hm, busy page invalidated? (not necesserily a bug)\n");
101 inode->i_nrpages--;
102 if ((*p = page->next) != NULL)
103 (*p)->prev = page->prev;
104 page->next = NULL;
105 page->prev = NULL;
106 remove_page_from_hash_queue(page);
107 page->inode = NULL;
108 UnlockPage(page);
109 page_cache_release(page);
110 page_cache_release(page);
113 spin_unlock(&pagecache_lock);
116 * Truncate the page cache at a set offset, removing the pages
117 * that are beyond that offset (and zeroing out partial pages).
119 void truncate_inode_pages(struct inode * inode, unsigned long start)
121 struct page ** p;
122 struct page * page;
123 int partial = 0;
125 repeat:
126 spin_lock(&pagecache_lock);
127 p = &inode->i_pages;
128 while ((page = *p) != NULL) {
129 unsigned long offset = page->offset;
131 /* page wholly truncated - free it */
132 if (offset >= start) {
133 get_page(page);
134 if (TryLockPage(page)) {
135 spin_unlock(&pagecache_lock);
136 wait_on_page(page);
137 page_cache_release(page);
138 goto repeat;
140 spin_unlock(&pagecache_lock);
142 if (inode->i_op->flushpage)
143 inode->i_op->flushpage(inode, page, 0);
146 * We remove the page from the page cache
147 * _after_ we have destroyed all buffer-cache
148 * references to it. Otherwise some other process
149 * might think this inode page is not in the
150 * page cache and creates a buffer-cache alias
151 * to it causing all sorts of fun problems ...
153 spin_lock(&pagecache_lock);
154 inode->i_nrpages--;
155 if ((*p = page->next) != NULL)
156 (*p)->prev = page->prev;
157 page->next = NULL;
158 page->prev = NULL;
159 remove_page_from_hash_queue(page);
160 page->inode = NULL;
161 spin_unlock(&pagecache_lock);
163 UnlockPage(page);
164 page_cache_release(page);
165 page_cache_release(page);
168 * We have done things without the pagecache lock,
169 * so we'll have to repeat the scan.
170 * It's not possible to deadlock here because
171 * we are guaranteed to make progress. (ie. we have
172 * just removed a page)
174 goto repeat;
176 p = &page->next;
178 * there is only one partial page possible.
180 if (partial)
181 continue;
183 offset = start - offset;
184 /* partial truncate, clear end of page */
185 if (offset < PAGE_CACHE_SIZE) {
186 unsigned long address;
187 get_page(page);
188 if (TryLockPage(page)) {
189 spin_unlock(&pagecache_lock);
190 wait_on_page(page);
191 page_cache_release(page);
192 goto repeat;
195 * It's worth dropping the write lock only at
196 * this point. We are holding the page lock
197 * so nobody can do anything bad to us.
199 spin_unlock(&pagecache_lock);
200 partial = 1;
202 address = page_address(page);
203 memset((void *) (offset + address), 0, PAGE_CACHE_SIZE - offset);
204 flush_page_to_ram(address);
206 if (inode->i_op->flushpage)
207 inode->i_op->flushpage(inode, page, offset);
209 * we have dropped the spinlock so we have to
210 * restart.
212 UnlockPage(page);
213 page_cache_release(page);
214 goto repeat;
217 spin_unlock(&pagecache_lock);
221 * Remove a page from the page cache and free it. Caller has to make
222 * sure the page is locked and that nobody else uses it - or that usage
223 * is safe.
225 void remove_inode_page(struct page *page)
227 if (!PageLocked(page))
228 PAGE_BUG(page);
230 spin_lock(&pagecache_lock);
231 remove_page_from_inode_queue(page);
232 remove_page_from_hash_queue(page);
233 page->inode = NULL;
234 spin_unlock(&pagecache_lock);
237 int shrink_mmap(int priority, int gfp_mask)
239 static unsigned long clock = 0;
240 unsigned long limit = num_physpages;
241 struct page * page;
242 int count, err;
244 count = limit >> priority;
246 page = mem_map + clock;
247 do {
248 int referenced;
250 /* This works even in the presence of PageSkip because
251 * the first two entries at the beginning of a hole will
252 * be marked, not just the first.
254 page++;
255 clock++;
256 if (clock >= max_mapnr) {
257 clock = 0;
258 page = mem_map;
260 if (PageSkip(page)) {
261 /* next_hash is overloaded for PageSkip */
262 page = page->next_hash;
263 clock = page - mem_map;
266 referenced = test_and_clear_bit(PG_referenced, &page->flags);
268 if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
269 continue;
271 if (PageLocked(page))
272 continue;
274 /* Is it a buffer page? */
275 if (page->buffers) {
276 if (buffer_under_min())
277 continue;
279 if (TryLockPage(page))
280 continue;
281 err = try_to_free_buffers(page);
282 UnlockPage(page);
284 if (!err)
285 continue;
286 goto out;
289 /* We can't free pages unless there's just one user */
290 if (page_count(page) != 1)
291 continue;
293 count--;
296 * Is it a page swap page? If so, we want to
297 * drop it if it is no longer used, even if it
298 * were to be marked referenced..
300 if (PageSwapCache(page)) {
301 if (referenced && swap_count(page->offset) != 1)
302 continue;
303 delete_from_swap_cache(page);
304 err = 1;
305 goto out;
308 if (referenced)
309 continue;
311 /* is it a page-cache page? */
312 spin_lock(&pagecache_lock);
313 if (page->inode) {
314 if (pgcache_under_min())
315 goto unlock_continue;
316 if (TryLockPage(page))
317 goto unlock_continue;
319 if (page_count(page) == 1) {
320 remove_page_from_inode_queue(page);
321 remove_page_from_hash_queue(page);
322 page->inode = NULL;
324 spin_unlock(&pagecache_lock);
326 UnlockPage(page);
327 page_cache_release(page);
328 err = 1;
329 goto out;
330 unlock_continue:
331 spin_unlock(&pagecache_lock);
332 continue;
334 spin_unlock(&pagecache_lock);
335 } while (count > 0);
336 err = 0;
337 out:
338 return err;
341 static inline struct page * __find_page_nolock(struct inode * inode, unsigned long offset, struct page *page)
343 goto inside;
345 for (;;) {
346 page = page->next_hash;
347 inside:
348 if (!page)
349 goto not_found;
350 if (page->inode != inode)
351 continue;
352 if (page->offset == offset)
353 break;
355 not_found:
356 return page;
360 * This adds a page to the page cache, starting out as locked,
361 * owned by us, referenced, but not uptodate and with no errors.
363 static inline void __add_to_page_cache(struct page * page,
364 struct inode * inode, unsigned long offset,
365 struct page **hash)
367 unsigned long flags;
369 flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error));
370 page->flags = flags | ((1 << PG_locked) | (1 << PG_referenced));
371 page->owner = (int)current; /* REMOVEME */
372 get_page(page);
373 page->offset = offset;
374 add_page_to_inode_queue(inode, page);
375 __add_page_to_hash_queue(page, hash);
378 int add_to_page_cache_unique(struct page * page,
379 struct inode * inode, unsigned long offset,
380 struct page **hash)
382 int err;
383 struct page *alias;
385 spin_lock(&pagecache_lock);
386 alias = __find_page_nolock(inode, offset, *hash);
388 err = 1;
389 if (!alias) {
390 __add_to_page_cache(page,inode,offset,hash);
391 err = 0;
394 spin_unlock(&pagecache_lock);
395 return err;
399 * Try to read ahead in the file. "page_cache" is a potentially free page
400 * that we could use for the cache (if it is 0 we can try to create one,
401 * this is all overlapped with the IO on the previous page finishing anyway)
403 static unsigned long try_to_read_ahead(struct file * file,
404 unsigned long offset, unsigned long page_cache)
406 struct inode *inode = file->f_dentry->d_inode;
407 struct page * page;
408 struct page ** hash;
410 offset &= PAGE_CACHE_MASK;
411 switch (page_cache) {
412 case 0:
413 page_cache = page_cache_alloc();
414 if (!page_cache)
415 break;
416 default:
417 if (offset >= inode->i_size)
418 break;
419 hash = page_hash(inode, offset);
420 page = page_cache_entry(page_cache);
421 if (!add_to_page_cache_unique(page, inode, offset, hash)) {
423 * We do not have to check the return value here
424 * because it's a readahead.
426 lock_kernel();
427 inode->i_op->readpage(file, page);
428 unlock_kernel();
429 page_cache = 0;
430 page_cache_release(page);
433 return page_cache;
437 * Wait for a page to get unlocked.
439 * This must be called with the caller "holding" the page,
440 * ie with increased "page->count" so that the page won't
441 * go away during the wait..
443 void ___wait_on_page(struct page *page)
445 struct task_struct *tsk = current;
446 DECLARE_WAITQUEUE(wait, tsk);
448 add_wait_queue(&page->wait, &wait);
449 repeat:
450 tsk->state = TASK_UNINTERRUPTIBLE;
451 run_task_queue(&tq_disk);
452 if (PageLocked(page)) {
453 int left;
454 left = schedule_timeout(HZ*20);
455 if (!left)
456 PAGE_BUG(page);
457 goto repeat;
459 tsk->state = TASK_RUNNING;
460 remove_wait_queue(&page->wait, &wait);
464 * Get an exclusive lock on the page..
466 void lock_page(struct page *page)
468 if (TryLockPage(page)) {
469 struct task_struct *tsk = current;
470 DECLARE_WAITQUEUE(wait, current);
472 run_task_queue(&tq_disk);
473 add_wait_queue(&page->wait, &wait);
474 tsk->state = TASK_UNINTERRUPTIBLE;
476 while (TryLockPage(page)) {
477 run_task_queue(&tq_disk);
478 schedule();
479 tsk->state = TASK_UNINTERRUPTIBLE;
482 remove_wait_queue(&page->wait, &wait);
483 tsk->state = TASK_RUNNING;
489 * a rather lightweight function, finding and getting a reference to a
490 * hashed page atomically, waiting for it if it's locked.
492 struct page * __find_get_page (struct inode * inode,
493 unsigned long offset, struct page *page)
497 * We scan the hash list read-only. Addition to and removal from
498 * the hash-list needs a held write-lock.
500 repeat:
501 spin_lock(&pagecache_lock);
502 page = __find_page_nolock(inode, offset, page);
503 if (page)
504 get_page(page);
505 spin_unlock(&pagecache_lock);
507 /* Found the page, sleep if locked. */
508 if (page && PageLocked(page)) {
509 struct task_struct *tsk = current;
510 DECLARE_WAITQUEUE(wait, tsk);
512 add_wait_queue(&page->wait, &wait);
513 tsk->state = TASK_UNINTERRUPTIBLE;
515 run_task_queue(&tq_disk);
516 if (PageLocked(page))
517 schedule();
518 tsk->state = TASK_RUNNING;
519 remove_wait_queue(&page->wait, &wait);
522 * The page might have been unhashed meanwhile. It's
523 * not freed though because we hold a reference to it.
524 * If this is the case then it will be freed _here_,
525 * and we recheck the hash anyway.
527 page_cache_release(page);
528 goto repeat;
531 * It's not locked so we can return the page and we hold
532 * a reference to it.
534 return page;
538 * Get the lock to a page atomically.
540 struct page * __find_lock_page (struct inode * inode,
541 unsigned long offset, struct page *page)
543 int locked;
547 * We scan the hash list read-only. Addition to and removal from
548 * the hash-list needs a held write-lock.
550 repeat:
551 spin_lock(&pagecache_lock);
552 page = __find_page_nolock(inode, offset, page);
553 locked = 0;
554 if (page) {
555 get_page(page);
556 if (TryLockPage(page))
557 locked = 1;
559 spin_unlock(&pagecache_lock);
561 /* Found the page, sleep if locked. */
562 if (page && locked) {
563 struct task_struct *tsk = current;
564 DECLARE_WAITQUEUE(wait, tsk);
566 add_wait_queue(&page->wait, &wait);
567 tsk->state = TASK_UNINTERRUPTIBLE;
569 run_task_queue(&tq_disk);
570 if (PageLocked(page))
571 schedule();
572 tsk->state = TASK_RUNNING;
573 remove_wait_queue(&page->wait, &wait);
576 * The page might have been unhashed meanwhile. It's
577 * not freed though because we hold a reference to it.
578 * If this is the case then it will be freed _here_,
579 * and we recheck the hash anyway.
581 page_cache_release(page);
582 goto repeat;
585 * It's not locked so we can return the page and we hold
586 * a reference to it.
588 return page;
591 #if 0
592 #define PROFILE_READAHEAD
593 #define DEBUG_READAHEAD
594 #endif
597 * Read-ahead profiling information
598 * --------------------------------
599 * Every PROFILE_MAXREADCOUNT, the following information is written
600 * to the syslog:
601 * Percentage of asynchronous read-ahead.
602 * Average of read-ahead fields context value.
603 * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
604 * to the syslog.
607 #ifdef PROFILE_READAHEAD
609 #define PROFILE_MAXREADCOUNT 1000
611 static unsigned long total_reada;
612 static unsigned long total_async;
613 static unsigned long total_ramax;
614 static unsigned long total_ralen;
615 static unsigned long total_rawin;
617 static void profile_readahead(int async, struct file *filp)
619 unsigned long flags;
621 ++total_reada;
622 if (async)
623 ++total_async;
625 total_ramax += filp->f_ramax;
626 total_ralen += filp->f_ralen;
627 total_rawin += filp->f_rawin;
629 if (total_reada > PROFILE_MAXREADCOUNT) {
630 save_flags(flags);
631 cli();
632 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
633 restore_flags(flags);
634 return;
637 printk("Readahead average: max=%ld, len=%ld, win=%ld, async=%ld%%\n",
638 total_ramax/total_reada,
639 total_ralen/total_reada,
640 total_rawin/total_reada,
641 (total_async*100)/total_reada);
642 #ifdef DEBUG_READAHEAD
643 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%ld\n",
644 filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
645 #endif
647 total_reada = 0;
648 total_async = 0;
649 total_ramax = 0;
650 total_ralen = 0;
651 total_rawin = 0;
653 restore_flags(flags);
656 #endif /* defined PROFILE_READAHEAD */
659 * Read-ahead context:
660 * -------------------
661 * The read ahead context fields of the "struct file" are the following:
662 * - f_raend : position of the first byte after the last page we tried to
663 * read ahead.
664 * - f_ramax : current read-ahead maximum size.
665 * - f_ralen : length of the current IO read block we tried to read-ahead.
666 * - f_rawin : length of the current read-ahead window.
667 * if last read-ahead was synchronous then
668 * f_rawin = f_ralen
669 * otherwise (was asynchronous)
670 * f_rawin = previous value of f_ralen + f_ralen
672 * Read-ahead limits:
673 * ------------------
674 * MIN_READAHEAD : minimum read-ahead size when read-ahead.
675 * MAX_READAHEAD : maximum read-ahead size when read-ahead.
677 * Synchronous read-ahead benefits:
678 * --------------------------------
679 * Using reasonable IO xfer length from peripheral devices increase system
680 * performances.
681 * Reasonable means, in this context, not too large but not too small.
682 * The actual maximum value is:
683 * MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
684 * and 32K if defined (4K page size assumed).
686 * Asynchronous read-ahead benefits:
687 * ---------------------------------
688 * Overlapping next read request and user process execution increase system
689 * performance.
691 * Read-ahead risks:
692 * -----------------
693 * We have to guess which further data are needed by the user process.
694 * If these data are often not really needed, it's bad for system
695 * performances.
696 * However, we know that files are often accessed sequentially by
697 * application programs and it seems that it is possible to have some good
698 * strategy in that guessing.
699 * We only try to read-ahead files that seems to be read sequentially.
701 * Asynchronous read-ahead risks:
702 * ------------------------------
703 * In order to maximize overlapping, we must start some asynchronous read
704 * request from the device, as soon as possible.
705 * We must be very careful about:
706 * - The number of effective pending IO read requests.
707 * ONE seems to be the only reasonable value.
708 * - The total memory pool usage for the file access stream.
709 * This maximum memory usage is implicitly 2 IO read chunks:
710 * 2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
711 * 64k if defined (4K page size assumed).
714 static inline int get_max_readahead(struct inode * inode)
716 if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
717 return MAX_READAHEAD;
718 return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
721 static inline unsigned long generic_file_readahead(int reada_ok,
722 struct file * filp, struct inode * inode,
723 unsigned long ppos, struct page * page, unsigned long page_cache)
725 unsigned long max_ahead, ahead;
726 unsigned long raend;
727 int max_readahead = get_max_readahead(inode);
729 raend = filp->f_raend & PAGE_CACHE_MASK;
730 max_ahead = 0;
733 * The current page is locked.
734 * If the current position is inside the previous read IO request, do not
735 * try to reread previously read ahead pages.
736 * Otherwise decide or not to read ahead some pages synchronously.
737 * If we are not going to read ahead, set the read ahead context for this
738 * page only.
740 if (PageLocked(page)) {
741 if (!filp->f_ralen || ppos >= raend || ppos + filp->f_ralen < raend) {
742 raend = ppos;
743 if (raend < inode->i_size)
744 max_ahead = filp->f_ramax;
745 filp->f_rawin = 0;
746 filp->f_ralen = PAGE_CACHE_SIZE;
747 if (!max_ahead) {
748 filp->f_raend = ppos + filp->f_ralen;
749 filp->f_rawin += filp->f_ralen;
754 * The current page is not locked.
755 * If we were reading ahead and,
756 * if the current max read ahead size is not zero and,
757 * if the current position is inside the last read-ahead IO request,
758 * it is the moment to try to read ahead asynchronously.
759 * We will later force unplug device in order to force asynchronous read IO.
761 else if (reada_ok && filp->f_ramax && raend >= PAGE_CACHE_SIZE &&
762 ppos <= raend && ppos + filp->f_ralen >= raend) {
764 * Add ONE page to max_ahead in order to try to have about the same IO max size
765 * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
766 * Compute the position of the last page we have tried to read in order to
767 * begin to read ahead just at the next page.
769 raend -= PAGE_CACHE_SIZE;
770 if (raend < inode->i_size)
771 max_ahead = filp->f_ramax + PAGE_CACHE_SIZE;
773 if (max_ahead) {
774 filp->f_rawin = filp->f_ralen;
775 filp->f_ralen = 0;
776 reada_ok = 2;
780 * Try to read ahead pages.
781 * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
782 * scheduler, will work enough for us to avoid too bad actuals IO requests.
784 ahead = 0;
785 while (ahead < max_ahead) {
786 ahead += PAGE_CACHE_SIZE;
787 page_cache = try_to_read_ahead(filp, raend + ahead,
788 page_cache);
791 * If we tried to read ahead some pages,
792 * If we tried to read ahead asynchronously,
793 * Try to force unplug of the device in order to start an asynchronous
794 * read IO request.
795 * Update the read-ahead context.
796 * Store the length of the current read-ahead window.
797 * Double the current max read ahead size.
798 * That heuristic avoid to do some large IO for files that are not really
799 * accessed sequentially.
801 if (ahead) {
802 if (reada_ok == 2) {
803 run_task_queue(&tq_disk);
806 filp->f_ralen += ahead;
807 filp->f_rawin += filp->f_ralen;
808 filp->f_raend = raend + ahead + PAGE_CACHE_SIZE;
810 filp->f_ramax += filp->f_ramax;
812 if (filp->f_ramax > max_readahead)
813 filp->f_ramax = max_readahead;
815 #ifdef PROFILE_READAHEAD
816 profile_readahead((reada_ok == 2), filp);
817 #endif
820 return page_cache;
824 * "descriptor" for what we're up to with a read.
825 * This allows us to use the same read code yet
826 * have multiple different users of the data that
827 * we read from a file.
829 * The simplest case just copies the data to user
830 * mode.
832 typedef struct {
833 size_t written;
834 size_t count;
835 char * buf;
836 int error;
837 } read_descriptor_t;
839 typedef int (*read_actor_t)(read_descriptor_t *, const char *, unsigned long);
842 * This is a generic file read routine, and uses the
843 * inode->i_op->readpage() function for the actual low-level
844 * stuff.
846 * This is really ugly. But the goto's actually try to clarify some
847 * of the logic when it comes to error handling etc.
849 static void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
851 struct dentry *dentry = filp->f_dentry;
852 struct inode *inode = dentry->d_inode;
853 size_t pos, pgpos, page_cache;
854 int reada_ok;
855 int error;
856 int max_readahead = get_max_readahead(inode);
858 page_cache = 0;
860 pos = *ppos;
861 pgpos = pos & PAGE_CACHE_MASK;
863 * If the current position is outside the previous read-ahead window,
864 * we reset the current read-ahead context and set read ahead max to zero
865 * (will be set to just needed value later),
866 * otherwise, we assume that the file accesses are sequential enough to
867 * continue read-ahead.
869 if (pgpos > filp->f_raend || pgpos + filp->f_rawin < filp->f_raend) {
870 reada_ok = 0;
871 filp->f_raend = 0;
872 filp->f_ralen = 0;
873 filp->f_ramax = 0;
874 filp->f_rawin = 0;
875 } else {
876 reada_ok = 1;
879 * Adjust the current value of read-ahead max.
880 * If the read operation stay in the first half page, force no readahead.
881 * Otherwise try to increase read ahead max just enough to do the read request.
882 * Then, at least MIN_READAHEAD if read ahead is ok,
883 * and at most MAX_READAHEAD in all cases.
885 if (pos + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
886 filp->f_ramax = 0;
887 } else {
888 unsigned long needed;
890 needed = ((pos + desc->count) & PAGE_CACHE_MASK) - pgpos;
892 if (filp->f_ramax < needed)
893 filp->f_ramax = needed;
895 if (reada_ok && filp->f_ramax < MIN_READAHEAD)
896 filp->f_ramax = MIN_READAHEAD;
897 if (filp->f_ramax > max_readahead)
898 filp->f_ramax = max_readahead;
901 for (;;) {
902 struct page *page, **hash;
904 if (pos >= inode->i_size)
905 break;
908 * Try to find the data in the page cache..
910 hash = page_hash(inode, pos & PAGE_CACHE_MASK);
912 spin_lock(&pagecache_lock);
913 page = __find_page_nolock(inode, pos & PAGE_CACHE_MASK, *hash);
914 if (!page)
915 goto no_cached_page;
916 found_page:
917 get_page(page);
918 spin_unlock(&pagecache_lock);
920 if (!Page_Uptodate(page))
921 goto page_not_up_to_date;
922 page_ok:
924 * Ok, we have the page, and it's up-to-date, so
925 * now we can copy it to user space...
928 unsigned long offset, nr;
930 offset = pos & ~PAGE_CACHE_MASK;
931 nr = PAGE_CACHE_SIZE - offset;
932 if (nr > inode->i_size - pos)
933 nr = inode->i_size - pos;
936 * The actor routine returns how many bytes were actually used..
937 * NOTE! This may not be the same as how much of a user buffer
938 * we filled up (we may be padding etc), so we can only update
939 * "pos" here (the actor routine has to update the user buffer
940 * pointers and the remaining count).
942 nr = actor(desc, (const char *) (page_address(page) + offset), nr);
943 pos += nr;
944 page_cache_release(page);
945 if (nr && desc->count)
946 continue;
947 break;
951 * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
953 page_not_up_to_date:
954 page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_CACHE_MASK, page, page_cache);
956 if (Page_Uptodate(page))
957 goto page_ok;
959 /* Get exclusive access to the page ... */
960 lock_page(page);
961 if (Page_Uptodate(page)) {
962 UnlockPage(page);
963 goto page_ok;
966 read_page:
967 /* ... and start the actual read. The read will unlock the page. */
968 lock_kernel();
969 error = inode->i_op->readpage(filp, page);
970 unlock_kernel();
972 if (!error) {
973 if (Page_Uptodate(page))
974 goto page_ok;
976 /* Again, try some read-ahead while waiting for the page to finish.. */
977 page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_CACHE_MASK, page, page_cache);
978 wait_on_page(page);
979 if (Page_Uptodate(page))
980 goto page_ok;
981 error = -EIO;
984 /* UHHUH! A synchronous read error occurred. Report it */
985 desc->error = error;
986 page_cache_release(page);
987 break;
989 no_cached_page:
991 * Ok, it wasn't cached, so we need to create a new
992 * page..
994 * We get here with the page cache lock held.
996 if (!page_cache) {
997 spin_unlock(&pagecache_lock);
998 page_cache = page_cache_alloc();
999 if (!page_cache) {
1000 desc->error = -ENOMEM;
1001 break;
1005 * Somebody may have added the page while we
1006 * dropped the page cache lock. Check for that.
1008 spin_lock(&pagecache_lock);
1009 page = __find_page_nolock(inode, pos & PAGE_CACHE_MASK, *hash);
1010 if (page)
1011 goto found_page;
1015 * Ok, add the new page to the hash-queues...
1017 page = page_cache_entry(page_cache);
1018 __add_to_page_cache(page, inode, pos & PAGE_CACHE_MASK, hash);
1019 spin_unlock(&pagecache_lock);
1021 page_cache = 0;
1022 goto read_page;
1025 *ppos = pos;
1026 filp->f_reada = 1;
1027 if (page_cache)
1028 page_cache_free(page_cache);
1029 UPDATE_ATIME(inode);
1032 static int file_read_actor(read_descriptor_t * desc, const char *area, unsigned long size)
1034 unsigned long left;
1035 unsigned long count = desc->count;
1037 if (size > count)
1038 size = count;
1039 left = __copy_to_user(desc->buf, area, size);
1040 if (left) {
1041 size -= left;
1042 desc->error = -EFAULT;
1044 desc->count = count - size;
1045 desc->written += size;
1046 desc->buf += size;
1047 return size;
1051 * This is the "read()" routine for all filesystems
1052 * that can use the page cache directly.
1054 ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
1056 ssize_t retval;
1058 unlock_kernel();
1059 retval = -EFAULT;
1060 if (access_ok(VERIFY_WRITE, buf, count)) {
1061 retval = 0;
1062 if (count) {
1063 read_descriptor_t desc;
1065 desc.written = 0;
1066 desc.count = count;
1067 desc.buf = buf;
1068 desc.error = 0;
1069 do_generic_file_read(filp, ppos, &desc, file_read_actor);
1071 retval = desc.written;
1072 if (!retval)
1073 retval = desc.error;
1076 lock_kernel();
1077 return retval;
1080 static int file_send_actor(read_descriptor_t * desc, const char *area, unsigned long size)
1082 ssize_t written;
1083 unsigned long count = desc->count;
1084 struct file *file = (struct file *) desc->buf;
1085 mm_segment_t old_fs;
1087 if (size > count)
1088 size = count;
1089 old_fs = get_fs();
1090 set_fs(KERNEL_DS);
1091 written = file->f_op->write(file, area, size, &file->f_pos);
1092 set_fs(old_fs);
1093 if (written < 0) {
1094 desc->error = written;
1095 written = 0;
1097 desc->count = count - written;
1098 desc->written += written;
1099 return written;
1102 asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
1104 ssize_t retval;
1105 struct file * in_file, * out_file;
1106 struct inode * in_inode, * out_inode;
1108 lock_kernel();
1111 * Get input file, and verify that it is ok..
1113 retval = -EBADF;
1114 in_file = fget(in_fd);
1115 if (!in_file)
1116 goto out;
1117 if (!(in_file->f_mode & FMODE_READ))
1118 goto fput_in;
1119 retval = -EINVAL;
1120 in_inode = in_file->f_dentry->d_inode;
1121 if (!in_inode)
1122 goto fput_in;
1123 if (!in_inode->i_op || !in_inode->i_op->readpage)
1124 goto fput_in;
1125 retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
1126 if (retval)
1127 goto fput_in;
1130 * Get output file, and verify that it is ok..
1132 retval = -EBADF;
1133 out_file = fget(out_fd);
1134 if (!out_file)
1135 goto fput_in;
1136 if (!(out_file->f_mode & FMODE_WRITE))
1137 goto fput_out;
1138 retval = -EINVAL;
1139 if (!out_file->f_op || !out_file->f_op->write)
1140 goto fput_out;
1141 out_inode = out_file->f_dentry->d_inode;
1142 if (!out_inode)
1143 goto fput_out;
1144 retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
1145 if (retval)
1146 goto fput_out;
1148 retval = 0;
1149 if (count) {
1150 read_descriptor_t desc;
1151 loff_t pos = 0, *ppos;
1153 retval = -EFAULT;
1154 ppos = &in_file->f_pos;
1155 if (offset) {
1156 if (get_user(pos, offset))
1157 goto fput_out;
1158 ppos = &pos;
1161 desc.written = 0;
1162 desc.count = count;
1163 desc.buf = (char *) out_file;
1164 desc.error = 0;
1165 do_generic_file_read(in_file, ppos, &desc, file_send_actor);
1167 retval = desc.written;
1168 if (!retval)
1169 retval = desc.error;
1170 if (offset)
1171 put_user(pos, offset);
1175 fput_out:
1176 fput(out_file);
1177 fput_in:
1178 fput(in_file);
1179 out:
1180 unlock_kernel();
1181 return retval;
1185 * Semantics for shared and private memory areas are different past the end
1186 * of the file. A shared mapping past the last page of the file is an error
1187 * and results in a SIGBUS, while a private mapping just maps in a zero page.
1189 * The goto's are kind of ugly, but this streamlines the normal case of having
1190 * it in the page cache, and handles the special cases reasonably without
1191 * having a lot of duplicated code.
1193 * WSH 06/04/97: fixed a memory leak and moved the allocation of new_page
1194 * ahead of the wait if we're sure to need it.
1196 static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long address, int no_share)
1198 struct file * file = area->vm_file;
1199 struct dentry * dentry = file->f_dentry;
1200 struct inode * inode = dentry->d_inode;
1201 unsigned long offset, reada, i;
1202 struct page * page, **hash;
1203 unsigned long old_page, new_page;
1204 int error;
1206 new_page = 0;
1207 offset = (address & PAGE_MASK) - area->vm_start + area->vm_offset;
1208 if (offset >= inode->i_size && (area->vm_flags & VM_SHARED) && area->vm_mm == current->mm)
1209 goto no_page_nolock;
1210 unlock_kernel();
1213 * Do we have something in the page cache already?
1215 hash = page_hash(inode, offset);
1216 retry_find:
1217 page = __find_get_page(inode, offset, *hash);
1218 if (!page)
1219 goto no_cached_page;
1221 found_page:
1223 * Ok, found a page in the page cache, now we need to check
1224 * that it's up-to-date. First check whether we'll need an
1225 * extra page -- better to overlap the allocation with the I/O.
1227 if (no_share && !new_page) {
1228 new_page = page_cache_alloc();
1229 if (!new_page)
1230 goto failure;
1233 wait_on_page(page);
1235 if (!Page_Uptodate(page))
1236 PAGE_BUG(page);
1238 success:
1240 * Found the page and have a reference on it, need to check sharing
1241 * and possibly copy it over to another page..
1243 old_page = page_address(page);
1244 if (!no_share) {
1246 * Ok, we can share the cached page directly.. Get rid
1247 * of any potential extra pages.
1249 if (new_page)
1250 page_cache_free(new_page);
1252 flush_page_to_ram(old_page);
1253 lock_kernel();
1254 return old_page;
1258 * No sharing ... copy to the new page.
1260 copy_page(new_page, old_page);
1261 flush_page_to_ram(new_page);
1262 page_cache_release(page);
1263 lock_kernel();
1264 return new_page;
1266 no_cached_page:
1268 * Try to read in an entire cluster at once.
1270 reada = offset;
1271 reada >>= PAGE_CACHE_SHIFT + page_cluster;
1272 reada <<= PAGE_CACHE_SHIFT + page_cluster;
1274 for (i = 1 << page_cluster; i > 0; --i, reada += PAGE_CACHE_SIZE)
1275 new_page = try_to_read_ahead(file, reada, new_page);
1277 if (!new_page)
1278 new_page = page_cache_alloc();
1279 if (!new_page)
1280 goto no_page;
1283 * During getting the above page we might have slept,
1284 * so we need to re-check the situation with the page
1285 * cache.. The page we just got may be useful if we
1286 * can't share, so don't get rid of it here.
1288 page = __find_get_page(inode, offset, *hash);
1289 if (page)
1290 goto found_page;
1293 * Now, create a new page-cache page from the page we got
1295 page = page_cache_entry(new_page);
1296 if (add_to_page_cache_unique(page, inode, offset, hash))
1297 goto retry_find;
1300 * Now it's ours and locked, we can do initial IO to it:
1302 new_page = 0;
1304 lock_kernel();
1305 error = inode->i_op->readpage(file, page);
1306 unlock_kernel();
1308 if (!error) {
1309 wait_on_page(page);
1310 if (PageError(page))
1311 goto page_read_error;
1312 goto success;
1315 page_read_error:
1317 * Umm, take care of errors if the page isn't up-to-date.
1318 * Try to re-read it _once_. We do this synchronously,
1319 * because there really aren't any performance issues here
1320 * and we need to check for errors.
1322 if (!PageLocked(page))
1323 PAGE_BUG(page);
1324 ClearPageError(page);
1325 lock_kernel();
1326 error = inode->i_op->readpage(file, page);
1327 unlock_kernel();
1328 if (error)
1329 goto failure;
1330 wait_on_page(page);
1331 if (Page_Uptodate(page))
1332 goto success;
1335 * Things didn't work out. Return zero to tell the
1336 * mm layer so, possibly freeing the page cache page first.
1338 failure:
1339 page_cache_release(page);
1340 if (new_page)
1341 page_cache_free(new_page);
1342 no_page:
1343 lock_kernel();
1344 no_page_nolock:
1345 return 0;
1349 * Tries to write a shared mapped page to its backing store. May return -EIO
1350 * if the disk is full.
1352 static inline int do_write_page(struct inode * inode, struct file * file,
1353 const char * page_addr, unsigned long offset)
1355 int retval;
1356 unsigned long size;
1357 loff_t loff = offset;
1358 int (*writepage) (struct file *, struct page *);
1359 struct page * page;
1361 size = offset + PAGE_SIZE;
1362 /* refuse to extend file size.. */
1363 if (S_ISREG(inode->i_mode)) {
1364 if (size > inode->i_size)
1365 size = inode->i_size;
1366 /* Ho humm.. We should have tested for this earlier */
1367 if (size < offset)
1368 return -EIO;
1370 size -= offset;
1371 retval = -EIO;
1372 writepage = inode->i_op->writepage;
1373 page = mem_map + MAP_NR(page_addr);
1374 lock_page(page);
1376 if (writepage) {
1377 retval = writepage(file, page);
1378 } else {
1379 mm_segment_t old_fs = get_fs();
1380 set_fs(KERNEL_DS);
1381 if (size == file->f_op->write(file, page_addr, size, &loff))
1382 retval = 0;
1383 set_fs(old_fs);
1385 UnlockPage(page);
1386 return retval;
1389 static int filemap_write_page(struct vm_area_struct * vma,
1390 unsigned long offset,
1391 unsigned long page,
1392 int wait)
1394 int result;
1395 struct file * file;
1396 struct dentry * dentry;
1397 struct inode * inode;
1399 file = vma->vm_file;
1400 dentry = file->f_dentry;
1401 inode = dentry->d_inode;
1402 if (!file->f_op->write)
1403 return -EIO;
1406 * If a task terminates while we're swapping the page, the vma and
1407 * and file could be released ... increment the count to be safe.
1409 file->f_count++;
1412 * If this is a swapping operation rather than msync(), then
1413 * leave the actual IO, and the restoration of the file count,
1414 * to the kpiod thread. Just queue the request for now.
1416 if (!wait) {
1417 make_pio_request(file, offset, page);
1418 return 0;
1421 result = do_write_page(inode, file, (const char *) page, offset);
1422 fput(file);
1423 return result;
1428 * The page cache takes care of races between somebody
1429 * trying to swap something out and swap something in
1430 * at the same time..
1432 int filemap_swapout(struct vm_area_struct * vma, struct page * page)
1434 return filemap_write_page(vma, page->offset, page_address(page), 0);
1437 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
1438 unsigned long address, unsigned int flags)
1440 pte_t pte = *ptep;
1441 unsigned long pageaddr;
1442 struct page *page;
1443 int error;
1445 if (!(flags & MS_INVALIDATE)) {
1446 if (!pte_present(pte))
1447 return 0;
1448 if (!pte_dirty(pte))
1449 return 0;
1450 flush_page_to_ram(pte_page(pte));
1451 flush_cache_page(vma, address);
1452 set_pte(ptep, pte_mkclean(pte));
1453 flush_tlb_page(vma, address);
1454 pageaddr = pte_page(pte);
1455 page = page_cache_entry(pageaddr);
1456 get_page(page);
1457 } else {
1458 if (pte_none(pte))
1459 return 0;
1460 flush_cache_page(vma, address);
1461 pte_clear(ptep);
1462 flush_tlb_page(vma, address);
1463 if (!pte_present(pte)) {
1464 swap_free(pte_val(pte));
1465 return 0;
1467 pageaddr = pte_page(pte);
1468 if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
1469 page_cache_free(pageaddr);
1470 return 0;
1473 error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, pageaddr, 1);
1474 page_cache_free(pageaddr);
1475 return error;
1478 static inline int filemap_sync_pte_range(pmd_t * pmd,
1479 unsigned long address, unsigned long size,
1480 struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
1482 pte_t * pte;
1483 unsigned long end;
1484 int error;
1486 if (pmd_none(*pmd))
1487 return 0;
1488 if (pmd_bad(*pmd)) {
1489 printk("filemap_sync_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
1490 pmd_clear(pmd);
1491 return 0;
1493 pte = pte_offset(pmd, address);
1494 offset += address & PMD_MASK;
1495 address &= ~PMD_MASK;
1496 end = address + size;
1497 if (end > PMD_SIZE)
1498 end = PMD_SIZE;
1499 error = 0;
1500 do {
1501 error |= filemap_sync_pte(pte, vma, address + offset, flags);
1502 address += PAGE_SIZE;
1503 pte++;
1504 } while (address < end);
1505 return error;
1508 static inline int filemap_sync_pmd_range(pgd_t * pgd,
1509 unsigned long address, unsigned long size,
1510 struct vm_area_struct *vma, unsigned int flags)
1512 pmd_t * pmd;
1513 unsigned long offset, end;
1514 int error;
1516 if (pgd_none(*pgd))
1517 return 0;
1518 if (pgd_bad(*pgd)) {
1519 printk("filemap_sync_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd));
1520 pgd_clear(pgd);
1521 return 0;
1523 pmd = pmd_offset(pgd, address);
1524 offset = address & PGDIR_MASK;
1525 address &= ~PGDIR_MASK;
1526 end = address + size;
1527 if (end > PGDIR_SIZE)
1528 end = PGDIR_SIZE;
1529 error = 0;
1530 do {
1531 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
1532 address = (address + PMD_SIZE) & PMD_MASK;
1533 pmd++;
1534 } while (address < end);
1535 return error;
1538 static int filemap_sync(struct vm_area_struct * vma, unsigned long address,
1539 size_t size, unsigned int flags)
1541 pgd_t * dir;
1542 unsigned long end = address + size;
1543 int error = 0;
1545 dir = pgd_offset(vma->vm_mm, address);
1546 flush_cache_range(vma->vm_mm, end - size, end);
1547 while (address < end) {
1548 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
1549 address = (address + PGDIR_SIZE) & PGDIR_MASK;
1550 dir++;
1552 flush_tlb_range(vma->vm_mm, end - size, end);
1553 return error;
1557 * This handles (potentially partial) area unmaps..
1559 static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len)
1561 filemap_sync(vma, start, len, MS_ASYNC);
1565 * Shared mappings need to be able to do the right thing at
1566 * close/unmap/sync. They will also use the private file as
1567 * backing-store for swapping..
1569 static struct vm_operations_struct file_shared_mmap = {
1570 NULL, /* no special open */
1571 NULL, /* no special close */
1572 filemap_unmap, /* unmap - we need to sync the pages */
1573 NULL, /* no special protect */
1574 filemap_sync, /* sync */
1575 NULL, /* advise */
1576 filemap_nopage, /* nopage */
1577 NULL, /* wppage */
1578 filemap_swapout, /* swapout */
1579 NULL, /* swapin */
1583 * Private mappings just need to be able to load in the map.
1585 * (This is actually used for shared mappings as well, if we
1586 * know they can't ever get write permissions..)
1588 static struct vm_operations_struct file_private_mmap = {
1589 NULL, /* open */
1590 NULL, /* close */
1591 NULL, /* unmap */
1592 NULL, /* protect */
1593 NULL, /* sync */
1594 NULL, /* advise */
1595 filemap_nopage, /* nopage */
1596 NULL, /* wppage */
1597 NULL, /* swapout */
1598 NULL, /* swapin */
1601 /* This is used for a general mmap of a disk file */
1603 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1605 struct vm_operations_struct * ops;
1606 struct inode *inode = file->f_dentry->d_inode;
1608 ops = &file_private_mmap;
1609 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
1610 ops = &file_shared_mmap;
1611 if (!inode->i_sb || !S_ISREG(inode->i_mode))
1612 return -EACCES;
1613 if (!inode->i_op || !inode->i_op->readpage)
1614 return -ENOEXEC;
1615 UPDATE_ATIME(inode);
1616 vma->vm_ops = ops;
1617 return 0;
1622 * The msync() system call.
1625 static int msync_interval(struct vm_area_struct * vma,
1626 unsigned long start, unsigned long end, int flags)
1628 if (vma->vm_file && vma->vm_ops && vma->vm_ops->sync) {
1629 int error;
1630 error = vma->vm_ops->sync(vma, start, end-start, flags);
1631 if (!error && (flags & MS_SYNC)) {
1632 struct file * file = vma->vm_file;
1633 if (file) {
1634 struct dentry * dentry = file->f_dentry;
1635 error = file_fsync(file, dentry);
1638 return error;
1640 return 0;
1643 asmlinkage int sys_msync(unsigned long start, size_t len, int flags)
1645 unsigned long end;
1646 struct vm_area_struct * vma;
1647 int unmapped_error, error = -EINVAL;
1649 down(&current->mm->mmap_sem);
1650 lock_kernel();
1651 if (start & ~PAGE_MASK)
1652 goto out;
1653 len = (len + ~PAGE_MASK) & PAGE_MASK;
1654 end = start + len;
1655 if (end < start)
1656 goto out;
1657 if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1658 goto out;
1659 error = 0;
1660 if (end == start)
1661 goto out;
1663 * If the interval [start,end) covers some unmapped address ranges,
1664 * just ignore them, but return -EFAULT at the end.
1666 vma = find_vma(current->mm, start);
1667 unmapped_error = 0;
1668 for (;;) {
1669 /* Still start < end. */
1670 error = -EFAULT;
1671 if (!vma)
1672 goto out;
1673 /* Here start < vma->vm_end. */
1674 if (start < vma->vm_start) {
1675 unmapped_error = -EFAULT;
1676 start = vma->vm_start;
1678 /* Here vma->vm_start <= start < vma->vm_end. */
1679 if (end <= vma->vm_end) {
1680 if (start < end) {
1681 error = msync_interval(vma, start, end, flags);
1682 if (error)
1683 goto out;
1685 error = unmapped_error;
1686 goto out;
1688 /* Here vma->vm_start <= start < vma->vm_end < end. */
1689 error = msync_interval(vma, start, vma->vm_end, flags);
1690 if (error)
1691 goto out;
1692 start = vma->vm_end;
1693 vma = vma->vm_next;
1695 out:
1696 unlock_kernel();
1697 up(&current->mm->mmap_sem);
1698 return error;
1702 * Write to a file through the page cache. This is mainly for the
1703 * benefit of NFS and possibly other network-based file systems.
1705 * We currently put everything into the page cache prior to writing it.
1706 * This is not a problem when writing full pages. With partial pages,
1707 * however, we first have to read the data into the cache, then
1708 * dirty the page, and finally schedule it for writing. Alternatively, we
1709 * could write-through just the portion of data that would go into that
1710 * page, but that would kill performance for applications that write data
1711 * line by line, and it's prone to race conditions.
1713 * Note that this routine doesn't try to keep track of dirty pages. Each
1714 * file system has to do this all by itself, unfortunately.
1715 * okir@monad.swb.de
1717 ssize_t
1718 generic_file_write(struct file *file, const char *buf,
1719 size_t count, loff_t *ppos,
1720 writepage_t write_one_page)
1722 struct dentry *dentry = file->f_dentry;
1723 struct inode *inode = dentry->d_inode;
1724 unsigned long pos = *ppos;
1725 unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
1726 struct page *page, **hash;
1727 unsigned long page_cache = 0;
1728 unsigned long written;
1729 long status;
1730 int err;
1732 err = file->f_error;
1733 if (err) {
1734 file->f_error = 0;
1735 goto out;
1738 written = 0;
1740 if (file->f_flags & O_APPEND)
1741 pos = inode->i_size;
1744 * Check whether we've reached the file size limit.
1746 err = -EFBIG;
1747 if (pos >= limit) {
1748 send_sig(SIGXFSZ, current, 0);
1749 goto out;
1752 status = 0;
1754 * Check whether to truncate the write,
1755 * and send the signal if we do.
1757 if (count > limit - pos) {
1758 send_sig(SIGXFSZ, current, 0);
1759 count = limit - pos;
1762 unlock_kernel();
1764 while (count) {
1765 unsigned long bytes, pgpos, offset;
1767 * Try to find the page in the cache. If it isn't there,
1768 * allocate a free page.
1770 offset = (pos & ~PAGE_CACHE_MASK);
1771 pgpos = pos & PAGE_CACHE_MASK;
1772 bytes = PAGE_CACHE_SIZE - offset;
1773 if (bytes > count)
1774 bytes = count;
1776 hash = page_hash(inode, pgpos);
1777 repeat_find:
1778 page = __find_lock_page(inode, pgpos, *hash);
1779 if (!page) {
1780 if (!page_cache) {
1781 page_cache = page_cache_alloc();
1782 if (page_cache)
1783 goto repeat_find;
1784 status = -ENOMEM;
1785 break;
1787 page = page_cache_entry(page_cache);
1788 if (add_to_page_cache_unique(page,inode,pgpos,hash))
1789 goto repeat_find;
1791 page_cache = 0;
1794 /* We have exclusive IO access to the page.. */
1795 if (!PageLocked(page)) {
1796 PAGE_BUG(page);
1797 } else {
1798 if (page->owner != (int)current) {
1799 PAGE_BUG(page);
1803 status = write_one_page(file, page, offset, bytes, buf);
1805 /* Mark it unlocked again and drop the page.. */
1806 UnlockPage(page);
1807 page_cache_release(page);
1809 if (status < 0)
1810 break;
1812 written += status;
1813 count -= status;
1814 pos += status;
1815 buf += status;
1817 *ppos = pos;
1818 if (pos > inode->i_size)
1819 inode->i_size = pos;
1821 if (page_cache)
1822 page_cache_free(page_cache);
1824 err = written ? written : status;
1825 lock_kernel();
1826 out:
1827 return err;
1831 * Support routines for directory caching using the page cache.
1835 * Unlock and free a page.
1837 void put_cached_page(unsigned long addr)
1839 struct page * page = page_cache_entry(addr);
1841 UnlockPage(page);
1842 if (page_count(page) != 2)
1843 panic("put_cached_page: page count=%d\n",
1844 page_count(page));
1845 page_cache_release(page);
1849 /* Add request for page IO to the queue */
1851 static inline void put_pio_request(struct pio_request *p)
1853 *pio_last = p;
1854 p->next = NULL;
1855 pio_last = &p->next;
1858 /* Take the first page IO request off the queue */
1860 static inline struct pio_request * get_pio_request(void)
1862 struct pio_request * p = pio_first;
1863 pio_first = p->next;
1864 if (!pio_first)
1865 pio_last = &pio_first;
1866 return p;
1869 /* Make a new page IO request and queue it to the kpiod thread */
1871 static inline void make_pio_request(struct file *file,
1872 unsigned long offset,
1873 unsigned long pageaddr)
1875 struct pio_request *p;
1876 struct page *page;
1878 page = page_cache_entry(pageaddr);
1879 get_page(page);
1882 * We need to allocate without causing any recursive IO in the
1883 * current thread's context. We might currently be swapping out
1884 * as a result of an allocation made while holding a critical
1885 * filesystem lock. To avoid deadlock, we *MUST* not reenter
1886 * the filesystem in this thread.
1888 * We can wait for kswapd to free memory, or we can try to free
1889 * pages without actually performing further IO, without fear of
1890 * deadlock. --sct
1893 while ((p = kmem_cache_alloc(pio_request_cache, GFP_BUFFER)) == NULL) {
1894 if (try_to_free_pages(__GFP_WAIT))
1895 continue;
1896 current->state = TASK_INTERRUPTIBLE;
1897 schedule_timeout(HZ/10);
1900 p->file = file;
1901 p->offset = offset;
1902 p->page = pageaddr;
1904 put_pio_request(p);
1905 wake_up(&pio_wait);
1910 * This is the only thread which is allowed to write out filemap pages
1911 * while swapping.
1913 * To avoid deadlock, it is important that we never reenter this thread.
1914 * Although recursive memory allocations within this thread may result
1915 * in more page swapping, that swapping will always be done by queuing
1916 * another IO request to the same thread: we will never actually start
1917 * that IO request until we have finished with the current one, and so
1918 * we will not deadlock.
1921 int kpiod(void * unused)
1923 struct task_struct *tsk = current;
1924 DECLARE_WAITQUEUE(wait, tsk);
1925 struct inode * inode;
1926 struct dentry * dentry;
1927 struct pio_request * p;
1929 tsk->session = 1;
1930 tsk->pgrp = 1;
1931 strcpy(tsk->comm, "kpiod");
1932 sigfillset(&tsk->blocked);
1934 * Mark this task as a memory allocator - we don't want to get caught
1935 * up in the regular mm freeing frenzy if we have to allocate memory
1936 * in order to write stuff out.
1938 tsk->flags |= PF_MEMALLOC;
1940 lock_kernel();
1942 pio_request_cache = kmem_cache_create("pio_request",
1943 sizeof(struct pio_request),
1944 0, SLAB_HWCACHE_ALIGN,
1945 NULL, NULL);
1946 if (!pio_request_cache)
1947 panic ("Could not create pio_request slab cache");
1949 while (1) {
1950 tsk->state = TASK_INTERRUPTIBLE;
1951 add_wait_queue(&pio_wait, &wait);
1952 if (!pio_first)
1953 schedule();
1954 remove_wait_queue(&pio_wait, &wait);
1955 tsk->state = TASK_RUNNING;
1957 while (pio_first) {
1958 p = get_pio_request();
1959 dentry = p->file->f_dentry;
1960 inode = dentry->d_inode;
1962 do_write_page(inode, p->file,
1963 (const char *) p->page, p->offset);
1964 fput(p->file);
1965 page_cache_free(p->page);
1966 kmem_cache_free(pio_request_cache, p);