Import 2.2.8pre2
[davej-history.git] / mm / filemap.c
blobf10a72951b8e6eeecf2ce2606619ef3586fae5b8
1 /*
2 * linux/mm/filemap.c
4 * Copyright (C) 1994, 1995 Linus Torvalds
5 */
7 /*
8 * This file handles the generic file mmap semantics used by
9 * most "normal" filesystems (but you don't /have/ to use this:
10 * the NFS filesystem used to do this differently, for example)
12 #include <linux/malloc.h>
13 #include <linux/shm.h>
14 #include <linux/mman.h>
15 #include <linux/locks.h>
16 #include <linux/pagemap.h>
17 #include <linux/swap.h>
18 #include <linux/smp_lock.h>
19 #include <linux/blkdev.h>
20 #include <linux/file.h>
21 #include <linux/swapctl.h>
22 #include <linux/slab.h>
24 #include <asm/pgtable.h>
25 #include <asm/uaccess.h>
28 * Shared mappings implemented 30.11.1994. It's not fully working yet,
29 * though.
31 * Shared mappings now work. 15.8.1995 Bruno.
34 unsigned long page_cache_size = 0;
35 struct page * page_hash_table[PAGE_HASH_SIZE];
37 /*
38 * Define a request structure for outstanding page write requests
39 * to the background page io daemon
42 struct pio_request
44 struct pio_request * next;
45 struct file * file;
46 unsigned long offset;
47 unsigned long page;
49 static struct pio_request *pio_first = NULL, **pio_last = &pio_first;
50 static kmem_cache_t *pio_request_cache;
51 static struct wait_queue *pio_wait = NULL;
53 static inline void
54 make_pio_request(struct file *, unsigned long, unsigned long);
58 * Invalidate the pages of an inode, removing all pages that aren't
59 * locked down (those are sure to be up-to-date anyway, so we shouldn't
60 * invalidate them).
62 void invalidate_inode_pages(struct inode * inode)
64 struct page ** p;
65 struct page * page;
67 p = &inode->i_pages;
68 while ((page = *p) != NULL) {
69 if (PageLocked(page)) {
70 p = &page->next;
71 continue;
73 inode->i_nrpages--;
74 if ((*p = page->next) != NULL)
75 (*p)->prev = page->prev;
76 page->next = NULL;
77 page->prev = NULL;
78 remove_page_from_hash_queue(page);
79 page->inode = NULL;
80 page_cache_release(page);
81 continue;
86 * Truncate the page cache at a set offset, removing the pages
87 * that are beyond that offset (and zeroing out partial pages).
89 void truncate_inode_pages(struct inode * inode, unsigned long start)
91 struct page ** p;
92 struct page * page;
94 repeat:
95 p = &inode->i_pages;
96 while ((page = *p) != NULL) {
97 unsigned long offset = page->offset;
99 /* page wholly truncated - free it */
100 if (offset >= start) {
101 if (PageLocked(page)) {
102 wait_on_page(page);
103 goto repeat;
105 inode->i_nrpages--;
106 if ((*p = page->next) != NULL)
107 (*p)->prev = page->prev;
108 page->next = NULL;
109 page->prev = NULL;
110 remove_page_from_hash_queue(page);
111 page->inode = NULL;
112 page_cache_release(page);
113 continue;
115 p = &page->next;
116 offset = start - offset;
117 /* partial truncate, clear end of page */
118 if (offset < PAGE_CACHE_SIZE) {
119 unsigned long address = page_address(page);
120 memset((void *) (offset + address), 0, PAGE_CACHE_SIZE - offset);
121 flush_page_to_ram(address);
127 * Remove a page from the page cache and free it.
129 void remove_inode_page(struct page *page)
131 remove_page_from_hash_queue(page);
132 remove_page_from_inode_queue(page);
133 page_cache_release(page);
136 int shrink_mmap(int priority, int gfp_mask)
138 static unsigned long clock = 0;
139 unsigned long limit = num_physpages;
140 struct page * page;
141 int count;
143 count = limit >> priority;
145 page = mem_map + clock;
146 do {
147 int referenced;
149 /* This works even in the presence of PageSkip because
150 * the first two entries at the beginning of a hole will
151 * be marked, not just the first.
153 page++;
154 clock++;
155 if (clock >= max_mapnr) {
156 clock = 0;
157 page = mem_map;
159 if (PageSkip(page)) {
160 /* next_hash is overloaded for PageSkip */
161 page = page->next_hash;
162 clock = page - mem_map;
165 referenced = test_and_clear_bit(PG_referenced, &page->flags);
167 if (PageLocked(page))
168 continue;
170 if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
171 continue;
173 /* We can't free pages unless there's just one user */
174 if (atomic_read(&page->count) != 1)
175 continue;
177 count--;
180 * Is it a page swap page? If so, we want to
181 * drop it if it is no longer used, even if it
182 * were to be marked referenced..
184 if (PageSwapCache(page)) {
185 if (referenced && swap_count(page->offset) != 1)
186 continue;
187 delete_from_swap_cache(page);
188 return 1;
191 if (referenced)
192 continue;
194 /* Is it a buffer page? */
195 if (page->buffers) {
196 if (buffer_under_min())
197 continue;
198 if (!try_to_free_buffers(page))
199 continue;
200 return 1;
203 /* is it a page-cache page? */
204 if (page->inode) {
205 if (pgcache_under_min())
206 continue;
207 remove_inode_page(page);
208 return 1;
211 } while (count > 0);
212 return 0;
216 * Update a page cache copy, when we're doing a "write()" system call
217 * See also "update_vm_cache()".
219 void update_vm_cache(struct inode * inode, unsigned long pos, const char * buf, int count)
221 unsigned long offset, len;
223 offset = (pos & ~PAGE_CACHE_MASK);
224 pos = pos & PAGE_CACHE_MASK;
225 len = PAGE_CACHE_SIZE - offset;
226 do {
227 struct page * page;
229 if (len > count)
230 len = count;
231 page = find_page(inode, pos);
232 if (page) {
233 wait_on_page(page);
234 memcpy((void *) (offset + page_address(page)), buf, len);
235 page_cache_release(page);
237 count -= len;
238 buf += len;
239 len = PAGE_CACHE_SIZE;
240 offset = 0;
241 pos += PAGE_CACHE_SIZE;
242 } while (count);
245 static inline void add_to_page_cache(struct page * page,
246 struct inode * inode, unsigned long offset,
247 struct page **hash)
249 atomic_inc(&page->count);
250 page->flags = (page->flags & ~((1 << PG_uptodate) | (1 << PG_error))) | (1 << PG_referenced);
251 page->offset = offset;
252 add_page_to_inode_queue(inode, page);
253 __add_page_to_hash_queue(page, hash);
257 * Try to read ahead in the file. "page_cache" is a potentially free page
258 * that we could use for the cache (if it is 0 we can try to create one,
259 * this is all overlapped with the IO on the previous page finishing anyway)
261 static unsigned long try_to_read_ahead(struct file * file,
262 unsigned long offset, unsigned long page_cache)
264 struct inode *inode = file->f_dentry->d_inode;
265 struct page * page;
266 struct page ** hash;
268 offset &= PAGE_CACHE_MASK;
269 switch (page_cache) {
270 case 0:
271 page_cache = page_cache_alloc();
272 if (!page_cache)
273 break;
274 default:
275 if (offset >= inode->i_size)
276 break;
277 hash = page_hash(inode, offset);
278 page = __find_page(inode, offset, *hash);
279 if (!page) {
281 * Ok, add the new page to the hash-queues...
283 page = page_cache_entry(page_cache);
284 add_to_page_cache(page, inode, offset, hash);
285 inode->i_op->readpage(file, page);
286 page_cache = 0;
288 page_cache_release(page);
290 return page_cache;
294 * Wait for IO to complete on a locked page.
296 * This must be called with the caller "holding" the page,
297 * ie with increased "page->count" so that the page won't
298 * go away during the wait..
300 void __wait_on_page(struct page *page)
302 struct task_struct *tsk = current;
303 struct wait_queue wait;
305 wait.task = tsk;
306 add_wait_queue(&page->wait, &wait);
307 repeat:
308 tsk->state = TASK_UNINTERRUPTIBLE;
309 run_task_queue(&tq_disk);
310 if (PageLocked(page)) {
311 schedule();
312 goto repeat;
314 tsk->state = TASK_RUNNING;
315 remove_wait_queue(&page->wait, &wait);
318 #if 0
319 #define PROFILE_READAHEAD
320 #define DEBUG_READAHEAD
321 #endif
324 * Read-ahead profiling information
325 * --------------------------------
326 * Every PROFILE_MAXREADCOUNT, the following information is written
327 * to the syslog:
328 * Percentage of asynchronous read-ahead.
329 * Average of read-ahead fields context value.
330 * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
331 * to the syslog.
334 #ifdef PROFILE_READAHEAD
336 #define PROFILE_MAXREADCOUNT 1000
338 static unsigned long total_reada;
339 static unsigned long total_async;
340 static unsigned long total_ramax;
341 static unsigned long total_ralen;
342 static unsigned long total_rawin;
344 static void profile_readahead(int async, struct file *filp)
346 unsigned long flags;
348 ++total_reada;
349 if (async)
350 ++total_async;
352 total_ramax += filp->f_ramax;
353 total_ralen += filp->f_ralen;
354 total_rawin += filp->f_rawin;
356 if (total_reada > PROFILE_MAXREADCOUNT) {
357 save_flags(flags);
358 cli();
359 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
360 restore_flags(flags);
361 return;
364 printk("Readahead average: max=%ld, len=%ld, win=%ld, async=%ld%%\n",
365 total_ramax/total_reada,
366 total_ralen/total_reada,
367 total_rawin/total_reada,
368 (total_async*100)/total_reada);
369 #ifdef DEBUG_READAHEAD
370 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%ld\n",
371 filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
372 #endif
374 total_reada = 0;
375 total_async = 0;
376 total_ramax = 0;
377 total_ralen = 0;
378 total_rawin = 0;
380 restore_flags(flags);
383 #endif /* defined PROFILE_READAHEAD */
386 * Read-ahead context:
387 * -------------------
388 * The read ahead context fields of the "struct file" are the following:
389 * - f_raend : position of the first byte after the last page we tried to
390 * read ahead.
391 * - f_ramax : current read-ahead maximum size.
392 * - f_ralen : length of the current IO read block we tried to read-ahead.
393 * - f_rawin : length of the current read-ahead window.
394 * if last read-ahead was synchronous then
395 * f_rawin = f_ralen
396 * otherwise (was asynchronous)
397 * f_rawin = previous value of f_ralen + f_ralen
399 * Read-ahead limits:
400 * ------------------
401 * MIN_READAHEAD : minimum read-ahead size when read-ahead.
402 * MAX_READAHEAD : maximum read-ahead size when read-ahead.
404 * Synchronous read-ahead benefits:
405 * --------------------------------
406 * Using reasonable IO xfer length from peripheral devices increase system
407 * performances.
408 * Reasonable means, in this context, not too large but not too small.
409 * The actual maximum value is:
410 * MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
411 * and 32K if defined (4K page size assumed).
413 * Asynchronous read-ahead benefits:
414 * ---------------------------------
415 * Overlapping next read request and user process execution increase system
416 * performance.
418 * Read-ahead risks:
419 * -----------------
420 * We have to guess which further data are needed by the user process.
421 * If these data are often not really needed, it's bad for system
422 * performances.
423 * However, we know that files are often accessed sequentially by
424 * application programs and it seems that it is possible to have some good
425 * strategy in that guessing.
426 * We only try to read-ahead files that seems to be read sequentially.
428 * Asynchronous read-ahead risks:
429 * ------------------------------
430 * In order to maximize overlapping, we must start some asynchronous read
431 * request from the device, as soon as possible.
432 * We must be very careful about:
433 * - The number of effective pending IO read requests.
434 * ONE seems to be the only reasonable value.
435 * - The total memory pool usage for the file access stream.
436 * This maximum memory usage is implicitly 2 IO read chunks:
437 * 2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
438 * 64k if defined (4K page size assumed).
441 static inline int get_max_readahead(struct inode * inode)
443 if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
444 return MAX_READAHEAD;
445 return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
448 static inline unsigned long generic_file_readahead(int reada_ok,
449 struct file * filp, struct inode * inode,
450 unsigned long ppos, struct page * page, unsigned long page_cache)
452 unsigned long max_ahead, ahead;
453 unsigned long raend;
454 int max_readahead = get_max_readahead(inode);
456 raend = filp->f_raend & PAGE_CACHE_MASK;
457 max_ahead = 0;
460 * The current page is locked.
461 * If the current position is inside the previous read IO request, do not
462 * try to reread previously read ahead pages.
463 * Otherwise decide or not to read ahead some pages synchronously.
464 * If we are not going to read ahead, set the read ahead context for this
465 * page only.
467 if (PageLocked(page)) {
468 if (!filp->f_ralen || ppos >= raend || ppos + filp->f_ralen < raend) {
469 raend = ppos;
470 if (raend < inode->i_size)
471 max_ahead = filp->f_ramax;
472 filp->f_rawin = 0;
473 filp->f_ralen = PAGE_CACHE_SIZE;
474 if (!max_ahead) {
475 filp->f_raend = ppos + filp->f_ralen;
476 filp->f_rawin += filp->f_ralen;
481 * The current page is not locked.
482 * If we were reading ahead and,
483 * if the current max read ahead size is not zero and,
484 * if the current position is inside the last read-ahead IO request,
485 * it is the moment to try to read ahead asynchronously.
486 * We will later force unplug device in order to force asynchronous read IO.
488 else if (reada_ok && filp->f_ramax && raend >= PAGE_CACHE_SIZE &&
489 ppos <= raend && ppos + filp->f_ralen >= raend) {
491 * Add ONE page to max_ahead in order to try to have about the same IO max size
492 * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
493 * Compute the position of the last page we have tried to read in order to
494 * begin to read ahead just at the next page.
496 raend -= PAGE_CACHE_SIZE;
497 if (raend < inode->i_size)
498 max_ahead = filp->f_ramax + PAGE_CACHE_SIZE;
500 if (max_ahead) {
501 filp->f_rawin = filp->f_ralen;
502 filp->f_ralen = 0;
503 reada_ok = 2;
507 * Try to read ahead pages.
508 * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
509 * scheduler, will work enough for us to avoid too bad actuals IO requests.
511 ahead = 0;
512 while (ahead < max_ahead) {
513 ahead += PAGE_CACHE_SIZE;
514 page_cache = try_to_read_ahead(filp, raend + ahead,
515 page_cache);
518 * If we tried to read ahead some pages,
519 * If we tried to read ahead asynchronously,
520 * Try to force unplug of the device in order to start an asynchronous
521 * read IO request.
522 * Update the read-ahead context.
523 * Store the length of the current read-ahead window.
524 * Double the current max read ahead size.
525 * That heuristic avoid to do some large IO for files that are not really
526 * accessed sequentially.
528 if (ahead) {
529 if (reada_ok == 2) {
530 run_task_queue(&tq_disk);
533 filp->f_ralen += ahead;
534 filp->f_rawin += filp->f_ralen;
535 filp->f_raend = raend + ahead + PAGE_CACHE_SIZE;
537 filp->f_ramax += filp->f_ramax;
539 if (filp->f_ramax > max_readahead)
540 filp->f_ramax = max_readahead;
542 #ifdef PROFILE_READAHEAD
543 profile_readahead((reada_ok == 2), filp);
544 #endif
547 return page_cache;
551 * "descriptor" for what we're up to with a read.
552 * This allows us to use the same read code yet
553 * have multiple different users of the data that
554 * we read from a file.
556 * The simplest case just copies the data to user
557 * mode.
559 typedef struct {
560 size_t written;
561 size_t count;
562 char * buf;
563 int error;
564 } read_descriptor_t;
566 typedef int (*read_actor_t)(read_descriptor_t *, const char *, unsigned long);
569 * This is a generic file read routine, and uses the
570 * inode->i_op->readpage() function for the actual low-level
571 * stuff.
573 * This is really ugly. But the goto's actually try to clarify some
574 * of the logic when it comes to error handling etc.
576 static void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
578 struct dentry *dentry = filp->f_dentry;
579 struct inode *inode = dentry->d_inode;
580 size_t pos, pgpos, page_cache;
581 int reada_ok;
582 int max_readahead = get_max_readahead(inode);
584 page_cache = 0;
586 pos = *ppos;
587 pgpos = pos & PAGE_CACHE_MASK;
589 * If the current position is outside the previous read-ahead window,
590 * we reset the current read-ahead context and set read ahead max to zero
591 * (will be set to just needed value later),
592 * otherwise, we assume that the file accesses are sequential enough to
593 * continue read-ahead.
595 if (pgpos > filp->f_raend || pgpos + filp->f_rawin < filp->f_raend) {
596 reada_ok = 0;
597 filp->f_raend = 0;
598 filp->f_ralen = 0;
599 filp->f_ramax = 0;
600 filp->f_rawin = 0;
601 } else {
602 reada_ok = 1;
605 * Adjust the current value of read-ahead max.
606 * If the read operation stay in the first half page, force no readahead.
607 * Otherwise try to increase read ahead max just enough to do the read request.
608 * Then, at least MIN_READAHEAD if read ahead is ok,
609 * and at most MAX_READAHEAD in all cases.
611 if (pos + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
612 filp->f_ramax = 0;
613 } else {
614 unsigned long needed;
616 needed = ((pos + desc->count) & PAGE_CACHE_MASK) - pgpos;
618 if (filp->f_ramax < needed)
619 filp->f_ramax = needed;
621 if (reada_ok && filp->f_ramax < MIN_READAHEAD)
622 filp->f_ramax = MIN_READAHEAD;
623 if (filp->f_ramax > max_readahead)
624 filp->f_ramax = max_readahead;
627 for (;;) {
628 struct page *page, **hash;
630 if (pos >= inode->i_size)
631 break;
634 * Try to find the data in the page cache..
636 hash = page_hash(inode, pos & PAGE_CACHE_MASK);
637 page = __find_page(inode, pos & PAGE_CACHE_MASK, *hash);
638 if (!page)
639 goto no_cached_page;
641 found_page:
643 * Try to read ahead only if the current page is filled or being filled.
644 * Otherwise, if we were reading ahead, decrease max read ahead size to
645 * the minimum value.
646 * In this context, that seems to may happen only on some read error or if
647 * the page has been rewritten.
649 if (PageUptodate(page) || PageLocked(page))
650 page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_CACHE_MASK, page, page_cache);
651 else if (reada_ok && filp->f_ramax > MIN_READAHEAD)
652 filp->f_ramax = MIN_READAHEAD;
654 wait_on_page(page);
656 if (!PageUptodate(page))
657 goto page_read_error;
659 success:
661 * Ok, we have the page, it's up-to-date and ok,
662 * so now we can finally copy it to user space...
665 unsigned long offset, nr;
667 offset = pos & ~PAGE_CACHE_MASK;
668 nr = PAGE_CACHE_SIZE - offset;
669 if (nr > inode->i_size - pos)
670 nr = inode->i_size - pos;
673 * The actor routine returns how many bytes were actually used..
674 * NOTE! This may not be the same as how much of a user buffer
675 * we filled up (we may be padding etc), so we can only update
676 * "pos" here (the actor routine has to update the user buffer
677 * pointers and the remaining count).
679 nr = actor(desc, (const char *) (page_address(page) + offset), nr);
680 pos += nr;
681 page_cache_release(page);
682 if (nr && desc->count)
683 continue;
684 break;
687 no_cached_page:
689 * Ok, it wasn't cached, so we need to create a new
690 * page..
692 if (!page_cache) {
693 page_cache = page_cache_alloc();
695 * That could have slept, so go around to the
696 * very beginning..
698 if (page_cache)
699 continue;
700 desc->error = -ENOMEM;
701 break;
705 * Ok, add the new page to the hash-queues...
707 page = page_cache_entry(page_cache);
708 page_cache = 0;
709 add_to_page_cache(page, inode, pos & PAGE_CACHE_MASK, hash);
712 * Error handling is tricky. If we get a read error,
713 * the cached page stays in the cache (but uptodate=0),
714 * and the next process that accesses it will try to
715 * re-read it. This is needed for NFS etc, where the
716 * identity of the reader can decide if we can read the
717 * page or not..
720 * We have to read the page.
721 * If we were reading ahead, we had previously tried to read this page,
722 * That means that the page has probably been removed from the cache before
723 * the application process needs it, or has been rewritten.
724 * Decrease max readahead size to the minimum value in that situation.
726 if (reada_ok && filp->f_ramax > MIN_READAHEAD)
727 filp->f_ramax = MIN_READAHEAD;
730 int error = inode->i_op->readpage(filp, page);
731 if (!error)
732 goto found_page;
733 desc->error = error;
734 page_cache_release(page);
735 break;
738 page_read_error:
740 * We found the page, but it wasn't up-to-date.
741 * Try to re-read it _once_. We do this synchronously,
742 * because this happens only if there were errors.
745 int error = inode->i_op->readpage(filp, page);
746 if (!error) {
747 wait_on_page(page);
748 if (PageUptodate(page) && !PageError(page))
749 goto success;
750 error = -EIO; /* Some unspecified error occurred.. */
752 desc->error = error;
753 page_cache_release(page);
754 break;
758 *ppos = pos;
759 filp->f_reada = 1;
760 if (page_cache)
761 page_cache_free(page_cache);
762 UPDATE_ATIME(inode);
765 static int file_read_actor(read_descriptor_t * desc, const char *area, unsigned long size)
767 unsigned long left;
768 unsigned long count = desc->count;
770 if (size > count)
771 size = count;
772 left = __copy_to_user(desc->buf, area, size);
773 if (left) {
774 size -= left;
775 desc->error = -EFAULT;
777 desc->count = count - size;
778 desc->written += size;
779 desc->buf += size;
780 return size;
784 * This is the "read()" routine for all filesystems
785 * that can use the page cache directly.
787 ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
789 ssize_t retval;
791 retval = -EFAULT;
792 if (access_ok(VERIFY_WRITE, buf, count)) {
793 retval = 0;
794 if (count) {
795 read_descriptor_t desc;
797 desc.written = 0;
798 desc.count = count;
799 desc.buf = buf;
800 desc.error = 0;
801 do_generic_file_read(filp, ppos, &desc, file_read_actor);
803 retval = desc.written;
804 if (!retval)
805 retval = desc.error;
808 return retval;
811 static int file_send_actor(read_descriptor_t * desc, const char *area, unsigned long size)
813 ssize_t written;
814 unsigned long count = desc->count;
815 struct file *file = (struct file *) desc->buf;
816 struct inode *inode = file->f_dentry->d_inode;
817 mm_segment_t old_fs;
819 if (size > count)
820 size = count;
821 down(&inode->i_sem);
822 old_fs = get_fs();
823 set_fs(KERNEL_DS);
824 written = file->f_op->write(file, area, size, &file->f_pos);
825 set_fs(old_fs);
826 up(&inode->i_sem);
827 if (written < 0) {
828 desc->error = written;
829 written = 0;
831 desc->count = count - written;
832 desc->written += written;
833 return written;
836 asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
838 ssize_t retval;
839 struct file * in_file, * out_file;
840 struct inode * in_inode, * out_inode;
842 lock_kernel();
845 * Get input file, and verify that it is ok..
847 retval = -EBADF;
848 in_file = fget(in_fd);
849 if (!in_file)
850 goto out;
851 if (!(in_file->f_mode & FMODE_READ))
852 goto fput_in;
853 retval = -EINVAL;
854 in_inode = in_file->f_dentry->d_inode;
855 if (!in_inode)
856 goto fput_in;
857 if (!in_inode->i_op || !in_inode->i_op->readpage)
858 goto fput_in;
859 retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
860 if (retval)
861 goto fput_in;
864 * Get output file, and verify that it is ok..
866 retval = -EBADF;
867 out_file = fget(out_fd);
868 if (!out_file)
869 goto fput_in;
870 if (!(out_file->f_mode & FMODE_WRITE))
871 goto fput_out;
872 retval = -EINVAL;
873 if (!out_file->f_op || !out_file->f_op->write)
874 goto fput_out;
875 out_inode = out_file->f_dentry->d_inode;
876 if (!out_inode)
877 goto fput_out;
878 retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
879 if (retval)
880 goto fput_out;
882 retval = 0;
883 if (count) {
884 read_descriptor_t desc;
885 loff_t pos = 0, *ppos;
887 retval = -EFAULT;
888 ppos = &in_file->f_pos;
889 if (offset) {
890 if (get_user(pos, offset))
891 goto fput_out;
892 ppos = &pos;
895 desc.written = 0;
896 desc.count = count;
897 desc.buf = (char *) out_file;
898 desc.error = 0;
899 do_generic_file_read(in_file, ppos, &desc, file_send_actor);
901 retval = desc.written;
902 if (!retval)
903 retval = desc.error;
904 if (offset)
905 put_user(pos, offset);
909 fput_out:
910 fput(out_file);
911 fput_in:
912 fput(in_file);
913 out:
914 unlock_kernel();
915 return retval;
919 * Semantics for shared and private memory areas are different past the end
920 * of the file. A shared mapping past the last page of the file is an error
921 * and results in a SIGBUS, while a private mapping just maps in a zero page.
923 * The goto's are kind of ugly, but this streamlines the normal case of having
924 * it in the page cache, and handles the special cases reasonably without
925 * having a lot of duplicated code.
927 * WSH 06/04/97: fixed a memory leak and moved the allocation of new_page
928 * ahead of the wait if we're sure to need it.
930 static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long address, int no_share)
932 struct file * file = area->vm_file;
933 struct dentry * dentry = file->f_dentry;
934 struct inode * inode = dentry->d_inode;
935 unsigned long offset, reada, i;
936 struct page * page, **hash;
937 unsigned long old_page, new_page;
939 new_page = 0;
940 offset = (address - area->vm_start + area->vm_offset) & PAGE_MASK;
941 if (offset >= inode->i_size && (area->vm_flags & VM_SHARED) && area->vm_mm == current->mm)
942 goto no_page;
945 * Do we have something in the page cache already?
947 hash = page_hash(inode, offset);
948 page = __find_page(inode, offset, *hash);
949 if (!page)
950 goto no_cached_page;
952 found_page:
954 * Ok, found a page in the page cache, now we need to check
955 * that it's up-to-date. First check whether we'll need an
956 * extra page -- better to overlap the allocation with the I/O.
958 if (no_share && !new_page) {
959 new_page = page_cache_alloc();
960 if (!new_page)
961 goto failure;
964 if (PageLocked(page))
965 goto page_locked_wait;
966 if (!PageUptodate(page))
967 goto page_read_error;
969 success:
971 * Found the page, need to check sharing and possibly
972 * copy it over to another page..
974 old_page = page_address(page);
975 if (!no_share) {
977 * Ok, we can share the cached page directly.. Get rid
978 * of any potential extra pages.
980 if (new_page)
981 page_cache_free(new_page);
983 flush_page_to_ram(old_page);
984 return old_page;
988 * No sharing ... copy to the new page.
990 copy_page(new_page, old_page);
991 flush_page_to_ram(new_page);
992 page_cache_release(page);
993 return new_page;
995 no_cached_page:
997 * Try to read in an entire cluster at once.
999 reada = offset;
1000 reada >>= PAGE_CACHE_SHIFT + page_cluster;
1001 reada <<= PAGE_CACHE_SHIFT + page_cluster;
1003 for (i = 1 << page_cluster; i > 0; --i, reada += PAGE_CACHE_SIZE)
1004 new_page = try_to_read_ahead(file, reada, new_page);
1006 if (!new_page)
1007 new_page = page_cache_alloc();
1008 if (!new_page)
1009 goto no_page;
1012 * During getting the above page we might have slept,
1013 * so we need to re-check the situation with the page
1014 * cache.. The page we just got may be useful if we
1015 * can't share, so don't get rid of it here.
1017 page = find_page(inode, offset);
1018 if (page)
1019 goto found_page;
1022 * Now, create a new page-cache page from the page we got
1024 page = page_cache_entry(new_page);
1025 new_page = 0;
1026 add_to_page_cache(page, inode, offset, hash);
1028 if (inode->i_op->readpage(file, page) != 0)
1029 goto failure;
1031 goto found_page;
1033 page_locked_wait:
1034 __wait_on_page(page);
1035 if (PageUptodate(page))
1036 goto success;
1038 page_read_error:
1040 * Umm, take care of errors if the page isn't up-to-date.
1041 * Try to re-read it _once_. We do this synchronously,
1042 * because there really aren't any performance issues here
1043 * and we need to check for errors.
1045 if (inode->i_op->readpage(file, page) != 0)
1046 goto failure;
1047 wait_on_page(page);
1048 if (PageError(page))
1049 goto failure;
1050 if (PageUptodate(page))
1051 goto success;
1054 * Things didn't work out. Return zero to tell the
1055 * mm layer so, possibly freeing the page cache page first.
1057 failure:
1058 page_cache_release(page);
1059 if (new_page)
1060 page_cache_free(new_page);
1061 no_page:
1062 return 0;
1066 * Tries to write a shared mapped page to its backing store. May return -EIO
1067 * if the disk is full.
1069 static inline int do_write_page(struct inode * inode, struct file * file,
1070 const char * page, unsigned long offset)
1072 int retval;
1073 unsigned long size;
1074 loff_t loff = offset;
1075 mm_segment_t old_fs;
1077 size = offset + PAGE_SIZE;
1078 /* refuse to extend file size.. */
1079 if (S_ISREG(inode->i_mode)) {
1080 if (size > inode->i_size)
1081 size = inode->i_size;
1082 /* Ho humm.. We should have tested for this earlier */
1083 if (size < offset)
1084 return -EIO;
1086 size -= offset;
1087 old_fs = get_fs();
1088 set_fs(KERNEL_DS);
1089 retval = -EIO;
1090 if (size == file->f_op->write(file, (const char *) page, size, &loff))
1091 retval = 0;
1092 set_fs(old_fs);
1093 return retval;
1096 static int filemap_write_page(struct vm_area_struct * vma,
1097 unsigned long offset,
1098 unsigned long page,
1099 int wait)
1101 int result;
1102 struct file * file;
1103 struct dentry * dentry;
1104 struct inode * inode;
1106 file = vma->vm_file;
1107 dentry = file->f_dentry;
1108 inode = dentry->d_inode;
1109 if (!file->f_op->write)
1110 return -EIO;
1113 * If a task terminates while we're swapping the page, the vma and
1114 * and file could be released ... increment the count to be safe.
1116 file->f_count++;
1119 * If this is a swapping operation rather than msync(), then
1120 * leave the actual IO, and the restoration of the file count,
1121 * to the kpiod thread. Just queue the request for now.
1123 if (!wait) {
1124 make_pio_request(file, offset, page);
1125 return 0;
1128 down(&inode->i_sem);
1129 result = do_write_page(inode, file, (const char *) page, offset);
1130 up(&inode->i_sem);
1131 fput(file);
1132 return result;
1137 * The page cache takes care of races between somebody
1138 * trying to swap something out and swap something in
1139 * at the same time..
1141 int filemap_swapout(struct vm_area_struct * vma, struct page * page)
1143 return filemap_write_page(vma, page->offset, page_address(page), 0);
1146 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
1147 unsigned long address, unsigned int flags)
1149 pte_t pte = *ptep;
1150 unsigned long page;
1151 int error;
1153 if (!(flags & MS_INVALIDATE)) {
1154 if (!pte_present(pte))
1155 return 0;
1156 if (!pte_dirty(pte))
1157 return 0;
1158 flush_page_to_ram(pte_page(pte));
1159 flush_cache_page(vma, address);
1160 set_pte(ptep, pte_mkclean(pte));
1161 flush_tlb_page(vma, address);
1162 page = pte_page(pte);
1163 atomic_inc(&page_cache_entry(page)->count);
1164 } else {
1165 if (pte_none(pte))
1166 return 0;
1167 flush_cache_page(vma, address);
1168 pte_clear(ptep);
1169 flush_tlb_page(vma, address);
1170 if (!pte_present(pte)) {
1171 swap_free(pte_val(pte));
1172 return 0;
1174 page = pte_page(pte);
1175 if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
1176 page_cache_free(page);
1177 return 0;
1180 error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page, 1);
1181 page_cache_free(page);
1182 return error;
1185 static inline int filemap_sync_pte_range(pmd_t * pmd,
1186 unsigned long address, unsigned long size,
1187 struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
1189 pte_t * pte;
1190 unsigned long end;
1191 int error;
1193 if (pmd_none(*pmd))
1194 return 0;
1195 if (pmd_bad(*pmd)) {
1196 printk("filemap_sync_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
1197 pmd_clear(pmd);
1198 return 0;
1200 pte = pte_offset(pmd, address);
1201 offset += address & PMD_MASK;
1202 address &= ~PMD_MASK;
1203 end = address + size;
1204 if (end > PMD_SIZE)
1205 end = PMD_SIZE;
1206 error = 0;
1207 do {
1208 error |= filemap_sync_pte(pte, vma, address + offset, flags);
1209 address += PAGE_SIZE;
1210 pte++;
1211 } while (address < end);
1212 return error;
1215 static inline int filemap_sync_pmd_range(pgd_t * pgd,
1216 unsigned long address, unsigned long size,
1217 struct vm_area_struct *vma, unsigned int flags)
1219 pmd_t * pmd;
1220 unsigned long offset, end;
1221 int error;
1223 if (pgd_none(*pgd))
1224 return 0;
1225 if (pgd_bad(*pgd)) {
1226 printk("filemap_sync_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd));
1227 pgd_clear(pgd);
1228 return 0;
1230 pmd = pmd_offset(pgd, address);
1231 offset = address & PGDIR_MASK;
1232 address &= ~PGDIR_MASK;
1233 end = address + size;
1234 if (end > PGDIR_SIZE)
1235 end = PGDIR_SIZE;
1236 error = 0;
1237 do {
1238 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
1239 address = (address + PMD_SIZE) & PMD_MASK;
1240 pmd++;
1241 } while (address < end);
1242 return error;
1245 static int filemap_sync(struct vm_area_struct * vma, unsigned long address,
1246 size_t size, unsigned int flags)
1248 pgd_t * dir;
1249 unsigned long end = address + size;
1250 int error = 0;
1252 dir = pgd_offset(vma->vm_mm, address);
1253 flush_cache_range(vma->vm_mm, end - size, end);
1254 while (address < end) {
1255 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
1256 address = (address + PGDIR_SIZE) & PGDIR_MASK;
1257 dir++;
1259 flush_tlb_range(vma->vm_mm, end - size, end);
1260 return error;
1264 * This handles (potentially partial) area unmaps..
1266 static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len)
1268 filemap_sync(vma, start, len, MS_ASYNC);
1272 * Shared mappings need to be able to do the right thing at
1273 * close/unmap/sync. They will also use the private file as
1274 * backing-store for swapping..
1276 static struct vm_operations_struct file_shared_mmap = {
1277 NULL, /* no special open */
1278 NULL, /* no special close */
1279 filemap_unmap, /* unmap - we need to sync the pages */
1280 NULL, /* no special protect */
1281 filemap_sync, /* sync */
1282 NULL, /* advise */
1283 filemap_nopage, /* nopage */
1284 NULL, /* wppage */
1285 filemap_swapout, /* swapout */
1286 NULL, /* swapin */
1290 * Private mappings just need to be able to load in the map.
1292 * (This is actually used for shared mappings as well, if we
1293 * know they can't ever get write permissions..)
1295 static struct vm_operations_struct file_private_mmap = {
1296 NULL, /* open */
1297 NULL, /* close */
1298 NULL, /* unmap */
1299 NULL, /* protect */
1300 NULL, /* sync */
1301 NULL, /* advise */
1302 filemap_nopage, /* nopage */
1303 NULL, /* wppage */
1304 NULL, /* swapout */
1305 NULL, /* swapin */
1308 /* This is used for a general mmap of a disk file */
1310 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1312 struct vm_operations_struct * ops;
1313 struct inode *inode = file->f_dentry->d_inode;
1315 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1316 ops = &file_shared_mmap;
1317 /* share_page() can only guarantee proper page sharing if
1318 * the offsets are all page aligned. */
1319 if (vma->vm_offset & (PAGE_SIZE - 1))
1320 return -EINVAL;
1321 } else {
1322 ops = &file_private_mmap;
1323 if (inode->i_op && inode->i_op->bmap &&
1324 (vma->vm_offset & (inode->i_sb->s_blocksize - 1)))
1325 return -EINVAL;
1327 if (!inode->i_sb || !S_ISREG(inode->i_mode))
1328 return -EACCES;
1329 if (!inode->i_op || !inode->i_op->readpage)
1330 return -ENOEXEC;
1331 UPDATE_ATIME(inode);
1332 vma->vm_ops = ops;
1333 return 0;
1338 * The msync() system call.
1341 static int msync_interval(struct vm_area_struct * vma,
1342 unsigned long start, unsigned long end, int flags)
1344 if (vma->vm_file && vma->vm_ops && vma->vm_ops->sync) {
1345 int error;
1346 error = vma->vm_ops->sync(vma, start, end-start, flags);
1347 if (!error && (flags & MS_SYNC)) {
1348 struct file * file = vma->vm_file;
1349 if (file) {
1350 struct dentry * dentry = file->f_dentry;
1351 struct inode * inode = dentry->d_inode;
1352 down(&inode->i_sem);
1353 error = file_fsync(file, dentry);
1354 up(&inode->i_sem);
1357 return error;
1359 return 0;
1362 asmlinkage int sys_msync(unsigned long start, size_t len, int flags)
1364 unsigned long end;
1365 struct vm_area_struct * vma;
1366 int unmapped_error, error = -EINVAL;
1368 down(&current->mm->mmap_sem);
1369 lock_kernel();
1370 if (start & ~PAGE_MASK)
1371 goto out;
1372 len = (len + ~PAGE_MASK) & PAGE_MASK;
1373 end = start + len;
1374 if (end < start)
1375 goto out;
1376 if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1377 goto out;
1378 error = 0;
1379 if (end == start)
1380 goto out;
1382 * If the interval [start,end) covers some unmapped address ranges,
1383 * just ignore them, but return -EFAULT at the end.
1385 vma = find_vma(current->mm, start);
1386 unmapped_error = 0;
1387 for (;;) {
1388 /* Still start < end. */
1389 error = -EFAULT;
1390 if (!vma)
1391 goto out;
1392 /* Here start < vma->vm_end. */
1393 if (start < vma->vm_start) {
1394 unmapped_error = -EFAULT;
1395 start = vma->vm_start;
1397 /* Here vma->vm_start <= start < vma->vm_end. */
1398 if (end <= vma->vm_end) {
1399 if (start < end) {
1400 error = msync_interval(vma, start, end, flags);
1401 if (error)
1402 goto out;
1404 error = unmapped_error;
1405 goto out;
1407 /* Here vma->vm_start <= start < vma->vm_end < end. */
1408 error = msync_interval(vma, start, vma->vm_end, flags);
1409 if (error)
1410 goto out;
1411 start = vma->vm_end;
1412 vma = vma->vm_next;
1414 out:
1415 unlock_kernel();
1416 up(&current->mm->mmap_sem);
1417 return error;
1421 * Write to a file through the page cache. This is mainly for the
1422 * benefit of NFS and possibly other network-based file systems.
1424 * We currently put everything into the page cache prior to writing it.
1425 * This is not a problem when writing full pages. With partial pages,
1426 * however, we first have to read the data into the cache, then
1427 * dirty the page, and finally schedule it for writing. Alternatively, we
1428 * could write-through just the portion of data that would go into that
1429 * page, but that would kill performance for applications that write data
1430 * line by line, and it's prone to race conditions.
1432 * Note that this routine doesn't try to keep track of dirty pages. Each
1433 * file system has to do this all by itself, unfortunately.
1434 * okir@monad.swb.de
1436 ssize_t
1437 generic_file_write(struct file *file, const char *buf,
1438 size_t count, loff_t *ppos)
1440 struct dentry *dentry = file->f_dentry;
1441 struct inode *inode = dentry->d_inode;
1442 unsigned long pos = *ppos;
1443 unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
1444 struct page *page, **hash;
1445 unsigned long page_cache = 0;
1446 unsigned long written;
1447 long status, sync;
1449 if (!inode->i_op || !inode->i_op->updatepage)
1450 return -EIO;
1452 if (file->f_error) {
1453 int error = file->f_error;
1454 file->f_error = 0;
1455 return error;
1458 sync = file->f_flags & O_SYNC;
1459 written = 0;
1461 if (file->f_flags & O_APPEND)
1462 pos = inode->i_size;
1465 * Check whether we've reached the file size limit.
1467 status = -EFBIG;
1468 if (pos >= limit) {
1469 send_sig(SIGXFSZ, current, 0);
1470 goto out;
1473 status = 0;
1475 * Check whether to truncate the write,
1476 * and send the signal if we do.
1478 if (count > limit - pos) {
1479 send_sig(SIGXFSZ, current, 0);
1480 count = limit - pos;
1483 while (count) {
1484 unsigned long bytes, pgpos, offset;
1486 * Try to find the page in the cache. If it isn't there,
1487 * allocate a free page.
1489 offset = (pos & ~PAGE_CACHE_MASK);
1490 pgpos = pos & PAGE_CACHE_MASK;
1491 bytes = PAGE_CACHE_SIZE - offset;
1492 if (bytes > count)
1493 bytes = count;
1495 hash = page_hash(inode, pgpos);
1496 page = __find_page(inode, pgpos, *hash);
1497 if (!page) {
1498 if (!page_cache) {
1499 page_cache = page_cache_alloc();
1500 if (page_cache)
1501 continue;
1502 status = -ENOMEM;
1503 break;
1505 page = page_cache_entry(page_cache);
1506 add_to_page_cache(page, inode, pgpos, hash);
1507 page_cache = 0;
1510 /* Get exclusive IO access to the page.. */
1511 wait_on_page(page);
1512 set_bit(PG_locked, &page->flags);
1515 * Do the real work.. If the writer ends up delaying the write,
1516 * the writer needs to increment the page use counts until he
1517 * is done with the page.
1519 bytes -= copy_from_user((u8*)page_address(page) + offset, buf, bytes);
1520 status = -EFAULT;
1521 if (bytes)
1522 status = inode->i_op->updatepage(file, page, offset, bytes, sync);
1524 /* Mark it unlocked again and drop the page.. */
1525 clear_bit(PG_locked, &page->flags);
1526 wake_up(&page->wait);
1527 page_cache_release(page);
1529 if (status < 0)
1530 break;
1532 written += status;
1533 count -= status;
1534 pos += status;
1535 buf += status;
1537 *ppos = pos;
1538 if (pos > inode->i_size)
1539 inode->i_size = pos;
1541 if (page_cache)
1542 page_cache_free(page_cache);
1543 out:
1544 return written ? written : status;
1548 * Support routines for directory cacheing using the page cache.
1552 * Finds the page at the specified offset, installing a new page
1553 * if requested. The count is incremented and the page is locked.
1555 * Note: we don't have to worry about races here, as the caller
1556 * is holding the inode semaphore.
1558 unsigned long get_cached_page(struct inode * inode, unsigned long offset,
1559 int new)
1561 struct page * page;
1562 struct page ** hash;
1563 unsigned long page_cache = 0;
1565 hash = page_hash(inode, offset);
1566 page = __find_page(inode, offset, *hash);
1567 if (!page) {
1568 if (!new)
1569 goto out;
1570 page_cache = page_cache_alloc();
1571 if (!page_cache)
1572 goto out;
1573 clear_page(page_cache);
1574 page = page_cache_entry(page_cache);
1575 add_to_page_cache(page, inode, offset, hash);
1577 if (atomic_read(&page->count) != 2)
1578 printk(KERN_ERR "get_cached_page: page count=%d\n",
1579 atomic_read(&page->count));
1580 if (test_bit(PG_locked, &page->flags))
1581 printk(KERN_ERR "get_cached_page: page already locked!\n");
1582 set_bit(PG_locked, &page->flags);
1583 page_cache = page_address(page);
1585 out:
1586 return page_cache;
1590 * Unlock and free a page.
1592 void put_cached_page(unsigned long addr)
1594 struct page * page = page_cache_entry(addr);
1596 if (!test_bit(PG_locked, &page->flags))
1597 printk("put_cached_page: page not locked!\n");
1598 if (atomic_read(&page->count) != 2)
1599 printk("put_cached_page: page count=%d\n",
1600 atomic_read(&page->count));
1601 clear_bit(PG_locked, &page->flags);
1602 wake_up(&page->wait);
1603 page_cache_release(page);
1607 /* Add request for page IO to the queue */
1609 static inline void put_pio_request(struct pio_request *p)
1611 *pio_last = p;
1612 p->next = NULL;
1613 pio_last = &p->next;
1616 /* Take the first page IO request off the queue */
1618 static inline struct pio_request * get_pio_request(void)
1620 struct pio_request * p = pio_first;
1621 pio_first = p->next;
1622 if (!pio_first)
1623 pio_last = &pio_first;
1624 return p;
1627 /* Make a new page IO request and queue it to the kpiod thread */
1629 static inline void make_pio_request(struct file *file,
1630 unsigned long offset,
1631 unsigned long page)
1633 struct pio_request *p;
1635 atomic_inc(&page_cache_entry(page)->count);
1638 * We need to allocate without causing any recursive IO in the
1639 * current thread's context. We might currently be swapping out
1640 * as a result of an allocation made while holding a critical
1641 * filesystem lock. To avoid deadlock, we *MUST* not reenter
1642 * the filesystem in this thread.
1644 * We can wait for kswapd to free memory, or we can try to free
1645 * pages without actually performing further IO, without fear of
1646 * deadlock. --sct
1649 while ((p = kmem_cache_alloc(pio_request_cache, GFP_BUFFER)) == NULL) {
1650 if (try_to_free_pages(__GFP_WAIT))
1651 continue;
1652 current->state = TASK_INTERRUPTIBLE;
1653 schedule_timeout(HZ/10);
1656 p->file = file;
1657 p->offset = offset;
1658 p->page = page;
1660 put_pio_request(p);
1661 wake_up(&pio_wait);
1666 * This is the only thread which is allowed to write out filemap pages
1667 * while swapping.
1669 * To avoid deadlock, it is important that we never reenter this thread.
1670 * Although recursive memory allocations within this thread may result
1671 * in more page swapping, that swapping will always be done by queuing
1672 * another IO request to the same thread: we will never actually start
1673 * that IO request until we have finished with the current one, and so
1674 * we will not deadlock.
1677 int kpiod(void * unused)
1679 struct task_struct *tsk = current;
1680 struct wait_queue wait = { tsk, };
1681 struct inode * inode;
1682 struct dentry * dentry;
1683 struct pio_request * p;
1685 tsk->session = 1;
1686 tsk->pgrp = 1;
1687 strcpy(tsk->comm, "kpiod");
1688 sigfillset(&tsk->blocked);
1689 init_waitqueue(&pio_wait);
1691 * Mark this task as a memory allocator - we don't want to get caught
1692 * up in the regular mm freeing frenzy if we have to allocate memory
1693 * in order to write stuff out.
1695 tsk->flags |= PF_MEMALLOC;
1697 lock_kernel();
1699 pio_request_cache = kmem_cache_create("pio_request",
1700 sizeof(struct pio_request),
1701 0, SLAB_HWCACHE_ALIGN,
1702 NULL, NULL);
1703 if (!pio_request_cache)
1704 panic ("Could not create pio_request slab cache");
1706 while (1) {
1707 tsk->state = TASK_INTERRUPTIBLE;
1708 add_wait_queue(&pio_wait, &wait);
1709 if (!pio_first)
1710 schedule();
1711 remove_wait_queue(&pio_wait, &wait);
1712 tsk->state = TASK_RUNNING;
1714 while (pio_first) {
1715 p = get_pio_request();
1716 dentry = p->file->f_dentry;
1717 inode = dentry->d_inode;
1719 down(&inode->i_sem);
1720 do_write_page(inode, p->file,
1721 (const char *) p->page, p->offset);
1722 up(&inode->i_sem);
1723 fput(p->file);
1724 page_cache_free(p->page);
1725 kmem_cache_free(pio_request_cache, p);