v2.6.22.22-op2-rc1
[linux-2.6.22.y-op.git] / fs / splice.c
blob3da87fe6c0e5c17946c4863d2b43b3cc9dd6dd37
1 /*
2 * "splice": joining two ropes together by interweaving their strands.
4 * This is the "extended pipe" functionality, where a pipe is used as
5 * an arbitrary in-memory buffer. Think of a pipe as a small kernel
6 * buffer that you can use to transfer data from one end to the other.
8 * The traditional unix read/write is extended with a "splice()" operation
9 * that transfers data buffers to or from a pipe buffer.
11 * Named by Larry McVoy, original implementation from Linus, extended by
12 * Jens to support splicing to files, network, direct splicing, etc and
13 * fixing lots of bugs.
15 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
16 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
17 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
20 #include <linux/fs.h>
21 #include <linux/file.h>
22 #include <linux/pagemap.h>
23 #include <linux/pipe_fs_i.h>
24 #include <linux/mm_inline.h>
25 #include <linux/swap.h>
26 #include <linux/writeback.h>
27 #include <linux/buffer_head.h>
28 #include <linux/module.h>
29 #include <linux/syscalls.h>
30 #include <linux/uio.h>
31 #include <linux/security.h>
33 struct partial_page {
34 unsigned int offset;
35 unsigned int len;
39 * Passed to splice_to_pipe
41 struct splice_pipe_desc {
42 struct page **pages; /* page map */
43 struct partial_page *partial; /* pages[] may not be contig */
44 int nr_pages; /* number of pages in map */
45 unsigned int flags; /* splice flags */
46 const struct pipe_buf_operations *ops;/* ops associated with output pipe */
50 * Attempt to steal a page from a pipe buffer. This should perhaps go into
51 * a vm helper function, it's already simplified quite a bit by the
52 * addition of remove_mapping(). If success is returned, the caller may
53 * attempt to reuse this page for another destination.
55 static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
56 struct pipe_buffer *buf)
58 struct page *page = buf->page;
59 struct address_space *mapping;
61 lock_page(page);
63 mapping = page_mapping(page);
64 if (mapping) {
65 WARN_ON(!PageUptodate(page));
68 * At least for ext2 with nobh option, we need to wait on
69 * writeback completing on this page, since we'll remove it
70 * from the pagecache. Otherwise truncate wont wait on the
71 * page, allowing the disk blocks to be reused by someone else
72 * before we actually wrote our data to them. fs corruption
73 * ensues.
75 wait_on_page_writeback(page);
77 if (PagePrivate(page))
78 try_to_release_page(page, GFP_KERNEL);
81 * If we succeeded in removing the mapping, set LRU flag
82 * and return good.
84 if (remove_mapping(mapping, page)) {
85 buf->flags |= PIPE_BUF_FLAG_LRU;
86 return 0;
91 * Raced with truncate or failed to remove page from current
92 * address space, unlock and return failure.
94 unlock_page(page);
95 return 1;
98 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
99 struct pipe_buffer *buf)
101 page_cache_release(buf->page);
102 buf->flags &= ~PIPE_BUF_FLAG_LRU;
105 static int page_cache_pipe_buf_pin(struct pipe_inode_info *pipe,
106 struct pipe_buffer *buf)
108 struct page *page = buf->page;
109 int err;
111 if (!PageUptodate(page)) {
112 lock_page(page);
115 * Page got truncated/unhashed. This will cause a 0-byte
116 * splice, if this is the first page.
118 if (!page->mapping) {
119 err = -ENODATA;
120 goto error;
124 * Uh oh, read-error from disk.
126 if (!PageUptodate(page)) {
127 err = -EIO;
128 goto error;
132 * Page is ok afterall, we are done.
134 unlock_page(page);
137 return 0;
138 error:
139 unlock_page(page);
140 return err;
143 static const struct pipe_buf_operations page_cache_pipe_buf_ops = {
144 .can_merge = 0,
145 .map = generic_pipe_buf_map,
146 .unmap = generic_pipe_buf_unmap,
147 .pin = page_cache_pipe_buf_pin,
148 .release = page_cache_pipe_buf_release,
149 .steal = page_cache_pipe_buf_steal,
150 .get = generic_pipe_buf_get,
153 static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
154 struct pipe_buffer *buf)
156 if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
157 return 1;
159 buf->flags |= PIPE_BUF_FLAG_LRU;
160 return generic_pipe_buf_steal(pipe, buf);
163 static const struct pipe_buf_operations user_page_pipe_buf_ops = {
164 .can_merge = 0,
165 .map = generic_pipe_buf_map,
166 .unmap = generic_pipe_buf_unmap,
167 .pin = generic_pipe_buf_pin,
168 .release = page_cache_pipe_buf_release,
169 .steal = user_page_pipe_buf_steal,
170 .get = generic_pipe_buf_get,
174 * Pipe output worker. This sets up our pipe format with the page cache
175 * pipe buffer operations. Otherwise very similar to the regular pipe_writev().
177 static ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
178 struct splice_pipe_desc *spd)
180 unsigned int spd_pages = spd->nr_pages;
181 int ret, do_wakeup, page_nr;
183 ret = 0;
184 do_wakeup = 0;
185 page_nr = 0;
187 if (pipe->inode)
188 mutex_lock(&pipe->inode->i_mutex);
190 for (;;) {
191 if (!pipe->readers) {
192 send_sig(SIGPIPE, current, 0);
193 if (!ret)
194 ret = -EPIPE;
195 break;
198 if (pipe->nrbufs < PIPE_BUFFERS) {
199 int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
200 struct pipe_buffer *buf = pipe->bufs + newbuf;
202 buf->page = spd->pages[page_nr];
203 buf->offset = spd->partial[page_nr].offset;
204 buf->len = spd->partial[page_nr].len;
205 buf->ops = spd->ops;
206 if (spd->flags & SPLICE_F_GIFT)
207 buf->flags |= PIPE_BUF_FLAG_GIFT;
209 pipe->nrbufs++;
210 page_nr++;
211 ret += buf->len;
213 if (pipe->inode)
214 do_wakeup = 1;
216 if (!--spd->nr_pages)
217 break;
218 if (pipe->nrbufs < PIPE_BUFFERS)
219 continue;
221 break;
224 if (spd->flags & SPLICE_F_NONBLOCK) {
225 if (!ret)
226 ret = -EAGAIN;
227 break;
230 if (signal_pending(current)) {
231 if (!ret)
232 ret = -ERESTARTSYS;
233 break;
236 if (do_wakeup) {
237 smp_mb();
238 if (waitqueue_active(&pipe->wait))
239 wake_up_interruptible_sync(&pipe->wait);
240 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
241 do_wakeup = 0;
244 pipe->waiting_writers++;
245 pipe_wait(pipe);
246 pipe->waiting_writers--;
249 if (pipe->inode) {
250 mutex_unlock(&pipe->inode->i_mutex);
252 if (do_wakeup) {
253 smp_mb();
254 if (waitqueue_active(&pipe->wait))
255 wake_up_interruptible(&pipe->wait);
256 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
260 while (page_nr < spd_pages)
261 page_cache_release(spd->pages[page_nr++]);
263 return ret;
266 static int
267 __generic_file_splice_read(struct file *in, loff_t *ppos,
268 struct pipe_inode_info *pipe, size_t len,
269 unsigned int flags)
271 struct address_space *mapping = in->f_mapping;
272 unsigned int loff, nr_pages;
273 struct page *pages[PIPE_BUFFERS];
274 struct partial_page partial[PIPE_BUFFERS];
275 struct page *page;
276 pgoff_t index, end_index;
277 loff_t isize;
278 int error, page_nr;
279 struct splice_pipe_desc spd = {
280 .pages = pages,
281 .partial = partial,
282 .flags = flags,
283 .ops = &page_cache_pipe_buf_ops,
286 index = *ppos >> PAGE_CACHE_SHIFT;
287 loff = *ppos & ~PAGE_CACHE_MASK;
288 nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
290 if (nr_pages > PIPE_BUFFERS)
291 nr_pages = PIPE_BUFFERS;
294 * Don't try to 2nd guess the read-ahead logic, call into
295 * page_cache_readahead() like the page cache reads would do.
297 page_cache_readahead(mapping, &in->f_ra, in, index, nr_pages);
300 * Now fill in the holes:
302 error = 0;
305 * Lookup the (hopefully) full range of pages we need.
307 spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages);
310 * If find_get_pages_contig() returned fewer pages than we needed,
311 * allocate the rest.
313 index += spd.nr_pages;
314 while (spd.nr_pages < nr_pages) {
316 * Page could be there, find_get_pages_contig() breaks on
317 * the first hole.
319 page = find_get_page(mapping, index);
320 if (!page) {
322 * Make sure the read-ahead engine is notified
323 * about this failure.
325 handle_ra_miss(mapping, &in->f_ra, index);
328 * page didn't exist, allocate one.
330 page = page_cache_alloc_cold(mapping);
331 if (!page)
332 break;
334 error = add_to_page_cache_lru(page, mapping, index,
335 mapping_gfp_mask(mapping));
336 if (unlikely(error)) {
337 page_cache_release(page);
338 if (error == -EEXIST)
339 continue;
340 break;
343 * add_to_page_cache() locks the page, unlock it
344 * to avoid convoluting the logic below even more.
346 unlock_page(page);
349 pages[spd.nr_pages++] = page;
350 index++;
354 * Now loop over the map and see if we need to start IO on any
355 * pages, fill in the partial map, etc.
357 index = *ppos >> PAGE_CACHE_SHIFT;
358 nr_pages = spd.nr_pages;
359 spd.nr_pages = 0;
360 for (page_nr = 0; page_nr < nr_pages; page_nr++) {
361 unsigned int this_len;
363 if (!len)
364 break;
367 * this_len is the max we'll use from this page
369 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
370 page = pages[page_nr];
373 * If the page isn't uptodate, we may need to start io on it
375 if (!PageUptodate(page)) {
377 * If in nonblock mode then dont block on waiting
378 * for an in-flight io page
380 if (flags & SPLICE_F_NONBLOCK) {
381 if (TestSetPageLocked(page))
382 break;
383 } else
384 lock_page(page);
387 * page was truncated, stop here. if this isn't the
388 * first page, we'll just complete what we already
389 * added
391 if (!page->mapping) {
392 unlock_page(page);
393 break;
396 * page was already under io and is now done, great
398 if (PageUptodate(page)) {
399 unlock_page(page);
400 goto fill_it;
404 * need to read in the page
406 error = mapping->a_ops->readpage(in, page);
407 if (unlikely(error)) {
409 * We really should re-lookup the page here,
410 * but it complicates things a lot. Instead
411 * lets just do what we already stored, and
412 * we'll get it the next time we are called.
414 if (error == AOP_TRUNCATED_PAGE)
415 error = 0;
417 break;
420 fill_it:
422 * i_size must be checked after PageUptodate.
424 isize = i_size_read(mapping->host);
425 end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
426 if (unlikely(!isize || index > end_index))
427 break;
430 * if this is the last page, see if we need to shrink
431 * the length and stop
433 if (end_index == index) {
434 unsigned int plen;
437 * max good bytes in this page
439 plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
440 if (plen <= loff)
441 break;
444 * force quit after adding this page
446 this_len = min(this_len, plen - loff);
447 len = this_len;
450 partial[page_nr].offset = loff;
451 partial[page_nr].len = this_len;
452 len -= this_len;
453 loff = 0;
454 spd.nr_pages++;
455 index++;
459 * Release any pages at the end, if we quit early. 'page_nr' is how far
460 * we got, 'nr_pages' is how many pages are in the map.
462 while (page_nr < nr_pages)
463 page_cache_release(pages[page_nr++]);
465 if (spd.nr_pages)
466 return splice_to_pipe(pipe, &spd);
468 return error;
472 * generic_file_splice_read - splice data from file to a pipe
473 * @in: file to splice from
474 * @pipe: pipe to splice to
475 * @len: number of bytes to splice
476 * @flags: splice modifier flags
478 * Will read pages from given file and fill them into a pipe.
480 ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
481 struct pipe_inode_info *pipe, size_t len,
482 unsigned int flags)
484 ssize_t spliced;
485 int ret;
486 loff_t isize, left;
488 isize = i_size_read(in->f_mapping->host);
489 if (unlikely(*ppos >= isize))
490 return 0;
492 left = isize - *ppos;
493 if (unlikely(left < len))
494 len = left;
496 ret = 0;
497 spliced = 0;
498 while (len) {
499 ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
501 if (ret < 0)
502 break;
503 else if (!ret) {
504 if (spliced)
505 break;
506 if (flags & SPLICE_F_NONBLOCK) {
507 ret = -EAGAIN;
508 break;
512 *ppos += ret;
513 len -= ret;
514 spliced += ret;
517 if (spliced)
518 return spliced;
520 return ret;
523 EXPORT_SYMBOL(generic_file_splice_read);
526 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
527 * using sendpage(). Return the number of bytes sent.
529 static int pipe_to_sendpage(struct pipe_inode_info *pipe,
530 struct pipe_buffer *buf, struct splice_desc *sd)
532 struct file *file = sd->file;
533 loff_t pos = sd->pos;
534 int ret, more;
536 ret = buf->ops->pin(pipe, buf);
537 if (!ret) {
538 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
540 ret = file->f_op->sendpage(file, buf->page, buf->offset,
541 sd->len, &pos, more);
544 return ret;
548 * This is a little more tricky than the file -> pipe splicing. There are
549 * basically three cases:
551 * - Destination page already exists in the address space and there
552 * are users of it. For that case we have no other option that
553 * copying the data. Tough luck.
554 * - Destination page already exists in the address space, but there
555 * are no users of it. Make sure it's uptodate, then drop it. Fall
556 * through to last case.
557 * - Destination page does not exist, we can add the pipe page to
558 * the page cache and avoid the copy.
560 * If asked to move pages to the output file (SPLICE_F_MOVE is set in
561 * sd->flags), we attempt to migrate pages from the pipe to the output
562 * file address space page cache. This is possible if no one else has
563 * the pipe page referenced outside of the pipe and page cache. If
564 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create
565 * a new page in the output file page cache and fill/dirty that.
567 static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
568 struct splice_desc *sd)
570 struct file *file = sd->file;
571 struct address_space *mapping = file->f_mapping;
572 unsigned int offset, this_len;
573 struct page *page;
574 pgoff_t index;
575 int ret;
578 * make sure the data in this buffer is uptodate
580 ret = buf->ops->pin(pipe, buf);
581 if (unlikely(ret))
582 return ret;
584 index = sd->pos >> PAGE_CACHE_SHIFT;
585 offset = sd->pos & ~PAGE_CACHE_MASK;
587 this_len = sd->len;
588 if (this_len + offset > PAGE_CACHE_SIZE)
589 this_len = PAGE_CACHE_SIZE - offset;
591 find_page:
592 page = find_lock_page(mapping, index);
593 if (!page) {
594 ret = -ENOMEM;
595 page = page_cache_alloc_cold(mapping);
596 if (unlikely(!page))
597 goto out_ret;
600 * This will also lock the page
602 ret = add_to_page_cache_lru(page, mapping, index,
603 GFP_KERNEL);
604 if (unlikely(ret))
605 goto out_release;
608 ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len);
609 if (unlikely(ret)) {
610 loff_t isize = i_size_read(mapping->host);
612 if (ret != AOP_TRUNCATED_PAGE)
613 unlock_page(page);
614 page_cache_release(page);
615 if (ret == AOP_TRUNCATED_PAGE)
616 goto find_page;
619 * prepare_write() may have instantiated a few blocks
620 * outside i_size. Trim these off again.
622 if (sd->pos + this_len > isize)
623 vmtruncate(mapping->host, isize);
625 goto out_ret;
628 if (buf->page != page) {
630 * Careful, ->map() uses KM_USER0!
632 char *src = buf->ops->map(pipe, buf, 1);
633 char *dst = kmap_atomic(page, KM_USER1);
635 memcpy(dst + offset, src + buf->offset, this_len);
636 flush_dcache_page(page);
637 kunmap_atomic(dst, KM_USER1);
638 buf->ops->unmap(pipe, buf, src);
641 ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len);
642 if (ret) {
643 if (ret == AOP_TRUNCATED_PAGE) {
644 page_cache_release(page);
645 goto find_page;
647 if (ret < 0)
648 goto out;
650 * Partial write has happened, so 'ret' already initialized by
651 * number of bytes written, Where is nothing we have to do here.
653 } else
654 ret = this_len;
656 * Return the number of bytes written and mark page as
657 * accessed, we are now done!
659 mark_page_accessed(page);
660 out:
661 unlock_page(page);
662 out_release:
663 page_cache_release(page);
664 out_ret:
665 return ret;
669 * Pipe input worker. Most of this logic works like a regular pipe, the
670 * key here is the 'actor' worker passed in that actually moves the data
671 * to the wanted destination. See pipe_to_file/pipe_to_sendpage above.
673 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe,
674 struct file *out, loff_t *ppos, size_t len,
675 unsigned int flags, splice_actor *actor)
677 int ret, do_wakeup, err;
678 struct splice_desc sd;
680 ret = 0;
681 do_wakeup = 0;
683 sd.total_len = len;
684 sd.flags = flags;
685 sd.file = out;
686 sd.pos = *ppos;
688 for (;;) {
689 if (pipe->nrbufs) {
690 struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
691 const struct pipe_buf_operations *ops = buf->ops;
693 sd.len = buf->len;
694 if (sd.len > sd.total_len)
695 sd.len = sd.total_len;
697 err = actor(pipe, buf, &sd);
698 if (err <= 0) {
699 if (!ret && err != -ENODATA)
700 ret = err;
702 break;
705 ret += err;
706 buf->offset += err;
707 buf->len -= err;
709 sd.len -= err;
710 sd.pos += err;
711 sd.total_len -= err;
712 if (sd.len)
713 continue;
715 if (!buf->len) {
716 buf->ops = NULL;
717 ops->release(pipe, buf);
718 pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
719 pipe->nrbufs--;
720 if (pipe->inode)
721 do_wakeup = 1;
724 if (!sd.total_len)
725 break;
728 if (pipe->nrbufs)
729 continue;
730 if (!pipe->writers)
731 break;
732 if (!pipe->waiting_writers) {
733 if (ret)
734 break;
737 if (flags & SPLICE_F_NONBLOCK) {
738 if (!ret)
739 ret = -EAGAIN;
740 break;
743 if (signal_pending(current)) {
744 if (!ret)
745 ret = -ERESTARTSYS;
746 break;
749 if (do_wakeup) {
750 smp_mb();
751 if (waitqueue_active(&pipe->wait))
752 wake_up_interruptible_sync(&pipe->wait);
753 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
754 do_wakeup = 0;
757 pipe_wait(pipe);
760 if (do_wakeup) {
761 smp_mb();
762 if (waitqueue_active(&pipe->wait))
763 wake_up_interruptible(&pipe->wait);
764 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
767 return ret;
769 EXPORT_SYMBOL(__splice_from_pipe);
771 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
772 loff_t *ppos, size_t len, unsigned int flags,
773 splice_actor *actor)
775 ssize_t ret;
776 struct inode *inode = out->f_mapping->host;
779 * The actor worker might be calling ->prepare_write and
780 * ->commit_write. Most of the time, these expect i_mutex to
781 * be held. Since this may result in an ABBA deadlock with
782 * pipe->inode, we have to order lock acquiry here.
784 inode_double_lock(inode, pipe->inode);
785 ret = __splice_from_pipe(pipe, out, ppos, len, flags, actor);
786 inode_double_unlock(inode, pipe->inode);
788 return ret;
792 * generic_file_splice_write_nolock - generic_file_splice_write without mutexes
793 * @pipe: pipe info
794 * @out: file to write to
795 * @len: number of bytes to splice
796 * @flags: splice modifier flags
798 * Will either move or copy pages (determined by @flags options) from
799 * the given pipe inode to the given file. The caller is responsible
800 * for acquiring i_mutex on both inodes.
803 ssize_t
804 generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out,
805 loff_t *ppos, size_t len, unsigned int flags)
807 struct address_space *mapping = out->f_mapping;
808 struct inode *inode = mapping->host;
809 ssize_t ret;
810 int err;
812 err = remove_suid(out->f_path.dentry);
813 if (unlikely(err))
814 return err;
816 ret = __splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file);
817 if (ret > 0) {
818 unsigned long nr_pages;
820 *ppos += ret;
821 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
824 * If file or inode is SYNC and we actually wrote some data,
825 * sync it.
827 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
828 err = generic_osync_inode(inode, mapping,
829 OSYNC_METADATA|OSYNC_DATA);
831 if (err)
832 ret = err;
834 balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
837 return ret;
840 EXPORT_SYMBOL(generic_file_splice_write_nolock);
843 * generic_file_splice_write - splice data from a pipe to a file
844 * @pipe: pipe info
845 * @out: file to write to
846 * @len: number of bytes to splice
847 * @flags: splice modifier flags
849 * Will either move or copy pages (determined by @flags options) from
850 * the given pipe inode to the given file.
853 ssize_t
854 generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
855 loff_t *ppos, size_t len, unsigned int flags)
857 struct address_space *mapping = out->f_mapping;
858 struct inode *inode = mapping->host;
859 ssize_t ret;
860 int err;
862 err = should_remove_suid(out->f_path.dentry);
863 if (unlikely(err)) {
864 mutex_lock(&inode->i_mutex);
865 err = __remove_suid(out->f_path.dentry, err);
866 mutex_unlock(&inode->i_mutex);
867 if (err)
868 return err;
871 ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file);
872 if (ret > 0) {
873 unsigned long nr_pages;
875 *ppos += ret;
876 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
879 * If file or inode is SYNC and we actually wrote some data,
880 * sync it.
882 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
883 mutex_lock(&inode->i_mutex);
884 err = generic_osync_inode(inode, mapping,
885 OSYNC_METADATA|OSYNC_DATA);
886 mutex_unlock(&inode->i_mutex);
888 if (err)
889 ret = err;
891 balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
894 return ret;
897 EXPORT_SYMBOL(generic_file_splice_write);
900 * generic_splice_sendpage - splice data from a pipe to a socket
901 * @inode: pipe inode
902 * @out: socket to write to
903 * @len: number of bytes to splice
904 * @flags: splice modifier flags
906 * Will send @len bytes from the pipe to a network socket. No data copying
907 * is involved.
910 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
911 loff_t *ppos, size_t len, unsigned int flags)
913 return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
916 EXPORT_SYMBOL(generic_splice_sendpage);
919 * Attempt to initiate a splice from pipe to file.
921 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
922 loff_t *ppos, size_t len, unsigned int flags)
924 int ret;
926 if (unlikely(!out->f_op || !out->f_op->splice_write))
927 return -EINVAL;
929 if (unlikely(!(out->f_mode & FMODE_WRITE)))
930 return -EBADF;
932 ret = rw_verify_area(WRITE, out, ppos, len);
933 if (unlikely(ret < 0))
934 return ret;
936 ret = security_file_permission(out, MAY_WRITE);
937 if (unlikely(ret < 0))
938 return ret;
940 return out->f_op->splice_write(pipe, out, ppos, len, flags);
944 * Attempt to initiate a splice from a file to a pipe.
946 static long do_splice_to(struct file *in, loff_t *ppos,
947 struct pipe_inode_info *pipe, size_t len,
948 unsigned int flags)
950 int ret;
952 if (unlikely(!in->f_op || !in->f_op->splice_read))
953 return -EINVAL;
955 if (unlikely(!(in->f_mode & FMODE_READ)))
956 return -EBADF;
958 ret = rw_verify_area(READ, in, ppos, len);
959 if (unlikely(ret < 0))
960 return ret;
962 ret = security_file_permission(in, MAY_READ);
963 if (unlikely(ret < 0))
964 return ret;
966 return in->f_op->splice_read(in, ppos, pipe, len, flags);
969 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
970 size_t len, unsigned int flags)
972 struct pipe_inode_info *pipe;
973 long ret, bytes;
974 loff_t out_off;
975 umode_t i_mode;
976 int i;
979 * We require the input being a regular file, as we don't want to
980 * randomly drop data for eg socket -> socket splicing. Use the
981 * piped splicing for that!
983 i_mode = in->f_path.dentry->d_inode->i_mode;
984 if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
985 return -EINVAL;
988 * neither in nor out is a pipe, setup an internal pipe attached to
989 * 'out' and transfer the wanted data from 'in' to 'out' through that
991 pipe = current->splice_pipe;
992 if (unlikely(!pipe)) {
993 pipe = alloc_pipe_info(NULL);
994 if (!pipe)
995 return -ENOMEM;
998 * We don't have an immediate reader, but we'll read the stuff
999 * out of the pipe right after the splice_to_pipe(). So set
1000 * PIPE_READERS appropriately.
1002 pipe->readers = 1;
1004 current->splice_pipe = pipe;
1008 * Do the splice.
1010 ret = 0;
1011 bytes = 0;
1012 out_off = 0;
1014 while (len) {
1015 size_t read_len, max_read_len;
1018 * Do at most PIPE_BUFFERS pages worth of transfer:
1020 max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE));
1022 ret = do_splice_to(in, ppos, pipe, max_read_len, flags);
1023 if (unlikely(ret <= 0))
1024 goto out_release;
1026 read_len = ret;
1029 * NOTE: nonblocking mode only applies to the input. We
1030 * must not do the output in nonblocking mode as then we
1031 * could get stuck data in the internal pipe:
1033 ret = do_splice_from(pipe, out, &out_off, read_len,
1034 flags & ~SPLICE_F_NONBLOCK);
1035 if (unlikely(ret <= 0))
1036 goto out_release;
1038 bytes += ret;
1039 len -= ret;
1042 * In nonblocking mode, if we got back a short read then
1043 * that was due to either an IO error or due to the
1044 * pagecache entry not being there. In the IO error case
1045 * the _next_ splice attempt will produce a clean IO error
1046 * return value (not a short read), so in both cases it's
1047 * correct to break out of the loop here:
1049 if ((flags & SPLICE_F_NONBLOCK) && (read_len < max_read_len))
1050 break;
1053 pipe->nrbufs = pipe->curbuf = 0;
1055 return bytes;
1057 out_release:
1059 * If we did an incomplete transfer we must release
1060 * the pipe buffers in question:
1062 for (i = 0; i < PIPE_BUFFERS; i++) {
1063 struct pipe_buffer *buf = pipe->bufs + i;
1065 if (buf->ops) {
1066 buf->ops->release(pipe, buf);
1067 buf->ops = NULL;
1070 pipe->nrbufs = pipe->curbuf = 0;
1073 * If we transferred some data, return the number of bytes:
1075 if (bytes > 0)
1076 return bytes;
1078 return ret;
1082 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
1083 * location, so checking ->i_pipe is not enough to verify that this is a
1084 * pipe.
1086 static inline struct pipe_inode_info *pipe_info(struct inode *inode)
1088 if (S_ISFIFO(inode->i_mode))
1089 return inode->i_pipe;
1091 return NULL;
1095 * Determine where to splice to/from.
1097 static long do_splice(struct file *in, loff_t __user *off_in,
1098 struct file *out, loff_t __user *off_out,
1099 size_t len, unsigned int flags)
1101 struct pipe_inode_info *pipe;
1102 loff_t offset, *off;
1103 long ret;
1105 pipe = pipe_info(in->f_path.dentry->d_inode);
1106 if (pipe) {
1107 if (off_in)
1108 return -ESPIPE;
1109 if (off_out) {
1110 if (out->f_op->llseek == no_llseek)
1111 return -EINVAL;
1112 if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1113 return -EFAULT;
1114 off = &offset;
1115 } else
1116 off = &out->f_pos;
1118 ret = do_splice_from(pipe, out, off, len, flags);
1120 if (off_out && copy_to_user(off_out, off, sizeof(loff_t)))
1121 ret = -EFAULT;
1123 return ret;
1126 pipe = pipe_info(out->f_path.dentry->d_inode);
1127 if (pipe) {
1128 if (off_out)
1129 return -ESPIPE;
1130 if (off_in) {
1131 if (in->f_op->llseek == no_llseek)
1132 return -EINVAL;
1133 if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1134 return -EFAULT;
1135 off = &offset;
1136 } else
1137 off = &in->f_pos;
1139 ret = do_splice_to(in, off, pipe, len, flags);
1141 if (off_in && copy_to_user(off_in, off, sizeof(loff_t)))
1142 ret = -EFAULT;
1144 return ret;
1147 return -EINVAL;
1151 * Map an iov into an array of pages and offset/length tupples. With the
1152 * partial_page structure, we can map several non-contiguous ranges into
1153 * our ones pages[] map instead of splitting that operation into pieces.
1154 * Could easily be exported as a generic helper for other users, in which
1155 * case one would probably want to add a 'max_nr_pages' parameter as well.
1157 static int get_iovec_page_array(const struct iovec __user *iov,
1158 unsigned int nr_vecs, struct page **pages,
1159 struct partial_page *partial, int aligned)
1161 int buffers = 0, error = 0;
1164 * It's ok to take the mmap_sem for reading, even
1165 * across a "get_user()".
1167 down_read(&current->mm->mmap_sem);
1169 while (nr_vecs) {
1170 unsigned long off, npages;
1171 void __user *base;
1172 size_t len;
1173 int i;
1176 * Get user address base and length for this iovec.
1178 error = get_user(base, &iov->iov_base);
1179 if (unlikely(error))
1180 break;
1181 error = get_user(len, &iov->iov_len);
1182 if (unlikely(error))
1183 break;
1186 * Sanity check this iovec. 0 read succeeds.
1188 if (unlikely(!len))
1189 break;
1190 error = -EFAULT;
1191 if (unlikely(!base))
1192 break;
1194 if (!access_ok(VERIFY_READ, base, len))
1195 break;
1198 * Get this base offset and number of pages, then map
1199 * in the user pages.
1201 off = (unsigned long) base & ~PAGE_MASK;
1204 * If asked for alignment, the offset must be zero and the
1205 * length a multiple of the PAGE_SIZE.
1207 error = -EINVAL;
1208 if (aligned && (off || len & ~PAGE_MASK))
1209 break;
1211 npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1212 if (npages > PIPE_BUFFERS - buffers)
1213 npages = PIPE_BUFFERS - buffers;
1215 error = get_user_pages(current, current->mm,
1216 (unsigned long) base, npages, 0, 0,
1217 &pages[buffers], NULL);
1219 if (unlikely(error <= 0))
1220 break;
1223 * Fill this contiguous range into the partial page map.
1225 for (i = 0; i < error; i++) {
1226 const int plen = min_t(size_t, len, PAGE_SIZE - off);
1228 partial[buffers].offset = off;
1229 partial[buffers].len = plen;
1231 off = 0;
1232 len -= plen;
1233 buffers++;
1237 * We didn't complete this iov, stop here since it probably
1238 * means we have to move some of this into a pipe to
1239 * be able to continue.
1241 if (len)
1242 break;
1245 * Don't continue if we mapped fewer pages than we asked for,
1246 * or if we mapped the max number of pages that we have
1247 * room for.
1249 if (error < npages || buffers == PIPE_BUFFERS)
1250 break;
1252 nr_vecs--;
1253 iov++;
1256 up_read(&current->mm->mmap_sem);
1258 if (buffers)
1259 return buffers;
1261 return error;
1265 * vmsplice splices a user address range into a pipe. It can be thought of
1266 * as splice-from-memory, where the regular splice is splice-from-file (or
1267 * to file). In both cases the output is a pipe, naturally.
1269 * Note that vmsplice only supports splicing _from_ user memory to a pipe,
1270 * not the other way around. Splicing from user memory is a simple operation
1271 * that can be supported without any funky alignment restrictions or nasty
1272 * vm tricks. We simply map in the user memory and fill them into a pipe.
1273 * The reverse isn't quite as easy, though. There are two possible solutions
1274 * for that:
1276 * - memcpy() the data internally, at which point we might as well just
1277 * do a regular read() on the buffer anyway.
1278 * - Lots of nasty vm tricks, that are neither fast nor flexible (it
1279 * has restriction limitations on both ends of the pipe).
1281 * Alas, it isn't here.
1284 static long do_vmsplice(struct file *file, const struct iovec __user *iov,
1285 unsigned long nr_segs, unsigned int flags)
1287 struct pipe_inode_info *pipe;
1288 struct page *pages[PIPE_BUFFERS];
1289 struct partial_page partial[PIPE_BUFFERS];
1290 struct splice_pipe_desc spd = {
1291 .pages = pages,
1292 .partial = partial,
1293 .flags = flags,
1294 .ops = &user_page_pipe_buf_ops,
1297 pipe = pipe_info(file->f_path.dentry->d_inode);
1298 if (!pipe)
1299 return -EBADF;
1300 if (unlikely(nr_segs > UIO_MAXIOV))
1301 return -EINVAL;
1302 else if (unlikely(!nr_segs))
1303 return 0;
1305 spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial,
1306 flags & SPLICE_F_GIFT);
1307 if (spd.nr_pages <= 0)
1308 return spd.nr_pages;
1310 return splice_to_pipe(pipe, &spd);
1313 asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
1314 unsigned long nr_segs, unsigned int flags)
1316 struct file *file;
1317 long error;
1318 int fput;
1320 error = -EBADF;
1321 file = fget_light(fd, &fput);
1322 if (file) {
1323 if (file->f_mode & FMODE_WRITE)
1324 error = do_vmsplice(file, iov, nr_segs, flags);
1326 fput_light(file, fput);
1329 return error;
1332 asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
1333 int fd_out, loff_t __user *off_out,
1334 size_t len, unsigned int flags)
1336 long error;
1337 struct file *in, *out;
1338 int fput_in, fput_out;
1340 if (unlikely(!len))
1341 return 0;
1343 error = -EBADF;
1344 in = fget_light(fd_in, &fput_in);
1345 if (in) {
1346 if (in->f_mode & FMODE_READ) {
1347 out = fget_light(fd_out, &fput_out);
1348 if (out) {
1349 if (out->f_mode & FMODE_WRITE)
1350 error = do_splice(in, off_in,
1351 out, off_out,
1352 len, flags);
1353 fput_light(out, fput_out);
1357 fput_light(in, fput_in);
1360 return error;
1364 * Make sure there's data to read. Wait for input if we can, otherwise
1365 * return an appropriate error.
1367 static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1369 int ret;
1372 * Check ->nrbufs without the inode lock first. This function
1373 * is speculative anyways, so missing one is ok.
1375 if (pipe->nrbufs)
1376 return 0;
1378 ret = 0;
1379 mutex_lock(&pipe->inode->i_mutex);
1381 while (!pipe->nrbufs) {
1382 if (signal_pending(current)) {
1383 ret = -ERESTARTSYS;
1384 break;
1386 if (!pipe->writers)
1387 break;
1388 if (!pipe->waiting_writers) {
1389 if (flags & SPLICE_F_NONBLOCK) {
1390 ret = -EAGAIN;
1391 break;
1394 pipe_wait(pipe);
1397 mutex_unlock(&pipe->inode->i_mutex);
1398 return ret;
1402 * Make sure there's writeable room. Wait for room if we can, otherwise
1403 * return an appropriate error.
1405 static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1407 int ret;
1410 * Check ->nrbufs without the inode lock first. This function
1411 * is speculative anyways, so missing one is ok.
1413 if (pipe->nrbufs < PIPE_BUFFERS)
1414 return 0;
1416 ret = 0;
1417 mutex_lock(&pipe->inode->i_mutex);
1419 while (pipe->nrbufs >= PIPE_BUFFERS) {
1420 if (!pipe->readers) {
1421 send_sig(SIGPIPE, current, 0);
1422 ret = -EPIPE;
1423 break;
1425 if (flags & SPLICE_F_NONBLOCK) {
1426 ret = -EAGAIN;
1427 break;
1429 if (signal_pending(current)) {
1430 ret = -ERESTARTSYS;
1431 break;
1433 pipe->waiting_writers++;
1434 pipe_wait(pipe);
1435 pipe->waiting_writers--;
1438 mutex_unlock(&pipe->inode->i_mutex);
1439 return ret;
1443 * Link contents of ipipe to opipe.
1445 static int link_pipe(struct pipe_inode_info *ipipe,
1446 struct pipe_inode_info *opipe,
1447 size_t len, unsigned int flags)
1449 struct pipe_buffer *ibuf, *obuf;
1450 int ret = 0, i = 0, nbuf;
1453 * Potential ABBA deadlock, work around it by ordering lock
1454 * grabbing by inode address. Otherwise two different processes
1455 * could deadlock (one doing tee from A -> B, the other from B -> A).
1457 inode_double_lock(ipipe->inode, opipe->inode);
1459 do {
1460 if (!opipe->readers) {
1461 send_sig(SIGPIPE, current, 0);
1462 if (!ret)
1463 ret = -EPIPE;
1464 break;
1468 * If we have iterated all input buffers or ran out of
1469 * output room, break.
1471 if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS)
1472 break;
1474 ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1));
1475 nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1);
1478 * Get a reference to this pipe buffer,
1479 * so we can copy the contents over.
1481 ibuf->ops->get(ipipe, ibuf);
1483 obuf = opipe->bufs + nbuf;
1484 *obuf = *ibuf;
1487 * Don't inherit the gift flag, we need to
1488 * prevent multiple steals of this page.
1490 obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1492 if (obuf->len > len)
1493 obuf->len = len;
1495 opipe->nrbufs++;
1496 ret += obuf->len;
1497 len -= obuf->len;
1498 i++;
1499 } while (len);
1502 * return EAGAIN if we have the potential of some data in the
1503 * future, otherwise just return 0
1505 if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK))
1506 ret = -EAGAIN;
1508 inode_double_unlock(ipipe->inode, opipe->inode);
1511 * If we put data in the output pipe, wakeup any potential readers.
1513 if (ret > 0) {
1514 smp_mb();
1515 if (waitqueue_active(&opipe->wait))
1516 wake_up_interruptible(&opipe->wait);
1517 kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
1520 return ret;
1524 * This is a tee(1) implementation that works on pipes. It doesn't copy
1525 * any data, it simply references the 'in' pages on the 'out' pipe.
1526 * The 'flags' used are the SPLICE_F_* variants, currently the only
1527 * applicable one is SPLICE_F_NONBLOCK.
1529 static long do_tee(struct file *in, struct file *out, size_t len,
1530 unsigned int flags)
1532 struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode);
1533 struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode);
1534 int ret = -EINVAL;
1537 * Duplicate the contents of ipipe to opipe without actually
1538 * copying the data.
1540 if (ipipe && opipe && ipipe != opipe) {
1542 * Keep going, unless we encounter an error. The ipipe/opipe
1543 * ordering doesn't really matter.
1545 ret = link_ipipe_prep(ipipe, flags);
1546 if (!ret) {
1547 ret = link_opipe_prep(opipe, flags);
1548 if (!ret)
1549 ret = link_pipe(ipipe, opipe, len, flags);
1553 return ret;
1556 asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags)
1558 struct file *in;
1559 int error, fput_in;
1561 if (unlikely(!len))
1562 return 0;
1564 error = -EBADF;
1565 in = fget_light(fdin, &fput_in);
1566 if (in) {
1567 if (in->f_mode & FMODE_READ) {
1568 int fput_out;
1569 struct file *out = fget_light(fdout, &fput_out);
1571 if (out) {
1572 if (out->f_mode & FMODE_WRITE)
1573 error = do_tee(in, out, len, flags);
1574 fput_light(out, fput_out);
1577 fput_light(in, fput_in);
1580 return error;