expand doc for non-blocking splice into a pipe
[ruby_io_splice.git] / ext / io_splice / io_splice_ext.c
blob23cc34f811efedc58b7fdcb73e28c8485bf8714c
1 #include "ruby.h"
2 #ifdef HAVE_RUBY_IO_H
3 # include "ruby/io.h"
4 #else
5 # include "rubyio.h"
6 #endif
7 #include <errno.h>
8 #include <fcntl.h>
9 #include <assert.h>
10 #include <sys/uio.h>
11 #include <limits.h>
12 #include <alloca.h>
13 #include <sys/utsname.h>
15 static VALUE sym_EAGAIN;
16 #define WAITALL 0x4000000
18 #ifndef F_LINUX_SPECIFIC_BASE
19 # define F_LINUX_SPECIFIC_BASE 1024
20 #endif
22 #ifndef F_GETPIPE_SZ
23 # define F_SETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 7)
24 # define F_GETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 8)
25 #endif
27 #if ! HAVE_RB_IO_T
28 # define rb_io_t OpenFile
29 #endif
31 #ifdef GetReadFile
32 # define FPTR_TO_FD(fptr) (fileno(GetReadFile(fptr)))
33 #else
34 # if !HAVE_RB_IO_T || (RUBY_VERSION_MAJOR == 1 && RUBY_VERSION_MINOR == 8)
35 # define FPTR_TO_FD(fptr) fileno(fptr->f)
36 # else
37 # define FPTR_TO_FD(fptr) fptr->fd
38 # endif
39 #endif
41 #ifndef SSIZET2NUM
42 # define SSIZET2NUM(x) LONG2NUM(x)
43 #endif
44 #ifndef NUM2SSIZET
45 # define NUM2SSIZET(x) NUM2LONG(x)
46 #endif
47 #ifndef SIZET2NUM
48 # define SIZET2NUM(x) ULONG2NUM(x)
49 #endif
50 #ifndef NUM2SIZET
51 # define NUM2SIZET(x) NUM2ULONG(x)
52 #endif
54 static int my_fileno(VALUE io)
56 rb_io_t *fptr;
58 for (;;) {
59 switch (TYPE(io)) {
60 case T_FIXNUM: return FIX2INT(io);
61 case T_FILE: {
62 GetOpenFile(io, fptr);
63 return FPTR_TO_FD(fptr);
65 default:
66 io = rb_convert_type(io, T_FILE, "IO", "to_io");
67 /* retry */
72 static int check_fileno(VALUE io)
74 int saved_errno = errno;
75 int fd = my_fileno(io);
76 errno = saved_errno;
77 return fd;
79 #ifndef HAVE_RB_THREAD_BLOCKING_REGION
80 /* partial emulation of the 1.9 rb_thread_blocking_region under 1.8 */
81 # include <rubysig.h>
82 # define RUBY_UBF_IO ((rb_unblock_function_t *)-1)
83 typedef void rb_unblock_function_t(void *);
84 typedef VALUE rb_blocking_function_t(void *);
85 static VALUE
86 rb_thread_blocking_region(
87 rb_blocking_function_t *fn, void *data1,
88 rb_unblock_function_t *ubf, void *data2)
90 VALUE rv;
92 assert(RUBY_UBF_IO == ubf && "RUBY_UBF_IO required for emulation");
94 TRAP_BEG;
95 rv = fn(data1);
96 TRAP_END;
98 return rv;
100 #endif /* ! HAVE_RB_THREAD_BLOCKING_REGION */
102 #ifndef RSTRING_PTR
103 # define RSTRING_PTR(s) (RSTRING(s)->ptr)
104 #endif
105 #ifndef RSTRING_LEN
106 # define RSTRING_LEN(s) (RSTRING(s)->len)
107 #endif
108 #ifndef RARRAY_PTR
109 # define RARRAY_PTR(s) (RARRAY(s)->ptr)
110 #endif
111 #ifndef RARRAY_LEN
112 # define RARRAY_LEN(s) (RARRAY(s)->len)
113 #endif
115 static VALUE io_run(rb_blocking_function_t *fn, void *data)
117 return rb_thread_blocking_region(fn, data, RUBY_UBF_IO, 0);
120 struct splice_args {
121 int fd_in;
122 off_t *off_in;
123 int fd_out;
124 off_t *off_out;
125 size_t len;
126 unsigned flags;
129 static VALUE nogvl_splice(void *ptr)
131 struct splice_args *a = ptr;
133 return (VALUE)splice(a->fd_in, a->off_in, a->fd_out, a->off_out,
134 a->len, a->flags);
137 static ssize_t do_splice(int argc, VALUE *argv, unsigned dflags)
139 off_t i = 0, o = 0;
140 VALUE io_in, off_in, io_out, off_out, len, flags;
141 struct splice_args a;
142 ssize_t bytes;
143 ssize_t total = 0;
144 unsigned waitall;
146 rb_scan_args(argc, argv, "51",
147 &io_in, &off_in, &io_out, &off_out, &len, &flags);
149 a.off_in = NIL_P(off_in) ? NULL : (i = NUM2OFFT(off_in), &i);
150 a.off_out = NIL_P(off_out) ? NULL : (o = NUM2OFFT(off_out), &o);
151 a.len = NUM2SIZET(len);
152 a.flags = NIL_P(flags) ? dflags : NUM2UINT(flags) | dflags;
153 waitall = a.flags & WAITALL;
154 if (waitall)
155 a.flags ^= WAITALL;
157 for (;;) {
158 a.fd_in = check_fileno(io_in);
159 a.fd_out = check_fileno(io_out);
160 bytes = (ssize_t)io_run(nogvl_splice, &a);
161 if (bytes == -1) {
162 if (errno == EINTR)
163 continue;
164 if (waitall && errno == EAGAIN) {
165 rb_io_wait_readable(check_fileno(io_in));
166 errno = EAGAIN;
167 rb_io_wait_writable(check_fileno(io_out));
168 continue;
170 if (total > 0)
171 return total;
172 return bytes;
173 } else if (bytes == 0) {
174 break;
175 } else if (waitall) {
176 total += bytes;
177 if ((a.len -= bytes) == 0)
178 return total;
179 i += bytes;
180 o += bytes;
181 } else {
182 return bytes;
186 return total;
190 * call-seq:
191 * IO.splice(io_in, off_in, io_out, off_out, len) => integer
192 * IO.splice(io_in, off_in, io_out, off_out, len, flags) => integer
194 * Splice +len+ bytes from/to a pipe. Either +io_in+ or +io_out+
195 * MUST be a pipe. +io_in+ and +io_out+ may BOTH be pipes as of
196 * Linux 2.6.31 or later.
198 * +off_in+ and +off_out+ if non-nil may be used to
199 * specify an offset for the non-pipe file descriptor.
201 * +flags+ defaults to zero if unspecified.
202 * +flags+ may be a bitmask of the following flags:
204 * * IO::Splice::F_MOVE
205 * * IO::Splice::F_NONBLOCK
206 * * IO::Splice::F_MORE
207 * * IO::Splice::WAITALL
209 * Returns the number of bytes spliced.
210 * Raises EOFError when +io_in+ has reached end of file.
211 * Raises Errno::EAGAIN if the IO::Splice::F_NONBLOCK flag is set
212 * and the pipe has no data to read from or space to write to. May
213 * also raise Errno::EAGAIN if the non-pipe descriptor has no data
214 * to read from or space to write to.
216 * As splice never exposes buffers to userspace, it will not take
217 * into account userspace buffering done by Ruby or stdio. It is
218 * also not subject to encoding/decoding filters under Ruby 1.9.
220 * Consider using IO.trysplice if +io_out+ is a pipe or if you are using
221 * non-blocking I/O on both descriptors as it avoids the cost of raising
222 * common Errno::EAGAIN exceptions.
224 * See manpage for full documentation:
225 * http://kernel.org/doc/man-pages/online/pages/man2/splice.2.html
227 static VALUE my_splice(int argc, VALUE *argv, VALUE self)
229 ssize_t n = do_splice(argc, argv, 0);
231 if (n == 0)
232 rb_eof_error();
233 if (n == -1)
234 rb_sys_fail("splice");
235 return SSIZET2NUM(n);
239 * call-seq:
240 * IO.trysplice(io_in, off_in, io_out, off_out, len) => integer
241 * IO.trysplice(io_in, off_in, io_out, off_out, len, flags) => integer
243 * Exactly like IO.splice, except +:EAGAIN+ is returned when either
244 * the read or write end would block instead of raising Errno::EAGAIN.
246 * IO::Splice::F_NONBLOCK is always passed for the pipe descriptor,
247 * but this can still block if the non-pipe descriptor is blocking.
249 * See IO.splice documentation for more details.
251 * This method is recommended whenever +io_out+ is a pipe.
253 static VALUE trysplice(int argc, VALUE *argv, VALUE self)
255 ssize_t n = do_splice(argc, argv, SPLICE_F_NONBLOCK);
257 if (n == 0)
258 return Qnil;
259 if (n == -1) {
260 if (errno == EAGAIN)
261 return sym_EAGAIN;
262 rb_sys_fail("splice");
264 return SSIZET2NUM(n);
267 struct tee_args {
268 int fd_in;
269 int fd_out;
270 size_t len;
271 unsigned flags;
274 /* runs without GVL */
275 static VALUE nogvl_tee(void *ptr)
277 struct tee_args *a = ptr;
279 return (VALUE)tee(a->fd_in, a->fd_out, a->len, a->flags);
282 static ssize_t do_tee(int argc, VALUE *argv, unsigned dflags)
284 VALUE io_in, io_out, len, flags;
285 struct tee_args a;
286 ssize_t bytes;
287 ssize_t total = 0;
288 unsigned waitall;
290 rb_scan_args(argc, argv, "31", &io_in, &io_out, &len, &flags);
291 a.len = (size_t)NUM2SIZET(len);
292 a.flags = NIL_P(flags) ? dflags : NUM2UINT(flags) | dflags;
293 waitall = a.flags & WAITALL;
294 if (waitall)
295 a.flags ^= WAITALL;
297 for (;;) {
298 a.fd_in = check_fileno(io_in);
299 a.fd_out = check_fileno(io_out);
300 bytes = (ssize_t)io_run(nogvl_tee, &a);
301 if (bytes == -1) {
302 if (errno == EINTR)
303 continue;
304 if (waitall && errno == EAGAIN) {
305 rb_io_wait_readable(check_fileno(io_in));
306 errno = EAGAIN;
307 rb_io_wait_writable(check_fileno(io_out));
308 continue;
310 if (total > 0)
311 return total;
312 return bytes;
313 } else if (bytes == 0) {
314 break;
315 } else if (waitall) {
316 total += bytes;
317 if ((a.len -= bytes) == 0)
318 return total;
319 } else {
320 return bytes;
324 return total;
328 * call-seq:
329 * IO.tee(io_in, io_out, len) => integer
330 * IO.tee(io_in, io_out, len, flags) => integer
332 * Copies up to +len+ bytes of data from +io_in+ to +io_out+. +io_in+
333 * and +io_out+ must both refer to pipe descriptors. +io_in+ and +io_out+
334 * may not be endpoints of the same pipe.
336 * +flags+ may be zero (the default) or a combination of:
337 * * IO::Splice::F_NONBLOCK
338 * * IO::Splice::WAITALL
340 * Other IO::Splice flags are currently unimplemented or have no effect.
342 * Returns the number of bytes duplicated if successful.
343 * Raises EOFError when +io_in+ is closed and emptied.
344 * Raises Errno::EAGAIN when +io_in+ is empty and/or +io_out+ is full
345 * and +flags+ contains IO::Splice::F_NONBLOCK
347 * Consider using IO.trytee if you are using IO::Splice::F_NONBLOCK
348 * as it avoids the cost of raising common Errno::EAGAIN exceptions.
350 * See manpage for full documentation:
351 * http://kernel.org/doc/man-pages/online/pages/man2/tee.2.html
353 static VALUE my_tee(int argc, VALUE *argv, VALUE self)
355 ssize_t n = do_tee(argc, argv, 0);
357 if (n == 0)
358 rb_eof_error();
359 if (n == -1)
360 rb_sys_fail("tee");
362 return SSIZET2NUM(n);
366 * call-seq:
367 * IO.trytee(io_in, io_out, len) => integer
368 * IO.trytee(io_in, io_out, len, flags) => integer
370 * Exactly like IO.tee, except +:EAGAIN+ is returned when either
371 * the read or write end would block instead of raising Errno::EAGAIN.
373 * IO::Splice::F_NONBLOCK is always passed for the pipe descriptor,
374 * but this can still block if the non-pipe descriptor is blocking.
376 * See IO.tee documentation for more details.
378 static VALUE trytee(int argc, VALUE *argv, VALUE self)
380 ssize_t n = do_tee(argc, argv, SPLICE_F_NONBLOCK);
382 if (n == 0)
383 return Qnil;
384 if (n == -1) {
385 if (errno == EAGAIN)
386 return sym_EAGAIN;
387 rb_sys_fail("tee");
390 return SSIZET2NUM(n);
393 struct vmsplice_args {
394 int fd;
395 struct iovec *iov;
396 unsigned long nr_segs;
397 unsigned flags;
400 static VALUE nogvl_vmsplice(void *ptr)
402 struct vmsplice_args *a = ptr;
404 return (VALUE)vmsplice(a->fd, a->iov, a->nr_segs, a->flags);
407 /* this can't be a function since we use alloca() */
408 #define ARY2IOVEC(iov,iovcnt,expect,ary) \
409 do { \
410 VALUE *cur; \
411 struct iovec *tmp; \
412 long n; \
413 cur = RARRAY_PTR(ary); \
414 n = RARRAY_LEN(ary); \
415 if (n > IOV_MAX) \
416 rb_raise(rb_eArgError, "array is larger than IOV_MAX"); \
417 iov = tmp = alloca(sizeof(struct iovec) * n); \
418 expect = 0; \
419 iovcnt = n; \
420 for (; --n >= 0; tmp++, cur++) { \
421 Check_Type(*cur, T_STRING); \
422 tmp->iov_base = RSTRING_PTR(*cur); \
423 tmp->iov_len = RSTRING_LEN(*cur); \
424 expect += tmp->iov_len; \
426 } while (0)
428 static void advance_vmsplice_args(struct vmsplice_args *a, long n)
430 struct iovec *new_iov = a->iov;
431 unsigned long i;
433 /* skip over iovecs we've already written completely */
434 for (i = 0; i < a->nr_segs; i++, new_iov++) {
435 if (n == 0)
436 break;
438 * partially written iov,
439 * modify and retry with current iovec in
440 * front
442 if (new_iov->iov_len > (size_t)n) {
443 VALUE base = (VALUE)new_iov->iov_base;
445 new_iov->iov_len -= n;
446 new_iov->iov_base = (void *)(base + n);
447 break;
450 n -= new_iov->iov_len;
453 /* setup to retry without the already-written iovecs */
454 a->nr_segs -= i;
455 a->iov = new_iov;
459 * call-seq:
460 * IO.vmsplice(io, string_array) => integer
461 * IO.vmsplice(io, string_array, flags) => integer
462 * IO.vmsplice(io, string) => integer
463 * IO.vmsplice(io, string, flags) => integer
465 * Transfers an array of strings into the pipe descriptor given by io.
466 * +io+ must be the writable end of a pipe.
468 * This may allow the kernel to avoid data copies in some cases.
469 * but is (probably) of limited usefulness in Ruby. If you have
470 * use cases or ideas for making this more useful for Ruby users,
471 * please tell us at ruby.io.splice@librelist.com!
473 * Also consider the "sendfile" RubyGem or IO.copy_stream in Ruby 1.9
474 * if you want to do zero-copy file transfers to pipes or sockets. As
475 * of Linux 2.6.33, sendfile(2) can copy to any output descriptor,
476 * not just sockets.
478 * See manpage for full documentation:
479 * http://kernel.org/doc/man-pages/online/pages/man2/vmsplice.2.html
481 static VALUE my_vmsplice(int argc, VALUE * argv, VALUE self)
483 ssize_t rv = 0;
484 ssize_t left;
485 struct vmsplice_args a;
486 VALUE io, data, flags;
488 rb_scan_args(argc, argv, "21", &io, &data, &flags);
490 switch (TYPE(data)) {
491 case T_STRING: {
492 struct iovec iov;
494 iov.iov_base = RSTRING_PTR(data);
495 iov.iov_len = (size_t)(left = (ssize_t)RSTRING_LEN(data));
496 a.iov = &iov;
497 a.nr_segs = 1;
499 break;
500 case T_ARRAY:
501 ARY2IOVEC(a.iov, a.nr_segs, left, data);
502 break;
503 default:
504 rb_raise(rb_eTypeError, "wrong argument type %s "
505 "(expected a String or Array of strings)",
506 rb_obj_classname(data));
508 a.fd = my_fileno(io);
509 a.flags = NIL_P(flags) ? 0 : NUM2UINT(flags);
511 for (;;) {
512 ssize_t n = (ssize_t)io_run(nogvl_vmsplice, &a);
514 if (n == -1) {
515 if (errno == EAGAIN) {
516 if (a.flags & SPLICE_F_NONBLOCK) {
517 rb_sys_fail("vmsplice");
518 } else {
519 a.fd = check_fileno(io);
520 if (rb_io_wait_writable(a.fd))
521 continue;
523 /* fall through on error */
526 * unlikely to hit this case, return the
527 * already written bytes, we'll let the next
528 * write (or close) fail instead
530 if (rv > 0)
531 break;
532 if (errno == EINTR) {
533 a.fd = check_fileno(io);
534 continue;
536 rb_sys_fail("vmsplice");
539 rv += n;
540 left -= n;
541 if (left == 0)
542 break;
543 advance_vmsplice_args(&a, n);
546 return SSIZET2NUM(rv);
550 * call-seq:
551 * reader, writer = IO.pipe
552 * reader.pipe_size => integer
554 * Returns the pipe capacity of the underlying pipe in bytes. The
555 * default capacity is 65536 bytes since Linux 2.6.11, and 4096 bytes
556 * in previous kernels.
558 * Since the pipe is a circular buffer in the same kernel, the size
559 * of the reader is exactly the same as the size of the writer.
561 * This method is only exposed on Linux 2.6.35 or later.
563 static VALUE pipe_size(VALUE self)
565 int size = fcntl(my_fileno(self), F_GETPIPE_SZ);
567 if (size < 0)
568 rb_sys_fail("fcntl(F_GETPIPE_SZ)");
570 return INT2NUM(size);
574 * call-seq:
575 * reader, writer = IO.pipe
576 * reader.pipe_size = integer
578 * Sets and returns the pipe capacity of the underlying pipe in bytes.
580 * This MUST be a power-of-two, or Errno::EINVAL will be raised.
581 * Linux will silently increase this to be equal to the page size
582 * (4096 bytes on most architectures) if the specified value is
583 * less than the size of a page.
585 * For users without CAP_SYS_RESOURCE, this raises Errno::EPERM when
586 * attempting to specify a value greater than the value in
587 * /proc/sys/fs/pipe-max-size.
589 * Since the pipe is a circular buffer in the same kernel, the size
590 * of the reader is exactly the same as the size of the writer.
592 * Raises Errno::EBUSY if the assigned value is less than
593 * the currently filled portion of the pipe.
595 * This method is only exposed on Linux 2.6.35 or later.
597 static VALUE set_pipe_size(VALUE self, VALUE size)
599 int fd = my_fileno(self);
600 int bytes = NUM2INT(size);
601 int rv = fcntl(fd, F_SETPIPE_SZ, bytes);
603 if (rv < 0) {
604 if (errno == ENOMEM) {
605 rb_gc();
606 rv = fcntl(fd, F_SETPIPE_SZ, bytes);
608 if (rv < 0)
609 rb_sys_fail("fcntl(F_SETPIPE_SZ)");
612 return size;
615 void Init_io_splice_ext(void)
617 VALUE mSplice = rb_define_module_under(rb_cIO, "Splice");
618 struct utsname utsname;
620 rb_define_singleton_method(rb_cIO, "splice", my_splice, -1);
621 rb_define_singleton_method(rb_cIO, "trysplice", trysplice, -1);
622 rb_define_singleton_method(rb_cIO, "tee", my_tee, -1);
623 rb_define_singleton_method(rb_cIO, "trytee", trytee, -1);
624 rb_define_singleton_method(rb_cIO, "vmsplice", my_vmsplice, -1);
627 * Attempt to move pages instead of copying. This is only a hint
628 * and support for it was removed in Linux 2.6.21. It will be
629 * re-added for FUSE filesystems only in Linux 2.6.35.
631 rb_define_const(mSplice, "F_MOVE", UINT2NUM(SPLICE_F_MOVE));
632 assert(WAITALL != SPLICE_F_MOVE && "WAITALL == F_MOVE");
635 * Do not block on pipe I/O. This flag only affects the pipe(s)
636 * being spliced from/to and has no effect on the non-pipe
637 * descriptor (which requires non-blocking operation to be set
638 * explicitly).
640 * The non-blocking flag (O_NONBLOCK) on the pipe descriptors
641 * themselves are ignored by this family of functions, and
642 * using this flag is the only way to get non-blocking operation
643 * out of them.
645 * It is highly recommended this flag be set (or IO.trysplice used)
646 * whenever splicing from a socket into a pipe unless there is
647 * another (native) thread or process doing a blocking read on that
648 * pipe. Otherwise it is possible to block a single-threaded process
649 * if the socket buffers are larger than the pipe buffers.
651 rb_define_const(mSplice, "F_NONBLOCK", UINT2NUM(SPLICE_F_NONBLOCK));
652 assert(WAITALL != SPLICE_F_NONBLOCK && "WAITALL == F_NONBLOCK");
655 * Indicate that there may be more data coming into the outbound
656 * descriptor. This can allow the kernel to avoid sending partial
657 * frames from sockets. Currently only used with splice.
659 rb_define_const(mSplice, "F_MORE", UINT2NUM(SPLICE_F_MORE));
660 assert(WAITALL != SPLICE_F_MORE && "WAITALL == F_MORE");
663 * Only usable by vmsplice. This flag probably not useful in the
664 * context of Ruby applications which cannot control alignment.
666 rb_define_const(mSplice, "F_GIFT", UINT2NUM(SPLICE_F_GIFT));
667 assert(WAITALL != SPLICE_F_GIFT && "WAITALL == F_GIFT");
670 * Retry until the requested transfer is complete, this will
671 * cause IO.splice/IO.tee to never return less than the requested
672 * transfer size unless an error occored.
674 * IO.vmsplice always defaults to this behavior.
676 rb_define_const(mSplice, "WAITALL", UINT2NUM(WAITALL));
679 * The maximum size of an atomic write to a pipe
680 * POSIX requires this to be at least 512 bytes.
681 * Under Linux, this is 4096 bytes.
683 rb_define_const(mSplice, "PIPE_BUF", UINT2NUM(PIPE_BUF));
685 if (uname(&utsname) == -1)
686 rb_sys_fail("uname");
688 /* includes 2.6.35-rc[1-6] */
689 if (strcmp(utsname.release, "2.6.35") >= 0) {
690 rb_define_method(rb_cIO, "pipe_size", pipe_size, 0);
691 rb_define_method(rb_cIO, "pipe_size=", set_pipe_size, 1);
694 * fcntl() command constant used to return the size of a pipe.
695 * This constant is only defined when running Linux 2.6.35
696 * or later. For convenience, use IO#pipe_size instead.
698 rb_define_const(mSplice, "F_GETPIPE_SZ",
699 UINT2NUM(F_GETPIPE_SZ));
702 * fcntl() command constant used to set the size of a pipe.
703 * This constant is only defined when running Linux 2.6.35
704 * or later. For convenience, use IO#pipe_size= instead.
706 rb_define_const(mSplice, "F_SETPIPE_SZ",
707 UINT2NUM(F_SETPIPE_SZ));
710 sym_EAGAIN = ID2SYM(rb_intern("EAGAIN"));