README: update with extra disclaimer
[raindrops.git] / ext / raindrops / linux_inet_diag.c
blob79f24bbec6bb2d2470bece603d1183bfe6b90338
1 #include <ruby.h>
2 #include <stdarg.h>
3 #include "my_fileno.h"
4 #ifdef __linux__
6 #ifdef HAVE_RB_THREAD_IO_BLOCKING_REGION
7 /* Ruby 1.9.3 and 2.0.0 */
8 VALUE rb_thread_io_blocking_region(rb_blocking_function_t *, void *, int);
9 # define rd_fd_region(fn,data,fd) \
10 rb_thread_io_blocking_region((fn),(data),(fd))
11 #elif defined(HAVE_RB_THREAD_CALL_WITHOUT_GVL) && \
12 defined(HAVE_RUBY_THREAD_H) && HAVE_RUBY_THREAD_H
13 /* in case Ruby 2.0+ ever drops rb_thread_io_blocking_region: */
14 # include <ruby/thread.h>
15 # define COMPAT_FN (void *(*)(void *))
16 # define rd_fd_region(fn,data,fd) \
17 rb_thread_call_without_gvl(COMPAT_FN(fn),(data),RUBY_UBF_IO,NULL)
18 #else
19 # error Ruby <= 1.8 not supported
20 #endif
22 #include <assert.h>
23 #include <errno.h>
24 #include <sys/socket.h>
25 #include <sys/types.h>
26 #include <netdb.h>
27 #include <unistd.h>
28 #include <fcntl.h>
29 #include <string.h>
30 #include <asm/types.h>
31 #include <netinet/in.h>
32 #include <arpa/inet.h>
33 #include <netinet/tcp.h>
34 #include <linux/netlink.h>
35 #include <linux/rtnetlink.h>
36 #include <linux/inet_diag.h>
38 union any_addr {
39 struct sockaddr_storage ss;
40 struct sockaddr sa;
41 struct sockaddr_in in;
42 struct sockaddr_in6 in6;
45 static size_t page_size;
46 static unsigned g_seq;
47 static VALUE cListenStats, cIDSock;
48 static ID id_new;
50 struct listen_stats {
51 uint32_t active;
52 uint32_t queued;
53 uint32_t listener_p;
56 /* override khashl.h defaults, these run w/o GVL */
57 #define kcalloc(N,Z) xcalloc(N,Z)
58 #define kmalloc(Z) xmalloc(Z)
59 #define krealloc(P,Z) abort() /* never called, we use ruby_xrealloc2 */
60 #define kfree(P) xfree(P)
62 #include "khashl.h"
63 KHASHL_CMAP_INIT(KH_LOCAL, addr2stats /* type */, a2s /* pfx */,
64 char * /* key */, struct listen_stats * /* val */,
65 kh_hash_str, kh_eq_str)
67 #define OPLEN (sizeof(struct inet_diag_bc_op) + \
68 sizeof(struct inet_diag_hostcond) + \
69 sizeof(struct sockaddr_storage))
71 struct nogvl_args {
72 addr2stats *a2s;
73 struct iovec iov[3]; /* last iov holds inet_diag bytecode */
74 struct listen_stats stats;
75 int fd;
78 #ifdef SOCK_CLOEXEC
79 # define my_SOCK_RAW (SOCK_RAW|SOCK_CLOEXEC)
80 # define FORCE_CLOEXEC(v) (v)
81 #else
82 # define my_SOCK_RAW SOCK_RAW
83 static VALUE FORCE_CLOEXEC(VALUE io)
85 int fd = my_fileno(io);
86 int flags = fcntl(fd, F_SETFD, FD_CLOEXEC);
87 if (flags == -1)
88 rb_sys_fail("fcntl(F_SETFD, FD_CLOEXEC)");
89 return io;
91 #endif
94 * call-seq:
95 * Raindrops::InetDiagSocket.new -> Socket
97 * Creates a new Socket object for the netlink inet_diag facility
99 static VALUE ids_s_new(VALUE klass)
101 VALUE argv[3];
103 argv[0] = INT2NUM(AF_NETLINK);
104 argv[1] = INT2NUM(my_SOCK_RAW);
105 argv[2] = INT2NUM(NETLINK_INET_DIAG);
107 return FORCE_CLOEXEC(rb_call_super(3, argv));
110 /* creates a Ruby ListenStats Struct based on our internal listen_stats */
111 static VALUE rb_listen_stats(struct listen_stats *stats)
113 VALUE active = UINT2NUM(stats->active);
114 VALUE queued = UINT2NUM(stats->queued);
116 return rb_struct_new(cListenStats, active, queued);
120 * call-seq:
121 * remove_scope_id(ip_address)
123 * Returns copy of IP address with Scope ID removed,
124 * if address has it (only IPv6 actually may have it).
126 static VALUE remove_scope_id(const char *addr)
128 VALUE rv = rb_str_new2(addr);
129 long len = RSTRING_LEN(rv);
130 char *ptr = RSTRING_PTR(rv);
131 char *pct = memchr(ptr, '%', len);
134 * remove scoped portion
135 * Ruby equivalent: rv.sub!(/%([^\]]*)\]/, "]")
137 if (pct) {
138 size_t newlen = pct - ptr;
139 char *rbracket = memchr(pct, ']', len - newlen);
141 if (rbracket) {
142 size_t move = len - (rbracket - ptr);
144 memmove(pct, rbracket, move);
145 newlen += move;
147 rb_str_set_len(rv, newlen);
148 } else {
149 rb_raise(rb_eArgError,
150 "']' not found in IPv6 addr=%s", ptr);
153 return rv;
156 static const char *addr_any(sa_family_t family)
158 static const char ipv4[] = "0.0.0.0";
159 static const char ipv6[] = "[::]";
161 if (family == AF_INET)
162 return ipv4;
163 assert(family == AF_INET6 && "unknown family");
164 return ipv6;
167 #ifdef __GNUC__
168 static void bug_warn_nogvl(const char *, ...)
169 __attribute__((format(printf,1,2)));
170 #endif
171 static void bug_warn_nogvl(const char *fmt, ...)
173 va_list ap;
175 va_start(ap, fmt);
176 vfprintf(stderr, fmt, ap);
177 va_end(ap);
179 fprintf(stderr, "Please report how you produced this at "\
180 "raindrops-public@yhbt.net\n");
181 fflush(stderr);
184 static struct listen_stats *stats_for(addr2stats *a2s, struct inet_diag_msg *r)
186 char *host, *key, *port, *old_key;
187 struct listen_stats *stats;
188 socklen_t hostlen;
189 socklen_t portlen = (socklen_t)sizeof("65535");
190 int n, absent;
191 const void *src = r->id.idiag_src;
192 char buf[INET6_ADDRSTRLEN];
193 size_t buf_len;
194 khint_t ki;
196 switch (r->idiag_family) {
197 case AF_INET: {
198 hostlen = INET_ADDRSTRLEN;
199 buf_len = hostlen + portlen;
200 host = key = buf;
201 break;
203 case AF_INET6: {
204 hostlen = INET6_ADDRSTRLEN;
205 buf_len = 1 + hostlen + 1 + portlen;
206 key = buf;
207 host = key + 1;
208 break;
210 default:
211 fprintf(stderr, "unsupported .idiag_family: %u\n",
212 (unsigned)r->idiag_family);
213 return NULL; /* can't raise w/o GVL */
215 if (!inet_ntop(r->idiag_family, src, host, hostlen)) {
216 bug_warn_nogvl("BUG: inet_ntop: %s\n", strerror(errno));
217 *key = '\0';
218 *host = '\0';
220 hostlen = (socklen_t)strlen(host);
221 switch (r->idiag_family) {
222 case AF_INET:
223 host[hostlen] = ':';
224 port = host + hostlen + 1;
225 break;
226 case AF_INET6:
227 key[0] = '[';
228 host[hostlen] = ']';
229 host[hostlen + 1] = ':';
230 port = host + hostlen + 2;
231 break;
232 default:
233 assert(0 && "should never get here (returned above)");
234 abort();
237 n = snprintf(port, portlen, "%u", ntohs(r->id.idiag_sport));
238 if (n <= 0) {
239 bug_warn_nogvl("BUG: snprintf port: %d\n", n);
240 *key = '\0';
243 ki = a2s_get(a2s, key);
244 if (ki < kh_end(a2s))
245 return kh_val(a2s, ki);
247 old_key = key;
249 if (r->idiag_state == TCP_ESTABLISHED) {
250 n = snprintf(key, buf_len, "%s:%u",
251 addr_any(r->idiag_family),
252 ntohs(r->id.idiag_sport));
253 if (n <= 0) {
254 bug_warn_nogvl("BUG: snprintf: %d\n", n);
255 *key = '\0';
258 ki = a2s_get(a2s, key);
259 if (ki < kh_end(a2s))
260 return kh_val(a2s, ki);
261 if (n <= 0) {
262 key = xmalloc(1);
263 *key = '\0';
264 } else {
265 old_key = key;
266 key = xmalloc(n + 1);
267 memcpy(key, old_key, n + 1);
269 } else {
270 size_t old_len = strlen(old_key) + 1;
271 key = xmalloc(old_len);
272 memcpy(key, old_key, old_len);
274 stats = xcalloc(1, sizeof(struct listen_stats));
275 ki = a2s_put(a2s, key, &absent); /* fails on OOM due to xrealloc */
276 assert(absent > 0 && "redundant put");
277 kh_val(a2s, ki) = stats;
278 return stats;
281 static void table_incr_active(addr2stats *a2s, struct inet_diag_msg *r)
283 struct listen_stats *stats = stats_for(a2s, r);
284 if (!stats) return;
285 ++stats->active;
288 static void table_set_queued(addr2stats *a2s, struct inet_diag_msg *r)
290 struct listen_stats *stats = stats_for(a2s, r);
291 if (!stats) return;
292 stats->listener_p = 1;
293 stats->queued += r->idiag_rqueue;
296 /* inner loop of inet_diag, called for every socket returned by netlink */
297 static inline void r_acc(struct nogvl_args *args, struct inet_diag_msg *r)
300 * inode == 0 means the connection is still in the listen queue
301 * and has not yet been accept()-ed by the server. The
302 * inet_diag bytecode cannot filter this for us.
304 if (r->idiag_inode == 0)
305 return;
306 if (r->idiag_state == TCP_ESTABLISHED) {
307 if (args->a2s)
308 table_incr_active(args->a2s, r);
309 else
310 args->stats.active++;
311 } else { /* if (r->idiag_state == TCP_LISTEN) */
312 if (args->a2s)
313 table_set_queued(args->a2s, r);
314 else
315 args->stats.queued += r->idiag_rqueue;
318 * we wont get anything else because of the idiag_states filter
322 static const char err_sendmsg[] = "sendmsg";
323 static const char err_recvmsg[] = "recvmsg";
324 static const char err_nlmsg[] = "nlmsg";
326 struct diag_req {
327 struct nlmsghdr nlh;
328 struct inet_diag_req r;
331 static void prep_msghdr(
332 struct msghdr *msg,
333 struct nogvl_args *args,
334 struct sockaddr_nl *nladdr,
335 size_t iovlen)
337 memset(msg, 0, sizeof(struct msghdr));
338 msg->msg_name = (void *)nladdr;
339 msg->msg_namelen = sizeof(struct sockaddr_nl);
340 msg->msg_iov = args->iov;
341 msg->msg_iovlen = iovlen;
344 static void prep_diag_args(
345 struct nogvl_args *args,
346 struct sockaddr_nl *nladdr,
347 struct rtattr *rta,
348 struct diag_req *req,
349 struct msghdr *msg)
351 memset(req, 0, sizeof(struct diag_req));
352 memset(nladdr, 0, sizeof(struct sockaddr_nl));
354 nladdr->nl_family = AF_NETLINK;
356 req->nlh.nlmsg_len = (unsigned int)(sizeof(struct diag_req) +
357 RTA_LENGTH(args->iov[2].iov_len));
358 req->nlh.nlmsg_type = TCPDIAG_GETSOCK;
359 req->nlh.nlmsg_flags = NLM_F_ROOT | NLM_F_MATCH | NLM_F_REQUEST;
360 req->nlh.nlmsg_pid = getpid();
361 req->r.idiag_states = (1<<TCP_ESTABLISHED) | (1<<TCP_LISTEN);
362 rta->rta_type = INET_DIAG_REQ_BYTECODE;
363 rta->rta_len = RTA_LENGTH(args->iov[2].iov_len);
365 args->iov[0].iov_base = req;
366 args->iov[0].iov_len = sizeof(struct diag_req);
367 args->iov[1].iov_base = rta;
368 args->iov[1].iov_len = sizeof(struct rtattr);
370 prep_msghdr(msg, args, nladdr, 3);
373 static void prep_recvmsg_buf(struct nogvl_args *args)
375 /* reuse buffer that was allocated for bytecode */
376 args->iov[0].iov_len = page_size;
377 args->iov[0].iov_base = args->iov[2].iov_base;
380 /* does the inet_diag stuff with netlink(), this is called w/o GVL */
381 static VALUE diag(void *ptr)
383 struct nogvl_args *args = ptr;
384 struct sockaddr_nl nladdr;
385 struct rtattr rta;
386 struct diag_req req;
387 struct msghdr msg;
388 const char *err = NULL;
389 unsigned seq = ++g_seq;
391 prep_diag_args(args, &nladdr, &rta, &req, &msg);
392 req.nlh.nlmsg_seq = seq;
394 if (sendmsg(args->fd, &msg, 0) < 0) {
395 err = err_sendmsg;
396 goto out;
399 prep_recvmsg_buf(args);
401 while (1) {
402 ssize_t readed;
403 size_t r;
404 struct nlmsghdr *h = (struct nlmsghdr *)args->iov[0].iov_base;
406 prep_msghdr(&msg, args, &nladdr, 1);
407 readed = recvmsg(args->fd, &msg, 0);
408 if (readed < 0) {
409 if (errno == EINTR)
410 continue;
411 err = err_recvmsg;
412 goto out;
414 if (readed == 0)
415 goto out;
416 r = (size_t)readed;
417 for ( ; NLMSG_OK(h, r); h = NLMSG_NEXT(h, r)) {
418 if (h->nlmsg_seq != seq)
419 continue;
420 if (h->nlmsg_type == NLMSG_DONE)
421 goto out;
422 if (h->nlmsg_type == NLMSG_ERROR) {
423 err = err_nlmsg;
424 goto out;
426 r_acc(args, NLMSG_DATA(h));
429 out:
430 /* prepare to raise, free memory before reacquiring GVL */
431 if (err && args->a2s) {
432 int save_errno = errno;
433 khint_t ki;
435 /* no kh_foreach* in khashl.h (unlike original khash.h) */
436 for (ki = 0; ki < kh_end(args->a2s); ki++) {
437 if (!kh_exist(args->a2s, ki)) continue;
439 xfree(kh_key(args->a2s, ki));
440 xfree(kh_val(args->a2s, ki));
442 a2s_destroy(args->a2s);
443 errno = save_errno;
445 return (VALUE)err;
448 /* populates sockaddr_storage struct by parsing +addr+ */
449 static void parse_addr(union any_addr *inet, VALUE addr)
451 char *host_ptr;
452 char *check;
453 char *colon = NULL;
454 char *rbracket = NULL;
455 void *dst;
456 long host_len;
457 int af, rc;
458 uint16_t *portdst;
459 unsigned long port;
461 Check_Type(addr, T_STRING);
462 host_ptr = StringValueCStr(addr);
463 host_len = RSTRING_LEN(addr);
464 if (*host_ptr == '[') { /* ipv6 address format (rfc2732) */
465 rbracket = memchr(host_ptr + 1, ']', host_len - 1);
467 if (rbracket == NULL)
468 rb_raise(rb_eArgError, "']' not found in IPv6 addr=%s",
469 host_ptr);
470 if (rbracket[1] != ':')
471 rb_raise(rb_eArgError, "':' not found in IPv6 addr=%s",
472 host_ptr);
473 colon = rbracket + 1;
474 host_ptr++;
475 *rbracket = 0;
476 inet->ss.ss_family = af = AF_INET6;
477 dst = &inet->in6.sin6_addr;
478 portdst = &inet->in6.sin6_port;
479 } else { /* ipv4 */
480 colon = memchr(host_ptr, ':', host_len);
481 inet->ss.ss_family = af = AF_INET;
482 dst = &inet->in.sin_addr;
483 portdst = &inet->in.sin_port;
486 if (!colon)
487 rb_raise(rb_eArgError, "port not found in: `%s'", host_ptr);
488 port = strtoul(colon + 1, &check, 10);
489 *colon = 0;
490 rc = inet_pton(af, host_ptr, dst);
491 *colon = ':';
492 if (rbracket) *rbracket = ']';
493 if (*check || ((uint16_t)port != port))
494 rb_raise(rb_eArgError, "invalid port: %s", colon + 1);
495 if (rc != 1)
496 rb_raise(rb_eArgError, "inet_pton failed for: `%s' with %d",
497 host_ptr, rc);
498 *portdst = ntohs((uint16_t)port);
501 /* generates inet_diag bytecode to match all addrs */
502 static void gen_bytecode_all(struct iovec *iov)
504 struct inet_diag_bc_op *op;
505 struct inet_diag_hostcond *cond;
507 /* iov_len was already set and base allocated in a parent function */
508 assert(iov->iov_len == OPLEN && iov->iov_base && "iov invalid");
509 op = iov->iov_base;
510 op->code = INET_DIAG_BC_S_COND;
511 op->yes = OPLEN;
512 op->no = sizeof(struct inet_diag_bc_op) + OPLEN;
513 cond = (struct inet_diag_hostcond *)(op + 1);
514 cond->family = AF_UNSPEC;
515 cond->port = -1;
516 cond->prefix_len = 0;
519 /* generates inet_diag bytecode to match a single addr */
520 static void gen_bytecode(struct iovec *iov, union any_addr *inet)
522 struct inet_diag_bc_op *op;
523 struct inet_diag_hostcond *cond;
525 /* iov_len was already set and base allocated in a parent function */
526 assert(iov->iov_len == OPLEN && iov->iov_base && "iov invalid");
527 op = iov->iov_base;
528 op->code = INET_DIAG_BC_S_COND;
529 op->yes = OPLEN;
530 op->no = sizeof(struct inet_diag_bc_op) + OPLEN;
532 cond = (struct inet_diag_hostcond *)(op + 1);
533 cond->family = inet->ss.ss_family;
534 switch (inet->ss.ss_family) {
535 case AF_INET: {
536 cond->port = ntohs(inet->in.sin_port);
537 cond->prefix_len = inet->in.sin_addr.s_addr == 0 ? 0 :
538 sizeof(inet->in.sin_addr.s_addr) * CHAR_BIT;
539 *cond->addr = inet->in.sin_addr.s_addr;
541 break;
542 case AF_INET6: {
543 cond->port = ntohs(inet->in6.sin6_port);
544 cond->prefix_len = memcmp(&in6addr_any, &inet->in6.sin6_addr,
545 sizeof(struct in6_addr)) == 0 ?
546 0 : sizeof(inet->in6.sin6_addr) * CHAR_BIT;
547 memcpy(&cond->addr, &inet->in6.sin6_addr,
548 sizeof(struct in6_addr));
550 break;
551 default:
552 assert(0 && "unsupported address family, could that be IPv7?!");
557 * n.b. we may safely raise here because an error will cause diag()
558 * to free args->a2s
560 static void nl_errcheck(VALUE r)
562 const char *err = (const char *)r;
564 if (err) {
565 if (err == err_nlmsg)
566 rb_raise(rb_eRuntimeError, "NLMSG_ERROR");
567 else
568 rb_sys_fail(err);
572 static VALUE tcp_stats(struct nogvl_args *args, VALUE addr)
574 union any_addr query_addr;
576 parse_addr(&query_addr, addr);
577 gen_bytecode(&args->iov[2], &query_addr);
579 memset(&args->stats, 0, sizeof(struct listen_stats));
580 nl_errcheck(rd_fd_region(diag, args, args->fd));
582 return rb_listen_stats(&args->stats);
585 /* part of the Ruby rb_hash_* API still relies on st_data_t... */
586 static int drop_placeholders(st_data_t k, st_data_t v, st_data_t ign)
588 if ((VALUE)v == Qtrue)
589 return ST_DELETE;
590 return ST_CONTINUE;
594 * call-seq:
595 * Raindrops::Linux.tcp_listener_stats([addrs[, sock]]) => hash
597 * If specified, +addr+ may be a string or array of strings representing
598 * listen addresses to filter for. Returns a hash with given addresses as
599 * keys and ListenStats objects as the values or a hash of all addresses.
601 * addrs = %w(0.0.0.0:80 127.0.0.1:8080)
603 * If +addr+ is nil or not specified, all (IPv4) addresses are returned.
604 * If +sock+ is specified, it should be a Raindrops::InetDiagSock object.
606 static VALUE tcp_listener_stats(int argc, VALUE *argv, VALUE self)
608 VALUE rv = rb_hash_new();
609 struct nogvl_args args;
610 VALUE addrs, sock, buf;
611 khint_t ki;
612 struct listen_stats *stats;
613 char *key;
615 rb_scan_args(argc, argv, "02", &addrs, &sock);
618 * allocating page_size instead of OP_LEN since we'll reuse the
619 * buffer for recvmsg() later, we already checked for
620 * OPLEN <= page_size at initialization
622 buf = rb_str_buf_new(page_size);
623 args.iov[2].iov_len = OPLEN;
624 args.iov[2].iov_base = RSTRING_PTR(buf);
625 args.a2s = NULL;
626 sock = NIL_P(sock) ? rb_funcall(cIDSock, id_new, 0)
627 : rb_io_get_io(sock);
628 args.fd = my_fileno(sock);
630 switch (TYPE(addrs)) {
631 case T_STRING:
632 rb_hash_aset(rv, addrs, tcp_stats(&args, addrs));
633 goto out;
634 case T_ARRAY: {
635 long i;
636 long len = RARRAY_LEN(addrs);
638 if (len == 1) {
639 VALUE cur = rb_ary_entry(addrs, 0);
641 rb_hash_aset(rv, cur, tcp_stats(&args, cur));
642 goto out;
644 for (i = 0; i < len; i++) {
645 union any_addr check;
646 VALUE cur = rb_ary_entry(addrs, i);
648 parse_addr(&check, cur);
649 rb_hash_aset(rv, cur, Qtrue /* placeholder */);
651 /* fall through */
653 case T_NIL:
654 args.a2s = a2s_init();
655 gen_bytecode_all(&args.iov[2]);
656 break;
657 default:
658 if (argc < 2) rb_io_close(sock);
659 rb_raise(rb_eArgError,
660 "addr must be an array of strings, a string, or nil");
663 nl_errcheck(rd_fd_region(diag, &args, args.fd));
665 /* no kh_foreach* in khashl.h (unlike original khash.h) */
666 for (ki = 0; ki < kh_end(args.a2s); ki++) {
667 if (!kh_exist(args.a2s, ki)) continue;
668 key = kh_key(args.a2s, ki);
669 stats = kh_val(args.a2s, ki);
670 if (stats->listener_p) {
671 VALUE k = remove_scope_id(key);
672 if (NIL_P(addrs) || rb_hash_lookup(rv, k) == Qtrue)
673 rb_hash_aset(rv, k, rb_listen_stats(stats));
675 xfree(key);
676 xfree(stats);
678 a2s_destroy(args.a2s);
680 if (RHASH_SIZE(rv) > 1)
681 rb_hash_foreach(rv, drop_placeholders, Qfalse);
683 out:
684 /* let GC deal with corner cases */
685 rb_str_resize(buf, 0);
686 if (argc < 2) rb_io_close(sock);
687 return rv;
690 void Init_raindrops_linux_inet_diag(void)
692 VALUE cRaindrops = rb_define_class("Raindrops", rb_cObject);
693 VALUE mLinux = rb_define_module_under(cRaindrops, "Linux");
694 VALUE Socket;
696 rb_require("socket");
697 Socket = rb_const_get(rb_cObject, rb_intern("Socket"));
698 id_new = rb_intern("new");
701 * Document-class: Raindrops::InetDiagSocket
703 * This is a subclass of +Socket+ specifically for talking
704 * to the inet_diag facility of Netlink.
706 cIDSock = rb_define_class_under(cRaindrops, "InetDiagSocket", Socket);
707 rb_define_singleton_method(cIDSock, "new", ids_s_new, 0);
709 cListenStats = rb_const_get(cRaindrops, rb_intern("ListenStats"));
710 rb_gc_register_mark_object(cListenStats); /* pin */
712 rb_define_module_function(mLinux, "tcp_listener_stats",
713 tcp_listener_stats, -1);
715 page_size = getpagesize();
717 assert(OPLEN <= page_size && "bytecode OPLEN is not <= PAGE_SIZE");
719 #endif /* __linux__ */