linux_inet_diag: reduce stack usage and simplify
[raindrops.git] / ext / raindrops / linux_inet_diag.c
blobf2db6a5369e7db1b451fe8f34637fb05045b64fa
1 #include <ruby.h>
2 #include <stdarg.h>
3 #ifdef HAVE_RUBY_ST_H
4 # include <ruby/st.h>
5 #else
6 # include <st.h>
7 #endif
8 #include "my_fileno.h"
9 #ifdef __linux__
11 /* Ruby 1.8.6+ macros (for compatibility with Ruby 1.9) */
12 #ifndef RSTRING_LEN
13 # define RSTRING_LEN(s) (RSTRING(s)->len)
14 #endif
16 /* partial emulation of the 1.9 rb_thread_blocking_region under 1.8 */
17 #if !defined(HAVE_RB_THREAD_BLOCKING_REGION) && \
18 !defined(HAVE_RB_THREAD_IO_BLOCKING_REGION)
19 # include <rubysig.h>
20 # define RUBY_UBF_IO ((rb_unblock_function_t *)-1)
21 typedef void rb_unblock_function_t(void *);
22 typedef VALUE rb_blocking_function_t(void *);
23 static VALUE
24 rb_thread_blocking_region(
25 rb_blocking_function_t *func, void *data1,
26 rb_unblock_function_t *ubf, void *data2)
28 VALUE rv;
30 TRAP_BEG;
31 rv = func(data1);
32 TRAP_END;
34 return rv;
36 #endif /* ! HAVE_RB_THREAD_BLOCKING_REGION */
38 #ifdef HAVE_RB_THREAD_IO_BLOCKING_REGION
39 VALUE rb_thread_io_blocking_region(rb_blocking_function_t *, void *, int);
40 #else
41 # define rb_thread_io_blocking_region(fn,data,fd) \
42 rb_thread_blocking_region((fn),(data),RUBY_UBF_IO,0)
43 #endif /* HAVE_RB_THREAD_IO_BLOCKING_REGION */
45 #include <assert.h>
46 #include <errno.h>
47 #include <sys/socket.h>
48 #include <sys/types.h>
49 #include <netdb.h>
50 #include <unistd.h>
51 #include <fcntl.h>
52 #include <string.h>
53 #include <asm/types.h>
54 #include <netinet/in.h>
55 #include <arpa/inet.h>
56 #include <netinet/tcp.h>
57 #include <linux/netlink.h>
58 #include <linux/rtnetlink.h>
59 #include <linux/inet_diag.h>
61 union any_addr {
62 struct sockaddr_storage ss;
63 struct sockaddr sa;
64 struct sockaddr_in in;
65 struct sockaddr_in6 in6;
68 static size_t page_size;
69 static unsigned g_seq;
70 static VALUE cListenStats, cIDSock;
71 static ID id_new;
73 struct listen_stats {
74 uint32_t active;
75 uint32_t queued;
76 uint32_t listener_p;
79 #define OPLEN (sizeof(struct inet_diag_bc_op) + \
80 sizeof(struct inet_diag_hostcond) + \
81 sizeof(struct sockaddr_storage))
83 struct nogvl_args {
84 st_table *table;
85 struct iovec iov[3]; /* last iov holds inet_diag bytecode */
86 struct listen_stats stats;
87 int fd;
90 #ifdef SOCK_CLOEXEC
91 # define my_SOCK_RAW (SOCK_RAW|SOCK_CLOEXEC)
92 # define FORCE_CLOEXEC(v) (v)
93 #else
94 # define my_SOCK_RAW SOCK_RAW
95 static VALUE FORCE_CLOEXEC(VALUE io)
97 int fd = my_fileno(io);
98 int flags = fcntl(fd, F_SETFD, FD_CLOEXEC);
99 if (flags == -1)
100 rb_sys_fail("fcntl(F_SETFD, FD_CLOEXEC)");
101 return io;
103 #endif
106 * call-seq:
107 * Raindrops::InetDiagSocket.new -> Socket
109 * Creates a new Socket object for the netlink inet_diag facility
111 static VALUE ids_s_new(VALUE klass)
113 VALUE argv[3];
115 argv[0] = INT2NUM(AF_NETLINK);
116 argv[1] = INT2NUM(my_SOCK_RAW);
117 argv[2] = INT2NUM(NETLINK_INET_DIAG);
119 return FORCE_CLOEXEC(rb_call_super(3, argv));
122 /* creates a Ruby ListenStats Struct based on our internal listen_stats */
123 static VALUE rb_listen_stats(struct listen_stats *stats)
125 VALUE active = UINT2NUM(stats->active);
126 VALUE queued = UINT2NUM(stats->queued);
128 return rb_struct_new(cListenStats, active, queued);
131 static int st_free_data(st_data_t key, st_data_t value, st_data_t ignored)
133 xfree((void *)key);
134 xfree((void *)value);
136 return ST_DELETE;
140 * call-seq:
141 * remove_scope_id(ip_address)
143 * Returns copy of IP address with Scope ID removed,
144 * if address has it (only IPv6 actually may have it).
146 static VALUE remove_scope_id(const char *addr)
148 VALUE rv = rb_str_new2(addr);
149 long len = RSTRING_LEN(rv);
150 char *ptr = RSTRING_PTR(rv);
151 char *pct = memchr(ptr, '%', len);
154 * remove scoped portion
155 * Ruby equivalent: rv.sub!(/%([^\]]*)\]/, "]")
157 if (pct) {
158 size_t newlen = pct - ptr;
159 char *rbracket = memchr(pct, ']', len - newlen);
161 if (rbracket) {
162 size_t move = len - (rbracket - ptr);
164 memmove(pct, rbracket, move);
165 newlen += move;
167 rb_str_set_len(rv, newlen);
168 } else {
169 rb_raise(rb_eArgError,
170 "']' not found in IPv6 addr=%s", ptr);
173 return rv;
176 static int st_to_hash(st_data_t key, st_data_t value, VALUE hash)
178 struct listen_stats *stats = (struct listen_stats *)value;
180 if (stats->listener_p) {
181 VALUE k = remove_scope_id((const char *)key);
182 VALUE v = rb_listen_stats(stats);
184 OBJ_FREEZE(k);
185 rb_hash_aset(hash, k, v);
187 return st_free_data(key, value, 0);
190 static int st_AND_hash(st_data_t key, st_data_t value, VALUE hash)
192 struct listen_stats *stats = (struct listen_stats *)value;
194 if (stats->listener_p) {
195 VALUE k = remove_scope_id((const char *)key);
197 if (rb_hash_lookup(hash, k) == Qtrue) {
198 VALUE v = rb_listen_stats(stats);
199 OBJ_FREEZE(k);
200 rb_hash_aset(hash, k, v);
203 return st_free_data(key, value, 0);
206 static const char *addr_any(sa_family_t family)
208 static const char ipv4[] = "0.0.0.0";
209 static const char ipv6[] = "[::]";
211 if (family == AF_INET)
212 return ipv4;
213 assert(family == AF_INET6 && "unknown family");
214 return ipv6;
217 #ifdef __GNUC__
218 static void bug_warn_nogvl(const char *, ...)
219 __attribute__((format(printf,1,2)));
220 #endif
221 static void bug_warn_nogvl(const char *fmt, ...)
223 va_list ap;
225 va_start(ap, fmt);
226 vfprintf(stderr, fmt, ap);
227 va_end(ap);
229 fprintf(stderr, "Please report how you produced this at "\
230 "raindrops-public@bogomips.org\n");
231 fflush(stderr);
234 static struct listen_stats *stats_for(st_table *table, struct inet_diag_msg *r)
236 char *host, *key, *port, *old_key;
237 size_t alloca_len;
238 struct listen_stats *stats;
239 socklen_t hostlen;
240 socklen_t portlen = (socklen_t)sizeof("65535");
241 int n;
242 const void *src = r->id.idiag_src;
244 switch (r->idiag_family) {
245 case AF_INET: {
246 hostlen = INET_ADDRSTRLEN;
247 alloca_len = hostlen + portlen;
248 host = key = alloca(alloca_len);
249 break;
251 case AF_INET6: {
252 hostlen = INET6_ADDRSTRLEN;
253 alloca_len = 1 + hostlen + 1 + portlen;
254 key = alloca(alloca_len);
255 host = key + 1;
256 break;
258 default:
259 assert(0 && "unsupported address family, could that be IPv7?!");
261 if (!inet_ntop(r->idiag_family, src, host, hostlen)) {
262 bug_warn_nogvl("BUG: inet_ntop: %s\n", strerror(errno));
263 *key = '\0';
264 *host = '\0';
266 hostlen = (socklen_t)strlen(host);
267 switch (r->idiag_family) {
268 case AF_INET:
269 host[hostlen] = ':';
270 port = host + hostlen + 1;
271 break;
272 case AF_INET6:
273 key[0] = '[';
274 host[hostlen] = ']';
275 host[hostlen + 1] = ':';
276 port = host + hostlen + 2;
277 break;
278 default:
279 assert(0 && "unsupported address family, could that be IPv7?!");
282 n = snprintf(port, portlen, "%u", ntohs(r->id.idiag_sport));
283 if (n <= 0) {
284 bug_warn_nogvl("BUG: snprintf port: %d\n", n);
285 *key = '\0';
288 if (st_lookup(table, (st_data_t)key, (st_data_t *)&stats))
289 return stats;
291 old_key = key;
293 if (r->idiag_state == TCP_ESTABLISHED) {
294 n = snprintf(key, alloca_len, "%s:%u",
295 addr_any(r->idiag_family),
296 ntohs(r->id.idiag_sport));
297 if (n <= 0) {
298 bug_warn_nogvl("BUG: snprintf: %d\n", n);
299 *key = '\0';
301 if (st_lookup(table, (st_data_t)key, (st_data_t *)&stats))
302 return stats;
303 if (n <= 0) {
304 key = xmalloc(1);
305 *key = '\0';
306 } else {
307 old_key = key;
308 key = xmalloc(n + 1);
309 memcpy(key, old_key, n + 1);
311 } else {
312 size_t old_len = strlen(old_key) + 1;
313 key = xmalloc(old_len);
314 memcpy(key, old_key, old_len);
316 stats = xcalloc(1, sizeof(struct listen_stats));
317 st_insert(table, (st_data_t)key, (st_data_t)stats);
318 return stats;
321 static void table_incr_active(st_table *table, struct inet_diag_msg *r)
323 struct listen_stats *stats = stats_for(table, r);
324 ++stats->active;
327 static void table_set_queued(st_table *table, struct inet_diag_msg *r)
329 struct listen_stats *stats = stats_for(table, r);
330 stats->listener_p = 1;
331 stats->queued = r->idiag_rqueue;
334 /* inner loop of inet_diag, called for every socket returned by netlink */
335 static inline void r_acc(struct nogvl_args *args, struct inet_diag_msg *r)
338 * inode == 0 means the connection is still in the listen queue
339 * and has not yet been accept()-ed by the server. The
340 * inet_diag bytecode cannot filter this for us.
342 if (r->idiag_inode == 0)
343 return;
344 if (r->idiag_state == TCP_ESTABLISHED) {
345 if (args->table)
346 table_incr_active(args->table, r);
347 else
348 args->stats.active++;
349 } else { /* if (r->idiag_state == TCP_LISTEN) */
350 if (args->table)
351 table_set_queued(args->table, r);
352 else
353 args->stats.queued = r->idiag_rqueue;
356 * we wont get anything else because of the idiag_states filter
360 static const char err_sendmsg[] = "sendmsg";
361 static const char err_recvmsg[] = "recvmsg";
362 static const char err_nlmsg[] = "nlmsg";
364 struct diag_req {
365 struct nlmsghdr nlh;
366 struct inet_diag_req r;
369 static void prep_msghdr(
370 struct msghdr *msg,
371 struct nogvl_args *args,
372 struct sockaddr_nl *nladdr,
373 size_t iovlen)
375 memset(msg, 0, sizeof(struct msghdr));
376 msg->msg_name = (void *)nladdr;
377 msg->msg_namelen = sizeof(struct sockaddr_nl);
378 msg->msg_iov = args->iov;
379 msg->msg_iovlen = iovlen;
382 static void prep_diag_args(
383 struct nogvl_args *args,
384 struct sockaddr_nl *nladdr,
385 struct rtattr *rta,
386 struct diag_req *req,
387 struct msghdr *msg)
389 memset(req, 0, sizeof(struct diag_req));
390 memset(nladdr, 0, sizeof(struct sockaddr_nl));
392 nladdr->nl_family = AF_NETLINK;
394 req->nlh.nlmsg_len = (unsigned int)(sizeof(struct diag_req) +
395 RTA_LENGTH(args->iov[2].iov_len));
396 req->nlh.nlmsg_type = TCPDIAG_GETSOCK;
397 req->nlh.nlmsg_flags = NLM_F_ROOT | NLM_F_MATCH | NLM_F_REQUEST;
398 req->nlh.nlmsg_pid = getpid();
399 req->r.idiag_states = (1<<TCP_ESTABLISHED) | (1<<TCP_LISTEN);
400 rta->rta_type = INET_DIAG_REQ_BYTECODE;
401 rta->rta_len = RTA_LENGTH(args->iov[2].iov_len);
403 args->iov[0].iov_base = req;
404 args->iov[0].iov_len = sizeof(struct diag_req);
405 args->iov[1].iov_base = rta;
406 args->iov[1].iov_len = sizeof(struct rtattr);
408 prep_msghdr(msg, args, nladdr, 3);
411 static void prep_recvmsg_buf(struct nogvl_args *args)
413 /* reuse buffer that was allocated for bytecode */
414 args->iov[0].iov_len = page_size;
415 args->iov[0].iov_base = args->iov[2].iov_base;
418 /* does the inet_diag stuff with netlink(), this is called w/o GVL */
419 static VALUE diag(void *ptr)
421 struct nogvl_args *args = ptr;
422 struct sockaddr_nl nladdr;
423 struct rtattr rta;
424 struct diag_req req;
425 struct msghdr msg;
426 const char *err = NULL;
427 unsigned seq = ++g_seq;
429 prep_diag_args(args, &nladdr, &rta, &req, &msg);
430 req.nlh.nlmsg_seq = seq;
432 if (sendmsg(args->fd, &msg, 0) < 0) {
433 err = err_sendmsg;
434 goto out;
437 prep_recvmsg_buf(args);
439 while (1) {
440 ssize_t readed;
441 size_t r;
442 struct nlmsghdr *h = (struct nlmsghdr *)args->iov[0].iov_base;
444 prep_msghdr(&msg, args, &nladdr, 1);
445 readed = recvmsg(args->fd, &msg, 0);
446 if (readed < 0) {
447 if (errno == EINTR)
448 continue;
449 err = err_recvmsg;
450 goto out;
452 if (readed == 0)
453 goto out;
454 r = (size_t)readed;
455 for ( ; NLMSG_OK(h, r); h = NLMSG_NEXT(h, r)) {
456 if (h->nlmsg_seq != seq)
457 continue;
458 if (h->nlmsg_type == NLMSG_DONE)
459 goto out;
460 if (h->nlmsg_type == NLMSG_ERROR) {
461 err = err_nlmsg;
462 goto out;
464 r_acc(args, NLMSG_DATA(h));
467 out:
468 /* prepare to raise, free memory before reacquiring GVL */
469 if (err && args->table) {
470 int save_errno = errno;
472 st_foreach(args->table, st_free_data, 0);
473 st_free_table(args->table);
474 errno = save_errno;
476 return (VALUE)err;
479 /* populates sockaddr_storage struct by parsing +addr+ */
480 static void parse_addr(union any_addr *inet, VALUE addr)
482 char *host_ptr;
483 char *check;
484 char *colon = NULL;
485 char *rbracket = NULL;
486 void *dst;
487 long host_len;
488 int af, rc;
489 uint16_t *portdst;
490 unsigned long port;
492 Check_Type(addr, T_STRING);
493 host_ptr = StringValueCStr(addr);
494 host_len = RSTRING_LEN(addr);
495 if (*host_ptr == '[') { /* ipv6 address format (rfc2732) */
496 rbracket = memchr(host_ptr + 1, ']', host_len - 1);
498 if (rbracket == NULL)
499 rb_raise(rb_eArgError, "']' not found in IPv6 addr=%s",
500 host_ptr);
501 if (rbracket[1] != ':')
502 rb_raise(rb_eArgError, "':' not found in IPv6 addr=%s",
503 host_ptr);
504 colon = rbracket + 1;
505 host_ptr++;
506 *rbracket = 0;
507 inet->ss.ss_family = af = AF_INET6;
508 dst = &inet->in6.sin6_addr;
509 portdst = &inet->in6.sin6_port;
510 } else { /* ipv4 */
511 colon = memchr(host_ptr, ':', host_len);
512 inet->ss.ss_family = af = AF_INET;
513 dst = &inet->in.sin_addr;
514 portdst = &inet->in.sin_port;
517 if (!colon)
518 rb_raise(rb_eArgError, "port not found in: `%s'", host_ptr);
519 port = strtoul(colon + 1, &check, 10);
520 *colon = 0;
521 rc = inet_pton(af, host_ptr, dst);
522 *colon = ':';
523 if (rbracket) *rbracket = ']';
524 if (*check || ((uint16_t)port != port))
525 rb_raise(rb_eArgError, "invalid port: %s", colon + 1);
526 if (rc != 1)
527 rb_raise(rb_eArgError, "inet_pton failed for: `%s' with %d",
528 host_ptr, rc);
529 *portdst = ntohs((uint16_t)port);
532 /* generates inet_diag bytecode to match all addrs */
533 static void gen_bytecode_all(struct iovec *iov)
535 struct inet_diag_bc_op *op;
536 struct inet_diag_hostcond *cond;
538 /* iov_len was already set and base allocated in a parent function */
539 assert(iov->iov_len == OPLEN && iov->iov_base && "iov invalid");
540 op = iov->iov_base;
541 op->code = INET_DIAG_BC_S_COND;
542 op->yes = OPLEN;
543 op->no = sizeof(struct inet_diag_bc_op) + OPLEN;
544 cond = (struct inet_diag_hostcond *)(op + 1);
545 cond->family = AF_UNSPEC;
546 cond->port = -1;
547 cond->prefix_len = 0;
550 /* generates inet_diag bytecode to match a single addr */
551 static void gen_bytecode(struct iovec *iov, union any_addr *inet)
553 struct inet_diag_bc_op *op;
554 struct inet_diag_hostcond *cond;
556 /* iov_len was already set and base allocated in a parent function */
557 assert(iov->iov_len == OPLEN && iov->iov_base && "iov invalid");
558 op = iov->iov_base;
559 op->code = INET_DIAG_BC_S_COND;
560 op->yes = OPLEN;
561 op->no = sizeof(struct inet_diag_bc_op) + OPLEN;
563 cond = (struct inet_diag_hostcond *)(op + 1);
564 cond->family = inet->ss.ss_family;
565 switch (inet->ss.ss_family) {
566 case AF_INET: {
567 cond->port = ntohs(inet->in.sin_port);
568 cond->prefix_len = inet->in.sin_addr.s_addr == 0 ? 0 :
569 sizeof(inet->in.sin_addr.s_addr) * CHAR_BIT;
570 *cond->addr = inet->in.sin_addr.s_addr;
572 break;
573 case AF_INET6: {
574 cond->port = ntohs(inet->in6.sin6_port);
575 cond->prefix_len = memcmp(&in6addr_any, &inet->in6.sin6_addr,
576 sizeof(struct in6_addr)) == 0 ?
577 0 : sizeof(inet->in6.sin6_addr) * CHAR_BIT;
578 memcpy(&cond->addr, &inet->in6.sin6_addr,
579 sizeof(struct in6_addr));
581 break;
582 default:
583 assert(0 && "unsupported address family, could that be IPv7?!");
588 * n.b. we may safely raise here because an error will cause diag()
589 * to free args->table
591 static void nl_errcheck(VALUE r)
593 const char *err = (const char *)r;
595 if (err) {
596 if (err == err_nlmsg)
597 rb_raise(rb_eRuntimeError, "NLMSG_ERROR");
598 else
599 rb_sys_fail(err);
603 static VALUE tcp_stats(struct nogvl_args *args, VALUE addr)
605 union any_addr query_addr;
607 parse_addr(&query_addr, addr);
608 gen_bytecode(&args->iov[2], &query_addr);
610 memset(&args->stats, 0, sizeof(struct listen_stats));
611 nl_errcheck(rb_thread_io_blocking_region(diag, args, args->fd));
613 return rb_listen_stats(&args->stats);
616 static int drop_placeholders(st_data_t k, st_data_t v, st_data_t ign)
618 if ((VALUE)v == Qtrue)
619 return ST_DELETE;
620 return ST_CONTINUE;
624 * call-seq:
625 * Raindrops::Linux.tcp_listener_stats([addrs[, sock]]) => hash
627 * If specified, +addr+ may be a string or array of strings representing
628 * listen addresses to filter for. Returns a hash with given addresses as
629 * keys and ListenStats objects as the values or a hash of all addresses.
631 * addrs = %w(0.0.0.0:80 127.0.0.1:8080)
633 * If +addr+ is nil or not specified, all (IPv4) addresses are returned.
634 * If +sock+ is specified, it should be a Raindrops::InetDiagSock object.
636 static VALUE tcp_listener_stats(int argc, VALUE *argv, VALUE self)
638 VALUE rv = rb_hash_new();
639 struct nogvl_args args;
640 VALUE addrs, sock;
642 rb_scan_args(argc, argv, "02", &addrs, &sock);
645 * allocating page_size instead of OP_LEN since we'll reuse the
646 * buffer for recvmsg() later, we already checked for
647 * OPLEN <= page_size at initialization
649 args.iov[2].iov_len = OPLEN;
650 args.iov[2].iov_base = alloca(page_size);
651 args.table = NULL;
652 if (NIL_P(sock))
653 sock = rb_funcall(cIDSock, id_new, 0);
654 args.fd = my_fileno(sock);
656 switch (TYPE(addrs)) {
657 case T_STRING:
658 rb_hash_aset(rv, addrs, tcp_stats(&args, addrs));
659 return rv;
660 case T_ARRAY: {
661 long i;
662 long len = RARRAY_LEN(addrs);
664 if (len == 1) {
665 VALUE cur = rb_ary_entry(addrs, 0);
667 rb_hash_aset(rv, cur, tcp_stats(&args, cur));
668 return rv;
670 for (i = 0; i < len; i++) {
671 union any_addr check;
672 VALUE cur = rb_ary_entry(addrs, i);
674 parse_addr(&check, cur);
675 rb_hash_aset(rv, cur, Qtrue /* placeholder */);
677 /* fall through */
679 case T_NIL:
680 args.table = st_init_strtable();
681 gen_bytecode_all(&args.iov[2]);
682 break;
683 default:
684 rb_raise(rb_eArgError,
685 "addr must be an array of strings, a string, or nil");
688 nl_errcheck(rb_thread_io_blocking_region(diag, &args, args.fd));
690 st_foreach(args.table, NIL_P(addrs) ? st_to_hash : st_AND_hash, rv);
691 st_free_table(args.table);
693 if (RHASH_SIZE(rv) > 1)
694 rb_hash_foreach(rv, drop_placeholders, Qfalse);
696 /* let GC deal with corner cases */
697 if (argc < 2) rb_io_close(sock);
698 return rv;
701 void Init_raindrops_linux_inet_diag(void)
703 VALUE cRaindrops = rb_define_class("Raindrops", rb_cObject);
704 VALUE mLinux = rb_define_module_under(cRaindrops, "Linux");
705 VALUE Socket;
707 rb_require("socket");
708 Socket = rb_const_get(rb_cObject, rb_intern("Socket"));
709 id_new = rb_intern("new");
712 * Document-class: Raindrops::InetDiagSocket
714 * This is a subclass of +Socket+ specifically for talking
715 * to the inet_diag facility of Netlink.
717 cIDSock = rb_define_class_under(cRaindrops, "InetDiagSocket", Socket);
718 rb_define_singleton_method(cIDSock, "new", ids_s_new, 0);
720 cListenStats = rb_const_get(cRaindrops, rb_intern("ListenStats"));
722 rb_define_module_function(mLinux, "tcp_listener_stats",
723 tcp_listener_stats, -1);
725 page_size = getpagesize();
727 assert(OPLEN <= page_size && "bytecode OPLEN is not <= PAGE_SIZE");
729 #endif /* __linux__ */