ctdb-common: Set immediate mode for pcap capture
[Samba.git] / ctdb / common / system_socket.c
blob273b9c3400ee177fa70552c4c0d8807692cd3bef
1 /*
2 ctdb system specific code to manage raw sockets on linux
4 Copyright (C) Ronnie Sahlberg 2007
5 Copyright (C) Andrew Tridgell 2007
6 Copyright (C) Marc Dequènes (Duck) 2009
7 Copyright (C) Volker Lendecke 2012
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 3 of the License, or
12 (at your option) any later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, see <http://www.gnu.org/licenses/>.
23 #include "replace.h"
26 * Use BSD struct tcphdr field names for portability. Modern glibc
27 * makes them available by default via <netinet/tcp.h> but older glibc
28 * requires __FAVOR_BSD to be defined.
30 * __FAVOR_BSD is normally defined in <features.h> if _DEFAULT_SOURCE
31 * (new) or _BSD_SOURCE (now deprecated) is set and _GNU_SOURCE is not
32 * set. Including "replace.h" above causes <features.h> to be
33 * indirectly included and this will not set __FAVOR_BSD because
34 * _GNU_SOURCE is set in Samba's "config.h" (which is included by
35 * "replace.h").
37 * Therefore, set __FAVOR_BSD by hand below.
39 #define __FAVOR_BSD 1
40 #include "system/network.h"
42 #ifdef HAVE_NETINET_IF_ETHER_H
43 #include <netinet/if_ether.h>
44 #endif
45 #ifdef HAVE_NETINET_IP6_H
46 #include <netinet/ip6.h>
47 #endif
48 #ifdef HAVE_NETINET_ICMP6_H
49 #include <netinet/icmp6.h>
50 #endif
51 #ifdef HAVE_LINUX_IF_PACKET_H
52 #include <linux/if_packet.h>
53 #endif
55 #ifndef ETHERTYPE_IP6
56 #define ETHERTYPE_IP6 0x86dd
57 #endif
59 #include "lib/util/debug.h"
60 #include "lib/util/blocking.h"
62 #include "protocol/protocol.h"
64 #include "common/logging.h"
65 #include "common/system_socket.h"
68 uint16 checksum for n bytes
70 static uint32_t uint16_checksum(uint8_t *data, size_t n)
72 uint32_t sum=0;
73 uint16_t value;
75 while (n>=2) {
76 memcpy(&value, data, 2);
77 sum += (uint32_t)ntohs(value);
78 data += 2;
79 n -= 2;
81 if (n == 1) {
82 sum += (uint32_t)ntohs(*data);
84 return sum;
88 * See if the given IP is currently on an interface
90 bool ctdb_sys_have_ip(ctdb_sock_addr *_addr)
92 int s;
93 int ret;
94 ctdb_sock_addr __addr = *_addr;
95 ctdb_sock_addr *addr = &__addr;
96 socklen_t addrlen = 0;
98 switch (addr->sa.sa_family) {
99 case AF_INET:
100 addr->ip.sin_port = 0;
101 addrlen = sizeof(struct sockaddr_in);
102 break;
103 case AF_INET6:
104 addr->ip6.sin6_port = 0;
105 addrlen = sizeof(struct sockaddr_in6);
106 break;
109 s = socket(addr->sa.sa_family, SOCK_STREAM, IPPROTO_TCP);
110 if (s == -1) {
111 return false;
114 ret = bind(s, (struct sockaddr *)addr, addrlen);
116 close(s);
117 return ret == 0;
121 * simple TCP checksum - assumes data is multiple of 2 bytes long
123 static uint16_t ip_checksum(uint8_t *data, size_t n, struct ip *ip)
125 uint32_t sum = uint16_checksum(data, n);
126 uint16_t sum2;
128 sum += uint16_checksum((uint8_t *)&ip->ip_src, sizeof(ip->ip_src));
129 sum += uint16_checksum((uint8_t *)&ip->ip_dst, sizeof(ip->ip_dst));
130 sum += ip->ip_p + n;
131 sum = (sum & 0xFFFF) + (sum >> 16);
132 sum = (sum & 0xFFFF) + (sum >> 16);
133 sum2 = htons(sum);
134 sum2 = ~sum2;
135 if (sum2 == 0) {
136 return 0xFFFF;
138 return sum2;
141 static uint16_t ip6_checksum(uint8_t *data, size_t n, struct ip6_hdr *ip6)
143 uint16_t phdr[3];
144 uint32_t sum = 0;
145 uint16_t sum2;
146 uint32_t len;
148 sum += uint16_checksum((uint8_t *)&ip6->ip6_src, 16);
149 sum += uint16_checksum((uint8_t *)&ip6->ip6_dst, 16);
151 len = htonl(n);
152 phdr[0] = len & UINT16_MAX;
153 phdr[1] = (len >> 16) & UINT16_MAX;
154 /* ip6_nxt is only 8 bits, so fits comfortably into a uint16_t */
155 phdr[2] = htons(ip6->ip6_nxt);
156 sum += uint16_checksum((uint8_t *)phdr, sizeof(phdr));
158 sum += uint16_checksum(data, n);
160 sum = (sum & 0xFFFF) + (sum >> 16);
161 sum = (sum & 0xFFFF) + (sum >> 16);
162 sum2 = htons(sum);
163 sum2 = ~sum2;
164 if (sum2 == 0) {
165 return 0xFFFF;
167 return sum2;
171 * Send gratuitous ARP request/reply or IPv6 neighbor advertisement
174 #ifdef HAVE_PACKETSOCKET
177 * Create IPv4 ARP requests/replies or IPv6 neighbour advertisement
178 * packets
181 #define ARP_STRUCT_SIZE sizeof(struct ether_header) + \
182 sizeof(struct ether_arp)
184 #define IP6_NA_STRUCT_SIZE sizeof(struct ether_header) + \
185 sizeof(struct ip6_hdr) + \
186 sizeof(struct nd_neighbor_advert) + \
187 sizeof(struct nd_opt_hdr) + \
188 sizeof(struct ether_addr)
190 #define ARP_BUFFER_SIZE MAX(ARP_STRUCT_SIZE, 64)
192 #define IP6_NA_BUFFER_SIZE MAX(IP6_NA_STRUCT_SIZE, 64)
194 static int arp_build(uint8_t *buffer,
195 size_t buflen,
196 const struct sockaddr_in *addr,
197 const struct ether_addr *hwaddr,
198 bool reply,
199 struct ether_addr **ether_dhost,
200 size_t *len)
202 size_t l = ARP_BUFFER_SIZE;
203 struct ether_header *eh;
204 struct ether_arp *ea;
205 struct arphdr *ah;
207 if (addr->sin_family != AF_INET) {
208 return EINVAL;
211 if (buflen < l) {
212 return EMSGSIZE;
215 memset(buffer, 0 , l);
217 eh = (struct ether_header *)buffer;
218 memset(eh->ether_dhost, 0xff, ETH_ALEN);
219 memcpy(eh->ether_shost, hwaddr, ETH_ALEN);
220 eh->ether_type = htons(ETHERTYPE_ARP);
222 ea = (struct ether_arp *)(buffer + sizeof(struct ether_header));
223 ah = &ea->ea_hdr;
224 ah->ar_hrd = htons(ARPHRD_ETHER);
225 ah->ar_pro = htons(ETH_P_IP);
226 ah->ar_hln = ETH_ALEN;
227 ah->ar_pln = sizeof(ea->arp_spa);
229 if (! reply) {
230 ah->ar_op = htons(ARPOP_REQUEST);
231 memcpy(ea->arp_sha, hwaddr, ETH_ALEN);
232 memcpy(ea->arp_spa, &addr->sin_addr, sizeof(ea->arp_spa));
233 memset(ea->arp_tha, 0, ETH_ALEN);
234 memcpy(ea->arp_tpa, &addr->sin_addr, sizeof(ea->arp_tpa));
235 } else {
236 ah->ar_op = htons(ARPOP_REPLY);
237 memcpy(ea->arp_sha, hwaddr, ETH_ALEN);
238 memcpy(ea->arp_spa, &addr->sin_addr, sizeof(ea->arp_spa));
239 memcpy(ea->arp_tha, hwaddr, ETH_ALEN);
240 memcpy(ea->arp_tpa, &addr->sin_addr, sizeof(ea->arp_tpa));
243 *ether_dhost = (struct ether_addr *)eh->ether_dhost;
244 *len = l;
245 return 0;
248 static int ip6_na_build(uint8_t *buffer,
249 size_t buflen,
250 const struct sockaddr_in6 *addr,
251 const struct ether_addr *hwaddr,
252 struct ether_addr **ether_dhost,
253 size_t *len)
255 size_t l = IP6_NA_BUFFER_SIZE;
256 struct ether_header *eh;
257 struct ip6_hdr *ip6;
258 struct nd_neighbor_advert *nd_na;
259 struct nd_opt_hdr *nd_oh;
260 struct ether_addr *ea;
261 int ret;
263 if (addr->sin6_family != AF_INET6) {
264 return EINVAL;
267 if (buflen < l) {
268 return EMSGSIZE;
271 memset(buffer, 0 , l);
273 eh = (struct ether_header *)buffer;
275 * Ethernet multicast: 33:33:00:00:00:01 (see RFC2464,
276 * section 7) - note memset 0 above!
278 eh->ether_dhost[0] = 0x33;
279 eh->ether_dhost[1] = 0x33;
280 eh->ether_dhost[5] = 0x01;
281 memcpy(eh->ether_shost, hwaddr, ETH_ALEN);
282 eh->ether_type = htons(ETHERTYPE_IP6);
284 ip6 = (struct ip6_hdr *)(buffer + sizeof(struct ether_header));
285 ip6->ip6_vfc = 6 << 4;
286 ip6->ip6_plen = htons(sizeof(struct nd_neighbor_advert) +
287 sizeof(struct nd_opt_hdr) +
288 ETH_ALEN);
289 ip6->ip6_nxt = IPPROTO_ICMPV6;
290 ip6->ip6_hlim = 255;
291 ip6->ip6_src = addr->sin6_addr;
292 /* all-nodes multicast */
294 ret = inet_pton(AF_INET6, "ff02::1", &ip6->ip6_dst);
295 if (ret != 1) {
296 return EIO;
299 nd_na = (struct nd_neighbor_advert *)(buffer +
300 sizeof(struct ether_header) +
301 sizeof(struct ip6_hdr));
302 nd_na->nd_na_type = ND_NEIGHBOR_ADVERT;
303 nd_na->nd_na_code = 0;
304 nd_na->nd_na_flags_reserved = ND_NA_FLAG_OVERRIDE;
305 nd_na->nd_na_target = addr->sin6_addr;
307 /* Option: Target link-layer address */
308 nd_oh = (struct nd_opt_hdr *)(buffer +
309 sizeof(struct ether_header) +
310 sizeof(struct ip6_hdr) +
311 sizeof(struct nd_neighbor_advert));
312 nd_oh->nd_opt_type = ND_OPT_TARGET_LINKADDR;
313 nd_oh->nd_opt_len = 1; /* multiple of 8 octets */
315 ea = (struct ether_addr *)(buffer +
316 sizeof(struct ether_header) +
317 sizeof(struct ip6_hdr) +
318 sizeof(struct nd_neighbor_advert) +
319 sizeof(struct nd_opt_hdr));
320 memcpy(ea, hwaddr, ETH_ALEN);
322 nd_na->nd_na_cksum = ip6_checksum((uint8_t *)nd_na,
323 ntohs(ip6->ip6_plen),
324 ip6);
326 *ether_dhost = (struct ether_addr *)eh->ether_dhost;
327 *len = l;
328 return 0;
331 int ctdb_sys_send_arp(const ctdb_sock_addr *addr, const char *iface)
333 int s;
334 struct sockaddr_ll sall = {0};
335 struct ifreq if_hwaddr = {
336 .ifr_ifru = {
337 .ifru_flags = 0
340 uint8_t buffer[MAX(ARP_BUFFER_SIZE, IP6_NA_BUFFER_SIZE)];
341 struct ifreq ifr = {
342 .ifr_ifru = {
343 .ifru_flags = 0
346 struct ether_addr *hwaddr = NULL;
347 struct ether_addr *ether_dhost = NULL;
348 size_t len = 0;
349 int ret = 0;
351 s = socket(AF_PACKET, SOCK_RAW, 0);
352 if (s == -1) {
353 ret = errno;
354 DBG_ERR("Failed to open raw socket\n");
355 return ret;
357 DBG_DEBUG("Created SOCKET FD:%d for sending arp\n", s);
359 /* Find interface */
360 strlcpy(ifr.ifr_name, iface, sizeof(ifr.ifr_name));
361 if (ioctl(s, SIOCGIFINDEX, &ifr) < 0) {
362 ret = errno;
363 DBG_ERR("Interface '%s' not found\n", iface);
364 goto fail;
367 /* Get MAC address */
368 strlcpy(if_hwaddr.ifr_name, iface, sizeof(if_hwaddr.ifr_name));
369 ret = ioctl(s, SIOCGIFHWADDR, &if_hwaddr);
370 if ( ret < 0 ) {
371 ret = errno;
372 DBG_ERR("ioctl failed\n");
373 goto fail;
375 if (ARPHRD_LOOPBACK == if_hwaddr.ifr_hwaddr.sa_family) {
376 ret = 0;
377 D_DEBUG("Ignoring loopback arp request\n");
378 goto fail;
380 if (if_hwaddr.ifr_hwaddr.sa_family != ARPHRD_ETHER) {
381 ret = EINVAL;
382 DBG_ERR("Not an ethernet address family (0x%x)\n",
383 if_hwaddr.ifr_hwaddr.sa_family);
384 goto fail;;
387 /* Set up most of destination address structure */
388 sall.sll_family = AF_PACKET;
389 sall.sll_halen = sizeof(struct ether_addr);
390 sall.sll_protocol = htons(ETH_P_ALL);
391 sall.sll_ifindex = ifr.ifr_ifindex;
393 /* For clarity */
394 hwaddr = (struct ether_addr *)if_hwaddr.ifr_hwaddr.sa_data;
396 switch (addr->ip.sin_family) {
397 case AF_INET:
398 /* Send gratuitous ARP */
399 ret = arp_build(buffer,
400 sizeof(buffer),
401 &addr->ip,
402 hwaddr,
403 false,
404 &ether_dhost,
405 &len);
406 if (ret != 0) {
407 DBG_ERR("Failed to build ARP request\n");
408 goto fail;
411 memcpy(&sall.sll_addr[0], ether_dhost, sall.sll_halen);
413 ret = sendto(s,
414 buffer,
415 len,
417 (struct sockaddr *)&sall,
418 sizeof(sall));
419 if (ret < 0 ) {
420 ret = errno;
421 DBG_ERR("Failed sendto\n");
422 goto fail;
425 /* Send unsolicited ARP reply */
426 ret = arp_build(buffer,
427 sizeof(buffer),
428 &addr->ip,
429 hwaddr,
430 true,
431 &ether_dhost,
432 &len);
433 if (ret != 0) {
434 DBG_ERR("Failed to build ARP reply\n");
435 goto fail;
438 memcpy(&sall.sll_addr[0], ether_dhost, sall.sll_halen);
440 ret = sendto(s,
441 buffer,
442 len,
444 (struct sockaddr *)&sall,
445 sizeof(sall));
446 if (ret < 0 ) {
447 ret = errno;
448 DBG_ERR("Failed sendto\n");
449 goto fail;
452 close(s);
453 break;
455 case AF_INET6:
456 ret = ip6_na_build(buffer,
457 sizeof(buffer),
458 &addr->ip6,
459 hwaddr,
460 &ether_dhost,
461 &len);
462 if (ret != 0) {
463 DBG_ERR("Failed to build IPv6 neighbor advertisement\n");
464 goto fail;
467 memcpy(&sall.sll_addr[0], ether_dhost, sall.sll_halen);
469 ret = sendto(s,
470 buffer,
471 len,
473 (struct sockaddr *)&sall,
474 sizeof(sall));
475 if (ret < 0 ) {
476 ret = errno;
477 DBG_ERR("Failed sendto\n");
478 goto fail;
481 close(s);
482 break;
484 default:
485 ret = EINVAL;
486 DBG_ERR("Not an ipv4/ipv6 address (family is %u)\n",
487 addr->ip.sin_family);
488 goto fail;
491 return 0;
493 fail:
494 close(s);
495 return ret;
498 #else /* HAVE_PACKETSOCKET */
500 int ctdb_sys_send_arp(const ctdb_sock_addr *addr, const char *iface)
502 /* Not implemented */
503 return ENOSYS;
506 #endif /* HAVE_PACKETSOCKET */
509 #define IP4_TCP_BUFFER_SIZE sizeof(struct ip) + \
510 sizeof(struct tcphdr)
512 #define IP6_TCP_BUFFER_SIZE sizeof(struct ip6_hdr) + \
513 sizeof(struct tcphdr)
515 static int tcp4_build(uint8_t *buf,
516 size_t buflen,
517 const struct sockaddr_in *src,
518 const struct sockaddr_in *dst,
519 uint32_t seq,
520 uint32_t ack,
521 int rst,
522 size_t *len)
524 size_t l = IP4_TCP_BUFFER_SIZE;
525 struct {
526 struct ip ip;
527 struct tcphdr tcp;
528 } *ip4pkt;
530 if (l != sizeof(*ip4pkt)) {
531 return EMSGSIZE;
534 if (buflen < l) {
535 return EMSGSIZE;
538 ip4pkt = (void *)buf;
539 memset(ip4pkt, 0, l);
541 ip4pkt->ip.ip_v = 4;
542 ip4pkt->ip.ip_hl = sizeof(ip4pkt->ip)/sizeof(uint32_t);
543 ip4pkt->ip.ip_len = htons(sizeof(ip4pkt));
544 ip4pkt->ip.ip_ttl = 255;
545 ip4pkt->ip.ip_p = IPPROTO_TCP;
546 ip4pkt->ip.ip_src.s_addr = src->sin_addr.s_addr;
547 ip4pkt->ip.ip_dst.s_addr = dst->sin_addr.s_addr;
548 ip4pkt->ip.ip_sum = 0;
550 ip4pkt->tcp.th_sport = src->sin_port;
551 ip4pkt->tcp.th_dport = dst->sin_port;
552 ip4pkt->tcp.th_seq = seq;
553 ip4pkt->tcp.th_ack = ack;
554 ip4pkt->tcp.th_flags = 0;
555 ip4pkt->tcp.th_flags |= TH_ACK;
556 if (rst) {
557 ip4pkt->tcp.th_flags |= TH_RST;
559 ip4pkt->tcp.th_off = sizeof(ip4pkt->tcp)/sizeof(uint32_t);
560 /* this makes it easier to spot in a sniffer */
561 ip4pkt->tcp.th_win = htons(1234);
562 ip4pkt->tcp.th_sum = ip_checksum((uint8_t *)&ip4pkt->tcp,
563 sizeof(ip4pkt->tcp),
564 &ip4pkt->ip);
566 *len = l;
567 return 0;
570 static int tcp6_build(uint8_t *buf,
571 size_t buflen,
572 const struct sockaddr_in6 *src,
573 const struct sockaddr_in6 *dst,
574 uint32_t seq,
575 uint32_t ack,
576 int rst,
577 size_t *len)
579 size_t l = IP6_TCP_BUFFER_SIZE;
580 struct {
581 struct ip6_hdr ip6;
582 struct tcphdr tcp;
583 } *ip6pkt;
585 if (l != sizeof(*ip6pkt)) {
586 return EMSGSIZE;
589 if (buflen < l) {
590 return EMSGSIZE;
593 ip6pkt = (void *)buf;
594 memset(ip6pkt, 0, l);
596 ip6pkt->ip6.ip6_vfc = 6 << 4;
597 ip6pkt->ip6.ip6_plen = htons(sizeof(struct tcphdr));
598 ip6pkt->ip6.ip6_nxt = IPPROTO_TCP;
599 ip6pkt->ip6.ip6_hlim = 64;
600 ip6pkt->ip6.ip6_src = src->sin6_addr;
601 ip6pkt->ip6.ip6_dst = dst->sin6_addr;
603 ip6pkt->tcp.th_sport = src->sin6_port;
604 ip6pkt->tcp.th_dport = dst->sin6_port;
605 ip6pkt->tcp.th_seq = seq;
606 ip6pkt->tcp.th_ack = ack;
607 ip6pkt->tcp.th_flags = 0;
608 ip6pkt->tcp.th_flags |= TH_ACK;
609 if (rst) {
610 ip6pkt->tcp.th_flags |= TH_RST;
612 ip6pkt->tcp.th_off = sizeof(ip6pkt->tcp)/sizeof(uint32_t);
613 /* this makes it easier to spot in a sniffer */
614 ip6pkt->tcp.th_win = htons(1234);
615 ip6pkt->tcp.th_sum = ip6_checksum((uint8_t *)&ip6pkt->tcp,
616 sizeof(ip6pkt->tcp),
617 &ip6pkt->ip6);
619 *len = l;
620 return 0;
624 * Send tcp segment from the specified IP/port to the specified
625 * destination IP/port.
627 * This is used to trigger the receiving host into sending its own ACK,
628 * which should trigger early detection of TCP reset by the client
629 * after IP takeover
631 * This can also be used to send RST segments (if rst is true) and also
632 * if correct seq and ack numbers are provided.
634 int ctdb_sys_send_tcp(const ctdb_sock_addr *dest,
635 const ctdb_sock_addr *src,
636 uint32_t seq,
637 uint32_t ack,
638 int rst)
640 uint8_t buf[MAX(IP4_TCP_BUFFER_SIZE, IP6_TCP_BUFFER_SIZE)];
641 size_t len = 0;
642 int ret;
643 int s;
644 uint32_t one = 1;
645 struct sockaddr_in6 tmpdest = { 0 };
646 int saved_errno;
648 switch (src->ip.sin_family) {
649 case AF_INET:
650 ret = tcp4_build(buf,
651 sizeof(buf),
652 &src->ip,
653 &dest->ip,
654 seq,
655 ack,
656 rst,
657 &len);
658 if (ret != 0) {
659 DBG_ERR("Failed to build TCP packet (%d)\n", ret);
660 return ret;
663 /* open a raw socket to send this segment from */
664 s = socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
665 if (s == -1) {
666 DBG_ERR("Failed to open raw socket (%s)\n",
667 strerror(errno));
668 return -1;
671 ret = setsockopt(s, IPPROTO_IP, IP_HDRINCL, &one, sizeof(one));
672 if (ret != 0) {
673 DBG_ERR("Failed to setup IP headers (%s)\n",
674 strerror(errno));
675 close(s);
676 return -1;
679 ret = sendto(s,
680 buf,
681 len,
683 (const struct sockaddr *)&dest->ip,
684 sizeof(dest->ip));
685 saved_errno = errno;
686 close(s);
687 if (ret == -1) {
688 D_ERR("Failed sendto (%s)\n", strerror(saved_errno));
689 return -1;
691 if ((size_t)ret != len) {
692 DBG_ERR("Failed sendto - didn't send full packet\n");
693 return -1;
695 break;
697 case AF_INET6:
698 ret = tcp6_build(buf,
699 sizeof(buf),
700 &src->ip6,
701 &dest->ip6,
702 seq,
703 ack,
704 rst,
705 &len);
706 if (ret != 0) {
707 DBG_ERR("Failed to build TCP packet (%d)\n", ret);
708 return ret;
711 s = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW);
712 if (s == -1) {
713 DBG_ERR("Failed to open sending socket\n");
714 return -1;
718 * sendto() on an IPv6 raw socket requires the port to
719 * be either 0 or a protocol value
721 tmpdest = dest->ip6;
722 tmpdest.sin6_port = 0;
724 ret = sendto(s,
725 buf,
726 len,
728 (const struct sockaddr *)&tmpdest,
729 sizeof(tmpdest));
730 saved_errno = errno;
731 close(s);
732 if (ret == -1) {
733 D_ERR("Failed sendto (%s)\n", strerror(saved_errno));
734 return -1;
736 if ((size_t)ret != len) {
737 DBG_ERR("Failed sendto - didn't send full packet\n");
738 return -1;
740 break;
742 default:
743 DBG_ERR("Not an ipv4/v6 address\n");
744 return -1;
747 return 0;
750 static int tcp4_extract(const uint8_t *ip_pkt,
751 size_t pktlen,
752 struct sockaddr_in *src,
753 struct sockaddr_in *dst,
754 uint32_t *ack_seq,
755 uint32_t *seq,
756 int *rst,
757 uint16_t *window)
759 const struct ip *ip;
760 const struct tcphdr *tcp;
762 if (pktlen < sizeof(struct ip)) {
763 return EMSGSIZE;
766 ip = (const struct ip *)ip_pkt;
768 /* IPv4 only */
769 if (ip->ip_v != 4) {
770 return ENOMSG;
772 /* Don't look at fragments */
773 if ((ntohs(ip->ip_off)&0x1fff) != 0) {
774 return ENOMSG;
776 /* TCP only */
777 if (ip->ip_p != IPPROTO_TCP) {
778 return ENOMSG;
781 /* Ensure there is enough of the packet to gather required fields */
782 if (pktlen <
783 (ip->ip_hl * sizeof(uint32_t)) + offsetof(struct tcphdr, th_sum)) {
784 return EMSGSIZE;
787 tcp = (const struct tcphdr *)(ip_pkt + (ip->ip_hl * sizeof(uint32_t)));
789 src->sin_family = AF_INET;
790 src->sin_addr.s_addr = ip->ip_src.s_addr;
791 src->sin_port = tcp->th_sport;
793 dst->sin_family = AF_INET;
794 dst->sin_addr.s_addr = ip->ip_dst.s_addr;
795 dst->sin_port = tcp->th_dport;
797 *ack_seq = tcp->th_ack;
798 *seq = tcp->th_seq;
799 if (window != NULL) {
800 *window = tcp->th_win;
802 if (rst != NULL) {
803 *rst = tcp->th_flags & TH_RST;
806 return 0;
809 static int tcp6_extract(const uint8_t *ip_pkt,
810 size_t pktlen,
811 struct sockaddr_in6 *src,
812 struct sockaddr_in6 *dst,
813 uint32_t *ack_seq,
814 uint32_t *seq,
815 int *rst,
816 uint16_t *window)
818 const struct ip6_hdr *ip6;
819 const struct tcphdr *tcp;
821 /* Ensure there is enough of the packet to gather required fields */
822 if (pktlen < sizeof(struct ip6_hdr) + offsetof(struct tcphdr, th_sum)) {
823 return EMSGSIZE;
826 ip6 = (const struct ip6_hdr *)ip_pkt;
828 /* IPv6 only */
829 if ((ip6->ip6_vfc >> 4) != 6){
830 return ENOMSG;
833 /* TCP only */
834 if (ip6->ip6_nxt != IPPROTO_TCP) {
835 return ENOMSG;
838 tcp = (const struct tcphdr *)(ip_pkt + sizeof(struct ip6_hdr));
840 src->sin6_family = AF_INET6;
841 src->sin6_port = tcp->th_sport;
842 src->sin6_addr = ip6->ip6_src;
844 dst->sin6_family = AF_INET6;
845 dst->sin6_port = tcp->th_dport;
846 dst->sin6_addr = ip6->ip6_dst;
848 *ack_seq = tcp->th_ack;
849 *seq = tcp->th_seq;
850 if (window != NULL) {
851 *window = tcp->th_win;
853 if (rst != NULL) {
854 *rst = tcp->th_flags & TH_RST;
857 return 0;
861 * Packet capture
863 * If AF_PACKET is available then use a raw socket otherwise use pcap.
864 * wscript has checked to make sure that pcap is available if needed.
867 #if defined(HAVE_AF_PACKET) && !defined(ENABLE_PCAP)
870 * This function is used to open a raw socket to capture from
872 int ctdb_sys_open_capture_socket(const char *iface, void **private_data)
874 int s, ret;
876 /* Open a socket to capture all traffic */
877 s = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
878 if (s == -1) {
879 DBG_ERR("Failed to open raw socket\n");
880 return -1;
883 DBG_DEBUG("Opened raw socket for TCP tickle capture (fd=%d)\n", s);
885 ret = set_blocking(s, false);
886 if (ret != 0) {
887 DBG_ERR("Failed to set socket non-blocking (%s)\n",
888 strerror(errno));
889 close(s);
890 return -1;
893 set_close_on_exec(s);
895 return s;
899 * This function is used to do any additional cleanup required when closing
900 * a capture socket.
901 * Note that the socket itself is closed automatically in the caller.
903 int ctdb_sys_close_capture_socket(void *private_data)
905 return 0;
910 * called when the raw socket becomes readable
912 int ctdb_sys_read_tcp_packet(int s, void *private_data,
913 ctdb_sock_addr *src,
914 ctdb_sock_addr *dst,
915 uint32_t *ack_seq,
916 uint32_t *seq,
917 int *rst,
918 uint16_t *window)
920 ssize_t nread;
921 uint8_t pkt[100]; /* Large enough for simple ACK/RST packets */
922 struct ether_header *eth;
923 int ret;
925 nread = recv(s, pkt, sizeof(pkt), MSG_TRUNC);
926 if (nread == -1) {
927 return errno;
929 if ((size_t)nread < sizeof(*eth)) {
930 return EMSGSIZE;
933 ZERO_STRUCTP(src);
934 ZERO_STRUCTP(dst);
936 /* Ethernet */
937 eth = (struct ether_header *)pkt;
939 /* we want either IPv4 or IPv6 */
940 if (ntohs(eth->ether_type) == ETHERTYPE_IP) {
941 ret = tcp4_extract(pkt + sizeof(struct ether_header),
942 (size_t)nread - sizeof(struct ether_header),
943 &src->ip,
944 &dst->ip,
945 ack_seq,
946 seq,
947 rst,
948 window);
949 return ret;
951 } else if (ntohs(eth->ether_type) == ETHERTYPE_IP6) {
952 ret = tcp6_extract(pkt + sizeof(struct ether_header),
953 (size_t)nread - sizeof(struct ether_header),
954 &src->ip6,
955 &dst->ip6,
956 ack_seq,
957 seq,
958 rst,
959 window);
960 return ret;
963 return ENOMSG;
966 #else /* defined(HAVE_AF_PACKET) && !defined(ENABLE_PCAP) */
968 #include <pcap.h>
971 * Assume this exists if pcap.h exists - it has been around for a
972 * while
974 #include <pcap/sll.h>
976 int ctdb_sys_open_capture_socket(const char *iface, void **private_data)
978 char errbuf[PCAP_ERRBUF_SIZE];
979 pcap_t *pt;
980 int pcap_packet_type;
981 const char *t = NULL;
982 int fd;
983 int ret;
985 pt = pcap_create(iface, errbuf);
986 if (pt == NULL) {
987 DBG_ERR("Failed to open pcap capture device %s (%s)\n",
988 iface,
989 errbuf);
990 return -1;
993 * pcap isn't very clear about defaults...
995 ret = pcap_set_snaplen(pt, 100);
996 if (ret < 0) {
997 DBG_ERR("Failed to set snaplen for pcap capture\n");
998 goto fail;
1000 ret = pcap_set_promisc(pt, 0);
1001 if (ret < 0) {
1002 DBG_ERR("Failed to unset promiscuous mode for pcap capture\n");
1003 goto fail;
1005 ret = pcap_set_timeout(pt, 0);
1006 if (ret < 0) {
1007 DBG_ERR("Failed to set timeout for pcap capture\n");
1008 goto fail;
1010 #ifdef HAVE_PCAP_SET_IMMEDIATE_MODE
1011 ret = pcap_set_immediate_mode(pt, 1);
1012 if (ret < 0) {
1013 DBG_ERR("Failed to set immediate mode for pcap capture\n");
1014 goto fail;
1016 #endif
1017 ret = pcap_activate(pt);
1018 if (ret < 0) {
1019 DBG_ERR("Failed to activate pcap capture\n");
1020 goto fail;
1023 pcap_packet_type = pcap_datalink(pt);
1024 switch (pcap_packet_type) {
1025 case DLT_EN10MB:
1026 t = "DLT_EN10MB";
1027 break;
1028 case DLT_LINUX_SLL:
1029 t = "DLT_LINUX_SLL";
1030 break;
1031 #ifdef DLT_LINUX_SLL2
1032 case DLT_LINUX_SLL2:
1033 t = "DLT_LINUX_SLL2";
1034 break;
1035 #endif /* DLT_LINUX_SLL2 */
1036 default:
1037 DBG_ERR("Unknown pcap packet type %d\n", pcap_packet_type);
1038 goto fail;
1041 fd = pcap_get_selectable_fd(pt);
1042 DBG_DEBUG("Opened pcap capture for TCP tickle (type=%s, fd=%d)\n",
1044 fd);
1046 *((pcap_t **)private_data) = pt;
1047 return fd;
1049 fail:
1050 pcap_close(pt);
1051 return -1;
1054 int ctdb_sys_close_capture_socket(void *private_data)
1056 pcap_t *pt = (pcap_t *)private_data;
1057 pcap_close(pt);
1058 return 0;
1061 int ctdb_sys_read_tcp_packet(int s,
1062 void *private_data,
1063 ctdb_sock_addr *src,
1064 ctdb_sock_addr *dst,
1065 uint32_t *ack_seq,
1066 uint32_t *seq,
1067 int *rst,
1068 uint16_t *window)
1070 int ret;
1071 struct pcap_pkthdr pkthdr;
1072 const u_char *buffer;
1073 pcap_t *pt = (pcap_t *)private_data;
1074 int pcap_packet_type;
1075 uint16_t ether_type;
1076 size_t ll_hdr_len;
1078 buffer=pcap_next(pt, &pkthdr);
1079 if (buffer==NULL) {
1080 return ENOMSG;
1083 ZERO_STRUCTP(src);
1084 ZERO_STRUCTP(dst);
1086 pcap_packet_type = pcap_datalink(pt);
1087 switch (pcap_packet_type) {
1088 case DLT_EN10MB: {
1089 const struct ether_header *eth =
1090 (const struct ether_header *)buffer;
1091 ether_type = ntohs(eth->ether_type);
1092 ll_hdr_len = sizeof(struct ether_header);
1093 break;
1095 case DLT_LINUX_SLL: {
1096 const struct sll_header *sll =
1097 (const struct sll_header *)buffer;
1098 uint16_t arphrd_type = ntohs(sll->sll_hatype);
1099 switch (arphrd_type) {
1100 case ARPHRD_ETHER:
1101 case ARPHRD_INFINIBAND:
1102 break;
1103 default:
1104 DBG_DEBUG("SLL: Unknown arphrd_type %"PRIu16"\n",
1105 arphrd_type);
1106 return EPROTONOSUPPORT;
1108 ether_type = ntohs(sll->sll_protocol);
1109 ll_hdr_len = SLL_HDR_LEN;
1110 break;
1112 #ifdef DLT_LINUX_SLL2
1113 case DLT_LINUX_SLL2: {
1114 const struct sll2_header *sll2 =
1115 (const struct sll2_header *)buffer;
1116 uint16_t arphrd_type = ntohs(sll2->sll2_hatype);
1117 switch (arphrd_type) {
1118 case ARPHRD_ETHER:
1119 case ARPHRD_INFINIBAND:
1120 break;
1121 default:
1122 DBG_DEBUG("SLL2: Unknown arphrd_type %"PRIu16"\n",
1123 arphrd_type);
1124 return EPROTONOSUPPORT;
1126 ether_type = ntohs(sll2->sll2_protocol);
1127 ll_hdr_len = SLL2_HDR_LEN;
1128 break;
1130 #endif /* DLT_LINUX_SLL2 */
1131 default:
1132 DBG_DEBUG("Unknown pcap packet type %d\n", pcap_packet_type);
1133 return EPROTONOSUPPORT;
1136 switch (ether_type) {
1137 case ETHERTYPE_IP:
1138 ret = tcp4_extract(buffer + ll_hdr_len,
1139 (size_t)pkthdr.caplen - ll_hdr_len,
1140 &src->ip,
1141 &dst->ip,
1142 ack_seq,
1143 seq,
1144 rst,
1145 window);
1146 break;
1147 case ETHERTYPE_IP6:
1148 ret = tcp6_extract(buffer + ll_hdr_len,
1149 (size_t)pkthdr.caplen - ll_hdr_len,
1150 &src->ip6,
1151 &dst->ip6,
1152 ack_seq,
1153 seq,
1154 rst,
1155 window);
1156 break;
1157 case ETHERTYPE_ARP:
1158 /* Silently ignore ARP packets */
1159 return EPROTO;
1160 default:
1161 DBG_DEBUG("Unknown ether type %"PRIu16"\n", ether_type);
1162 return EPROTO;
1165 return ret;
1168 #endif /* defined(HAVE_AF_PACKET) && !defined(ENABLE_PCAP) */