2 ctdb system specific code to manage raw sockets on linux
4 Copyright (C) Ronnie Sahlberg 2007
5 Copyright (C) Andrew Tridgell 2007
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, see <http://www.gnu.org/licenses/>.
22 #include "system/network.h"
23 #include "system/filesys.h"
24 #include "system/wait.h"
26 #include "lib/util/debug.h"
27 #include "lib/util/blocking.h"
29 #include "protocol/protocol.h"
31 #include <netinet/if_ether.h>
32 #include <netinet/ip6.h>
33 #include <netinet/icmp6.h>
34 #include <net/if_arp.h>
35 #include <netpacket/packet.h>
36 #include <sys/prctl.h>
38 #include "common/logging.h"
39 #include "common/system.h"
42 #define ETHERTYPE_IP6 0x86dd
46 calculate the tcp checksum for tcp over ipv6
48 static uint16_t tcp_checksum6(uint16_t *data
, size_t n
, struct ip6_hdr
*ip6
)
54 sum
+= uint16_checksum((uint16_t *)(void *)&ip6
->ip6_src
, 16);
55 sum
+= uint16_checksum((uint16_t *)(void *)&ip6
->ip6_dst
, 16);
58 phdr
[1] = htonl(ip6
->ip6_nxt
);
59 sum
+= uint16_checksum((uint16_t *)phdr
, 8);
61 sum
+= uint16_checksum(data
, n
);
63 sum
= (sum
& 0xFFFF) + (sum
>> 16);
64 sum
= (sum
& 0xFFFF) + (sum
>> 16);
74 send gratuitous arp reply after we have taken over an ip address
76 saddr is the address we are trying to claim
77 iface is the interface name we will be using to claim the address
79 int ctdb_sys_send_arp(const ctdb_sock_addr
*addr
, const char *iface
)
82 struct sockaddr_ll sall
;
83 struct ether_header
*eh
;
86 struct nd_neighbor_advert
*nd_na
;
87 struct nd_opt_hdr
*nd_oh
;
88 struct ifreq if_hwaddr
;
89 /* Size of IPv6 neighbor advertisement (with option) */
90 unsigned char buffer
[sizeof(struct ether_header
) +
91 sizeof(struct ip6_hdr
) +
92 sizeof(struct nd_neighbor_advert
) +
93 sizeof(struct nd_opt_hdr
) + ETH_ALEN
];
95 char bdcast
[] = {0xff,0xff,0xff,0xff,0xff,0xff};
100 ZERO_STRUCT(if_hwaddr
);
102 switch (addr
->ip
.sin_family
) {
104 s
= socket(AF_PACKET
, SOCK_RAW
, 0);
106 DEBUG(DEBUG_CRIT
,(__location__
" failed to open raw socket\n"));
110 DEBUG(DEBUG_DEBUG
, (__location__
" Created SOCKET FD:%d for sending arp\n", s
));
111 strlcpy(ifr
.ifr_name
, iface
, sizeof(ifr
.ifr_name
));
112 if (ioctl(s
, SIOCGIFINDEX
, &ifr
) < 0) {
113 DEBUG(DEBUG_CRIT
,(__location__
" interface '%s' not found\n", iface
));
118 /* get the mac address */
119 strlcpy(if_hwaddr
.ifr_name
, iface
, sizeof(if_hwaddr
.ifr_name
));
120 ret
= ioctl(s
, SIOCGIFHWADDR
, &if_hwaddr
);
123 DEBUG(DEBUG_CRIT
,(__location__
" ioctl failed\n"));
126 if (ARPHRD_LOOPBACK
== if_hwaddr
.ifr_hwaddr
.sa_family
) {
127 DEBUG(DEBUG_DEBUG
,("Ignoring loopback arp request\n"));
131 if (if_hwaddr
.ifr_hwaddr
.sa_family
!= ARPHRD_ETHER
) {
134 DEBUG(DEBUG_CRIT
,(__location__
" not an ethernet address family (0x%x)\n",
135 if_hwaddr
.ifr_hwaddr
.sa_family
));
140 memset(buffer
, 0 , 64);
141 eh
= (struct ether_header
*)buffer
;
142 memset(eh
->ether_dhost
, 0xff, ETH_ALEN
);
143 memcpy(eh
->ether_shost
, if_hwaddr
.ifr_hwaddr
.sa_data
, ETH_ALEN
);
144 eh
->ether_type
= htons(ETHERTYPE_ARP
);
146 ah
= (struct arphdr
*)&buffer
[sizeof(struct ether_header
)];
147 ah
->ar_hrd
= htons(ARPHRD_ETHER
);
148 ah
->ar_pro
= htons(ETH_P_IP
);
149 ah
->ar_hln
= ETH_ALEN
;
152 /* send a gratious arp */
153 ah
->ar_op
= htons(ARPOP_REQUEST
);
154 ptr
= (char *)&ah
[1];
155 memcpy(ptr
, if_hwaddr
.ifr_hwaddr
.sa_data
, ETH_ALEN
);
157 memcpy(ptr
, &addr
->ip
.sin_addr
, 4);
159 memset(ptr
, 0, ETH_ALEN
);
161 memcpy(ptr
, &addr
->ip
.sin_addr
, 4);
164 sall
.sll_family
= AF_PACKET
;
166 memcpy(&sall
.sll_addr
[0], bdcast
, sall
.sll_halen
);
167 sall
.sll_protocol
= htons(ETH_P_ALL
);
168 sall
.sll_ifindex
= ifr
.ifr_ifindex
;
169 ret
= sendto(s
, buffer
, 64, 0, (struct sockaddr
*)&sall
, sizeof(sall
));
172 DEBUG(DEBUG_CRIT
,(__location__
" failed sendto\n"));
176 /* send unsolicited arp reply broadcast */
177 ah
->ar_op
= htons(ARPOP_REPLY
);
178 ptr
= (char *)&ah
[1];
179 memcpy(ptr
, if_hwaddr
.ifr_hwaddr
.sa_data
, ETH_ALEN
);
181 memcpy(ptr
, &addr
->ip
.sin_addr
, 4);
183 memcpy(ptr
, if_hwaddr
.ifr_hwaddr
.sa_data
, ETH_ALEN
);
185 memcpy(ptr
, &addr
->ip
.sin_addr
, 4);
188 ret
= sendto(s
, buffer
, 64, 0, (struct sockaddr
*)&sall
, sizeof(sall
));
190 DEBUG(DEBUG_CRIT
,(__location__
" failed sendto\n"));
198 s
= socket(AF_PACKET
, SOCK_RAW
, 0);
200 DEBUG(DEBUG_CRIT
,(__location__
" failed to open raw socket\n"));
204 DEBUG(DEBUG_DEBUG
, (__location__
" Created SOCKET FD:%d for sending arp\n", s
));
205 strlcpy(ifr
.ifr_name
, iface
, sizeof(ifr
.ifr_name
));
206 if (ioctl(s
, SIOCGIFINDEX
, &ifr
) < 0) {
207 DEBUG(DEBUG_CRIT
,(__location__
" interface '%s' not found\n", iface
));
212 /* get the mac address */
213 strlcpy(if_hwaddr
.ifr_name
, iface
, sizeof(if_hwaddr
.ifr_name
));
214 ret
= ioctl(s
, SIOCGIFHWADDR
, &if_hwaddr
);
217 DEBUG(DEBUG_CRIT
,(__location__
" ioctl failed\n"));
220 if (ARPHRD_LOOPBACK
== if_hwaddr
.ifr_hwaddr
.sa_family
) {
221 DEBUG(DEBUG_DEBUG
,("Ignoring loopback arp request\n"));
225 if (if_hwaddr
.ifr_hwaddr
.sa_family
!= ARPHRD_ETHER
) {
228 DEBUG(DEBUG_CRIT
,(__location__
" not an ethernet address family (0x%x)\n",
229 if_hwaddr
.ifr_hwaddr
.sa_family
));
233 memset(buffer
, 0 , sizeof(buffer
));
234 eh
= (struct ether_header
*)buffer
;
235 /* Ethernet multicast: 33:33:00:00:00:01 (see RFC2464,
236 * section 7) - note zeroes above! */
237 eh
->ether_dhost
[0] = eh
->ether_dhost
[1] = 0x33;
238 eh
->ether_dhost
[5] = 0x01;
239 memcpy(eh
->ether_shost
, if_hwaddr
.ifr_hwaddr
.sa_data
, ETH_ALEN
);
240 eh
->ether_type
= htons(ETHERTYPE_IP6
);
242 ip6
= (struct ip6_hdr
*)(eh
+1);
244 ip6
->ip6_plen
= htons(sizeof(*nd_na
) +
245 sizeof(struct nd_opt_hdr
) +
247 ip6
->ip6_nxt
= IPPROTO_ICMPV6
;
249 ip6
->ip6_src
= addr
->ip6
.sin6_addr
;
250 /* all-nodes multicast */
252 ret
= inet_pton(AF_INET6
, "ff02::1", &ip6
->ip6_dst
);
255 DEBUG(DEBUG_CRIT
,(__location__
" failed inet_pton\n"));
259 nd_na
= (struct nd_neighbor_advert
*)(ip6
+1);
260 nd_na
->nd_na_type
= ND_NEIGHBOR_ADVERT
;
261 nd_na
->nd_na_code
= 0;
262 nd_na
->nd_na_flags_reserved
= ND_NA_FLAG_OVERRIDE
;
263 nd_na
->nd_na_target
= addr
->ip6
.sin6_addr
;
264 /* Option: Target link-layer address */
265 nd_oh
= (struct nd_opt_hdr
*)(nd_na
+1);
266 nd_oh
->nd_opt_type
= ND_OPT_TARGET_LINKADDR
;
267 nd_oh
->nd_opt_len
= 1;
268 memcpy(&(nd_oh
+1)[0], if_hwaddr
.ifr_hwaddr
.sa_data
, ETH_ALEN
);
270 nd_na
->nd_na_cksum
= tcp_checksum6((uint16_t *)nd_na
,
271 ntohs(ip6
->ip6_plen
), ip6
);
273 sall
.sll_family
= AF_PACKET
;
275 memcpy(&sall
.sll_addr
[0], &eh
->ether_dhost
[0], sall
.sll_halen
);
276 sall
.sll_protocol
= htons(ETH_P_ALL
);
277 sall
.sll_ifindex
= ifr
.ifr_ifindex
;
278 ret
= sendto(s
, buffer
, sizeof(buffer
),
279 0, (struct sockaddr
*)&sall
, sizeof(sall
));
282 DEBUG(DEBUG_CRIT
,(__location__
" failed sendto\n"));
289 DEBUG(DEBUG_CRIT
,(__location__
" not an ipv4/ipv6 address (family is %u)\n", addr
->ip
.sin_family
));
298 simple TCP checksum - assumes data is multiple of 2 bytes long
300 static uint16_t tcp_checksum(uint16_t *data
, size_t n
, struct iphdr
*ip
)
302 uint32_t sum
= uint16_checksum(data
, n
);
304 sum
+= uint16_checksum((uint16_t *)(void *)&ip
->saddr
,
306 sum
+= uint16_checksum((uint16_t *)(void *)&ip
->daddr
,
308 sum
+= ip
->protocol
+ n
;
309 sum
= (sum
& 0xFFFF) + (sum
>> 16);
310 sum
= (sum
& 0xFFFF) + (sum
>> 16);
320 Send tcp segment from the specified IP/port to the specified
323 This is used to trigger the receiving host into sending its own ACK,
324 which should trigger early detection of TCP reset by the client
327 This can also be used to send RST segments (if rst is true) and also
328 if correct seq and ack numbers are provided.
330 int ctdb_sys_send_tcp(const ctdb_sock_addr
*dest
,
331 const ctdb_sock_addr
*src
,
332 uint32_t seq
, uint32_t ack
, int rst
)
338 ctdb_sock_addr
*tmpdest
;
349 switch (src
->ip
.sin_family
) {
352 ip4pkt
.ip
.version
= 4;
353 ip4pkt
.ip
.ihl
= sizeof(ip4pkt
.ip
)/4;
354 ip4pkt
.ip
.tot_len
= htons(sizeof(ip4pkt
));
356 ip4pkt
.ip
.protocol
= IPPROTO_TCP
;
357 ip4pkt
.ip
.saddr
= src
->ip
.sin_addr
.s_addr
;
358 ip4pkt
.ip
.daddr
= dest
->ip
.sin_addr
.s_addr
;
361 ip4pkt
.tcp
.source
= src
->ip
.sin_port
;
362 ip4pkt
.tcp
.dest
= dest
->ip
.sin_port
;
363 ip4pkt
.tcp
.seq
= seq
;
364 ip4pkt
.tcp
.ack_seq
= ack
;
369 ip4pkt
.tcp
.doff
= sizeof(ip4pkt
.tcp
)/4;
370 /* this makes it easier to spot in a sniffer */
371 ip4pkt
.tcp
.window
= htons(1234);
372 ip4pkt
.tcp
.check
= tcp_checksum((uint16_t *)&ip4pkt
.tcp
, sizeof(ip4pkt
.tcp
), &ip4pkt
.ip
);
374 /* open a raw socket to send this segment from */
375 s
= socket(AF_INET
, SOCK_RAW
, IPPROTO_RAW
);
377 DEBUG(DEBUG_CRIT
,(__location__
" failed to open raw socket (%s)\n",
382 ret
= setsockopt(s
, SOL_IP
, IP_HDRINCL
, &one
, sizeof(one
));
384 DEBUG(DEBUG_CRIT
,(__location__
" failed to setup IP headers (%s)\n",
390 ret
= sendto(s
, &ip4pkt
, sizeof(ip4pkt
), 0,
391 (const struct sockaddr
*)&dest
->ip
,
395 if (ret
!= sizeof(ip4pkt
)) {
397 ("Failed sendto (%s)\n", strerror(saved_errno
)));
403 ip6pkt
.ip6
.ip6_vfc
= 0x60;
404 ip6pkt
.ip6
.ip6_plen
= htons(20);
405 ip6pkt
.ip6
.ip6_nxt
= IPPROTO_TCP
;
406 ip6pkt
.ip6
.ip6_hlim
= 64;
407 ip6pkt
.ip6
.ip6_src
= src
->ip6
.sin6_addr
;
408 ip6pkt
.ip6
.ip6_dst
= dest
->ip6
.sin6_addr
;
410 ip6pkt
.tcp
.source
= src
->ip6
.sin6_port
;
411 ip6pkt
.tcp
.dest
= dest
->ip6
.sin6_port
;
412 ip6pkt
.tcp
.seq
= seq
;
413 ip6pkt
.tcp
.ack_seq
= ack
;
418 ip6pkt
.tcp
.doff
= sizeof(ip6pkt
.tcp
)/4;
419 /* this makes it easier to spot in a sniffer */
420 ip6pkt
.tcp
.window
= htons(1234);
421 ip6pkt
.tcp
.check
= tcp_checksum6((uint16_t *)&ip6pkt
.tcp
, sizeof(ip6pkt
.tcp
), &ip6pkt
.ip6
);
423 s
= socket(AF_INET6
, SOCK_RAW
, IPPROTO_RAW
);
425 DEBUG(DEBUG_CRIT
, (__location__
" Failed to open sending socket\n"));
429 /* sendto() don't like if the port is set and the socket is
432 tmpdest
= discard_const(dest
);
433 tmpport
= tmpdest
->ip6
.sin6_port
;
435 tmpdest
->ip6
.sin6_port
= 0;
436 ret
= sendto(s
, &ip6pkt
, sizeof(ip6pkt
), 0,
437 (const struct sockaddr
*)&dest
->ip6
,
440 tmpdest
->ip6
.sin6_port
= tmpport
;
443 if (ret
!= sizeof(ip6pkt
)) {
445 ("Failed sendto (%s)\n", strerror(saved_errno
)));
451 DEBUG(DEBUG_CRIT
,(__location__
" not an ipv4/v6 address\n"));
459 This function is used to open a raw socket to capture from
461 int ctdb_sys_open_capture_socket(const char *iface
, void **private_data
)
465 /* Open a socket to capture all traffic */
466 s
= socket(AF_PACKET
, SOCK_RAW
, htons(ETH_P_ALL
));
468 DEBUG(DEBUG_CRIT
,(__location__
" failed to open raw socket\n"));
472 DEBUG(DEBUG_DEBUG
, (__location__
" Created RAW SOCKET FD:%d for tcp tickle\n", s
));
474 ret
= set_blocking(s
, false);
478 " failed to set socket non-blocking (%s)\n",
484 set_close_on_exec(s
);
490 This function is used to do any additional cleanup required when closing
492 Note that the socket itself is closed automatically in the caller.
494 int ctdb_sys_close_capture_socket(void *private_data
)
501 called when the raw socket becomes readable
503 int ctdb_sys_read_tcp_packet(int s
, void *private_data
,
504 ctdb_sock_addr
*src
, ctdb_sock_addr
*dst
,
505 uint32_t *ack_seq
, uint32_t *seq
,
506 int *rst
, uint16_t *window
)
509 #define RCVPKTSIZE 100
510 char pkt
[RCVPKTSIZE
];
511 struct ether_header
*eth
;
516 ret
= recv(s
, pkt
, RCVPKTSIZE
, MSG_TRUNC
);
517 if (ret
< sizeof(*eth
)+sizeof(*ip
)) {
522 eth
= (struct ether_header
*)pkt
;
524 /* we want either IPv4 or IPv6 */
525 if (ntohs(eth
->ether_type
) == ETHERTYPE_IP
) {
527 ip
= (struct iphdr
*)(eth
+1);
529 /* We only want IPv4 packets */
530 if (ip
->version
!= 4) {
533 /* Dont look at fragments */
534 if ((ntohs(ip
->frag_off
)&0x1fff) != 0) {
537 /* we only want TCP */
538 if (ip
->protocol
!= IPPROTO_TCP
) {
542 /* make sure its not a short packet */
543 if (offsetof(struct tcphdr
, ack_seq
) + 4 +
544 (ip
->ihl
*4) + sizeof(*eth
) > ret
) {
548 tcp
= (struct tcphdr
*)((ip
->ihl
*4) + (char *)ip
);
550 /* tell the caller which one we've found */
551 src
->ip
.sin_family
= AF_INET
;
552 src
->ip
.sin_addr
.s_addr
= ip
->saddr
;
553 src
->ip
.sin_port
= tcp
->source
;
554 dst
->ip
.sin_family
= AF_INET
;
555 dst
->ip
.sin_addr
.s_addr
= ip
->daddr
;
556 dst
->ip
.sin_port
= tcp
->dest
;
557 *ack_seq
= tcp
->ack_seq
;
559 if (window
!= NULL
) {
560 *window
= tcp
->window
;
567 } else if (ntohs(eth
->ether_type
) == ETHERTYPE_IP6
) {
569 ip6
= (struct ip6_hdr
*)(eth
+1);
571 /* we only want TCP */
572 if (ip6
->ip6_nxt
!= IPPROTO_TCP
) {
577 tcp
= (struct tcphdr
*)(ip6
+1);
579 /* tell the caller which one we've found */
580 src
->ip6
.sin6_family
= AF_INET6
;
581 src
->ip6
.sin6_port
= tcp
->source
;
582 src
->ip6
.sin6_addr
= ip6
->ip6_src
;
584 dst
->ip6
.sin6_family
= AF_INET6
;
585 dst
->ip6
.sin6_port
= tcp
->dest
;
586 dst
->ip6
.sin6_addr
= ip6
->ip6_dst
;
588 *ack_seq
= tcp
->ack_seq
;
590 if (window
!= NULL
) {
591 *window
= tcp
->window
;
604 bool ctdb_sys_check_iface_exists(const char *iface
)
609 s
= socket(AF_PACKET
, SOCK_RAW
, 0);
611 /* We don't know if the interface exists, so assume yes */
612 DEBUG(DEBUG_CRIT
,(__location__
" failed to open raw socket\n"));
616 strlcpy(ifr
.ifr_name
, iface
, sizeof(ifr
.ifr_name
));
617 if (ioctl(s
, SIOCGIFINDEX
, &ifr
) < 0 && errno
== ENODEV
) {
618 DEBUG(DEBUG_CRIT
,(__location__
" interface '%s' not found\n", iface
));
627 int ctdb_get_peer_pid(const int fd
, pid_t
*peer_pid
)
630 socklen_t crl
= sizeof(struct ucred
);
632 if ((ret
= getsockopt(fd
, SOL_SOCKET
, SO_PEERCRED
, &cr
, &crl
)) == 0) {