2 ctdb system specific code to manage raw sockets on linux
4 Copyright (C) Ronnie Sahlberg 2007
5 Copyright (C) Andrew Tridgell 2007
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, see <http://www.gnu.org/licenses/>.
22 #include "system/network.h"
23 #include "system/filesys.h"
24 #include "system/wait.h"
25 #include "../include/ctdb_private.h"
26 #include <netinet/if_ether.h>
27 #include <netinet/ip6.h>
28 #include <netinet/icmp6.h>
29 #include <net/if_arp.h>
30 #include <netpacket/packet.h>
31 #include <sys/prctl.h>
34 #define ETHERTYPE_IP6 0x86dd
38 calculate the tcp checksum for tcp over ipv6
40 static uint16_t tcp_checksum6(uint16_t *data
, size_t n
, struct ip6_hdr
*ip6
)
46 sum
+= uint16_checksum((uint16_t *)(void *)&ip6
->ip6_src
, 16);
47 sum
+= uint16_checksum((uint16_t *)(void *)&ip6
->ip6_dst
, 16);
50 phdr
[1] = htonl(ip6
->ip6_nxt
);
51 sum
+= uint16_checksum((uint16_t *)phdr
, 8);
53 sum
+= uint16_checksum(data
, n
);
55 sum
= (sum
& 0xFFFF) + (sum
>> 16);
56 sum
= (sum
& 0xFFFF) + (sum
>> 16);
66 send gratuitous arp reply after we have taken over an ip address
68 saddr is the address we are trying to claim
69 iface is the interface name we will be using to claim the address
71 int ctdb_sys_send_arp(const ctdb_sock_addr
*addr
, const char *iface
)
74 struct sockaddr_ll sall
;
75 struct ether_header
*eh
;
78 struct nd_neighbor_solicit
*nd_ns
;
79 struct ifreq if_hwaddr
;
80 unsigned char buffer
[78]; /* ipv6 neigh solicitation size */
82 char bdcast
[] = {0xff,0xff,0xff,0xff,0xff,0xff};
87 ZERO_STRUCT(if_hwaddr
);
89 switch (addr
->ip
.sin_family
) {
91 s
= socket(PF_PACKET
, SOCK_RAW
, htons(ETHERTYPE_ARP
));
93 DEBUG(DEBUG_CRIT
,(__location__
" failed to open raw socket\n"));
97 DEBUG(DEBUG_DEBUG
, (__location__
" Created SOCKET FD:%d for sending arp\n", s
));
98 strncpy(ifr
.ifr_name
, iface
, sizeof(ifr
.ifr_name
)-1);
99 if (ioctl(s
, SIOCGIFINDEX
, &ifr
) < 0) {
100 DEBUG(DEBUG_CRIT
,(__location__
" interface '%s' not found\n", iface
));
105 /* get the mac address */
106 strncpy(if_hwaddr
.ifr_name
, iface
, sizeof(if_hwaddr
.ifr_name
)-1);
107 ret
= ioctl(s
, SIOCGIFHWADDR
, &if_hwaddr
);
110 DEBUG(DEBUG_CRIT
,(__location__
" ioctl failed\n"));
113 if (ARPHRD_LOOPBACK
== if_hwaddr
.ifr_hwaddr
.sa_family
) {
114 DEBUG(DEBUG_DEBUG
,("Ignoring loopback arp request\n"));
118 if (if_hwaddr
.ifr_hwaddr
.sa_family
!= AF_LOCAL
) {
121 DEBUG(DEBUG_CRIT
,(__location__
" not an ethernet address family (0x%x)\n",
122 if_hwaddr
.ifr_hwaddr
.sa_family
));
127 memset(buffer
, 0 , 64);
128 eh
= (struct ether_header
*)buffer
;
129 memset(eh
->ether_dhost
, 0xff, ETH_ALEN
);
130 memcpy(eh
->ether_shost
, if_hwaddr
.ifr_hwaddr
.sa_data
, ETH_ALEN
);
131 eh
->ether_type
= htons(ETHERTYPE_ARP
);
133 ah
= (struct arphdr
*)&buffer
[sizeof(struct ether_header
)];
134 ah
->ar_hrd
= htons(ARPHRD_ETHER
);
135 ah
->ar_pro
= htons(ETH_P_IP
);
136 ah
->ar_hln
= ETH_ALEN
;
139 /* send a gratious arp */
140 ah
->ar_op
= htons(ARPOP_REQUEST
);
141 ptr
= (char *)&ah
[1];
142 memcpy(ptr
, if_hwaddr
.ifr_hwaddr
.sa_data
, ETH_ALEN
);
144 memcpy(ptr
, &addr
->ip
.sin_addr
, 4);
146 memset(ptr
, 0, ETH_ALEN
);
148 memcpy(ptr
, &addr
->ip
.sin_addr
, 4);
151 sall
.sll_family
= AF_PACKET
;
153 memcpy(&sall
.sll_addr
[0], bdcast
, sall
.sll_halen
);
154 sall
.sll_protocol
= htons(ETH_P_ALL
);
155 sall
.sll_ifindex
= ifr
.ifr_ifindex
;
156 ret
= sendto(s
, buffer
, 64, 0, (struct sockaddr
*)&sall
, sizeof(sall
));
159 DEBUG(DEBUG_CRIT
,(__location__
" failed sendto\n"));
163 /* send unsolicited arp reply broadcast */
164 ah
->ar_op
= htons(ARPOP_REPLY
);
165 ptr
= (char *)&ah
[1];
166 memcpy(ptr
, if_hwaddr
.ifr_hwaddr
.sa_data
, ETH_ALEN
);
168 memcpy(ptr
, &addr
->ip
.sin_addr
, 4);
170 memcpy(ptr
, if_hwaddr
.ifr_hwaddr
.sa_data
, ETH_ALEN
);
172 memcpy(ptr
, &addr
->ip
.sin_addr
, 4);
175 ret
= sendto(s
, buffer
, 64, 0, (struct sockaddr
*)&sall
, sizeof(sall
));
177 DEBUG(DEBUG_CRIT
,(__location__
" failed sendto\n"));
185 s
= socket(PF_PACKET
, SOCK_RAW
, htons(ETHERTYPE_ARP
));
187 DEBUG(DEBUG_CRIT
,(__location__
" failed to open raw socket\n"));
191 DEBUG(DEBUG_DEBUG
, (__location__
" Created SOCKET FD:%d for sending arp\n", s
));
192 strncpy(ifr
.ifr_name
, iface
, sizeof(ifr
.ifr_name
));
193 if (ioctl(s
, SIOCGIFINDEX
, &ifr
) < 0) {
194 DEBUG(DEBUG_CRIT
,(__location__
" interface '%s' not found\n", iface
));
199 /* get the mac address */
200 strncpy(if_hwaddr
.ifr_name
, iface
, sizeof(if_hwaddr
.ifr_name
)-1);
201 ret
= ioctl(s
, SIOCGIFHWADDR
, &if_hwaddr
);
204 DEBUG(DEBUG_CRIT
,(__location__
" ioctl failed\n"));
207 if (ARPHRD_LOOPBACK
== if_hwaddr
.ifr_hwaddr
.sa_family
) {
208 DEBUG(DEBUG_DEBUG
,("Ignoring loopback arp request\n"));
212 if (if_hwaddr
.ifr_hwaddr
.sa_family
!= AF_LOCAL
) {
215 DEBUG(DEBUG_CRIT
,(__location__
" not an ethernet address family (0x%x)\n",
216 if_hwaddr
.ifr_hwaddr
.sa_family
));
220 memset(buffer
, 0 , sizeof(buffer
));
221 eh
= (struct ether_header
*)buffer
;
222 memset(eh
->ether_dhost
, 0xff, ETH_ALEN
);
223 memcpy(eh
->ether_shost
, if_hwaddr
.ifr_hwaddr
.sa_data
, ETH_ALEN
);
224 eh
->ether_type
= htons(ETHERTYPE_IP6
);
226 ip6
= (struct ip6_hdr
*)(eh
+1);
228 ip6
->ip6_plen
= htons(sizeof(*nd_ns
));
229 ip6
->ip6_nxt
= IPPROTO_ICMPV6
;
231 ip6
->ip6_dst
= addr
->ip6
.sin6_addr
;
233 nd_ns
= (struct nd_neighbor_solicit
*)(ip6
+1);
234 nd_ns
->nd_ns_type
= ND_NEIGHBOR_SOLICIT
;
235 nd_ns
->nd_ns_code
= 0;
236 nd_ns
->nd_ns_reserved
= 0;
237 nd_ns
->nd_ns_target
= addr
->ip6
.sin6_addr
;
239 nd_ns
->nd_ns_cksum
= tcp_checksum6((uint16_t *)nd_ns
, ntohs(ip6
->ip6_plen
), ip6
);
241 sall
.sll_family
= AF_PACKET
;
243 memcpy(&sall
.sll_addr
[0], bdcast
, sall
.sll_halen
);
244 sall
.sll_protocol
= htons(ETH_P_ALL
);
245 sall
.sll_ifindex
= ifr
.ifr_ifindex
;
246 ret
= sendto(s
, buffer
, 78, 0, (struct sockaddr
*)&sall
, sizeof(sall
));
249 DEBUG(DEBUG_CRIT
,(__location__
" failed sendto\n"));
256 DEBUG(DEBUG_CRIT
,(__location__
" not an ipv4/ipv6 address (family is %u)\n", addr
->ip
.sin_family
));
265 simple TCP checksum - assumes data is multiple of 2 bytes long
267 static uint16_t tcp_checksum(uint16_t *data
, size_t n
, struct iphdr
*ip
)
269 uint32_t sum
= uint16_checksum(data
, n
);
271 sum
+= uint16_checksum((uint16_t *)(void *)&ip
->saddr
,
273 sum
+= uint16_checksum((uint16_t *)(void *)&ip
->daddr
,
275 sum
+= ip
->protocol
+ n
;
276 sum
= (sum
& 0xFFFF) + (sum
>> 16);
277 sum
= (sum
& 0xFFFF) + (sum
>> 16);
287 Send tcp segment from the specified IP/port to the specified
290 This is used to trigger the receiving host into sending its own ACK,
291 which should trigger early detection of TCP reset by the client
294 This can also be used to send RST segments (if rst is true) and also
295 if correct seq and ack numbers are provided.
297 int ctdb_sys_send_tcp(const ctdb_sock_addr
*dest
,
298 const ctdb_sock_addr
*src
,
299 uint32_t seq
, uint32_t ack
, int rst
)
305 ctdb_sock_addr
*tmpdest
;
315 switch (src
->ip
.sin_family
) {
318 ip4pkt
.ip
.version
= 4;
319 ip4pkt
.ip
.ihl
= sizeof(ip4pkt
.ip
)/4;
320 ip4pkt
.ip
.tot_len
= htons(sizeof(ip4pkt
));
322 ip4pkt
.ip
.protocol
= IPPROTO_TCP
;
323 ip4pkt
.ip
.saddr
= src
->ip
.sin_addr
.s_addr
;
324 ip4pkt
.ip
.daddr
= dest
->ip
.sin_addr
.s_addr
;
327 ip4pkt
.tcp
.source
= src
->ip
.sin_port
;
328 ip4pkt
.tcp
.dest
= dest
->ip
.sin_port
;
329 ip4pkt
.tcp
.seq
= seq
;
330 ip4pkt
.tcp
.ack_seq
= ack
;
335 ip4pkt
.tcp
.doff
= sizeof(ip4pkt
.tcp
)/4;
336 /* this makes it easier to spot in a sniffer */
337 ip4pkt
.tcp
.window
= htons(1234);
338 ip4pkt
.tcp
.check
= tcp_checksum((uint16_t *)&ip4pkt
.tcp
, sizeof(ip4pkt
.tcp
), &ip4pkt
.ip
);
340 /* open a raw socket to send this segment from */
341 s
= socket(AF_INET
, SOCK_RAW
, htons(IPPROTO_RAW
));
343 DEBUG(DEBUG_CRIT
,(__location__
" failed to open raw socket (%s)\n",
348 ret
= setsockopt(s
, SOL_IP
, IP_HDRINCL
, &one
, sizeof(one
));
350 DEBUG(DEBUG_CRIT
,(__location__
" failed to setup IP headers (%s)\n",
357 set_close_on_exec(s
);
359 ret
= sendto(s
, &ip4pkt
, sizeof(ip4pkt
), 0,
360 (const struct sockaddr
*)&dest
->ip
,
363 if (ret
!= sizeof(ip4pkt
)) {
364 DEBUG(DEBUG_CRIT
,(__location__
" failed sendto (%s)\n", strerror(errno
)));
370 ip6pkt
.ip6
.ip6_vfc
= 0x60;
371 ip6pkt
.ip6
.ip6_plen
= htons(20);
372 ip6pkt
.ip6
.ip6_nxt
= IPPROTO_TCP
;
373 ip6pkt
.ip6
.ip6_hlim
= 64;
374 ip6pkt
.ip6
.ip6_src
= src
->ip6
.sin6_addr
;
375 ip6pkt
.ip6
.ip6_dst
= dest
->ip6
.sin6_addr
;
377 ip6pkt
.tcp
.source
= src
->ip6
.sin6_port
;
378 ip6pkt
.tcp
.dest
= dest
->ip6
.sin6_port
;
379 ip6pkt
.tcp
.seq
= seq
;
380 ip6pkt
.tcp
.ack_seq
= ack
;
385 ip6pkt
.tcp
.doff
= sizeof(ip6pkt
.tcp
)/4;
386 /* this makes it easier to spot in a sniffer */
387 ip6pkt
.tcp
.window
= htons(1234);
388 ip6pkt
.tcp
.check
= tcp_checksum6((uint16_t *)&ip6pkt
.tcp
, sizeof(ip6pkt
.tcp
), &ip6pkt
.ip6
);
390 s
= socket(PF_INET6
, SOCK_RAW
, IPPROTO_RAW
);
392 DEBUG(DEBUG_CRIT
, (__location__
" Failed to open sending socket\n"));
396 /* sendto() dont like if the port is set and the socket is
399 tmpdest
= discard_const(dest
);
400 tmpport
= tmpdest
->ip6
.sin6_port
;
402 tmpdest
->ip6
.sin6_port
= 0;
403 ret
= sendto(s
, &ip6pkt
, sizeof(ip6pkt
), 0,
404 (const struct sockaddr
*)&dest
->ip6
,
406 tmpdest
->ip6
.sin6_port
= tmpport
;
409 if (ret
!= sizeof(ip6pkt
)) {
410 DEBUG(DEBUG_CRIT
,(__location__
" failed sendto (%s)\n", strerror(errno
)));
416 DEBUG(DEBUG_CRIT
,(__location__
" not an ipv4/v6 address\n"));
424 This function is used to open a raw socket to capture from
426 int ctdb_sys_open_capture_socket(const char *iface
, void **private_data
)
430 /* Open a socket to capture all traffic */
431 s
= socket(AF_PACKET
, SOCK_RAW
, htons(ETH_P_ALL
));
433 DEBUG(DEBUG_CRIT
,(__location__
" failed to open raw socket\n"));
437 DEBUG(DEBUG_DEBUG
, (__location__
" Created RAW SOCKET FD:%d for tcp tickle\n", s
));
440 set_close_on_exec(s
);
446 This function is used to do any additional cleanup required when closing
448 Note that the socket itself is closed automatically in the caller.
450 int ctdb_sys_close_capture_socket(void *private_data
)
457 called when the raw socket becomes readable
459 int ctdb_sys_read_tcp_packet(int s
, void *private_data
,
460 ctdb_sock_addr
*src
, ctdb_sock_addr
*dst
,
461 uint32_t *ack_seq
, uint32_t *seq
)
464 #define RCVPKTSIZE 100
465 char pkt
[RCVPKTSIZE
];
466 struct ether_header
*eth
;
471 ret
= recv(s
, pkt
, RCVPKTSIZE
, MSG_TRUNC
);
472 if (ret
< sizeof(*eth
)+sizeof(*ip
)) {
477 eth
= (struct ether_header
*)pkt
;
479 /* we want either IPv4 or IPv6 */
480 if (ntohs(eth
->ether_type
) == ETHERTYPE_IP
) {
482 ip
= (struct iphdr
*)(eth
+1);
484 /* We only want IPv4 packets */
485 if (ip
->version
!= 4) {
488 /* Dont look at fragments */
489 if ((ntohs(ip
->frag_off
)&0x1fff) != 0) {
492 /* we only want TCP */
493 if (ip
->protocol
!= IPPROTO_TCP
) {
497 /* make sure its not a short packet */
498 if (offsetof(struct tcphdr
, ack_seq
) + 4 +
499 (ip
->ihl
*4) + sizeof(*eth
) > ret
) {
503 tcp
= (struct tcphdr
*)((ip
->ihl
*4) + (char *)ip
);
505 /* tell the caller which one we've found */
506 src
->ip
.sin_family
= AF_INET
;
507 src
->ip
.sin_addr
.s_addr
= ip
->saddr
;
508 src
->ip
.sin_port
= tcp
->source
;
509 dst
->ip
.sin_family
= AF_INET
;
510 dst
->ip
.sin_addr
.s_addr
= ip
->daddr
;
511 dst
->ip
.sin_port
= tcp
->dest
;
512 *ack_seq
= tcp
->ack_seq
;
516 } else if (ntohs(eth
->ether_type
) == ETHERTYPE_IP6
) {
518 ip6
= (struct ip6_hdr
*)(eth
+1);
520 /* we only want TCP */
521 if (ip6
->ip6_nxt
!= IPPROTO_TCP
) {
526 tcp
= (struct tcphdr
*)(ip6
+1);
528 /* tell the caller which one we've found */
529 src
->ip6
.sin6_family
= AF_INET6
;
530 src
->ip6
.sin6_port
= tcp
->source
;
531 src
->ip6
.sin6_addr
= ip6
->ip6_src
;
533 dst
->ip6
.sin6_family
= AF_INET6
;
534 dst
->ip6
.sin6_port
= tcp
->dest
;
535 dst
->ip6
.sin6_addr
= ip6
->ip6_dst
;
537 *ack_seq
= tcp
->ack_seq
;
547 bool ctdb_sys_check_iface_exists(const char *iface
)
552 s
= socket(PF_PACKET
, SOCK_RAW
, 0);
554 /* We dont know if the interface exists, so assume yes */
555 DEBUG(DEBUG_CRIT
,(__location__
" failed to open raw socket\n"));
559 strncpy(ifr
.ifr_name
, iface
, sizeof(ifr
.ifr_name
)-1);
560 if (ioctl(s
, SIOCGIFINDEX
, &ifr
) < 0 && errno
== ENODEV
) {
561 DEBUG(DEBUG_CRIT
,(__location__
" interface '%s' not found\n", iface
));
570 int ctdb_get_peer_pid(const int fd
, pid_t
*peer_pid
)
573 socklen_t crl
= sizeof(struct ucred
);
575 if ((ret
= getsockopt(fd
, SOL_SOCKET
, SO_PEERCRED
, &cr
, &crl
) == 0)) {
582 * Find the process name from process ID
584 char *ctdb_get_process_name(pid_t pid
)
591 snprintf(path
, sizeof(path
), "/proc/%d/exe", pid
);
592 n
= readlink(path
, buf
, sizeof(buf
)-1);
597 /* Remove any extra fields */
599 ptr
= strtok(buf
, " ");
600 return (ptr
== NULL
? ptr
: strdup(ptr
));
606 int ctdb_set_process_name(const char *name
)
610 strncpy(procname
, name
, 15);
612 return prctl(PR_SET_NAME
, (unsigned long)procname
, 0, 0, 0);
616 * Parsing a line from /proc/locks,
618 static bool parse_proc_locks_line(char *line
, pid_t
*pid
,
619 struct ctdb_lock_info
*curlock
)
623 /* output of /proc/locks
626 * 1: POSIX ADVISORY WRITE 25945 fd:00:6424820 212 212
629 * 1: -> POSIX ADVISORY WRITE 25946 fd:00:6424820 212 212
633 ptr
= strtok_r(line
, " ", &saveptr
);
634 if (ptr
== NULL
) return false;
637 ptr
= strtok_r(NULL
, " ", &saveptr
);
638 if (ptr
== NULL
) return false;
639 if (strcmp(ptr
, "->") == 0) {
640 curlock
->waiting
= true;
641 ptr
= strtok_r(NULL
, " ", &saveptr
);
643 curlock
->waiting
= false;
647 if (ptr
== NULL
|| strcmp(ptr
, "POSIX") != 0) {
652 ptr
= strtok_r(NULL
, " ", &saveptr
);
653 if (ptr
== NULL
) return false;
656 ptr
= strtok_r(NULL
, " ", &saveptr
);
657 if (ptr
== NULL
) return false;
658 if (strcmp(ptr
, "READ") == 0) {
659 curlock
->read_only
= true;
660 } else if (strcmp(ptr
, "WRITE") == 0) {
661 curlock
->read_only
= false;
667 ptr
= strtok_r(NULL
, " ", &saveptr
);
668 if (ptr
== NULL
) return false;
671 /* MAJOR:MINOR:INODE */
672 ptr
= strtok_r(NULL
, " :", &saveptr
);
673 if (ptr
== NULL
) return false;
674 ptr
= strtok_r(NULL
, " :", &saveptr
);
675 if (ptr
== NULL
) return false;
676 ptr
= strtok_r(NULL
, " :", &saveptr
);
677 if (ptr
== NULL
) return false;
678 curlock
->inode
= atol(ptr
);
681 ptr
= strtok_r(NULL
, " ", &saveptr
);
682 if (ptr
== NULL
) return false;
683 curlock
->start
= atol(ptr
);
686 ptr
= strtok_r(NULL
, " ", &saveptr
);
687 if (ptr
== NULL
) return false;
688 if (strncmp(ptr
, "EOF", 3) == 0) {
689 curlock
->end
= (off_t
)-1;
691 curlock
->end
= atol(ptr
);
698 * Find information of lock being waited on for given process ID
700 bool ctdb_get_lock_info(pid_t req_pid
, struct ctdb_lock_info
*lock_info
)
703 struct ctdb_lock_info curlock
;
708 if ((fp
= fopen("/proc/locks", "r")) == NULL
) {
709 DEBUG(DEBUG_ERR
, ("Failed to read locks information"));
712 while (fgets(buf
, sizeof(buf
), fp
) != NULL
) {
713 if (! parse_proc_locks_line(buf
, &pid
, &curlock
)) {
716 if (pid
== req_pid
&& curlock
.waiting
) {
717 *lock_info
= curlock
;
728 * Find process ID which holds an overlapping byte lock for required
729 * inode and byte range.
731 bool ctdb_get_blocker_pid(struct ctdb_lock_info
*reqlock
, pid_t
*blocker_pid
)
734 struct ctdb_lock_info curlock
;
739 if ((fp
= fopen("/proc/locks", "r")) == NULL
) {
740 DEBUG(DEBUG_ERR
, ("Failed to read locks information"));
743 while (fgets(buf
, sizeof(buf
), fp
) != NULL
) {
744 if (! parse_proc_locks_line(buf
, &pid
, &curlock
)) {
748 if (curlock
.waiting
) {
752 if (curlock
.inode
!= reqlock
->inode
) {
756 if (curlock
.start
> reqlock
->end
||
757 curlock
.end
< reqlock
->start
) {
758 /* Outside the required range */