2 monitoring links to all other nodes to detect dead nodes
5 Copyright (C) Ronnie Sahlberg 2007
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, see <http://www.gnu.org/licenses/>.
22 #include "system/filesys.h"
23 #include "system/network.h"
24 #include "system/time.h"
25 #include "system/wait.h"
30 #include "lib/util/debug.h"
31 #include "lib/util/samba_util.h"
33 #include "ctdb_private.h"
36 #include "common/common.h"
37 #include "common/logging.h"
40 static uint32_t keepalive_version(void)
42 return (SAMBA_VERSION_MAJOR
<< 16) | SAMBA_VERSION_MINOR
;
45 static uint32_t keepalive_uptime(struct ctdb_context
*ctdb
)
47 struct timeval current
= tevent_timeval_current();
49 return current
.tv_sec
- ctdb
->ctdbd_start_time
.tv_sec
;
53 send a keepalive packet to the other node
55 static void ctdb_send_keepalive(struct ctdb_context
*ctdb
, uint32_t destnode
)
57 struct ctdb_req_keepalive_old
*r
;
59 if (ctdb
->methods
== NULL
) {
61 ("Failed to send keepalive. Transport is DOWN\n"));
65 r
= ctdb_transport_allocate(ctdb
, ctdb
, CTDB_REQ_KEEPALIVE
,
66 sizeof(struct ctdb_req_keepalive_old
),
67 struct ctdb_req_keepalive_old
);
68 CTDB_NO_MEMORY_FATAL(ctdb
, r
);
69 r
->hdr
.destnode
= destnode
;
72 r
->version
= keepalive_version();
73 r
->uptime
= keepalive_uptime(ctdb
);
75 CTDB_INCREMENT_STAT(ctdb
, keepalive_packets_sent
);
77 ctdb_queue_packet(ctdb
, &r
->hdr
);
83 see if any nodes are dead
85 static void ctdb_check_for_dead_nodes(struct tevent_context
*ev
,
86 struct tevent_timer
*te
,
87 struct timeval t
, void *private_data
)
89 struct ctdb_context
*ctdb
= talloc_get_type(private_data
, struct ctdb_context
);
92 /* send a keepalive to all other nodes, unless */
93 for (i
=0;i
<ctdb
->num_nodes
;i
++) {
94 struct ctdb_node
*node
= ctdb
->nodes
[i
];
96 if (node
->flags
& NODE_FLAGS_DELETED
) {
100 if (node
->pnn
== ctdb
->pnn
) {
104 if (node
->flags
& NODE_FLAGS_DISCONNECTED
) {
105 /* it might have come alive again */
106 if (node
->rx_cnt
!= 0) {
107 ctdb_node_connected(node
);
113 if (node
->rx_cnt
== 0) {
116 node
->dead_count
= 0;
121 if (node
->dead_count
>= ctdb
->tunable
.keepalive_limit
) {
122 DEBUG(DEBUG_NOTICE
,("dead count reached for node %u\n", node
->pnn
));
123 ctdb_node_dead(node
);
124 ctdb_send_keepalive(ctdb
, node
->pnn
);
125 /* maybe tell the transport layer to kill the
131 DEBUG(DEBUG_DEBUG
,("sending keepalive to %u\n", node
->pnn
));
132 ctdb_send_keepalive(ctdb
, node
->pnn
);
137 tevent_add_timer(ctdb
->ev
, ctdb
->keepalive_ctx
,
138 timeval_current_ofs(ctdb
->tunable
.keepalive_interval
, 0),
139 ctdb_check_for_dead_nodes
, ctdb
);
143 void ctdb_start_keepalive(struct ctdb_context
*ctdb
)
145 struct tevent_timer
*te
;
147 ctdb
->keepalive_ctx
= talloc_new(ctdb
);
148 CTDB_NO_MEMORY_FATAL(ctdb
, ctdb
->keepalive_ctx
);
150 te
= tevent_add_timer(ctdb
->ev
, ctdb
->keepalive_ctx
,
151 timeval_current_ofs(ctdb
->tunable
.keepalive_interval
, 0),
152 ctdb_check_for_dead_nodes
, ctdb
);
153 CTDB_NO_MEMORY_FATAL(ctdb
, te
);
155 DEBUG(DEBUG_NOTICE
,("Keepalive monitoring has been started\n"));
157 if (ctdb
->tunable
.allow_mixed_versions
== 1) {
159 ("CTDB cluster with mixed versions configured\n"));
163 void ctdb_stop_keepalive(struct ctdb_context
*ctdb
)
165 talloc_free(ctdb
->keepalive_ctx
);
166 ctdb
->keepalive_ctx
= NULL
;
169 void ctdb_request_keepalive(struct ctdb_context
*ctdb
,
170 struct ctdb_req_header
*hdr
)
172 struct ctdb_req_keepalive_old
*c
=
173 (struct ctdb_req_keepalive_old
*)hdr
;
174 uint32_t my_version
= keepalive_version();
175 uint32_t my_uptime
= keepalive_uptime(ctdb
);
177 /* Don't check anything if mixed versions are allowed */
178 if (ctdb
->tunable
.allow_mixed_versions
== 1) {
182 if (hdr
->length
== sizeof(struct ctdb_req_header
)) {
187 if (c
->version
!= my_version
) {
188 if (c
->uptime
> my_uptime
) {
190 } else if (c
->uptime
== my_uptime
) {
191 if (c
->version
> my_version
) {
201 ("Keepalive version missing from node %u\n", hdr
->srcnode
));
206 ("Keepalive version mismatch 0x%08x != 0x%08x from node %u\n",
207 my_version
, c
->version
, hdr
->srcnode
));
212 ("CTDB Cluster with mixed versions, cannot continue\n"));
213 ctdb_shutdown_sequence(ctdb
, 0);