2 monitoring links to all other nodes to detect dead nodes
5 Copyright (C) Ronnie Sahlberg 2007
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, see <http://www.gnu.org/licenses/>.
22 #include "system/filesys.h"
23 #include "system/network.h"
24 #include "system/time.h"
25 #include "system/wait.h"
30 #include "lib/util/debug.h"
31 #include "lib/util/samba_util.h"
33 #include "ctdb_private.h"
36 #include "common/common.h"
37 #include "common/logging.h"
40 static uint32_t keepalive_version(void)
42 static uint32_t version
= 0;
47 version
= (SAMBA_VERSION_MAJOR
<< 16) | SAMBA_VERSION_MINOR
;
49 t
= getenv("CTDB_TEST_SAMBA_VERSION");
55 DBG_WARNING("Failed to parse env var: %s\n", t
);
65 static uint32_t keepalive_uptime(struct ctdb_context
*ctdb
)
67 struct timeval current
= tevent_timeval_current();
69 return current
.tv_sec
- ctdb
->ctdbd_start_time
.tv_sec
;
73 send a keepalive packet to the other node
75 static void ctdb_send_keepalive(struct ctdb_context
*ctdb
, uint32_t destnode
)
77 struct ctdb_req_keepalive_old
*r
;
79 if (ctdb
->methods
== NULL
) {
81 ("Failed to send keepalive. Transport is DOWN\n"));
85 r
= ctdb_transport_allocate(ctdb
, ctdb
, CTDB_REQ_KEEPALIVE
,
86 sizeof(struct ctdb_req_keepalive_old
),
87 struct ctdb_req_keepalive_old
);
88 CTDB_NO_MEMORY_FATAL(ctdb
, r
);
89 r
->hdr
.destnode
= destnode
;
92 r
->version
= keepalive_version();
93 r
->uptime
= keepalive_uptime(ctdb
);
95 CTDB_INCREMENT_STAT(ctdb
, keepalive_packets_sent
);
97 ctdb_queue_packet(ctdb
, &r
->hdr
);
103 see if any nodes are dead
105 static void ctdb_check_for_dead_nodes(struct tevent_context
*ev
,
106 struct tevent_timer
*te
,
107 struct timeval t
, void *private_data
)
109 struct ctdb_context
*ctdb
= talloc_get_type(private_data
, struct ctdb_context
);
112 /* send a keepalive to all other nodes, unless */
113 for (i
=0;i
<ctdb
->num_nodes
;i
++) {
114 struct ctdb_node
*node
= ctdb
->nodes
[i
];
116 if (node
->flags
& NODE_FLAGS_DELETED
) {
120 if (node
->pnn
== ctdb
->pnn
) {
124 if (node
->flags
& NODE_FLAGS_DISCONNECTED
) {
125 /* it might have come alive again */
126 if (node
->rx_cnt
!= 0) {
127 ctdb_node_connected(node
);
133 if (node
->rx_cnt
== 0) {
136 node
->dead_count
= 0;
141 if (node
->dead_count
>= ctdb
->tunable
.keepalive_limit
) {
142 DEBUG(DEBUG_NOTICE
,("dead count reached for node %u\n", node
->pnn
));
143 ctdb_node_dead(node
);
144 ctdb_send_keepalive(ctdb
, node
->pnn
);
145 /* maybe tell the transport layer to kill the
151 DEBUG(DEBUG_DEBUG
,("sending keepalive to %u\n", node
->pnn
));
152 ctdb_send_keepalive(ctdb
, node
->pnn
);
157 tevent_add_timer(ctdb
->ev
, ctdb
->keepalive_ctx
,
158 timeval_current_ofs(ctdb
->tunable
.keepalive_interval
, 0),
159 ctdb_check_for_dead_nodes
, ctdb
);
163 void ctdb_start_keepalive(struct ctdb_context
*ctdb
)
165 struct tevent_timer
*te
;
167 ctdb
->keepalive_ctx
= talloc_new(ctdb
);
168 CTDB_NO_MEMORY_FATAL(ctdb
, ctdb
->keepalive_ctx
);
170 te
= tevent_add_timer(ctdb
->ev
, ctdb
->keepalive_ctx
,
171 timeval_current_ofs(ctdb
->tunable
.keepalive_interval
, 0),
172 ctdb_check_for_dead_nodes
, ctdb
);
173 CTDB_NO_MEMORY_FATAL(ctdb
, te
);
175 DEBUG(DEBUG_NOTICE
,("Keepalive monitoring has been started\n"));
177 if (ctdb
->tunable
.allow_mixed_versions
== 1) {
179 ("CTDB cluster with mixed versions configured\n"));
183 void ctdb_stop_keepalive(struct ctdb_context
*ctdb
)
185 talloc_free(ctdb
->keepalive_ctx
);
186 ctdb
->keepalive_ctx
= NULL
;
189 void ctdb_request_keepalive(struct ctdb_context
*ctdb
,
190 struct ctdb_req_header
*hdr
)
192 struct ctdb_req_keepalive_old
*c
=
193 (struct ctdb_req_keepalive_old
*)hdr
;
194 uint32_t my_version
= keepalive_version();
195 uint32_t my_uptime
= keepalive_uptime(ctdb
);
197 /* Don't check anything if mixed versions are allowed */
198 if (ctdb
->tunable
.allow_mixed_versions
== 1) {
202 if (hdr
->length
== sizeof(struct ctdb_req_header
)) {
207 if (c
->version
!= my_version
) {
208 if (c
->uptime
> my_uptime
) {
210 } else if (c
->uptime
== my_uptime
) {
211 if (c
->version
> my_version
) {
221 ("Keepalive version missing from node %u\n", hdr
->srcnode
));
226 ("Keepalive version mismatch 0x%08x != 0x%08x from node %u\n",
227 my_version
, c
->version
, hdr
->srcnode
));
232 ("CTDB Cluster with mixed versions, cannot continue\n"));
233 ctdb_shutdown_sequence(ctdb
, 0);