4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "system/filesys.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/wait.h"
31 #include "lib/tdb_wrap/tdb_wrap.h"
32 #include "lib/util/dlinklist.h"
33 #include "lib/util/debug.h"
34 #include "lib/util/samba_util.h"
35 #include "lib/util/sys_rw.h"
36 #include "lib/util/util_process.h"
38 #include "ctdb_private.h"
39 #include "ctdb_client.h"
41 #include "common/system_socket.h"
42 #include "common/common.h"
43 #include "common/logging.h"
45 #include "server/ctdb_config.h"
47 #include "ctdb_cluster_mutex.h"
49 /* List of SRVID requests that need to be processed */
51 struct srvid_list
*next
, *prev
;
52 struct ctdb_srvid_message
*request
;
55 struct srvid_requests
{
56 struct srvid_list
*requests
;
59 static void srvid_request_reply(struct ctdb_context
*ctdb
,
60 struct ctdb_srvid_message
*request
,
63 /* Someone that sent srvid==0 does not want a reply */
64 if (request
->srvid
== 0) {
69 if (ctdb_client_send_message(ctdb
, request
->pnn
, request
->srvid
,
71 DEBUG(DEBUG_INFO
,("Sent SRVID reply to %u:%llu\n",
72 (unsigned)request
->pnn
,
73 (unsigned long long)request
->srvid
));
75 DEBUG(DEBUG_ERR
,("Failed to send SRVID reply to %u:%llu\n",
76 (unsigned)request
->pnn
,
77 (unsigned long long)request
->srvid
));
83 static void srvid_requests_reply(struct ctdb_context
*ctdb
,
84 struct srvid_requests
**requests
,
89 if (*requests
== NULL
) {
93 for (r
= (*requests
)->requests
; r
!= NULL
; r
= r
->next
) {
94 srvid_request_reply(ctdb
, r
->request
, result
);
97 /* Free the list structure... */
98 TALLOC_FREE(*requests
);
101 static void srvid_request_add(struct ctdb_context
*ctdb
,
102 struct srvid_requests
**requests
,
103 struct ctdb_srvid_message
*request
)
105 struct srvid_list
*t
;
109 if (*requests
== NULL
) {
110 *requests
= talloc_zero(ctdb
, struct srvid_requests
);
111 if (*requests
== NULL
) {
116 t
= talloc_zero(*requests
, struct srvid_list
);
118 /* If *requests was just allocated above then free it */
119 if ((*requests
)->requests
== NULL
) {
120 TALLOC_FREE(*requests
);
125 t
->request
= (struct ctdb_srvid_message
*)talloc_steal(t
, request
);
126 DLIST_ADD((*requests
)->requests
, t
);
131 /* Failed to add the request to the list. Send a fail. */
132 DEBUG(DEBUG_ERR
, (__location__
133 " Out of memory, failed to queue SRVID request\n"));
135 result
.dsize
= sizeof(ret
);
136 result
.dptr
= (uint8_t *)&ret
;
137 srvid_request_reply(ctdb
, request
, result
);
140 /* An abstraction to allow an operation (takeover runs, recoveries,
141 * ...) to be disabled for a given timeout */
142 struct ctdb_op_state
{
143 struct tevent_timer
*timer
;
148 static struct ctdb_op_state
*ctdb_op_init(TALLOC_CTX
*mem_ctx
, const char *name
)
150 struct ctdb_op_state
*state
= talloc_zero(mem_ctx
, struct ctdb_op_state
);
153 state
->in_progress
= false;
160 static bool ctdb_op_is_disabled(struct ctdb_op_state
*state
)
162 return state
->timer
!= NULL
;
165 static bool ctdb_op_begin(struct ctdb_op_state
*state
)
167 if (ctdb_op_is_disabled(state
)) {
169 ("Unable to begin - %s are disabled\n", state
->name
));
173 state
->in_progress
= true;
177 static bool ctdb_op_end(struct ctdb_op_state
*state
)
179 return state
->in_progress
= false;
182 static bool ctdb_op_is_in_progress(struct ctdb_op_state
*state
)
184 return state
->in_progress
;
187 static void ctdb_op_enable(struct ctdb_op_state
*state
)
189 TALLOC_FREE(state
->timer
);
192 static void ctdb_op_timeout_handler(struct tevent_context
*ev
,
193 struct tevent_timer
*te
,
194 struct timeval yt
, void *p
)
196 struct ctdb_op_state
*state
=
197 talloc_get_type(p
, struct ctdb_op_state
);
199 DEBUG(DEBUG_NOTICE
,("Reenabling %s after timeout\n", state
->name
));
200 ctdb_op_enable(state
);
203 static int ctdb_op_disable(struct ctdb_op_state
*state
,
204 struct tevent_context
*ev
,
208 DEBUG(DEBUG_NOTICE
,("Reenabling %s\n", state
->name
));
209 ctdb_op_enable(state
);
213 if (state
->in_progress
) {
215 ("Unable to disable %s - in progress\n", state
->name
));
219 DEBUG(DEBUG_NOTICE
,("Disabling %s for %u seconds\n",
220 state
->name
, timeout
));
222 /* Clear any old timers */
223 talloc_free(state
->timer
);
225 /* Arrange for the timeout to occur */
226 state
->timer
= tevent_add_timer(ev
, state
,
227 timeval_current_ofs(timeout
, 0),
228 ctdb_op_timeout_handler
, state
);
229 if (state
->timer
== NULL
) {
230 DEBUG(DEBUG_ERR
,(__location__
" Unable to setup timer\n"));
237 struct ctdb_banning_state
{
239 struct timeval last_reported_time
;
242 struct ctdb_recovery_lock_handle
;
245 private state of recovery daemon
247 struct ctdb_recoverd
{
248 struct ctdb_context
*ctdb
;
250 uint32_t last_culprit_node
;
251 struct ctdb_node_map_old
*nodemap
;
252 struct timeval priority_time
;
253 bool need_takeover_run
;
256 struct tevent_timer
*send_election_te
;
257 struct tevent_timer
*election_timeout
;
258 struct srvid_requests
*reallocate_requests
;
259 struct ctdb_op_state
*takeover_run
;
260 struct ctdb_op_state
*recovery
;
261 struct ctdb_iface_list_old
*ifaces
;
262 uint32_t *force_rebalance_nodes
;
263 struct ctdb_node_capabilities
*caps
;
264 bool frozen_on_inactive
;
265 struct ctdb_recovery_lock_handle
*recovery_lock_handle
;
268 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
269 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
271 static void ctdb_restart_recd(struct tevent_context
*ev
,
272 struct tevent_timer
*te
, struct timeval t
,
276 ban a node for a period of time
278 static void ctdb_ban_node(struct ctdb_recoverd
*rec
, uint32_t pnn
, uint32_t ban_time
)
281 struct ctdb_context
*ctdb
= rec
->ctdb
;
282 struct ctdb_ban_state bantime
;
284 if (!ctdb_validate_pnn(ctdb
, pnn
)) {
285 DEBUG(DEBUG_ERR
,("Bad pnn %u in ctdb_ban_node\n", pnn
));
289 DEBUG(DEBUG_NOTICE
,("Banning node %u for %u seconds\n", pnn
, ban_time
));
292 bantime
.time
= ban_time
;
294 ret
= ctdb_ctrl_set_ban(ctdb
, CONTROL_TIMEOUT(), pnn
, &bantime
);
296 DEBUG(DEBUG_ERR
,(__location__
" Failed to ban node %d\n", pnn
));
302 enum monitor_result
{ MONITOR_OK
, MONITOR_RECOVERY_NEEDED
, MONITOR_ELECTION_NEEDED
, MONITOR_FAILED
};
306 remember the trouble maker
308 static void ctdb_set_culprit_count(struct ctdb_recoverd
*rec
, uint32_t culprit
, uint32_t count
)
310 struct ctdb_context
*ctdb
= talloc_get_type(rec
->ctdb
, struct ctdb_context
);
311 struct ctdb_banning_state
*ban_state
;
313 if (culprit
> ctdb
->num_nodes
) {
314 DEBUG(DEBUG_ERR
,("Trying to set culprit %d but num_nodes is %d\n", culprit
, ctdb
->num_nodes
));
318 /* If we are banned or stopped, do not set other nodes as culprits */
319 if (rec
->node_flags
& NODE_FLAGS_INACTIVE
) {
320 DEBUG(DEBUG_NOTICE
, ("This node is INACTIVE, cannot set culprit node %d\n", culprit
));
324 if (ctdb
->nodes
[culprit
]->ban_state
== NULL
) {
325 ctdb
->nodes
[culprit
]->ban_state
= talloc_zero(ctdb
->nodes
[culprit
], struct ctdb_banning_state
);
326 CTDB_NO_MEMORY_VOID(ctdb
, ctdb
->nodes
[culprit
]->ban_state
);
330 ban_state
= ctdb
->nodes
[culprit
]->ban_state
;
331 if (timeval_elapsed(&ban_state
->last_reported_time
) > ctdb
->tunable
.recovery_grace_period
) {
332 /* this was the first time in a long while this node
333 misbehaved so we will forgive any old transgressions.
335 ban_state
->count
= 0;
338 ban_state
->count
+= count
;
339 ban_state
->last_reported_time
= timeval_current();
340 rec
->last_culprit_node
= culprit
;
344 remember the trouble maker
346 static void ctdb_set_culprit(struct ctdb_recoverd
*rec
, uint32_t culprit
)
348 ctdb_set_culprit_count(rec
, culprit
, 1);
352 Retrieve capabilities from all connected nodes
354 static int update_capabilities(struct ctdb_recoverd
*rec
,
355 struct ctdb_node_map_old
*nodemap
)
359 struct ctdb_node_capabilities
*caps
;
360 struct ctdb_context
*ctdb
= rec
->ctdb
;
362 tmp_ctx
= talloc_new(rec
);
363 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
365 caps
= ctdb_get_capabilities(ctdb
, tmp_ctx
,
366 CONTROL_TIMEOUT(), nodemap
);
370 (__location__
" Failed to get node capabilities\n"));
371 talloc_free(tmp_ctx
);
375 capp
= ctdb_get_node_capabilities(caps
, ctdb_get_pnn(ctdb
));
379 " Capabilities don't include current node.\n"));
380 talloc_free(tmp_ctx
);
383 ctdb
->capabilities
= *capp
;
385 TALLOC_FREE(rec
->caps
);
386 rec
->caps
= talloc_steal(rec
, caps
);
388 talloc_free(tmp_ctx
);
393 change recovery mode on all nodes
395 static int set_recovery_mode(struct ctdb_context
*ctdb
,
396 struct ctdb_recoverd
*rec
,
397 struct ctdb_node_map_old
*nodemap
,
404 tmp_ctx
= talloc_new(ctdb
);
405 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
407 nodes
= list_of_active_nodes(ctdb
, nodemap
, tmp_ctx
, true);
409 data
.dsize
= sizeof(uint32_t);
410 data
.dptr
= (unsigned char *)&rec_mode
;
412 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_SET_RECMODE
,
418 DEBUG(DEBUG_ERR
, (__location__
" Unable to set recovery mode. Recovery failed.\n"));
419 talloc_free(tmp_ctx
);
423 talloc_free(tmp_ctx
);
428 * Update flags on all connected nodes
430 static int update_flags_on_all_nodes(struct ctdb_context
*ctdb
,
436 ret
= ctdb_ctrl_modflags(ctdb
, CONTROL_TIMEOUT(), pnn
, flags
, ~flags
);
438 DEBUG(DEBUG_ERR
, (__location__
" Unable to update nodeflags on remote nodes\n"));
446 called when ctdb_wait_timeout should finish
448 static void ctdb_wait_handler(struct tevent_context
*ev
,
449 struct tevent_timer
*te
,
450 struct timeval yt
, void *p
)
452 uint32_t *timed_out
= (uint32_t *)p
;
457 wait for a given number of seconds
459 static void ctdb_wait_timeout(struct ctdb_context
*ctdb
, double secs
)
461 uint32_t timed_out
= 0;
462 time_t usecs
= (secs
- (time_t)secs
) * 1000000;
463 tevent_add_timer(ctdb
->ev
, ctdb
, timeval_current_ofs(secs
, usecs
),
464 ctdb_wait_handler
, &timed_out
);
466 tevent_loop_once(ctdb
->ev
);
471 called when an election times out (ends)
473 static void ctdb_election_timeout(struct tevent_context
*ev
,
474 struct tevent_timer
*te
,
475 struct timeval t
, void *p
)
477 struct ctdb_recoverd
*rec
= talloc_get_type(p
, struct ctdb_recoverd
);
478 rec
->election_timeout
= NULL
;
481 D_WARNING("Election period ended, master=%u\n", rec
->recmaster
);
486 wait for an election to finish. It finished election_timeout seconds after
487 the last election packet is received
489 static void ctdb_wait_election(struct ctdb_recoverd
*rec
)
491 struct ctdb_context
*ctdb
= rec
->ctdb
;
492 while (rec
->election_timeout
) {
493 tevent_loop_once(ctdb
->ev
);
498 Update our local flags from all remote connected nodes.
499 This is only run when we are or we belive we are the recovery master
501 static int update_local_flags(struct ctdb_recoverd
*rec
, struct ctdb_node_map_old
*nodemap
)
504 struct ctdb_context
*ctdb
= rec
->ctdb
;
505 TALLOC_CTX
*mem_ctx
= talloc_new(ctdb
);
507 /* get the nodemap for all active remote nodes and verify
508 they are the same as for this node
510 for (j
=0; j
<nodemap
->num
; j
++) {
511 struct ctdb_node_map_old
*remote_nodemap
=NULL
;
514 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_DISCONNECTED
) {
517 if (nodemap
->nodes
[j
].pnn
== ctdb
->pnn
) {
521 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
522 mem_ctx
, &remote_nodemap
);
524 DEBUG(DEBUG_ERR
, (__location__
" Unable to get nodemap from remote node %u\n",
525 nodemap
->nodes
[j
].pnn
));
526 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
527 talloc_free(mem_ctx
);
530 if (nodemap
->nodes
[j
].flags
!= remote_nodemap
->nodes
[j
].flags
) {
531 /* We should tell our daemon about this so it
532 updates its flags or else we will log the same
533 message again in the next iteration of recovery.
534 Since we are the recovery master we can just as
535 well update the flags on all nodes.
537 ret
= ctdb_ctrl_modflags(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
, remote_nodemap
->nodes
[j
].flags
, ~remote_nodemap
->nodes
[j
].flags
);
539 DEBUG(DEBUG_ERR
, (__location__
" Unable to update nodeflags on remote nodes\n"));
543 /* Update our local copy of the flags in the recovery
546 DEBUG(DEBUG_NOTICE
,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
547 nodemap
->nodes
[j
].pnn
, remote_nodemap
->nodes
[j
].flags
,
548 nodemap
->nodes
[j
].flags
));
549 nodemap
->nodes
[j
].flags
= remote_nodemap
->nodes
[j
].flags
;
551 talloc_free(remote_nodemap
);
553 talloc_free(mem_ctx
);
558 /* Create a new random generation id.
559 The generation id can not be the INVALID_GENERATION id
561 static uint32_t new_generation(void)
566 generation
= random();
568 if (generation
!= INVALID_GENERATION
) {
576 static bool ctdb_recovery_have_lock(struct ctdb_recoverd
*rec
)
578 return (rec
->recovery_lock_handle
!= NULL
);
581 struct ctdb_recovery_lock_handle
{
585 struct ctdb_cluster_mutex_handle
*h
;
586 struct ctdb_recoverd
*rec
;
589 static void take_reclock_handler(char status
,
593 struct ctdb_recovery_lock_handle
*s
=
594 (struct ctdb_recovery_lock_handle
*) private_data
;
596 s
->locked
= (status
== '0') ;
599 * If unsuccessful then ensure the process has exited and that
600 * the file descriptor event handler has been cancelled
608 s
->latency
= latency
;
612 D_ERR("Unable to take recovery lock - contention\n");
616 D_ERR("Unable to take recovery lock - timeout\n");
620 D_ERR("Unable to take recover lock - unknown error\n");
623 struct ctdb_recoverd
*rec
= s
->rec
;
624 struct ctdb_context
*ctdb
= rec
->ctdb
;
625 uint32_t pnn
= ctdb_get_pnn(ctdb
);
627 D_ERR("Banning this node\n");
630 ctdb
->tunable
.recovery_ban_period
);
637 static void force_election(struct ctdb_recoverd
*rec
,
639 struct ctdb_node_map_old
*nodemap
);
641 static void lost_reclock_handler(void *private_data
)
643 struct ctdb_recoverd
*rec
= talloc_get_type_abort(
644 private_data
, struct ctdb_recoverd
);
646 D_ERR("Recovery lock helper terminated, triggering an election\n");
647 TALLOC_FREE(rec
->recovery_lock_handle
);
649 force_election(rec
, ctdb_get_pnn(rec
->ctdb
), rec
->nodemap
);
652 static bool ctdb_recovery_lock(struct ctdb_recoverd
*rec
)
654 struct ctdb_context
*ctdb
= rec
->ctdb
;
655 struct ctdb_cluster_mutex_handle
*h
;
656 struct ctdb_recovery_lock_handle
*s
;
658 s
= talloc_zero(rec
, struct ctdb_recovery_lock_handle
);
660 DBG_ERR("Memory allocation error\n");
666 h
= ctdb_cluster_mutex(s
,
670 take_reclock_handler
,
672 lost_reclock_handler
,
679 rec
->recovery_lock_handle
= s
;
683 tevent_loop_once(ctdb
->ev
);
687 TALLOC_FREE(rec
->recovery_lock_handle
);
691 ctdb_ctrl_report_recd_lock_latency(ctdb
,
698 static void ctdb_recovery_unlock(struct ctdb_recoverd
*rec
)
700 if (rec
->recovery_lock_handle
== NULL
) {
704 if (! rec
->recovery_lock_handle
->done
) {
706 * Taking of recovery lock still in progress. Free
707 * the cluster mutex handle to release it but leave
708 * the recovery lock handle in place to allow taking
709 * of the lock to fail.
711 D_NOTICE("Cancelling recovery lock\n");
712 TALLOC_FREE(rec
->recovery_lock_handle
->h
);
713 rec
->recovery_lock_handle
->done
= true;
714 rec
->recovery_lock_handle
->locked
= false;
718 D_NOTICE("Releasing recovery lock\n");
719 TALLOC_FREE(rec
->recovery_lock_handle
);
722 static void ban_misbehaving_nodes(struct ctdb_recoverd
*rec
, bool *self_ban
)
724 struct ctdb_context
*ctdb
= rec
->ctdb
;
726 struct ctdb_banning_state
*ban_state
;
729 for (i
=0; i
<ctdb
->num_nodes
; i
++) {
730 if (ctdb
->nodes
[i
]->ban_state
== NULL
) {
733 ban_state
= (struct ctdb_banning_state
*)ctdb
->nodes
[i
]->ban_state
;
734 if (ban_state
->count
< 2*ctdb
->num_nodes
) {
738 DEBUG(DEBUG_NOTICE
,("Node %u reached %u banning credits - banning it for %u seconds\n",
739 ctdb
->nodes
[i
]->pnn
, ban_state
->count
,
740 ctdb
->tunable
.recovery_ban_period
));
741 ctdb_ban_node(rec
, ctdb
->nodes
[i
]->pnn
, ctdb
->tunable
.recovery_ban_period
);
742 ban_state
->count
= 0;
744 /* Banning ourself? */
745 if (ctdb
->nodes
[i
]->pnn
== rec
->ctdb
->pnn
) {
751 struct helper_state
{
758 static void helper_handler(struct tevent_context
*ev
,
759 struct tevent_fd
*fde
,
760 uint16_t flags
, void *private_data
)
762 struct helper_state
*state
= talloc_get_type_abort(
763 private_data
, struct helper_state
);
766 ret
= sys_read(state
->fd
[0], &state
->result
, sizeof(state
->result
));
767 if (ret
!= sizeof(state
->result
)) {
768 state
->result
= EPIPE
;
774 static int helper_run(struct ctdb_recoverd
*rec
, TALLOC_CTX
*mem_ctx
,
775 const char *prog
, const char *arg
, const char *type
)
777 struct helper_state
*state
;
778 struct tevent_fd
*fde
;
781 uint32_t recmaster
= rec
->recmaster
;
783 state
= talloc_zero(mem_ctx
, struct helper_state
);
785 DEBUG(DEBUG_ERR
, (__location__
" memory error\n"));
791 ret
= pipe(state
->fd
);
794 ("Failed to create pipe for %s helper\n", type
));
798 set_close_on_exec(state
->fd
[0]);
801 args
= talloc_array(state
, const char *, nargs
);
803 DEBUG(DEBUG_ERR
, (__location__
" memory error\n"));
807 args
[0] = talloc_asprintf(args
, "%d", state
->fd
[1]);
808 if (args
[0] == NULL
) {
809 DEBUG(DEBUG_ERR
, (__location__
" memory error\n"));
812 args
[1] = rec
->ctdb
->daemon
.name
;
816 if (args
[2] == NULL
) {
820 state
->pid
= ctdb_vfork_exec(state
, rec
->ctdb
, prog
, nargs
, args
);
821 if (state
->pid
== -1) {
823 ("Failed to create child for %s helper\n", type
));
832 fde
= tevent_add_fd(rec
->ctdb
->ev
, state
, state
->fd
[0],
833 TEVENT_FD_READ
, helper_handler
, state
);
837 tevent_fd_set_auto_close(fde
);
839 while (!state
->done
) {
840 tevent_loop_once(rec
->ctdb
->ev
);
842 /* If recmaster changes, we have lost election */
843 if (recmaster
!= rec
->recmaster
) {
844 D_ERR("Recmaster changed to %u, aborting %s\n",
845 rec
->recmaster
, type
);
854 if (state
->result
!= 0) {
858 ctdb_kill(rec
->ctdb
, state
->pid
, SIGKILL
);
863 if (state
->fd
[0] != -1) {
866 if (state
->fd
[1] != -1) {
869 if (state
->pid
!= -1) {
870 ctdb_kill(rec
->ctdb
, state
->pid
, SIGKILL
);
877 static int ctdb_takeover(struct ctdb_recoverd
*rec
,
878 uint32_t *force_rebalance_nodes
)
880 static char prog
[PATH_MAX
+1] = "";
885 if (!ctdb_set_helper("takeover_helper", prog
, sizeof(prog
),
886 "CTDB_TAKEOVER_HELPER", CTDB_HELPER_BINDIR
,
887 "ctdb_takeover_helper")) {
888 ctdb_die(rec
->ctdb
, "Unable to set takeover helper\n");
892 for (i
= 0; i
< talloc_array_length(force_rebalance_nodes
); i
++) {
893 uint32_t pnn
= force_rebalance_nodes
[i
];
895 arg
= talloc_asprintf(rec
, "%u", pnn
);
897 arg
= talloc_asprintf_append(arg
, ",%u", pnn
);
900 DEBUG(DEBUG_ERR
, (__location__
" memory error\n"));
905 if (ctdb_config
.failover_disabled
) {
906 ret
= setenv("CTDB_DISABLE_IP_FAILOVER", "1", 1);
908 D_ERR("Failed to set CTDB_DISABLE_IP_FAILOVER variable\n");
913 return helper_run(rec
, rec
, prog
, arg
, "takeover");
916 static bool do_takeover_run(struct ctdb_recoverd
*rec
,
917 struct ctdb_node_map_old
*nodemap
)
919 uint32_t *nodes
= NULL
;
920 struct ctdb_disable_message dtr
;
923 uint32_t *rebalance_nodes
= rec
->force_rebalance_nodes
;
927 DEBUG(DEBUG_NOTICE
, ("Takeover run starting\n"));
929 if (ctdb_op_is_in_progress(rec
->takeover_run
)) {
930 DEBUG(DEBUG_ERR
, (__location__
931 " takeover run already in progress \n"));
936 if (!ctdb_op_begin(rec
->takeover_run
)) {
941 /* Disable IP checks (takeover runs, really) on other nodes
942 * while doing this takeover run. This will stop those other
943 * nodes from triggering takeover runs when think they should
944 * be hosting an IP but it isn't yet on an interface. Don't
945 * wait for replies since a failure here might cause some
946 * noise in the logs but will not actually cause a problem.
949 dtr
.srvid
= 0; /* No reply */
952 data
.dptr
= (uint8_t*)&dtr
;
953 data
.dsize
= sizeof(dtr
);
955 nodes
= list_of_connected_nodes(rec
->ctdb
, nodemap
, rec
, false);
957 /* Disable for 60 seconds. This can be a tunable later if
961 for (i
= 0; i
< talloc_array_length(nodes
); i
++) {
962 if (ctdb_client_send_message(rec
->ctdb
, nodes
[i
],
963 CTDB_SRVID_DISABLE_TAKEOVER_RUNS
,
965 DEBUG(DEBUG_INFO
,("Failed to disable takeover runs\n"));
969 ret
= ctdb_takeover(rec
, rec
->force_rebalance_nodes
);
971 /* Reenable takeover runs and IP checks on other nodes */
973 for (i
= 0; i
< talloc_array_length(nodes
); i
++) {
974 if (ctdb_client_send_message(rec
->ctdb
, nodes
[i
],
975 CTDB_SRVID_DISABLE_TAKEOVER_RUNS
,
977 DEBUG(DEBUG_INFO
,("Failed to re-enable takeover runs\n"));
982 DEBUG(DEBUG_ERR
, ("ctdb_takeover_run() failed\n"));
988 /* Takeover run was successful so clear force rebalance targets */
989 if (rebalance_nodes
== rec
->force_rebalance_nodes
) {
990 TALLOC_FREE(rec
->force_rebalance_nodes
);
993 ("Rebalance target nodes changed during takeover run - not clearing\n"));
996 rec
->need_takeover_run
= !ok
;
998 ctdb_op_end(rec
->takeover_run
);
1000 DEBUG(DEBUG_NOTICE
, ("Takeover run %s\n", ok
? "completed successfully" : "unsuccessful"));
1004 static int db_recovery_parallel(struct ctdb_recoverd
*rec
, TALLOC_CTX
*mem_ctx
)
1006 static char prog
[PATH_MAX
+1] = "";
1009 if (!ctdb_set_helper("recovery_helper", prog
, sizeof(prog
),
1010 "CTDB_RECOVERY_HELPER", CTDB_HELPER_BINDIR
,
1011 "ctdb_recovery_helper")) {
1012 ctdb_die(rec
->ctdb
, "Unable to set recovery helper\n");
1015 arg
= talloc_asprintf(mem_ctx
, "%u", new_generation());
1017 DEBUG(DEBUG_ERR
, (__location__
" memory error\n"));
1021 setenv("CTDB_DBDIR_STATE", rec
->ctdb
->db_directory_state
, 1);
1023 return helper_run(rec
, mem_ctx
, prog
, arg
, "recovery");
1027 we are the recmaster, and recovery is needed - start a recovery run
1029 static int do_recovery(struct ctdb_recoverd
*rec
,
1030 TALLOC_CTX
*mem_ctx
, uint32_t pnn
,
1031 struct ctdb_node_map_old
*nodemap
, struct ctdb_vnn_map
*vnnmap
)
1033 struct ctdb_context
*ctdb
= rec
->ctdb
;
1038 DEBUG(DEBUG_NOTICE
, (__location__
" Starting do_recovery\n"));
1040 /* Check if the current node is still the recmaster. It's possible that
1041 * re-election has changed the recmaster.
1043 if (pnn
!= rec
->recmaster
) {
1045 ("Recovery master changed to %u, aborting recovery\n",
1050 /* if recovery fails, force it again */
1051 rec
->need_recovery
= true;
1053 if (!ctdb_op_begin(rec
->recovery
)) {
1057 if (rec
->election_timeout
) {
1058 /* an election is in progress */
1059 DEBUG(DEBUG_ERR
, ("do_recovery called while election in progress - try again later\n"));
1063 ban_misbehaving_nodes(rec
, &self_ban
);
1065 DEBUG(DEBUG_NOTICE
, ("This node was banned, aborting recovery\n"));
1069 if (ctdb
->recovery_lock
!= NULL
) {
1070 if (ctdb_recovery_have_lock(rec
)) {
1071 D_NOTICE("Already holding recovery lock\n");
1075 D_NOTICE("Attempting to take recovery lock (%s)\n",
1076 ctdb
->recovery_lock
);
1078 ok
= ctdb_recovery_lock(rec
);
1080 D_ERR("Unable to take recovery lock\n");
1082 if (pnn
!= rec
->recmaster
) {
1083 D_NOTICE("Recovery master changed to %u,"
1084 " aborting recovery\n",
1086 rec
->need_recovery
= false;
1090 if (ctdb
->runstate
==
1091 CTDB_RUNSTATE_FIRST_RECOVERY
) {
1093 * First recovery? Perhaps
1094 * current node does not yet
1095 * know who the recmaster is.
1097 D_ERR("Retrying recovery\n");
1101 D_ERR("Abort recovery, "
1102 "ban this node for %u seconds\n",
1103 ctdb
->tunable
.recovery_ban_period
);
1106 ctdb
->tunable
.recovery_ban_period
);
1109 D_NOTICE("Recovery lock taken successfully\n");
1113 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery initiated due to problem with node %u\n", rec
->last_culprit_node
));
1115 /* Retrieve capabilities from all connected nodes */
1116 ret
= update_capabilities(rec
, nodemap
);
1118 DEBUG(DEBUG_ERR
, (__location__
" Unable to update node capabilities.\n"));
1123 update all nodes to have the same flags that we have
1125 for (i
=0;i
<nodemap
->num
;i
++) {
1126 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
) {
1130 ret
= update_flags_on_all_nodes(ctdb
,
1132 nodemap
->nodes
[i
].flags
);
1134 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_INACTIVE
) {
1135 DEBUG(DEBUG_WARNING
, (__location__
"Unable to update flags on inactive node %d\n", i
));
1137 DEBUG(DEBUG_ERR
, (__location__
" Unable to update flags on all nodes for node %d\n", i
));
1143 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - updated flags\n"));
1145 ret
= db_recovery_parallel(rec
, mem_ctx
);
1150 do_takeover_run(rec
, nodemap
);
1152 /* send a message to all clients telling them that the cluster
1153 has been reconfigured */
1154 ret
= ctdb_client_send_message(ctdb
, CTDB_BROADCAST_CONNECTED
,
1155 CTDB_SRVID_RECONFIGURE
, tdb_null
);
1157 DEBUG(DEBUG_ERR
, (__location__
" Failed to send reconfigure message\n"));
1161 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery complete\n"));
1163 rec
->need_recovery
= false;
1164 ctdb_op_end(rec
->recovery
);
1166 /* we managed to complete a full recovery, make sure to forgive
1167 any past sins by the nodes that could now participate in the
1170 DEBUG(DEBUG_ERR
,("Resetting ban count to 0 for all nodes\n"));
1171 for (i
=0;i
<nodemap
->num
;i
++) {
1172 struct ctdb_banning_state
*ban_state
;
1174 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
) {
1178 ban_state
= (struct ctdb_banning_state
*)ctdb
->nodes
[nodemap
->nodes
[i
].pnn
]->ban_state
;
1179 if (ban_state
== NULL
) {
1183 ban_state
->count
= 0;
1186 /* We just finished a recovery successfully.
1187 We now wait for rerecovery_timeout before we allow
1188 another recovery to take place.
1190 DEBUG(DEBUG_NOTICE
, ("Just finished a recovery. New recoveries will now be suppressed for the rerecovery timeout (%d seconds)\n", ctdb
->tunable
.rerecovery_timeout
));
1191 ctdb_op_disable(rec
->recovery
, ctdb
->ev
,
1192 ctdb
->tunable
.rerecovery_timeout
);
1196 ctdb_op_end(rec
->recovery
);
1202 elections are won by first checking the number of connected nodes, then
1203 the priority time, then the pnn
1205 struct election_message
{
1206 uint32_t num_connected
;
1207 struct timeval priority_time
;
1209 uint32_t node_flags
;
1213 form this nodes election data
1215 static void ctdb_election_data(struct ctdb_recoverd
*rec
, struct election_message
*em
)
1219 struct ctdb_node_map_old
*nodemap
;
1220 struct ctdb_context
*ctdb
= rec
->ctdb
;
1224 em
->pnn
= rec
->ctdb
->pnn
;
1225 em
->priority_time
= rec
->priority_time
;
1227 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, rec
, &nodemap
);
1229 DEBUG(DEBUG_ERR
,(__location__
" unable to get node map\n"));
1233 rec
->node_flags
= nodemap
->nodes
[ctdb
->pnn
].flags
;
1234 em
->node_flags
= rec
->node_flags
;
1236 for (i
=0;i
<nodemap
->num
;i
++) {
1237 if (!(nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
)) {
1238 em
->num_connected
++;
1242 /* we shouldnt try to win this election if we cant be a recmaster */
1243 if ((ctdb
->capabilities
& CTDB_CAP_RECMASTER
) == 0) {
1244 em
->num_connected
= 0;
1245 em
->priority_time
= timeval_current();
1248 talloc_free(nodemap
);
1252 see if the given election data wins
1254 static bool ctdb_election_win(struct ctdb_recoverd
*rec
, struct election_message
*em
)
1256 struct election_message myem
;
1259 ctdb_election_data(rec
, &myem
);
1261 /* we cant win if we don't have the recmaster capability */
1262 if ((rec
->ctdb
->capabilities
& CTDB_CAP_RECMASTER
) == 0) {
1266 /* we cant win if we are banned */
1267 if (rec
->node_flags
& NODE_FLAGS_BANNED
) {
1271 /* we cant win if we are stopped */
1272 if (rec
->node_flags
& NODE_FLAGS_STOPPED
) {
1276 /* we will automatically win if the other node is banned */
1277 if (em
->node_flags
& NODE_FLAGS_BANNED
) {
1281 /* we will automatically win if the other node is banned */
1282 if (em
->node_flags
& NODE_FLAGS_STOPPED
) {
1286 /* then the longest running node */
1288 cmp
= timeval_compare(&em
->priority_time
, &myem
.priority_time
);
1292 cmp
= (int)myem
.pnn
- (int)em
->pnn
;
1299 send out an election request
1301 static int send_election_request(struct ctdb_recoverd
*rec
, uint32_t pnn
)
1304 TDB_DATA election_data
;
1305 struct election_message emsg
;
1307 struct ctdb_context
*ctdb
= rec
->ctdb
;
1309 srvid
= CTDB_SRVID_ELECTION
;
1311 ctdb_election_data(rec
, &emsg
);
1313 election_data
.dsize
= sizeof(struct election_message
);
1314 election_data
.dptr
= (unsigned char *)&emsg
;
1317 /* first we assume we will win the election and set
1318 recoverymaster to be ourself on the current node
1320 ret
= ctdb_ctrl_setrecmaster(ctdb
, CONTROL_TIMEOUT(),
1321 CTDB_CURRENT_NODE
, pnn
);
1323 DEBUG(DEBUG_ERR
, (__location__
" failed to set recmaster\n"));
1326 rec
->recmaster
= pnn
;
1328 /* send an election message to all active nodes */
1329 DEBUG(DEBUG_INFO
,(__location__
" Send election request to all active nodes\n"));
1330 return ctdb_client_send_message(ctdb
, CTDB_BROADCAST_ALL
, srvid
, election_data
);
1334 we think we are winning the election - send a broadcast election request
1336 static void election_send_request(struct tevent_context
*ev
,
1337 struct tevent_timer
*te
,
1338 struct timeval t
, void *p
)
1340 struct ctdb_recoverd
*rec
= talloc_get_type(p
, struct ctdb_recoverd
);
1343 ret
= send_election_request(rec
, ctdb_get_pnn(rec
->ctdb
));
1345 DEBUG(DEBUG_ERR
,("Failed to send election request!\n"));
1348 TALLOC_FREE(rec
->send_election_te
);
1352 handler for memory dumps
1354 static void mem_dump_handler(uint64_t srvid
, TDB_DATA data
, void *private_data
)
1356 struct ctdb_recoverd
*rec
= talloc_get_type(
1357 private_data
, struct ctdb_recoverd
);
1358 struct ctdb_context
*ctdb
= rec
->ctdb
;
1359 TALLOC_CTX
*tmp_ctx
= talloc_new(ctdb
);
1362 struct ctdb_srvid_message
*rd
;
1364 if (data
.dsize
!= sizeof(struct ctdb_srvid_message
)) {
1365 DEBUG(DEBUG_ERR
, (__location__
" Wrong size of return address.\n"));
1366 talloc_free(tmp_ctx
);
1369 rd
= (struct ctdb_srvid_message
*)data
.dptr
;
1371 dump
= talloc_zero(tmp_ctx
, TDB_DATA
);
1373 DEBUG(DEBUG_ERR
, (__location__
" Failed to allocate memory for memdump\n"));
1374 talloc_free(tmp_ctx
);
1377 ret
= ctdb_dump_memory(ctdb
, dump
);
1379 DEBUG(DEBUG_ERR
, (__location__
" ctdb_dump_memory() failed\n"));
1380 talloc_free(tmp_ctx
);
1384 DEBUG(DEBUG_ERR
, ("recovery master memory dump\n"));
1386 ret
= ctdb_client_send_message(ctdb
, rd
->pnn
, rd
->srvid
, *dump
);
1388 DEBUG(DEBUG_ERR
,("Failed to send rd memdump reply message\n"));
1389 talloc_free(tmp_ctx
);
1393 talloc_free(tmp_ctx
);
1397 handler for reload_nodes
1399 static void reload_nodes_handler(uint64_t srvid
, TDB_DATA data
,
1402 struct ctdb_recoverd
*rec
= talloc_get_type(
1403 private_data
, struct ctdb_recoverd
);
1405 DEBUG(DEBUG_ERR
, (__location__
" Reload nodes file from recovery daemon\n"));
1407 ctdb_load_nodes_file(rec
->ctdb
);
1411 static void recd_node_rebalance_handler(uint64_t srvid
, TDB_DATA data
,
1414 struct ctdb_recoverd
*rec
= talloc_get_type(
1415 private_data
, struct ctdb_recoverd
);
1416 struct ctdb_context
*ctdb
= rec
->ctdb
;
1421 if (rec
->recmaster
!= ctdb_get_pnn(ctdb
)) {
1425 if (data
.dsize
!= sizeof(uint32_t)) {
1426 DEBUG(DEBUG_ERR
,(__location__
" Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data
.dsize
, sizeof(uint32_t)));
1430 pnn
= *(uint32_t *)&data
.dptr
[0];
1432 DEBUG(DEBUG_NOTICE
,("Setting up rebalance of IPs to node %u\n", pnn
));
1434 /* Copy any existing list of nodes. There's probably some
1435 * sort of realloc variant that will do this but we need to
1436 * make sure that freeing the old array also cancels the timer
1437 * event for the timeout... not sure if realloc will do that.
1439 len
= (rec
->force_rebalance_nodes
!= NULL
) ?
1440 talloc_array_length(rec
->force_rebalance_nodes
) :
1443 /* This allows duplicates to be added but they don't cause
1444 * harm. A call to add a duplicate PNN arguably means that
1445 * the timeout should be reset, so this is the simplest
1448 t
= talloc_zero_array(rec
, uint32_t, len
+1);
1449 CTDB_NO_MEMORY_VOID(ctdb
, t
);
1451 memcpy(t
, rec
->force_rebalance_nodes
, sizeof(uint32_t) * len
);
1455 talloc_free(rec
->force_rebalance_nodes
);
1457 rec
->force_rebalance_nodes
= t
;
1462 static void srvid_disable_and_reply(struct ctdb_context
*ctdb
,
1464 struct ctdb_op_state
*op_state
)
1466 struct ctdb_disable_message
*r
;
1471 /* Validate input data */
1472 if (data
.dsize
!= sizeof(struct ctdb_disable_message
)) {
1473 DEBUG(DEBUG_ERR
,(__location__
" Wrong size for data :%lu "
1474 "expecting %lu\n", (long unsigned)data
.dsize
,
1475 (long unsigned)sizeof(struct ctdb_srvid_message
)));
1478 if (data
.dptr
== NULL
) {
1479 DEBUG(DEBUG_ERR
,(__location__
" No data received\n"));
1483 r
= (struct ctdb_disable_message
*)data
.dptr
;
1484 timeout
= r
->timeout
;
1486 ret
= ctdb_op_disable(op_state
, ctdb
->ev
, timeout
);
1491 /* Returning our PNN tells the caller that we succeeded */
1492 ret
= ctdb_get_pnn(ctdb
);
1494 result
.dsize
= sizeof(int32_t);
1495 result
.dptr
= (uint8_t *)&ret
;
1496 srvid_request_reply(ctdb
, (struct ctdb_srvid_message
*)r
, result
);
1499 static void disable_takeover_runs_handler(uint64_t srvid
, TDB_DATA data
,
1502 struct ctdb_recoverd
*rec
= talloc_get_type(
1503 private_data
, struct ctdb_recoverd
);
1505 srvid_disable_and_reply(rec
->ctdb
, data
, rec
->takeover_run
);
1508 /* Backward compatibility for this SRVID */
1509 static void disable_ip_check_handler(uint64_t srvid
, TDB_DATA data
,
1512 struct ctdb_recoverd
*rec
= talloc_get_type(
1513 private_data
, struct ctdb_recoverd
);
1516 if (data
.dsize
!= sizeof(uint32_t)) {
1517 DEBUG(DEBUG_ERR
,(__location__
" Wrong size for data :%lu "
1518 "expecting %lu\n", (long unsigned)data
.dsize
,
1519 (long unsigned)sizeof(uint32_t)));
1522 if (data
.dptr
== NULL
) {
1523 DEBUG(DEBUG_ERR
,(__location__
" No data received\n"));
1527 timeout
= *((uint32_t *)data
.dptr
);
1529 ctdb_op_disable(rec
->takeover_run
, rec
->ctdb
->ev
, timeout
);
1532 static void disable_recoveries_handler(uint64_t srvid
, TDB_DATA data
,
1535 struct ctdb_recoverd
*rec
= talloc_get_type(
1536 private_data
, struct ctdb_recoverd
);
1538 srvid_disable_and_reply(rec
->ctdb
, data
, rec
->recovery
);
1542 handler for ip reallocate, just add it to the list of requests and
1543 handle this later in the monitor_cluster loop so we do not recurse
1544 with other requests to takeover_run()
1546 static void ip_reallocate_handler(uint64_t srvid
, TDB_DATA data
,
1549 struct ctdb_srvid_message
*request
;
1550 struct ctdb_recoverd
*rec
= talloc_get_type(
1551 private_data
, struct ctdb_recoverd
);
1553 if (data
.dsize
!= sizeof(struct ctdb_srvid_message
)) {
1554 DEBUG(DEBUG_ERR
, (__location__
" Wrong size of return address.\n"));
1558 request
= (struct ctdb_srvid_message
*)data
.dptr
;
1560 srvid_request_add(rec
->ctdb
, &rec
->reallocate_requests
, request
);
1563 static void process_ipreallocate_requests(struct ctdb_context
*ctdb
,
1564 struct ctdb_recoverd
*rec
)
1568 struct srvid_requests
*current
;
1570 /* Only process requests that are currently pending. More
1571 * might come in while the takeover run is in progress and
1572 * they will need to be processed later since they might
1573 * be in response flag changes.
1575 current
= rec
->reallocate_requests
;
1576 rec
->reallocate_requests
= NULL
;
1578 if (do_takeover_run(rec
, rec
->nodemap
)) {
1579 ret
= ctdb_get_pnn(ctdb
);
1584 result
.dsize
= sizeof(int32_t);
1585 result
.dptr
= (uint8_t *)&ret
;
1587 srvid_requests_reply(ctdb
, ¤t
, result
);
1591 * handler for assigning banning credits
1593 static void banning_handler(uint64_t srvid
, TDB_DATA data
, void *private_data
)
1595 struct ctdb_recoverd
*rec
= talloc_get_type(
1596 private_data
, struct ctdb_recoverd
);
1599 /* Ignore if we are not recmaster */
1600 if (rec
->ctdb
->pnn
!= rec
->recmaster
) {
1604 if (data
.dsize
!= sizeof(uint32_t)) {
1605 DEBUG(DEBUG_ERR
, (__location__
"invalid data size %zu\n",
1610 ban_pnn
= *(uint32_t *)data
.dptr
;
1612 ctdb_set_culprit_count(rec
, ban_pnn
, rec
->nodemap
->num
);
1616 handler for recovery master elections
1618 static void election_handler(uint64_t srvid
, TDB_DATA data
, void *private_data
)
1620 struct ctdb_recoverd
*rec
= talloc_get_type(
1621 private_data
, struct ctdb_recoverd
);
1622 struct ctdb_context
*ctdb
= rec
->ctdb
;
1624 struct election_message
*em
= (struct election_message
*)data
.dptr
;
1626 /* Ignore election packets from ourself */
1627 if (ctdb
->pnn
== em
->pnn
) {
1631 /* we got an election packet - update the timeout for the election */
1632 talloc_free(rec
->election_timeout
);
1633 rec
->election_timeout
= tevent_add_timer(
1636 timeval_current_ofs(0, 500000) :
1637 timeval_current_ofs(ctdb
->tunable
.election_timeout
, 0),
1638 ctdb_election_timeout
, rec
);
1640 /* someone called an election. check their election data
1641 and if we disagree and we would rather be the elected node,
1642 send a new election message to all other nodes
1644 if (ctdb_election_win(rec
, em
)) {
1645 if (!rec
->send_election_te
) {
1646 rec
->send_election_te
= tevent_add_timer(
1648 timeval_current_ofs(0, 500000),
1649 election_send_request
, rec
);
1655 TALLOC_FREE(rec
->send_election_te
);
1657 /* Release the recovery lock file */
1658 if (ctdb_recovery_have_lock(rec
)) {
1659 ctdb_recovery_unlock(rec
);
1662 /* ok, let that guy become recmaster then */
1663 ret
= ctdb_ctrl_setrecmaster(ctdb
, CONTROL_TIMEOUT(),
1664 CTDB_CURRENT_NODE
, em
->pnn
);
1666 DEBUG(DEBUG_ERR
, (__location__
" failed to set recmaster"));
1669 rec
->recmaster
= em
->pnn
;
1676 force the start of the election process
1678 static void force_election(struct ctdb_recoverd
*rec
, uint32_t pnn
,
1679 struct ctdb_node_map_old
*nodemap
)
1682 struct ctdb_context
*ctdb
= rec
->ctdb
;
1684 DEBUG(DEBUG_INFO
,(__location__
" Force an election\n"));
1686 /* set all nodes to recovery mode to stop all internode traffic */
1687 ret
= set_recovery_mode(ctdb
, rec
, nodemap
, CTDB_RECOVERY_ACTIVE
);
1689 DEBUG(DEBUG_ERR
, (__location__
" Unable to set recovery mode to active on cluster\n"));
1693 talloc_free(rec
->election_timeout
);
1694 rec
->election_timeout
= tevent_add_timer(
1697 timeval_current_ofs(0, 500000) :
1698 timeval_current_ofs(ctdb
->tunable
.election_timeout
, 0),
1699 ctdb_election_timeout
, rec
);
1701 ret
= send_election_request(rec
, pnn
);
1703 DEBUG(DEBUG_ERR
, (__location__
" failed to initiate recmaster election"));
1707 /* wait for a few seconds to collect all responses */
1708 ctdb_wait_election(rec
);
1714 handler for when a node changes its flags
1716 static void monitor_handler(uint64_t srvid
, TDB_DATA data
, void *private_data
)
1718 struct ctdb_recoverd
*rec
= talloc_get_type(
1719 private_data
, struct ctdb_recoverd
);
1720 struct ctdb_context
*ctdb
= rec
->ctdb
;
1722 struct ctdb_node_flag_change
*c
= (struct ctdb_node_flag_change
*)data
.dptr
;
1723 struct ctdb_node_map_old
*nodemap
=NULL
;
1724 TALLOC_CTX
*tmp_ctx
;
1727 if (data
.dsize
!= sizeof(*c
)) {
1728 DEBUG(DEBUG_ERR
,(__location__
"Invalid data in ctdb_node_flag_change\n"));
1732 tmp_ctx
= talloc_new(ctdb
);
1733 CTDB_NO_MEMORY_VOID(ctdb
, tmp_ctx
);
1735 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, tmp_ctx
, &nodemap
);
1737 DEBUG(DEBUG_ERR
,(__location__
"ctdb_ctrl_getnodemap failed in monitor_handler\n"));
1738 talloc_free(tmp_ctx
);
1743 for (i
=0;i
<nodemap
->num
;i
++) {
1744 if (nodemap
->nodes
[i
].pnn
== c
->pnn
) break;
1747 if (i
== nodemap
->num
) {
1748 DEBUG(DEBUG_CRIT
,(__location__
"Flag change for non-existant node %u\n", c
->pnn
));
1749 talloc_free(tmp_ctx
);
1753 if (c
->old_flags
!= c
->new_flags
) {
1754 DEBUG(DEBUG_NOTICE
,("Node %u has changed flags - now 0x%x was 0x%x\n", c
->pnn
, c
->new_flags
, c
->old_flags
));
1757 nodemap
->nodes
[i
].flags
= c
->new_flags
;
1759 talloc_free(tmp_ctx
);
1763 handler for when we need to push out flag changes to all other nodes
1765 static void push_flags_handler(uint64_t srvid
, TDB_DATA data
,
1768 struct ctdb_recoverd
*rec
= talloc_get_type(
1769 private_data
, struct ctdb_recoverd
);
1770 struct ctdb_context
*ctdb
= rec
->ctdb
;
1772 struct ctdb_node_flag_change
*c
= (struct ctdb_node_flag_change
*)data
.dptr
;
1773 struct ctdb_node_map_old
*nodemap
=NULL
;
1774 TALLOC_CTX
*tmp_ctx
= talloc_new(ctdb
);
1777 /* read the node flags from the recmaster */
1778 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), rec
->recmaster
,
1781 DEBUG(DEBUG_ERR
, (__location__
" Unable to get nodemap from node %u\n", c
->pnn
));
1782 talloc_free(tmp_ctx
);
1785 if (c
->pnn
>= nodemap
->num
) {
1786 DEBUG(DEBUG_ERR
,(__location__
" Nodemap from recmaster does not contain node %d\n", c
->pnn
));
1787 talloc_free(tmp_ctx
);
1791 /* send the flags update to all connected nodes */
1792 nodes
= list_of_connected_nodes(ctdb
, nodemap
, tmp_ctx
, true);
1794 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_MODIFY_FLAGS
,
1795 nodes
, 0, CONTROL_TIMEOUT(),
1799 DEBUG(DEBUG_ERR
, (__location__
" ctdb_control to modify node flags failed\n"));
1801 talloc_free(tmp_ctx
);
1805 talloc_free(tmp_ctx
);
1809 struct verify_recmode_normal_data
{
1811 enum monitor_result status
;
1814 static void verify_recmode_normal_callback(struct ctdb_client_control_state
*state
)
1816 struct verify_recmode_normal_data
*rmdata
= talloc_get_type(state
->async
.private_data
, struct verify_recmode_normal_data
);
1819 /* one more node has responded with recmode data*/
1822 /* if we failed to get the recmode, then return an error and let
1823 the main loop try again.
1825 if (state
->state
!= CTDB_CONTROL_DONE
) {
1826 if (rmdata
->status
== MONITOR_OK
) {
1827 rmdata
->status
= MONITOR_FAILED
;
1832 /* if we got a response, then the recmode will be stored in the
1835 if (state
->status
!= CTDB_RECOVERY_NORMAL
) {
1836 DEBUG(DEBUG_NOTICE
, ("Node:%u was in recovery mode. Start recovery process\n", state
->c
->hdr
.destnode
));
1837 rmdata
->status
= MONITOR_RECOVERY_NEEDED
;
1844 /* verify that all nodes are in normal recovery mode */
1845 static enum monitor_result
verify_recmode(struct ctdb_context
*ctdb
, struct ctdb_node_map_old
*nodemap
)
1847 struct verify_recmode_normal_data
*rmdata
;
1848 TALLOC_CTX
*mem_ctx
= talloc_new(ctdb
);
1849 struct ctdb_client_control_state
*state
;
1850 enum monitor_result status
;
1853 rmdata
= talloc(mem_ctx
, struct verify_recmode_normal_data
);
1854 CTDB_NO_MEMORY_FATAL(ctdb
, rmdata
);
1856 rmdata
->status
= MONITOR_OK
;
1858 /* loop over all active nodes and send an async getrecmode call to
1860 for (j
=0; j
<nodemap
->num
; j
++) {
1861 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
1864 state
= ctdb_ctrl_getrecmode_send(ctdb
, mem_ctx
,
1866 nodemap
->nodes
[j
].pnn
);
1867 if (state
== NULL
) {
1868 /* we failed to send the control, treat this as
1869 an error and try again next iteration
1871 DEBUG(DEBUG_ERR
,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
1872 talloc_free(mem_ctx
);
1873 return MONITOR_FAILED
;
1876 /* set up the callback functions */
1877 state
->async
.fn
= verify_recmode_normal_callback
;
1878 state
->async
.private_data
= rmdata
;
1880 /* one more control to wait for to complete */
1885 /* now wait for up to the maximum number of seconds allowed
1886 or until all nodes we expect a response from has replied
1888 while (rmdata
->count
> 0) {
1889 tevent_loop_once(ctdb
->ev
);
1892 status
= rmdata
->status
;
1893 talloc_free(mem_ctx
);
1898 struct verify_recmaster_data
{
1899 struct ctdb_recoverd
*rec
;
1902 enum monitor_result status
;
1905 static void verify_recmaster_callback(struct ctdb_client_control_state
*state
)
1907 struct verify_recmaster_data
*rmdata
= talloc_get_type(state
->async
.private_data
, struct verify_recmaster_data
);
1910 /* one more node has responded with recmaster data*/
1913 /* if we failed to get the recmaster, then return an error and let
1914 the main loop try again.
1916 if (state
->state
!= CTDB_CONTROL_DONE
) {
1917 if (rmdata
->status
== MONITOR_OK
) {
1918 rmdata
->status
= MONITOR_FAILED
;
1923 /* if we got a response, then the recmaster will be stored in the
1926 if ((uint32_t)state
->status
!= rmdata
->pnn
) {
1927 DEBUG(DEBUG_ERR
,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state
->c
->hdr
.destnode
, state
->status
));
1928 ctdb_set_culprit(rmdata
->rec
, state
->c
->hdr
.destnode
);
1929 rmdata
->status
= MONITOR_ELECTION_NEEDED
;
1936 /* verify that all nodes agree that we are the recmaster */
1937 static enum monitor_result
verify_recmaster(struct ctdb_recoverd
*rec
, struct ctdb_node_map_old
*nodemap
, uint32_t pnn
)
1939 struct ctdb_context
*ctdb
= rec
->ctdb
;
1940 struct verify_recmaster_data
*rmdata
;
1941 TALLOC_CTX
*mem_ctx
= talloc_new(ctdb
);
1942 struct ctdb_client_control_state
*state
;
1943 enum monitor_result status
;
1946 rmdata
= talloc(mem_ctx
, struct verify_recmaster_data
);
1947 CTDB_NO_MEMORY_FATAL(ctdb
, rmdata
);
1951 rmdata
->status
= MONITOR_OK
;
1953 /* loop over all active nodes and send an async getrecmaster call to
1955 for (j
=0; j
<nodemap
->num
; j
++) {
1956 if (nodemap
->nodes
[j
].pnn
== rec
->recmaster
) {
1959 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
1962 state
= ctdb_ctrl_getrecmaster_send(ctdb
, mem_ctx
,
1964 nodemap
->nodes
[j
].pnn
);
1965 if (state
== NULL
) {
1966 /* we failed to send the control, treat this as
1967 an error and try again next iteration
1969 DEBUG(DEBUG_ERR
,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
1970 talloc_free(mem_ctx
);
1971 return MONITOR_FAILED
;
1974 /* set up the callback functions */
1975 state
->async
.fn
= verify_recmaster_callback
;
1976 state
->async
.private_data
= rmdata
;
1978 /* one more control to wait for to complete */
1983 /* now wait for up to the maximum number of seconds allowed
1984 or until all nodes we expect a response from has replied
1986 while (rmdata
->count
> 0) {
1987 tevent_loop_once(ctdb
->ev
);
1990 status
= rmdata
->status
;
1991 talloc_free(mem_ctx
);
1995 static bool interfaces_have_changed(struct ctdb_context
*ctdb
,
1996 struct ctdb_recoverd
*rec
)
1998 struct ctdb_iface_list_old
*ifaces
= NULL
;
1999 TALLOC_CTX
*mem_ctx
;
2002 mem_ctx
= talloc_new(NULL
);
2004 /* Read the interfaces from the local node */
2005 if (ctdb_ctrl_get_ifaces(ctdb
, CONTROL_TIMEOUT(),
2006 CTDB_CURRENT_NODE
, mem_ctx
, &ifaces
) != 0) {
2007 DEBUG(DEBUG_ERR
, ("Unable to get interfaces from local node %u\n", ctdb
->pnn
));
2008 /* We could return an error. However, this will be
2009 * rare so we'll decide that the interfaces have
2010 * actually changed, just in case.
2012 talloc_free(mem_ctx
);
2017 /* We haven't been here before so things have changed */
2018 DEBUG(DEBUG_NOTICE
, ("Initial interface fetched\n"));
2020 } else if (rec
->ifaces
->num
!= ifaces
->num
) {
2021 /* Number of interfaces has changed */
2022 DEBUG(DEBUG_NOTICE
, ("Interface count changed from %d to %d\n",
2023 rec
->ifaces
->num
, ifaces
->num
));
2026 /* See if interface names or link states have changed */
2028 for (i
= 0; i
< rec
->ifaces
->num
; i
++) {
2029 struct ctdb_iface
* iface
= &rec
->ifaces
->ifaces
[i
];
2030 if (strcmp(iface
->name
, ifaces
->ifaces
[i
].name
) != 0) {
2032 ("Interface in slot %d changed: %s => %s\n",
2033 i
, iface
->name
, ifaces
->ifaces
[i
].name
));
2037 if (iface
->link_state
!= ifaces
->ifaces
[i
].link_state
) {
2039 ("Interface %s changed state: %d => %d\n",
2040 iface
->name
, iface
->link_state
,
2041 ifaces
->ifaces
[i
].link_state
));
2048 talloc_free(rec
->ifaces
);
2049 rec
->ifaces
= talloc_steal(rec
, ifaces
);
2051 talloc_free(mem_ctx
);
2055 /* Check that the local allocation of public IP addresses is correct
2056 * and do some house-keeping */
2057 static int verify_local_ip_allocation(struct ctdb_context
*ctdb
,
2058 struct ctdb_recoverd
*rec
,
2060 struct ctdb_node_map_old
*nodemap
)
2062 TALLOC_CTX
*mem_ctx
= talloc_new(NULL
);
2065 bool need_takeover_run
= false;
2066 struct ctdb_public_ip_list_old
*ips
= NULL
;
2068 /* If we are not the recmaster then do some housekeeping */
2069 if (rec
->recmaster
!= pnn
) {
2070 /* Ignore any IP reallocate requests - only recmaster
2073 TALLOC_FREE(rec
->reallocate_requests
);
2074 /* Clear any nodes that should be force rebalanced in
2075 * the next takeover run. If the recovery master role
2076 * has moved then we don't want to process these some
2077 * time in the future.
2079 TALLOC_FREE(rec
->force_rebalance_nodes
);
2082 /* Return early if disabled... */
2083 if (ctdb_config
.failover_disabled
||
2084 ctdb_op_is_disabled(rec
->takeover_run
)) {
2085 talloc_free(mem_ctx
);
2089 if (interfaces_have_changed(ctdb
, rec
)) {
2090 need_takeover_run
= true;
2093 /* If there are unhosted IPs but this node can host them then
2094 * trigger an IP reallocation */
2096 /* Read *available* IPs from local node */
2097 ret
= ctdb_ctrl_get_public_ips_flags(
2098 ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, mem_ctx
,
2099 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE
, &ips
);
2101 DEBUG(DEBUG_ERR
, ("Unable to retrieve available public IPs\n"));
2102 talloc_free(mem_ctx
);
2106 for (j
=0; j
<ips
->num
; j
++) {
2107 if (ips
->ips
[j
].pnn
== CTDB_UNKNOWN_PNN
&&
2108 nodemap
->nodes
[pnn
].flags
== 0) {
2109 DEBUG(DEBUG_WARNING
,
2110 ("Unassigned IP %s can be served by this node\n",
2111 ctdb_addr_to_str(&ips
->ips
[j
].addr
)));
2112 need_takeover_run
= true;
2118 if (!ctdb
->do_checkpublicip
) {
2122 /* Validate the IP addresses that this node has on network
2123 * interfaces. If there is an inconsistency between reality
2124 * and the state expected by CTDB then try to fix it by
2125 * triggering an IP reallocation or releasing extraneous IP
2128 /* Read *known* IPs from local node */
2129 ret
= ctdb_ctrl_get_public_ips_flags(
2130 ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, mem_ctx
, 0, &ips
);
2132 DEBUG(DEBUG_ERR
, ("Unable to retrieve known public IPs\n"));
2133 talloc_free(mem_ctx
);
2137 for (j
=0; j
<ips
->num
; j
++) {
2138 if (ips
->ips
[j
].pnn
== pnn
) {
2139 if (!ctdb_sys_have_ip(&ips
->ips
[j
].addr
)) {
2141 ("Assigned IP %s not on an interface\n",
2142 ctdb_addr_to_str(&ips
->ips
[j
].addr
)));
2143 need_takeover_run
= true;
2146 if (ctdb_sys_have_ip(&ips
->ips
[j
].addr
)) {
2148 ("IP %s incorrectly on an interface\n",
2149 ctdb_addr_to_str(&ips
->ips
[j
].addr
)));
2150 need_takeover_run
= true;
2156 if (need_takeover_run
) {
2157 struct ctdb_srvid_message rd
;
2160 DEBUG(DEBUG_NOTICE
,("Trigger takeoverrun\n"));
2165 data
.dptr
= (uint8_t *)&rd
;
2166 data
.dsize
= sizeof(rd
);
2168 ret
= ctdb_client_send_message(ctdb
, rec
->recmaster
, CTDB_SRVID_TAKEOVER_RUN
, data
);
2171 ("Failed to send takeover run request\n"));
2174 talloc_free(mem_ctx
);
2179 static void async_getnodemap_callback(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
2181 struct ctdb_node_map_old
**remote_nodemaps
= callback_data
;
2183 if (node_pnn
>= ctdb
->num_nodes
) {
2184 DEBUG(DEBUG_ERR
,(__location__
" pnn from invalid node\n"));
2188 remote_nodemaps
[node_pnn
] = (struct ctdb_node_map_old
*)talloc_steal(remote_nodemaps
, outdata
.dptr
);
2192 static int get_remote_nodemaps(struct ctdb_context
*ctdb
, TALLOC_CTX
*mem_ctx
,
2193 struct ctdb_node_map_old
*nodemap
,
2194 struct ctdb_node_map_old
**remote_nodemaps
)
2198 nodes
= list_of_active_nodes(ctdb
, nodemap
, mem_ctx
, true);
2199 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_GET_NODEMAP
,
2201 CONTROL_TIMEOUT(), false, tdb_null
,
2202 async_getnodemap_callback
,
2204 remote_nodemaps
) != 0) {
2205 DEBUG(DEBUG_ERR
, (__location__
" Unable to pull all remote nodemaps\n"));
2213 static bool validate_recovery_master(struct ctdb_recoverd
*rec
,
2214 TALLOC_CTX
*mem_ctx
)
2216 struct ctdb_context
*ctdb
= rec
->ctdb
;
2217 uint32_t pnn
= ctdb_get_pnn(ctdb
);
2218 struct ctdb_node_map_old
*nodemap
= rec
->nodemap
;
2219 struct ctdb_node_map_old
*recmaster_nodemap
= NULL
;
2222 /* When recovery daemon is started, recmaster is set to
2223 * "unknown" so it knows to start an election.
2225 if (rec
->recmaster
== CTDB_UNKNOWN_PNN
) {
2227 ("Initial recovery master set - forcing election\n"));
2228 force_election(rec
, pnn
, nodemap
);
2233 * If the current recmaster does not have CTDB_CAP_RECMASTER,
2234 * but we have, then force an election and try to become the new
2237 if (!ctdb_node_has_capabilities(rec
->caps
,
2239 CTDB_CAP_RECMASTER
) &&
2240 (rec
->ctdb
->capabilities
& CTDB_CAP_RECMASTER
) &&
2241 !(nodemap
->nodes
[pnn
].flags
& NODE_FLAGS_INACTIVE
)) {
2243 (" Current recmaster node %u does not have CAP_RECMASTER,"
2244 " but we (node %u) have - force an election\n",
2245 rec
->recmaster
, pnn
));
2246 force_election(rec
, pnn
, nodemap
);
2250 /* Verify that the master node has not been deleted. This
2251 * should not happen because a node should always be shutdown
2252 * before being deleted, causing a new master to be elected
2253 * before now. However, if something strange has happened
2254 * then checking here will ensure we don't index beyond the
2255 * end of the nodemap array. */
2256 if (rec
->recmaster
>= nodemap
->num
) {
2258 ("Recmaster node %u has been deleted. Force election\n",
2260 force_election(rec
, pnn
, nodemap
);
2264 /* if recovery master is disconnected/deleted we must elect a new recmaster */
2265 if (nodemap
->nodes
[rec
->recmaster
].flags
&
2266 (NODE_FLAGS_DISCONNECTED
|NODE_FLAGS_DELETED
)) {
2268 ("Recmaster node %u is disconnected/deleted. Force election\n",
2270 force_election(rec
, pnn
, nodemap
);
2274 /* get nodemap from the recovery master to check if it is inactive */
2275 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), rec
->recmaster
,
2276 mem_ctx
, &recmaster_nodemap
);
2280 " Unable to get nodemap from recovery master %u\n",
2282 /* No election, just error */
2287 if ((recmaster_nodemap
->nodes
[rec
->recmaster
].flags
& NODE_FLAGS_INACTIVE
) &&
2288 (rec
->node_flags
& NODE_FLAGS_INACTIVE
) == 0) {
2290 ("Recmaster node %u is inactive. Force election\n",
2293 * update our nodemap to carry the recmaster's notion of
2294 * its own flags, so that we don't keep freezing the
2295 * inactive recmaster node...
2297 nodemap
->nodes
[rec
->recmaster
].flags
=
2298 recmaster_nodemap
->nodes
[rec
->recmaster
].flags
;
2299 force_election(rec
, pnn
, nodemap
);
2306 static void main_loop(struct ctdb_context
*ctdb
, struct ctdb_recoverd
*rec
,
2307 TALLOC_CTX
*mem_ctx
)
2310 struct ctdb_node_map_old
*nodemap
=NULL
;
2311 struct ctdb_node_map_old
**remote_nodemaps
=NULL
;
2312 struct ctdb_vnn_map
*vnnmap
=NULL
;
2313 struct ctdb_vnn_map
*remote_vnnmap
=NULL
;
2314 uint32_t num_lmasters
;
2315 int32_t debug_level
;
2321 /* verify that the main daemon is still running */
2322 if (ctdb_kill(ctdb
, ctdb
->ctdbd_pid
, 0) != 0) {
2323 DEBUG(DEBUG_CRIT
,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2327 /* ping the local daemon to tell it we are alive */
2328 ctdb_ctrl_recd_ping(ctdb
);
2330 if (rec
->election_timeout
) {
2331 /* an election is in progress */
2335 /* read the debug level from the parent and update locally */
2336 ret
= ctdb_ctrl_get_debuglevel(ctdb
, CTDB_CURRENT_NODE
, &debug_level
);
2338 DEBUG(DEBUG_ERR
, (__location__
" Failed to read debuglevel from parent\n"));
2341 debuglevel_set(debug_level
);
2343 /* get relevant tunables */
2344 ret
= ctdb_ctrl_get_all_tunables(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, &ctdb
->tunable
);
2346 DEBUG(DEBUG_ERR
,("Failed to get tunables - retrying\n"));
2351 ret
= ctdb_ctrl_get_runstate(ctdb
, CONTROL_TIMEOUT(),
2352 CTDB_CURRENT_NODE
, &ctdb
->runstate
);
2354 DEBUG(DEBUG_ERR
, ("Failed to get runstate - retrying\n"));
2358 pnn
= ctdb_get_pnn(ctdb
);
2361 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), pnn
, rec
, &nodemap
);
2363 DBG_ERR("Unable to get nodemap from node %"PRIu32
"\n", pnn
);
2366 talloc_free(rec
->nodemap
);
2367 rec
->nodemap
= nodemap
;
2369 /* remember our own node flags */
2370 rec
->node_flags
= nodemap
->nodes
[pnn
].flags
;
2372 ban_misbehaving_nodes(rec
, &self_ban
);
2374 DEBUG(DEBUG_NOTICE
, ("This node was banned, restart main_loop\n"));
2378 ret
= ctdb_ctrl_getrecmode(ctdb
, mem_ctx
, CONTROL_TIMEOUT(),
2379 CTDB_CURRENT_NODE
, &ctdb
->recovery_mode
);
2381 D_ERR("Failed to read recmode from local node\n");
2385 /* if the local daemon is STOPPED or BANNED, we verify that the databases are
2386 also frozen and that the recmode is set to active.
2388 if (rec
->node_flags
& (NODE_FLAGS_STOPPED
| NODE_FLAGS_BANNED
)) {
2389 /* If this node has become inactive then we want to
2390 * reduce the chances of it taking over the recovery
2391 * master role when it becomes active again. This
2392 * helps to stabilise the recovery master role so that
2393 * it stays on the most stable node.
2395 rec
->priority_time
= timeval_current();
2397 if (ctdb
->recovery_mode
== CTDB_RECOVERY_NORMAL
) {
2398 DEBUG(DEBUG_ERR
,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
2400 ret
= ctdb_ctrl_setrecmode(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, CTDB_RECOVERY_ACTIVE
);
2402 DEBUG(DEBUG_ERR
,(__location__
" Failed to activate recovery mode in STOPPED or BANNED state\n"));
2407 if (! rec
->frozen_on_inactive
) {
2408 ret
= ctdb_ctrl_freeze(ctdb
, CONTROL_TIMEOUT(),
2412 (__location__
" Failed to freeze node "
2413 "in STOPPED or BANNED state\n"));
2417 rec
->frozen_on_inactive
= true;
2420 /* If this node is stopped or banned then it is not the recovery
2421 * master, so don't do anything. This prevents stopped or banned
2422 * node from starting election and sending unnecessary controls.
2427 rec
->frozen_on_inactive
= false;
2429 /* Retrieve capabilities from all connected nodes */
2430 ret
= update_capabilities(rec
, nodemap
);
2432 DEBUG(DEBUG_ERR
, (__location__
" Unable to update node capabilities.\n"));
2436 if (! validate_recovery_master(rec
, mem_ctx
)) {
2440 if (ctdb
->recovery_mode
== CTDB_RECOVERY_NORMAL
) {
2441 /* Check if an IP takeover run is needed and trigger one if
2443 verify_local_ip_allocation(ctdb
, rec
, pnn
, nodemap
);
2446 /* if we are not the recmaster then we do not need to check
2447 if recovery is needed
2449 if (pnn
!= rec
->recmaster
) {
2454 /* ensure our local copies of flags are right */
2455 ret
= update_local_flags(rec
, nodemap
);
2457 DEBUG(DEBUG_ERR
,("Unable to update local flags\n"));
2461 if (ctdb
->num_nodes
!= nodemap
->num
) {
2462 DEBUG(DEBUG_ERR
, (__location__
" ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb
->num_nodes
, nodemap
->num
));
2463 ctdb_load_nodes_file(ctdb
);
2467 /* verify that all active nodes agree that we are the recmaster */
2468 switch (verify_recmaster(rec
, nodemap
, pnn
)) {
2469 case MONITOR_RECOVERY_NEEDED
:
2470 /* can not happen */
2472 case MONITOR_ELECTION_NEEDED
:
2473 force_election(rec
, pnn
, nodemap
);
2477 case MONITOR_FAILED
:
2482 /* get the vnnmap */
2483 ret
= ctdb_ctrl_getvnnmap(ctdb
, CONTROL_TIMEOUT(), pnn
, mem_ctx
, &vnnmap
);
2485 DEBUG(DEBUG_ERR
, (__location__
" Unable to get vnnmap from node %u\n", pnn
));
2489 if (rec
->need_recovery
) {
2490 /* a previous recovery didn't finish */
2491 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
2495 /* verify that all active nodes are in normal mode
2496 and not in recovery mode
2498 switch (verify_recmode(ctdb
, nodemap
)) {
2499 case MONITOR_RECOVERY_NEEDED
:
2500 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
2502 case MONITOR_FAILED
:
2504 case MONITOR_ELECTION_NEEDED
:
2505 /* can not happen */
2511 if (ctdb
->recovery_lock
!= NULL
) {
2512 /* We must already hold the recovery lock */
2513 if (!ctdb_recovery_have_lock(rec
)) {
2514 DEBUG(DEBUG_ERR
,("Failed recovery lock sanity check. Force a recovery\n"));
2515 ctdb_set_culprit(rec
, ctdb
->pnn
);
2516 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
2522 /* If recoveries are disabled then there is no use doing any
2523 * nodemap or flags checks. Recoveries might be disabled due
2524 * to "reloadnodes", so doing these checks might cause an
2525 * unnecessary recovery. */
2526 if (ctdb_op_is_disabled(rec
->recovery
)) {
2527 goto takeover_run_checks
;
2530 /* get the nodemap for all active remote nodes
2532 remote_nodemaps
= talloc_array(mem_ctx
, struct ctdb_node_map_old
*, nodemap
->num
);
2533 if (remote_nodemaps
== NULL
) {
2534 DEBUG(DEBUG_ERR
, (__location__
" failed to allocate remote nodemap array\n"));
2537 for(i
=0; i
<nodemap
->num
; i
++) {
2538 remote_nodemaps
[i
] = NULL
;
2540 if (get_remote_nodemaps(ctdb
, mem_ctx
, nodemap
, remote_nodemaps
) != 0) {
2541 DEBUG(DEBUG_ERR
,(__location__
" Failed to read remote nodemaps\n"));
2545 /* verify that all other nodes have the same nodemap as we have
2547 for (j
=0; j
<nodemap
->num
; j
++) {
2548 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
2552 if (remote_nodemaps
[j
] == NULL
) {
2553 DEBUG(DEBUG_ERR
,(__location__
" Did not get a remote nodemap for node %d, restarting monitoring\n", j
));
2554 ctdb_set_culprit(rec
, j
);
2559 /* if the nodes disagree on how many nodes there are
2560 then this is a good reason to try recovery
2562 if (remote_nodemaps
[j
]->num
!= nodemap
->num
) {
2563 DEBUG(DEBUG_ERR
, (__location__
" Remote node:%u has different node count. %u vs %u of the local node\n",
2564 nodemap
->nodes
[j
].pnn
, remote_nodemaps
[j
]->num
, nodemap
->num
));
2565 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
2566 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
2570 /* if the nodes disagree on which nodes exist and are
2571 active, then that is also a good reason to do recovery
2573 for (i
=0;i
<nodemap
->num
;i
++) {
2574 if (remote_nodemaps
[j
]->nodes
[i
].pnn
!= nodemap
->nodes
[i
].pnn
) {
2575 DEBUG(DEBUG_ERR
, (__location__
" Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
2576 nodemap
->nodes
[j
].pnn
, i
,
2577 remote_nodemaps
[j
]->nodes
[i
].pnn
, nodemap
->nodes
[i
].pnn
));
2578 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
2579 do_recovery(rec
, mem_ctx
, pnn
, nodemap
,
2587 * Update node flags obtained from each active node. This ensure we have
2588 * up-to-date information for all the nodes.
2590 for (j
=0; j
<nodemap
->num
; j
++) {
2591 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
2594 nodemap
->nodes
[j
].flags
= remote_nodemaps
[j
]->nodes
[j
].flags
;
2597 for (j
=0; j
<nodemap
->num
; j
++) {
2598 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
2602 /* verify the flags are consistent
2604 for (i
=0; i
<nodemap
->num
; i
++) {
2605 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
) {
2609 if (nodemap
->nodes
[i
].flags
!= remote_nodemaps
[j
]->nodes
[i
].flags
) {
2610 DEBUG(DEBUG_ERR
, (__location__
" Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
2611 nodemap
->nodes
[j
].pnn
,
2612 nodemap
->nodes
[i
].pnn
,
2613 remote_nodemaps
[j
]->nodes
[i
].flags
,
2614 nodemap
->nodes
[i
].flags
));
2616 DEBUG(DEBUG_ERR
,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps
[j
]->nodes
[i
].flags
, j
));
2617 update_flags_on_all_nodes(
2619 nodemap
->nodes
[i
].pnn
,
2620 remote_nodemaps
[j
]->nodes
[i
].flags
);
2621 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
2622 do_recovery(rec
, mem_ctx
, pnn
, nodemap
,
2626 DEBUG(DEBUG_ERR
,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap
->nodes
[i
].flags
, i
));
2627 update_flags_on_all_nodes(
2629 nodemap
->nodes
[i
].pnn
,
2630 nodemap
->nodes
[i
].flags
);
2631 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
2632 do_recovery(rec
, mem_ctx
, pnn
, nodemap
,
2641 /* count how many active nodes there are */
2643 for (i
=0; i
<nodemap
->num
; i
++) {
2644 if (!(nodemap
->nodes
[i
].flags
& NODE_FLAGS_INACTIVE
)) {
2645 if (ctdb_node_has_capabilities(rec
->caps
,
2646 ctdb
->nodes
[i
]->pnn
,
2647 CTDB_CAP_LMASTER
)) {
2654 /* There must be the same number of lmasters in the vnn map as
2655 * there are active nodes with the lmaster capability... or
2658 if (vnnmap
->size
!= num_lmasters
) {
2659 DEBUG(DEBUG_ERR
, (__location__
" The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
2660 vnnmap
->size
, num_lmasters
));
2661 ctdb_set_culprit(rec
, ctdb
->pnn
);
2662 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
2667 * Verify that all active lmaster nodes in the nodemap also
2668 * exist in the vnnmap
2670 for (j
=0; j
<nodemap
->num
; j
++) {
2671 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
2674 if (! ctdb_node_has_capabilities(rec
->caps
,
2675 nodemap
->nodes
[j
].pnn
,
2676 CTDB_CAP_LMASTER
)) {
2679 if (nodemap
->nodes
[j
].pnn
== pnn
) {
2683 for (i
=0; i
<vnnmap
->size
; i
++) {
2684 if (vnnmap
->map
[i
] == nodemap
->nodes
[j
].pnn
) {
2688 if (i
== vnnmap
->size
) {
2689 D_ERR("Active LMASTER node %u is not in the vnnmap\n",
2690 nodemap
->nodes
[j
].pnn
);
2691 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
2692 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
2698 /* verify that all other nodes have the same vnnmap
2699 and are from the same generation
2701 for (j
=0; j
<nodemap
->num
; j
++) {
2702 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
2705 if (nodemap
->nodes
[j
].pnn
== pnn
) {
2709 ret
= ctdb_ctrl_getvnnmap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
2710 mem_ctx
, &remote_vnnmap
);
2712 DEBUG(DEBUG_ERR
, (__location__
" Unable to get vnnmap from remote node %u\n",
2713 nodemap
->nodes
[j
].pnn
));
2717 /* verify the vnnmap generation is the same */
2718 if (vnnmap
->generation
!= remote_vnnmap
->generation
) {
2719 DEBUG(DEBUG_ERR
, (__location__
" Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
2720 nodemap
->nodes
[j
].pnn
, remote_vnnmap
->generation
, vnnmap
->generation
));
2721 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
2722 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
2726 /* verify the vnnmap size is the same */
2727 if (vnnmap
->size
!= remote_vnnmap
->size
) {
2728 DEBUG(DEBUG_ERR
, (__location__
" Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
2729 nodemap
->nodes
[j
].pnn
, remote_vnnmap
->size
, vnnmap
->size
));
2730 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
2731 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
2735 /* verify the vnnmap is the same */
2736 for (i
=0;i
<vnnmap
->size
;i
++) {
2737 if (remote_vnnmap
->map
[i
] != vnnmap
->map
[i
]) {
2738 DEBUG(DEBUG_ERR
, (__location__
" Remote node %u has different vnnmap.\n",
2739 nodemap
->nodes
[j
].pnn
));
2740 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
2741 do_recovery(rec
, mem_ctx
, pnn
, nodemap
,
2748 /* FIXME: Add remote public IP checking to ensure that nodes
2749 * have the IP addresses that are allocated to them. */
2751 takeover_run_checks
:
2753 /* If there are IP takeover runs requested or the previous one
2754 * failed then perform one and notify the waiters */
2755 if (!ctdb_op_is_disabled(rec
->takeover_run
) &&
2756 (rec
->reallocate_requests
|| rec
->need_takeover_run
)) {
2757 process_ipreallocate_requests(ctdb
, rec
);
2761 static void recd_sig_term_handler(struct tevent_context
*ev
,
2762 struct tevent_signal
*se
, int signum
,
2763 int count
, void *dont_care
,
2766 struct ctdb_recoverd
*rec
= talloc_get_type_abort(
2767 private_data
, struct ctdb_recoverd
);
2769 DEBUG(DEBUG_ERR
, ("Received SIGTERM, exiting\n"));
2770 ctdb_recovery_unlock(rec
);
2775 * Periodically log elements of the cluster state
2777 * This can be used to confirm a split brain has occurred
2779 static void maybe_log_cluster_state(struct tevent_context
*ev
,
2780 struct tevent_timer
*te
,
2781 struct timeval current_time
,
2784 struct ctdb_recoverd
*rec
= talloc_get_type_abort(
2785 private_data
, struct ctdb_recoverd
);
2786 struct ctdb_context
*ctdb
= rec
->ctdb
;
2787 struct tevent_timer
*tt
;
2789 static struct timeval start_incomplete
= {
2797 unsigned int minutes
;
2798 unsigned int num_connected
;
2800 if (rec
->recmaster
!= ctdb_get_pnn(ctdb
)) {
2804 if (rec
->nodemap
== NULL
) {
2810 for (i
= 0; i
< rec
->nodemap
->num
; i
++) {
2811 struct ctdb_node_and_flags
*n
= &rec
->nodemap
->nodes
[i
];
2813 if (n
->pnn
== ctdb_get_pnn(ctdb
)) {
2816 if ((n
->flags
& NODE_FLAGS_DELETED
) != 0) {
2819 if ((n
->flags
& NODE_FLAGS_DISCONNECTED
) != 0) {
2820 is_complete
= false;
2827 was_complete
= timeval_is_zero(&start_incomplete
);
2830 if (! was_complete
) {
2831 D_WARNING("Cluster complete with master=%u\n",
2833 start_incomplete
= timeval_zero();
2838 /* Cluster is newly incomplete... */
2840 start_incomplete
= current_time
;
2846 * Cluster has been incomplete since previous check, so figure
2847 * out how long (in minutes) and decide whether to log anything
2849 seconds
= timeval_elapsed2(&start_incomplete
, ¤t_time
);
2850 minutes
= (unsigned int)seconds
/ 60;
2851 if (minutes
>= 60) {
2852 /* Over an hour, log every hour */
2853 if (minutes
% 60 != 0) {
2856 } else if (minutes
>= 10) {
2857 /* Over 10 minutes, log every 10 minutes */
2858 if (minutes
% 10 != 0) {
2864 D_WARNING("Cluster incomplete with master=%u, elapsed=%u minutes, "
2871 tt
= tevent_add_timer(ctdb
->ev
,
2873 timeval_current_ofs(60, 0),
2874 maybe_log_cluster_state
,
2877 DBG_WARNING("Failed to set up cluster state timer\n");
2882 the main monitoring loop
2884 static void monitor_cluster(struct ctdb_context
*ctdb
)
2886 struct tevent_signal
*se
;
2887 struct ctdb_recoverd
*rec
;
2889 DEBUG(DEBUG_NOTICE
,("monitor_cluster starting\n"));
2891 rec
= talloc_zero(ctdb
, struct ctdb_recoverd
);
2892 CTDB_NO_MEMORY_FATAL(ctdb
, rec
);
2895 rec
->recmaster
= CTDB_UNKNOWN_PNN
;
2896 rec
->recovery_lock_handle
= NULL
;
2898 rec
->takeover_run
= ctdb_op_init(rec
, "takeover runs");
2899 CTDB_NO_MEMORY_FATAL(ctdb
, rec
->takeover_run
);
2901 rec
->recovery
= ctdb_op_init(rec
, "recoveries");
2902 CTDB_NO_MEMORY_FATAL(ctdb
, rec
->recovery
);
2904 rec
->priority_time
= timeval_current();
2905 rec
->frozen_on_inactive
= false;
2907 se
= tevent_add_signal(ctdb
->ev
, ctdb
, SIGTERM
, 0,
2908 recd_sig_term_handler
, rec
);
2910 DEBUG(DEBUG_ERR
, ("Failed to install SIGTERM handler\n"));
2914 if (ctdb
->recovery_lock
== NULL
) {
2915 struct tevent_timer
*tt
;
2917 tt
= tevent_add_timer(ctdb
->ev
,
2919 timeval_current_ofs(60, 0),
2920 maybe_log_cluster_state
,
2923 DBG_WARNING("Failed to set up cluster state timer\n");
2927 /* register a message port for sending memory dumps */
2928 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_MEM_DUMP
, mem_dump_handler
, rec
);
2930 /* when a node is assigned banning credits */
2931 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_BANNING
,
2932 banning_handler
, rec
);
2934 /* register a message port for recovery elections */
2935 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_ELECTION
, election_handler
, rec
);
2937 /* when nodes are disabled/enabled */
2938 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_SET_NODE_FLAGS
, monitor_handler
, rec
);
2940 /* when we are asked to puch out a flag change */
2941 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_PUSH_NODE_FLAGS
, push_flags_handler
, rec
);
2943 /* register a message port for reloadnodes */
2944 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_RELOAD_NODES
, reload_nodes_handler
, rec
);
2946 /* register a message port for performing a takeover run */
2947 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_TAKEOVER_RUN
, ip_reallocate_handler
, rec
);
2949 /* register a message port for disabling the ip check for a short while */
2950 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_DISABLE_IP_CHECK
, disable_ip_check_handler
, rec
);
2952 /* register a message port for forcing a rebalance of a node next
2954 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_REBALANCE_NODE
, recd_node_rebalance_handler
, rec
);
2956 /* Register a message port for disabling takeover runs */
2957 ctdb_client_set_message_handler(ctdb
,
2958 CTDB_SRVID_DISABLE_TAKEOVER_RUNS
,
2959 disable_takeover_runs_handler
, rec
);
2961 /* Register a message port for disabling recoveries */
2962 ctdb_client_set_message_handler(ctdb
,
2963 CTDB_SRVID_DISABLE_RECOVERIES
,
2964 disable_recoveries_handler
, rec
);
2967 TALLOC_CTX
*mem_ctx
= talloc_new(ctdb
);
2968 struct timeval start
;
2972 DEBUG(DEBUG_CRIT
,(__location__
2973 " Failed to create temp context\n"));
2977 start
= timeval_current();
2978 main_loop(ctdb
, rec
, mem_ctx
);
2979 talloc_free(mem_ctx
);
2981 /* we only check for recovery once every second */
2982 elapsed
= timeval_elapsed(&start
);
2983 if (elapsed
< ctdb
->tunable
.recover_interval
) {
2984 ctdb_wait_timeout(ctdb
, ctdb
->tunable
.recover_interval
2991 event handler for when the main ctdbd dies
2993 static void ctdb_recoverd_parent(struct tevent_context
*ev
,
2994 struct tevent_fd
*fde
,
2995 uint16_t flags
, void *private_data
)
2997 DEBUG(DEBUG_ALERT
,("recovery daemon parent died - exiting\n"));
3002 called regularly to verify that the recovery daemon is still running
3004 static void ctdb_check_recd(struct tevent_context
*ev
,
3005 struct tevent_timer
*te
,
3006 struct timeval yt
, void *p
)
3008 struct ctdb_context
*ctdb
= talloc_get_type(p
, struct ctdb_context
);
3010 if (ctdb_kill(ctdb
, ctdb
->recoverd_pid
, 0) != 0) {
3011 DEBUG(DEBUG_ERR
,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb
->recoverd_pid
));
3013 tevent_add_timer(ctdb
->ev
, ctdb
, timeval_zero(),
3014 ctdb_restart_recd
, ctdb
);
3019 tevent_add_timer(ctdb
->ev
, ctdb
->recd_ctx
,
3020 timeval_current_ofs(30, 0),
3021 ctdb_check_recd
, ctdb
);
3024 static void recd_sig_child_handler(struct tevent_context
*ev
,
3025 struct tevent_signal
*se
, int signum
,
3026 int count
, void *dont_care
,
3029 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3034 pid
= waitpid(-1, &status
, WNOHANG
);
3036 if (errno
!= ECHILD
) {
3037 DEBUG(DEBUG_ERR
, (__location__
" waitpid() returned error. errno:%s(%d)\n", strerror(errno
),errno
));
3042 DEBUG(DEBUG_DEBUG
, ("RECD SIGCHLD from %d\n", (int)pid
));
3048 startup the recovery daemon as a child of the main ctdb daemon
3050 int ctdb_start_recoverd(struct ctdb_context
*ctdb
)
3053 struct tevent_signal
*se
;
3054 struct tevent_fd
*fde
;
3057 if (pipe(fd
) != 0) {
3061 ctdb
->recoverd_pid
= ctdb_fork(ctdb
);
3062 if (ctdb
->recoverd_pid
== -1) {
3066 if (ctdb
->recoverd_pid
!= 0) {
3067 talloc_free(ctdb
->recd_ctx
);
3068 ctdb
->recd_ctx
= talloc_new(ctdb
);
3069 CTDB_NO_MEMORY(ctdb
, ctdb
->recd_ctx
);
3072 tevent_add_timer(ctdb
->ev
, ctdb
->recd_ctx
,
3073 timeval_current_ofs(30, 0),
3074 ctdb_check_recd
, ctdb
);
3080 srandom(getpid() ^ time(NULL
));
3082 ret
= logging_init(ctdb
, NULL
, NULL
, "ctdb-recoverd");
3087 prctl_set_comment("ctdb_recoverd");
3088 if (switch_from_server_to_client(ctdb
) != 0) {
3089 DEBUG(DEBUG_CRIT
, (__location__
"ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3093 DEBUG(DEBUG_DEBUG
, (__location__
" Created PIPE FD:%d to recovery daemon\n", fd
[0]));
3095 fde
= tevent_add_fd(ctdb
->ev
, ctdb
, fd
[0], TEVENT_FD_READ
,
3096 ctdb_recoverd_parent
, &fd
[0]);
3097 tevent_fd_set_auto_close(fde
);
3099 /* set up a handler to pick up sigchld */
3100 se
= tevent_add_signal(ctdb
->ev
, ctdb
, SIGCHLD
, 0,
3101 recd_sig_child_handler
, ctdb
);
3103 DEBUG(DEBUG_CRIT
,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3107 monitor_cluster(ctdb
);
3109 DEBUG(DEBUG_ALERT
,("ERROR: ctdb_recoverd finished!?\n"));
3114 shutdown the recovery daemon
3116 void ctdb_stop_recoverd(struct ctdb_context
*ctdb
)
3118 if (ctdb
->recoverd_pid
== 0) {
3122 DEBUG(DEBUG_NOTICE
,("Shutting down recovery daemon\n"));
3123 ctdb_kill(ctdb
, ctdb
->recoverd_pid
, SIGTERM
);
3125 TALLOC_FREE(ctdb
->recd_ctx
);
3126 TALLOC_FREE(ctdb
->recd_ping_count
);
3129 static void ctdb_restart_recd(struct tevent_context
*ev
,
3130 struct tevent_timer
*te
,
3131 struct timeval t
, void *private_data
)
3133 struct ctdb_context
*ctdb
= talloc_get_type(private_data
, struct ctdb_context
);
3135 DEBUG(DEBUG_ERR
,("Restarting recovery daemon\n"));
3136 ctdb_stop_recoverd(ctdb
);
3137 ctdb_start_recoverd(ctdb
);