4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "system/filesys.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/wait.h"
31 #include "lib/tdb_wrap/tdb_wrap.h"
32 #include "lib/util/dlinklist.h"
33 #include "lib/util/debug.h"
34 #include "lib/util/samba_util.h"
35 #include "lib/util/sys_rw.h"
36 #include "lib/util/util_process.h"
38 #include "ctdb_private.h"
39 #include "ctdb_client.h"
41 #include "protocol/protocol_basic.h"
43 #include "common/system_socket.h"
44 #include "common/common.h"
45 #include "common/logging.h"
47 #include "server/ctdb_config.h"
49 #include "ctdb_cluster_mutex.h"
51 /* List of SRVID requests that need to be processed */
53 struct srvid_list
*next
, *prev
;
54 struct ctdb_srvid_message
*request
;
57 struct srvid_requests
{
58 struct srvid_list
*requests
;
61 static void srvid_request_reply(struct ctdb_context
*ctdb
,
62 struct ctdb_srvid_message
*request
,
65 /* Someone that sent srvid==0 does not want a reply */
66 if (request
->srvid
== 0) {
71 if (ctdb_client_send_message(ctdb
, request
->pnn
, request
->srvid
,
73 DEBUG(DEBUG_INFO
,("Sent SRVID reply to %u:%llu\n",
74 (unsigned)request
->pnn
,
75 (unsigned long long)request
->srvid
));
77 DEBUG(DEBUG_ERR
,("Failed to send SRVID reply to %u:%llu\n",
78 (unsigned)request
->pnn
,
79 (unsigned long long)request
->srvid
));
85 static void srvid_requests_reply(struct ctdb_context
*ctdb
,
86 struct srvid_requests
**requests
,
91 if (*requests
== NULL
) {
95 for (r
= (*requests
)->requests
; r
!= NULL
; r
= r
->next
) {
96 srvid_request_reply(ctdb
, r
->request
, result
);
99 /* Free the list structure... */
100 TALLOC_FREE(*requests
);
103 static void srvid_request_add(struct ctdb_context
*ctdb
,
104 struct srvid_requests
**requests
,
105 struct ctdb_srvid_message
*request
)
107 struct srvid_list
*t
;
111 if (*requests
== NULL
) {
112 *requests
= talloc_zero(ctdb
, struct srvid_requests
);
113 if (*requests
== NULL
) {
118 t
= talloc_zero(*requests
, struct srvid_list
);
120 /* If *requests was just allocated above then free it */
121 if ((*requests
)->requests
== NULL
) {
122 TALLOC_FREE(*requests
);
127 t
->request
= (struct ctdb_srvid_message
*)talloc_steal(t
, request
);
128 DLIST_ADD((*requests
)->requests
, t
);
133 /* Failed to add the request to the list. Send a fail. */
134 DEBUG(DEBUG_ERR
, (__location__
135 " Out of memory, failed to queue SRVID request\n"));
137 result
.dsize
= sizeof(ret
);
138 result
.dptr
= (uint8_t *)&ret
;
139 srvid_request_reply(ctdb
, request
, result
);
142 /* An abstraction to allow an operation (takeover runs, recoveries,
143 * ...) to be disabled for a given timeout */
144 struct ctdb_op_state
{
145 struct tevent_timer
*timer
;
150 static struct ctdb_op_state
*ctdb_op_init(TALLOC_CTX
*mem_ctx
, const char *name
)
152 struct ctdb_op_state
*state
= talloc_zero(mem_ctx
, struct ctdb_op_state
);
155 state
->in_progress
= false;
162 static bool ctdb_op_is_disabled(struct ctdb_op_state
*state
)
164 return state
->timer
!= NULL
;
167 static bool ctdb_op_begin(struct ctdb_op_state
*state
)
169 if (ctdb_op_is_disabled(state
)) {
171 ("Unable to begin - %s are disabled\n", state
->name
));
175 state
->in_progress
= true;
179 static bool ctdb_op_end(struct ctdb_op_state
*state
)
181 return state
->in_progress
= false;
184 static bool ctdb_op_is_in_progress(struct ctdb_op_state
*state
)
186 return state
->in_progress
;
189 static void ctdb_op_enable(struct ctdb_op_state
*state
)
191 TALLOC_FREE(state
->timer
);
194 static void ctdb_op_timeout_handler(struct tevent_context
*ev
,
195 struct tevent_timer
*te
,
196 struct timeval yt
, void *p
)
198 struct ctdb_op_state
*state
=
199 talloc_get_type(p
, struct ctdb_op_state
);
201 DEBUG(DEBUG_NOTICE
,("Reenabling %s after timeout\n", state
->name
));
202 ctdb_op_enable(state
);
205 static int ctdb_op_disable(struct ctdb_op_state
*state
,
206 struct tevent_context
*ev
,
210 DEBUG(DEBUG_NOTICE
,("Reenabling %s\n", state
->name
));
211 ctdb_op_enable(state
);
215 if (state
->in_progress
) {
217 ("Unable to disable %s - in progress\n", state
->name
));
221 DEBUG(DEBUG_NOTICE
,("Disabling %s for %u seconds\n",
222 state
->name
, timeout
));
224 /* Clear any old timers */
225 talloc_free(state
->timer
);
227 /* Arrange for the timeout to occur */
228 state
->timer
= tevent_add_timer(ev
, state
,
229 timeval_current_ofs(timeout
, 0),
230 ctdb_op_timeout_handler
, state
);
231 if (state
->timer
== NULL
) {
232 DEBUG(DEBUG_ERR
,(__location__
" Unable to setup timer\n"));
239 struct ctdb_banning_state
{
242 struct timeval last_reported_time
;
245 struct ctdb_cluster_lock_handle
;
248 private state of recovery daemon
250 struct ctdb_recoverd
{
251 struct ctdb_context
*ctdb
;
253 struct tevent_timer
*leader_broadcast_te
;
254 struct tevent_timer
*leader_broadcast_timeout_te
;
256 uint32_t last_culprit_node
;
257 struct ctdb_banning_state
*banning_state
;
258 struct ctdb_node_map_old
*nodemap
;
259 struct timeval priority_time
;
260 bool need_takeover_run
;
263 struct tevent_timer
*send_election_te
;
264 bool election_in_progress
;
265 struct tevent_timer
*election_timeout
;
266 struct srvid_requests
*reallocate_requests
;
267 struct ctdb_op_state
*takeover_run
;
268 struct ctdb_op_state
*recovery
;
269 struct ctdb_iface_list_old
*ifaces
;
270 uint32_t *force_rebalance_nodes
;
271 struct ctdb_node_capabilities
*caps
;
272 bool frozen_on_inactive
;
273 struct ctdb_cluster_lock_handle
*cluster_lock_handle
;
277 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
278 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
280 static void ctdb_restart_recd(struct tevent_context
*ev
,
281 struct tevent_timer
*te
, struct timeval t
,
284 static bool this_node_is_leader(struct ctdb_recoverd
*rec
)
286 return rec
->leader
== rec
->pnn
;
289 static bool this_node_can_be_leader(struct ctdb_recoverd
*rec
)
291 return (rec
->node_flags
& NODE_FLAGS_INACTIVE
) == 0 &&
292 (rec
->ctdb
->capabilities
& CTDB_CAP_RECMASTER
) != 0;
295 static bool node_flags(struct ctdb_recoverd
*rec
, uint32_t pnn
, uint32_t *flags
)
299 for (i
= 0; i
< rec
->nodemap
->num
; i
++) {
300 struct ctdb_node_and_flags
*node
= &rec
->nodemap
->nodes
[i
];
301 if (node
->pnn
== pnn
) {
303 *flags
= node
->flags
;
313 ban a node for a period of time
315 static void ctdb_ban_node(struct ctdb_recoverd
*rec
, uint32_t pnn
)
318 struct ctdb_context
*ctdb
= rec
->ctdb
;
319 uint32_t ban_time
= ctdb
->tunable
.recovery_ban_period
;
320 struct ctdb_ban_state bantime
;
322 if (!ctdb_validate_pnn(ctdb
, pnn
)) {
323 DEBUG(DEBUG_ERR
,("Bad pnn %u in ctdb_ban_node\n", pnn
));
327 DEBUG(DEBUG_NOTICE
,("Banning node %u for %u seconds\n", pnn
, ban_time
));
330 bantime
.time
= ban_time
;
332 ret
= ctdb_ctrl_set_ban(ctdb
, CONTROL_TIMEOUT(), pnn
, &bantime
);
334 DEBUG(DEBUG_ERR
,(__location__
" Failed to ban node %d\n", pnn
));
340 enum monitor_result
{ MONITOR_OK
, MONITOR_RECOVERY_NEEDED
, MONITOR_ELECTION_NEEDED
, MONITOR_FAILED
};
344 remember the trouble maker
346 static void ctdb_set_culprit_count(struct ctdb_recoverd
*rec
,
350 struct ctdb_context
*ctdb
= talloc_get_type_abort(
351 rec
->ctdb
, struct ctdb_context
);
352 struct ctdb_banning_state
*ban_state
= NULL
;
356 ok
= node_flags(rec
, culprit
, NULL
);
358 DBG_WARNING("Unknown culprit node %"PRIu32
"\n", culprit
);
362 /* If we are banned or stopped, do not set other nodes as culprits */
363 if (rec
->node_flags
& NODE_FLAGS_INACTIVE
) {
364 D_WARNING("This node is INACTIVE, cannot set culprit node %d\n",
369 if (rec
->banning_state
== NULL
) {
374 len
= talloc_array_length(rec
->banning_state
);
376 for (i
= 0 ; i
< len
; i
++) {
377 if (rec
->banning_state
[i
].pnn
== culprit
) {
378 ban_state
= &rec
->banning_state
[i
];
384 /* Not found, so extend (or allocate new) array */
385 if (ban_state
== NULL
) {
386 struct ctdb_banning_state
*t
;
390 * talloc_realloc() handles the corner case where
391 * rec->banning_state is NULL
393 t
= talloc_realloc(rec
,
395 struct ctdb_banning_state
,
398 DBG_WARNING("Memory allocation error\n");
401 rec
->banning_state
= t
;
403 /* New element is always at the end - initialise it... */
404 ban_state
= &rec
->banning_state
[len
- 1];
405 *ban_state
= (struct ctdb_banning_state
) {
409 } else if (ban_state
->count
> 0 &&
410 timeval_elapsed(&ban_state
->last_reported_time
) >
411 ctdb
->tunable
.recovery_grace_period
) {
413 * Forgive old transgressions beyond the tunable time-limit
415 ban_state
->count
= 0;
418 ban_state
->count
+= count
;
419 ban_state
->last_reported_time
= timeval_current();
420 rec
->last_culprit_node
= culprit
;
423 static void ban_counts_reset(struct ctdb_recoverd
*rec
)
425 D_NOTICE("Resetting ban count to 0 for all nodes\n");
426 TALLOC_FREE(rec
->banning_state
);
430 remember the trouble maker
432 static void ctdb_set_culprit(struct ctdb_recoverd
*rec
, uint32_t culprit
)
434 ctdb_set_culprit_count(rec
, culprit
, 1);
438 Retrieve capabilities from all connected nodes
440 static int update_capabilities(struct ctdb_recoverd
*rec
,
441 struct ctdb_node_map_old
*nodemap
)
445 struct ctdb_node_capabilities
*caps
;
446 struct ctdb_context
*ctdb
= rec
->ctdb
;
448 tmp_ctx
= talloc_new(rec
);
449 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
451 caps
= ctdb_get_capabilities(ctdb
, tmp_ctx
,
452 CONTROL_TIMEOUT(), nodemap
);
456 (__location__
" Failed to get node capabilities\n"));
457 talloc_free(tmp_ctx
);
461 capp
= ctdb_get_node_capabilities(caps
, rec
->pnn
);
465 " Capabilities don't include current node.\n"));
466 talloc_free(tmp_ctx
);
469 ctdb
->capabilities
= *capp
;
471 TALLOC_FREE(rec
->caps
);
472 rec
->caps
= talloc_steal(rec
, caps
);
474 talloc_free(tmp_ctx
);
479 change recovery mode on all nodes
481 static int set_recovery_mode(struct ctdb_context
*ctdb
,
482 struct ctdb_recoverd
*rec
,
483 struct ctdb_node_map_old
*nodemap
,
490 tmp_ctx
= talloc_new(ctdb
);
491 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
493 nodes
= list_of_active_nodes(ctdb
, nodemap
, tmp_ctx
, true);
495 data
.dsize
= sizeof(uint32_t);
496 data
.dptr
= (unsigned char *)&rec_mode
;
498 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_SET_RECMODE
,
504 DEBUG(DEBUG_ERR
, (__location__
" Unable to set recovery mode. Recovery failed.\n"));
505 talloc_free(tmp_ctx
);
509 talloc_free(tmp_ctx
);
514 * Update flags on all connected nodes
516 static int update_flags_on_all_nodes(struct ctdb_recoverd
*rec
,
520 struct ctdb_context
*ctdb
= rec
->ctdb
;
521 struct timeval timeout
= CONTROL_TIMEOUT();
523 struct ctdb_node_map_old
*nodemap
=NULL
;
524 struct ctdb_node_flag_change c
;
525 TALLOC_CTX
*tmp_ctx
= talloc_new(ctdb
);
530 nodemap
= rec
->nodemap
;
532 for (i
= 0; i
< nodemap
->num
; i
++) {
533 if (pnn
== nodemap
->nodes
[i
].pnn
) {
537 if (i
>= nodemap
->num
) {
538 DBG_ERR("Nodemap does not contain node %d\n", pnn
);
539 talloc_free(tmp_ctx
);
544 c
.old_flags
= nodemap
->nodes
[i
].flags
;
547 data
.dsize
= sizeof(c
);
548 data
.dptr
= (unsigned char *)&c
;
550 /* send the flags update to all connected nodes */
551 nodes
= list_of_connected_nodes(ctdb
, nodemap
, tmp_ctx
, true);
553 ret
= ctdb_client_async_control(ctdb
,
554 CTDB_CONTROL_MODIFY_FLAGS
,
564 DBG_ERR("Unable to update flags on remote nodes\n");
565 talloc_free(tmp_ctx
);
569 talloc_free(tmp_ctx
);
573 static bool _cluster_lock_lock(struct ctdb_recoverd
*rec
);
574 static bool cluster_lock_held(struct ctdb_recoverd
*rec
);
576 static bool cluster_lock_enabled(struct ctdb_recoverd
*rec
)
578 return rec
->ctdb
->recovery_lock
!= NULL
;
581 static bool cluster_lock_take(struct ctdb_recoverd
*rec
)
583 struct ctdb_context
*ctdb
= rec
->ctdb
;
586 if (!cluster_lock_enabled(rec
)) {
590 if (cluster_lock_held(rec
)) {
591 D_NOTICE("Already holding cluster lock\n");
595 D_NOTICE("Attempting to take cluster lock (%s)\n", ctdb
->recovery_lock
);
596 have_lock
= _cluster_lock_lock(rec
);
601 D_NOTICE("Cluster lock taken successfully\n");
606 called when ctdb_wait_timeout should finish
608 static void ctdb_wait_handler(struct tevent_context
*ev
,
609 struct tevent_timer
*te
,
610 struct timeval yt
, void *p
)
612 uint32_t *timed_out
= (uint32_t *)p
;
617 wait for a given number of seconds
619 static void ctdb_wait_timeout(struct ctdb_context
*ctdb
, double secs
)
621 uint32_t timed_out
= 0;
622 uint32_t usecs
= (secs
- (uint32_t)secs
) * 1000000;
623 tevent_add_timer(ctdb
->ev
, ctdb
, timeval_current_ofs(secs
, usecs
),
624 ctdb_wait_handler
, &timed_out
);
626 tevent_loop_once(ctdb
->ev
);
631 * Broadcast cluster leader
634 static int leader_broadcast_send(struct ctdb_recoverd
*rec
, uint32_t pnn
)
636 struct ctdb_context
*ctdb
= rec
->ctdb
;
640 data
.dptr
= (uint8_t *)&pnn
;
641 data
.dsize
= sizeof(pnn
);
643 ret
= ctdb_client_send_message(ctdb
,
644 CTDB_BROADCAST_CONNECTED
,
650 static int leader_broadcast_loop(struct ctdb_recoverd
*rec
);
651 static void cluster_lock_release(struct ctdb_recoverd
*rec
);
653 /* This runs continuously but only sends the broadcast when leader */
654 static void leader_broadcast_loop_handler(struct tevent_context
*ev
,
655 struct tevent_timer
*te
,
656 struct timeval current_time
,
659 struct ctdb_recoverd
*rec
= talloc_get_type_abort(
660 private_data
, struct ctdb_recoverd
);
663 if (!this_node_can_be_leader(rec
)) {
664 if (this_node_is_leader(rec
)) {
665 rec
->leader
= CTDB_UNKNOWN_PNN
;
667 if (cluster_lock_enabled(rec
) && cluster_lock_held(rec
)) {
668 cluster_lock_release(rec
);
673 if (!this_node_is_leader(rec
)) {
677 if (rec
->election_in_progress
) {
681 ret
= leader_broadcast_send(rec
, rec
->leader
);
683 DBG_WARNING("Failed to send leader broadcast\n");
687 ret
= leader_broadcast_loop(rec
);
689 D_WARNING("Failed to set up leader broadcast\n");
693 static int leader_broadcast_loop(struct ctdb_recoverd
*rec
)
695 struct ctdb_context
*ctdb
= rec
->ctdb
;
697 TALLOC_FREE(rec
->leader_broadcast_te
);
698 rec
->leader_broadcast_te
=
699 tevent_add_timer(ctdb
->ev
,
701 timeval_current_ofs(1, 0),
702 leader_broadcast_loop_handler
,
704 if (rec
->leader_broadcast_te
== NULL
) {
711 static bool leader_broadcast_loop_active(struct ctdb_recoverd
*rec
)
713 return rec
->leader_broadcast_te
!= NULL
;
717 called when an election times out (ends)
719 static void ctdb_election_timeout(struct tevent_context
*ev
,
720 struct tevent_timer
*te
,
721 struct timeval t
, void *p
)
723 struct ctdb_recoverd
*rec
= talloc_get_type(p
, struct ctdb_recoverd
);
726 rec
->election_in_progress
= false;
727 rec
->election_timeout
= NULL
;
730 D_WARNING("Election period ended, leader=%u\n", rec
->leader
);
732 if (!this_node_is_leader(rec
)) {
736 ok
= cluster_lock_take(rec
);
738 D_ERR("Unable to get cluster lock, banning node\n");
739 ctdb_ban_node(rec
, rec
->pnn
);
745 wait for an election to finish. It finished election_timeout seconds after
746 the last election packet is received
748 static void ctdb_wait_election(struct ctdb_recoverd
*rec
)
750 struct ctdb_context
*ctdb
= rec
->ctdb
;
751 while (rec
->election_in_progress
) {
752 tevent_loop_once(ctdb
->ev
);
757 * Update local flags from all remote connected nodes and push out
758 * flags changes to all nodes. This is only run by the leader.
760 static int update_flags(struct ctdb_recoverd
*rec
,
761 struct ctdb_node_map_old
*nodemap
,
762 struct ctdb_node_map_old
**remote_nodemaps
)
765 struct ctdb_context
*ctdb
= rec
->ctdb
;
766 TALLOC_CTX
*mem_ctx
= talloc_new(ctdb
);
768 /* Check flags from remote nodes */
769 for (j
=0; j
<nodemap
->num
; j
++) {
770 struct ctdb_node_map_old
*remote_nodemap
=NULL
;
771 uint32_t local_flags
= nodemap
->nodes
[j
].flags
;
772 uint32_t remote_pnn
= nodemap
->nodes
[j
].pnn
;
773 uint32_t remote_flags
;
777 if (local_flags
& NODE_FLAGS_DISCONNECTED
) {
780 if (remote_pnn
== rec
->pnn
) {
782 * No remote nodemap for this node since this
783 * is the local nodemap. However, still need
784 * to check this against the remote nodes and
785 * push it if they are out-of-date.
787 goto compare_remotes
;
790 remote_nodemap
= remote_nodemaps
[j
];
791 remote_flags
= remote_nodemap
->nodes
[j
].flags
;
793 if (local_flags
!= remote_flags
) {
795 * Update the local copy of the flags in the
798 D_NOTICE("Remote node %u had flags 0x%x, "
799 "local had 0x%x - updating local\n",
803 nodemap
->nodes
[j
].flags
= remote_flags
;
804 local_flags
= remote_flags
;
809 for (i
= 0; i
< nodemap
->num
; i
++) {
813 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
) {
816 if (nodemap
->nodes
[i
].pnn
== rec
->pnn
) {
820 remote_nodemap
= remote_nodemaps
[i
];
821 remote_flags
= remote_nodemap
->nodes
[j
].flags
;
823 if (local_flags
!= remote_flags
) {
831 D_NOTICE("Pushing updated flags for node %u (0x%x)\n",
834 ret
= update_flags_on_all_nodes(rec
, remote_pnn
, local_flags
);
836 DBG_ERR("Unable to update flags on remote nodes\n");
837 talloc_free(mem_ctx
);
841 talloc_free(mem_ctx
);
846 /* Create a new random generation id.
847 The generation id can not be the INVALID_GENERATION id
849 static uint32_t new_generation(void)
854 generation
= random();
856 if (generation
!= INVALID_GENERATION
) {
864 static bool cluster_lock_held(struct ctdb_recoverd
*rec
)
866 return (rec
->cluster_lock_handle
!= NULL
);
869 struct ctdb_cluster_lock_handle
{
873 struct ctdb_cluster_mutex_handle
*h
;
874 struct ctdb_recoverd
*rec
;
877 static void take_cluster_lock_handler(char status
,
881 struct ctdb_cluster_lock_handle
*s
=
882 (struct ctdb_cluster_lock_handle
*) private_data
;
884 s
->locked
= (status
== '0') ;
887 * If unsuccessful then ensure the process has exited and that
888 * the file descriptor event handler has been cancelled
896 s
->latency
= latency
;
900 D_ERR("Unable to take cluster lock - contention\n");
904 D_ERR("Unable to take cluster lock - timeout\n");
908 D_ERR("Unable to take cluster lock - unknown error\n");
914 static void force_election(struct ctdb_recoverd
*rec
);
916 static void lost_cluster_lock_handler(void *private_data
)
918 struct ctdb_recoverd
*rec
= talloc_get_type_abort(
919 private_data
, struct ctdb_recoverd
);
921 D_ERR("Cluster lock helper terminated\n");
922 TALLOC_FREE(rec
->cluster_lock_handle
);
924 if (this_node_can_be_leader(rec
)) {
929 static bool _cluster_lock_lock(struct ctdb_recoverd
*rec
)
931 struct ctdb_context
*ctdb
= rec
->ctdb
;
932 struct ctdb_cluster_mutex_handle
*h
;
933 struct ctdb_cluster_lock_handle
*s
;
935 s
= talloc_zero(rec
, struct ctdb_cluster_lock_handle
);
937 DBG_ERR("Memory allocation error\n");
943 h
= ctdb_cluster_mutex(s
,
947 take_cluster_lock_handler
,
949 lost_cluster_lock_handler
,
956 rec
->cluster_lock_handle
= s
;
960 tevent_loop_once(ctdb
->ev
);
964 TALLOC_FREE(rec
->cluster_lock_handle
);
968 ctdb_ctrl_report_recd_lock_latency(ctdb
,
975 static void cluster_lock_release(struct ctdb_recoverd
*rec
)
977 if (rec
->cluster_lock_handle
== NULL
) {
981 if (! rec
->cluster_lock_handle
->done
) {
983 * Taking of cluster lock still in progress. Free
984 * the cluster mutex handle to release it but leave
985 * the cluster lock handle in place to allow taking
986 * of the lock to fail.
988 D_NOTICE("Cancelling cluster lock\n");
989 TALLOC_FREE(rec
->cluster_lock_handle
->h
);
990 rec
->cluster_lock_handle
->done
= true;
991 rec
->cluster_lock_handle
->locked
= false;
995 D_NOTICE("Releasing cluster lock\n");
996 TALLOC_FREE(rec
->cluster_lock_handle
);
999 static void ban_misbehaving_nodes(struct ctdb_recoverd
*rec
, bool *self_ban
)
1001 size_t len
= talloc_array_length(rec
->banning_state
);
1006 for (i
= 0; i
< len
; i
++) {
1007 struct ctdb_banning_state
*ban_state
= &rec
->banning_state
[i
];
1009 if (ban_state
->count
< 2 * rec
->nodemap
->num
) {
1013 D_NOTICE("Node %u reached %u banning credits\n",
1016 ctdb_ban_node(rec
, ban_state
->pnn
);
1017 ban_state
->count
= 0;
1019 /* Banning ourself? */
1020 if (ban_state
->pnn
== rec
->pnn
) {
1026 struct helper_state
{
1033 static void helper_handler(struct tevent_context
*ev
,
1034 struct tevent_fd
*fde
,
1035 uint16_t flags
, void *private_data
)
1037 struct helper_state
*state
= talloc_get_type_abort(
1038 private_data
, struct helper_state
);
1041 ret
= sys_read(state
->fd
[0], &state
->result
, sizeof(state
->result
));
1042 if (ret
!= sizeof(state
->result
)) {
1043 state
->result
= EPIPE
;
1049 static int helper_run(struct ctdb_recoverd
*rec
, TALLOC_CTX
*mem_ctx
,
1050 const char *prog
, const char *arg
, const char *type
)
1052 struct helper_state
*state
;
1053 struct tevent_fd
*fde
;
1057 state
= talloc_zero(mem_ctx
, struct helper_state
);
1058 if (state
== NULL
) {
1059 DEBUG(DEBUG_ERR
, (__location__
" memory error\n"));
1065 ret
= pipe(state
->fd
);
1068 ("Failed to create pipe for %s helper\n", type
));
1072 set_close_on_exec(state
->fd
[0]);
1075 args
= talloc_array(state
, const char *, nargs
);
1077 DEBUG(DEBUG_ERR
, (__location__
" memory error\n"));
1081 args
[0] = talloc_asprintf(args
, "%d", state
->fd
[1]);
1082 if (args
[0] == NULL
) {
1083 DEBUG(DEBUG_ERR
, (__location__
" memory error\n"));
1086 args
[1] = rec
->ctdb
->daemon
.name
;
1090 if (args
[2] == NULL
) {
1094 state
->pid
= ctdb_vfork_exec(state
, rec
->ctdb
, prog
, nargs
, args
);
1095 if (state
->pid
== -1) {
1097 ("Failed to create child for %s helper\n", type
));
1101 close(state
->fd
[1]);
1104 rec
->helper_pid
= state
->pid
;
1105 state
->done
= false;
1107 fde
= tevent_add_fd(rec
->ctdb
->ev
, state
, state
->fd
[0],
1108 TEVENT_FD_READ
, helper_handler
, state
);
1112 tevent_fd_set_auto_close(fde
);
1114 while (!state
->done
) {
1115 tevent_loop_once(rec
->ctdb
->ev
);
1117 if (!this_node_is_leader(rec
)) {
1118 D_ERR("Leader changed to %u, aborting %s\n",
1126 close(state
->fd
[0]);
1129 if (state
->result
!= 0) {
1133 rec
->helper_pid
= -1;
1134 ctdb_kill(rec
->ctdb
, state
->pid
, SIGKILL
);
1139 if (state
->fd
[0] != -1) {
1140 close(state
->fd
[0]);
1142 if (state
->fd
[1] != -1) {
1143 close(state
->fd
[1]);
1145 rec
->helper_pid
= -1;
1146 if (state
->pid
!= -1) {
1147 ctdb_kill(rec
->ctdb
, state
->pid
, SIGKILL
);
1154 static int ctdb_takeover(struct ctdb_recoverd
*rec
,
1155 uint32_t *force_rebalance_nodes
)
1157 static char prog
[PATH_MAX
+1] = "";
1162 if (!ctdb_set_helper("takeover_helper", prog
, sizeof(prog
),
1163 "CTDB_TAKEOVER_HELPER", CTDB_HELPER_BINDIR
,
1164 "ctdb_takeover_helper")) {
1165 ctdb_die(rec
->ctdb
, "Unable to set takeover helper\n");
1169 for (i
= 0; i
< talloc_array_length(force_rebalance_nodes
); i
++) {
1170 uint32_t pnn
= force_rebalance_nodes
[i
];
1172 arg
= talloc_asprintf(rec
, "%u", pnn
);
1174 arg
= talloc_asprintf_append(arg
, ",%u", pnn
);
1177 DEBUG(DEBUG_ERR
, (__location__
" memory error\n"));
1182 if (ctdb_config
.failover_disabled
) {
1183 ret
= setenv("CTDB_DISABLE_IP_FAILOVER", "1", 1);
1185 D_ERR("Failed to set CTDB_DISABLE_IP_FAILOVER variable\n");
1190 return helper_run(rec
, rec
, prog
, arg
, "takeover");
1193 static bool do_takeover_run(struct ctdb_recoverd
*rec
,
1194 struct ctdb_node_map_old
*nodemap
)
1196 uint32_t *nodes
= NULL
;
1197 struct ctdb_disable_message dtr
;
1200 uint32_t *rebalance_nodes
= rec
->force_rebalance_nodes
;
1204 DEBUG(DEBUG_NOTICE
, ("Takeover run starting\n"));
1206 if (ctdb_op_is_in_progress(rec
->takeover_run
)) {
1207 DEBUG(DEBUG_ERR
, (__location__
1208 " takeover run already in progress \n"));
1213 if (!ctdb_op_begin(rec
->takeover_run
)) {
1218 /* Disable IP checks (takeover runs, really) on other nodes
1219 * while doing this takeover run. This will stop those other
1220 * nodes from triggering takeover runs when think they should
1221 * be hosting an IP but it isn't yet on an interface. Don't
1222 * wait for replies since a failure here might cause some
1223 * noise in the logs but will not actually cause a problem.
1226 dtr
.srvid
= 0; /* No reply */
1229 data
.dptr
= (uint8_t*)&dtr
;
1230 data
.dsize
= sizeof(dtr
);
1232 nodes
= list_of_connected_nodes(rec
->ctdb
, nodemap
, rec
, false);
1234 /* Disable for 60 seconds. This can be a tunable later if
1238 for (i
= 0; i
< talloc_array_length(nodes
); i
++) {
1239 if (ctdb_client_send_message(rec
->ctdb
, nodes
[i
],
1240 CTDB_SRVID_DISABLE_TAKEOVER_RUNS
,
1242 DEBUG(DEBUG_INFO
,("Failed to disable takeover runs\n"));
1246 ret
= ctdb_takeover(rec
, rec
->force_rebalance_nodes
);
1248 /* Re-enable takeover runs and IP checks on other nodes */
1250 for (i
= 0; i
< talloc_array_length(nodes
); i
++) {
1251 if (ctdb_client_send_message(rec
->ctdb
, nodes
[i
],
1252 CTDB_SRVID_DISABLE_TAKEOVER_RUNS
,
1254 DEBUG(DEBUG_INFO
,("Failed to re-enable takeover runs\n"));
1259 DEBUG(DEBUG_ERR
, ("ctdb_takeover_run() failed\n"));
1265 /* Takeover run was successful so clear force rebalance targets */
1266 if (rebalance_nodes
== rec
->force_rebalance_nodes
) {
1267 TALLOC_FREE(rec
->force_rebalance_nodes
);
1269 DEBUG(DEBUG_WARNING
,
1270 ("Rebalance target nodes changed during takeover run - not clearing\n"));
1273 rec
->need_takeover_run
= !ok
;
1275 ctdb_op_end(rec
->takeover_run
);
1277 DEBUG(DEBUG_NOTICE
, ("Takeover run %s\n", ok
? "completed successfully" : "unsuccessful"));
1281 static int db_recovery_parallel(struct ctdb_recoverd
*rec
, TALLOC_CTX
*mem_ctx
)
1283 static char prog
[PATH_MAX
+1] = "";
1286 if (!ctdb_set_helper("recovery_helper", prog
, sizeof(prog
),
1287 "CTDB_RECOVERY_HELPER", CTDB_HELPER_BINDIR
,
1288 "ctdb_recovery_helper")) {
1289 ctdb_die(rec
->ctdb
, "Unable to set recovery helper\n");
1292 arg
= talloc_asprintf(mem_ctx
, "%u", new_generation());
1294 DEBUG(DEBUG_ERR
, (__location__
" memory error\n"));
1298 setenv("CTDB_DBDIR_STATE", rec
->ctdb
->db_directory_state
, 1);
1300 return helper_run(rec
, mem_ctx
, prog
, arg
, "recovery");
1304 * Main recovery function, only run by leader
1306 static int do_recovery(struct ctdb_recoverd
*rec
, TALLOC_CTX
*mem_ctx
)
1308 struct ctdb_context
*ctdb
= rec
->ctdb
;
1309 struct ctdb_node_map_old
*nodemap
= rec
->nodemap
;
1314 DBG_NOTICE("Starting do_recovery\n");
1316 /* Check if the current node is still the leader. It's possible that
1317 * re-election has changed the leader.
1319 if (!this_node_is_leader(rec
)) {
1320 D_NOTICE("Leader changed to %" PRIu32
", aborting recovery\n",
1325 /* if recovery fails, force it again */
1326 rec
->need_recovery
= true;
1328 if (!ctdb_op_begin(rec
->recovery
)) {
1332 if (rec
->election_in_progress
) {
1333 /* an election is in progress */
1334 DBG_ERR("do_recovery called while election in progress - try "
1339 ban_misbehaving_nodes(rec
, &self_ban
);
1341 DBG_NOTICE("This node was banned, aborting recovery\n");
1345 if (cluster_lock_enabled(rec
) && !cluster_lock_held(rec
)) {
1346 /* Leader can change in ban_misbehaving_nodes() */
1347 if (!this_node_is_leader(rec
)) {
1348 D_NOTICE("Leader changed to %" PRIu32
1349 ", aborting recovery\n",
1351 rec
->need_recovery
= false;
1355 D_ERR("Cluster lock not held - abort recovery, ban node\n");
1356 ctdb_ban_node(rec
, rec
->pnn
);
1360 DBG_NOTICE("Recovery initiated due to problem with node %" PRIu32
"\n",
1361 rec
->last_culprit_node
);
1363 /* Retrieve capabilities from all connected nodes */
1364 ret
= update_capabilities(rec
, nodemap
);
1366 DBG_ERR("Unable to update node capabilities.\n");
1371 update all nodes to have the same flags that we have
1373 for (i
=0;i
<nodemap
->num
;i
++) {
1374 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
) {
1378 ret
= update_flags_on_all_nodes(rec
,
1379 nodemap
->nodes
[i
].pnn
,
1380 nodemap
->nodes
[i
].flags
);
1382 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_INACTIVE
) {
1383 DBG_WARNING("Unable to update flags on "
1384 "inactive node %d\n",
1387 DBG_ERR("Unable to update flags on all nodes "
1395 DBG_NOTICE("Recovery - updated flags\n");
1397 ret
= db_recovery_parallel(rec
, mem_ctx
);
1402 do_takeover_run(rec
, nodemap
);
1404 /* send a message to all clients telling them that the cluster
1405 has been reconfigured */
1406 ret
= ctdb_client_send_message(ctdb
, CTDB_BROADCAST_CONNECTED
,
1407 CTDB_SRVID_RECONFIGURE
, tdb_null
);
1409 DBG_ERR("Failed to send reconfigure message\n");
1413 DBG_NOTICE("Recovery complete\n");
1415 rec
->need_recovery
= false;
1416 ctdb_op_end(rec
->recovery
);
1419 * Completed a full recovery so forgive any past transgressions
1421 ban_counts_reset(rec
);
1423 /* We just finished a recovery successfully.
1424 We now wait for rerecovery_timeout before we allow
1425 another recovery to take place.
1427 D_NOTICE("Just finished a recovery. New recoveries will now be "
1428 "suppressed for the rerecovery timeout (%" PRIu32
1430 ctdb
->tunable
.rerecovery_timeout
);
1431 ctdb_op_disable(rec
->recovery
, ctdb
->ev
,
1432 ctdb
->tunable
.rerecovery_timeout
);
1436 ctdb_op_end(rec
->recovery
);
1442 elections are won by first checking the number of connected nodes, then
1443 the priority time, then the pnn
1445 struct election_message
{
1446 uint32_t num_connected
;
1447 struct timeval priority_time
;
1449 uint32_t node_flags
;
1453 form this nodes election data
1455 static void ctdb_election_data(struct ctdb_recoverd
*rec
, struct election_message
*em
)
1459 struct ctdb_node_map_old
*nodemap
;
1460 struct ctdb_context
*ctdb
= rec
->ctdb
;
1466 em
->priority_time
= rec
->priority_time
;
1468 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, rec
, &nodemap
);
1470 DEBUG(DEBUG_ERR
,(__location__
" unable to get node map\n"));
1474 ok
= node_flags(rec
, rec
->pnn
, &rec
->node_flags
);
1476 DBG_ERR("Unable to get node flags for this node\n");
1479 em
->node_flags
= rec
->node_flags
;
1481 for (i
=0;i
<nodemap
->num
;i
++) {
1482 if (!(nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
)) {
1483 em
->num_connected
++;
1487 if (!this_node_can_be_leader(rec
)) {
1488 /* Try to lose... */
1489 em
->num_connected
= 0;
1490 em
->priority_time
= timeval_current();
1493 talloc_free(nodemap
);
1497 see if the given election data wins
1499 static bool ctdb_election_win(struct ctdb_recoverd
*rec
, struct election_message
*em
)
1501 struct election_message myem
;
1504 ctdb_election_data(rec
, &myem
);
1506 if (!this_node_can_be_leader(rec
)) {
1510 /* Automatically win if other node is banned or stopped */
1511 if (em
->node_flags
& NODE_FLAGS_INACTIVE
) {
1515 /* then the longest running node */
1517 cmp
= timeval_compare(&em
->priority_time
, &myem
.priority_time
);
1521 cmp
= (int)myem
.pnn
- (int)em
->pnn
;
1528 send out an election request
1530 static int send_election_request(struct ctdb_recoverd
*rec
)
1532 TDB_DATA election_data
;
1533 struct election_message emsg
;
1535 struct ctdb_context
*ctdb
= rec
->ctdb
;
1537 srvid
= CTDB_SRVID_ELECTION
;
1539 ctdb_election_data(rec
, &emsg
);
1541 election_data
.dsize
= sizeof(struct election_message
);
1542 election_data
.dptr
= (unsigned char *)&emsg
;
1545 /* Assume this node will win the election, set leader accordingly */
1546 rec
->leader
= rec
->pnn
;
1548 /* send an election message to all active nodes */
1549 DEBUG(DEBUG_INFO
,(__location__
" Send election request to all active nodes\n"));
1550 return ctdb_client_send_message(ctdb
, CTDB_BROADCAST_ALL
, srvid
, election_data
);
1554 we think we are winning the election - send a broadcast election request
1556 static void election_send_request(struct tevent_context
*ev
,
1557 struct tevent_timer
*te
,
1558 struct timeval t
, void *p
)
1560 struct ctdb_recoverd
*rec
= talloc_get_type(p
, struct ctdb_recoverd
);
1563 ret
= send_election_request(rec
);
1565 DEBUG(DEBUG_ERR
,("Failed to send election request!\n"));
1568 TALLOC_FREE(rec
->send_election_te
);
1572 handler for memory dumps
1574 static void mem_dump_handler(uint64_t srvid
, TDB_DATA data
, void *private_data
)
1576 struct ctdb_recoverd
*rec
= talloc_get_type(
1577 private_data
, struct ctdb_recoverd
);
1578 struct ctdb_context
*ctdb
= rec
->ctdb
;
1579 TALLOC_CTX
*tmp_ctx
= talloc_new(ctdb
);
1582 struct ctdb_srvid_message
*rd
;
1584 if (data
.dsize
!= sizeof(struct ctdb_srvid_message
)) {
1585 DEBUG(DEBUG_ERR
, (__location__
" Wrong size of return address.\n"));
1586 talloc_free(tmp_ctx
);
1589 rd
= (struct ctdb_srvid_message
*)data
.dptr
;
1591 dump
= talloc_zero(tmp_ctx
, TDB_DATA
);
1593 DEBUG(DEBUG_ERR
, (__location__
" Failed to allocate memory for memdump\n"));
1594 talloc_free(tmp_ctx
);
1597 ret
= ctdb_dump_memory(ctdb
, dump
);
1599 DEBUG(DEBUG_ERR
, (__location__
" ctdb_dump_memory() failed\n"));
1600 talloc_free(tmp_ctx
);
1604 DBG_ERR("recovery daemon memory dump\n");
1606 ret
= ctdb_client_send_message(ctdb
, rd
->pnn
, rd
->srvid
, *dump
);
1608 DEBUG(DEBUG_ERR
,("Failed to send rd memdump reply message\n"));
1609 talloc_free(tmp_ctx
);
1613 talloc_free(tmp_ctx
);
1617 handler for reload_nodes
1619 static void reload_nodes_handler(uint64_t srvid
, TDB_DATA data
,
1622 struct ctdb_recoverd
*rec
= talloc_get_type(
1623 private_data
, struct ctdb_recoverd
);
1625 DEBUG(DEBUG_ERR
, (__location__
" Reload nodes file from recovery daemon\n"));
1627 ctdb_load_nodes_file(rec
->ctdb
);
1631 static void recd_node_rebalance_handler(uint64_t srvid
, TDB_DATA data
,
1634 struct ctdb_recoverd
*rec
= talloc_get_type(
1635 private_data
, struct ctdb_recoverd
);
1636 struct ctdb_context
*ctdb
= rec
->ctdb
;
1641 if (!this_node_is_leader(rec
)) {
1645 if (data
.dsize
!= sizeof(uint32_t)) {
1646 DEBUG(DEBUG_ERR
,(__location__
" Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data
.dsize
, sizeof(uint32_t)));
1650 pnn
= *(uint32_t *)&data
.dptr
[0];
1652 DEBUG(DEBUG_NOTICE
,("Setting up rebalance of IPs to node %u\n", pnn
));
1654 /* Copy any existing list of nodes. There's probably some
1655 * sort of realloc variant that will do this but we need to
1656 * make sure that freeing the old array also cancels the timer
1657 * event for the timeout... not sure if realloc will do that.
1659 len
= (rec
->force_rebalance_nodes
!= NULL
) ?
1660 talloc_array_length(rec
->force_rebalance_nodes
) :
1663 /* This allows duplicates to be added but they don't cause
1664 * harm. A call to add a duplicate PNN arguably means that
1665 * the timeout should be reset, so this is the simplest
1668 t
= talloc_zero_array(rec
, uint32_t, len
+1);
1669 CTDB_NO_MEMORY_VOID(ctdb
, t
);
1671 memcpy(t
, rec
->force_rebalance_nodes
, sizeof(uint32_t) * len
);
1675 talloc_free(rec
->force_rebalance_nodes
);
1677 rec
->force_rebalance_nodes
= t
;
1682 static void srvid_disable_and_reply(struct ctdb_recoverd
*rec
,
1684 struct ctdb_op_state
*op_state
)
1686 struct ctdb_context
*ctdb
= rec
->ctdb
;
1687 struct ctdb_disable_message
*r
;
1692 /* Validate input data */
1693 if (data
.dsize
!= sizeof(struct ctdb_disable_message
)) {
1694 DEBUG(DEBUG_ERR
,(__location__
" Wrong size for data :%lu "
1695 "expecting %lu\n", (long unsigned)data
.dsize
,
1696 (long unsigned)sizeof(struct ctdb_srvid_message
)));
1699 if (data
.dptr
== NULL
) {
1700 DEBUG(DEBUG_ERR
,(__location__
" No data received\n"));
1704 r
= (struct ctdb_disable_message
*)data
.dptr
;
1705 timeout
= r
->timeout
;
1707 ret
= ctdb_op_disable(op_state
, ctdb
->ev
, timeout
);
1712 /* Returning our PNN tells the caller that we succeeded */
1715 result
.dsize
= sizeof(int32_t);
1716 result
.dptr
= (uint8_t *)&ret
;
1717 srvid_request_reply(ctdb
, (struct ctdb_srvid_message
*)r
, result
);
1720 static void disable_takeover_runs_handler(uint64_t srvid
, TDB_DATA data
,
1723 struct ctdb_recoverd
*rec
= talloc_get_type(
1724 private_data
, struct ctdb_recoverd
);
1726 srvid_disable_and_reply(rec
, data
, rec
->takeover_run
);
1729 /* Backward compatibility for this SRVID */
1730 static void disable_ip_check_handler(uint64_t srvid
, TDB_DATA data
,
1733 struct ctdb_recoverd
*rec
= talloc_get_type(
1734 private_data
, struct ctdb_recoverd
);
1737 if (data
.dsize
!= sizeof(uint32_t)) {
1738 DEBUG(DEBUG_ERR
,(__location__
" Wrong size for data :%lu "
1739 "expecting %lu\n", (long unsigned)data
.dsize
,
1740 (long unsigned)sizeof(uint32_t)));
1743 if (data
.dptr
== NULL
) {
1744 DEBUG(DEBUG_ERR
,(__location__
" No data received\n"));
1748 timeout
= *((uint32_t *)data
.dptr
);
1750 ctdb_op_disable(rec
->takeover_run
, rec
->ctdb
->ev
, timeout
);
1753 static void disable_recoveries_handler(uint64_t srvid
, TDB_DATA data
,
1756 struct ctdb_recoverd
*rec
= talloc_get_type(
1757 private_data
, struct ctdb_recoverd
);
1759 srvid_disable_and_reply(rec
, data
, rec
->recovery
);
1763 handler for ip reallocate, just add it to the list of requests and
1764 handle this later in the monitor_cluster loop so we do not recurse
1765 with other requests to takeover_run()
1767 static void ip_reallocate_handler(uint64_t srvid
, TDB_DATA data
,
1770 struct ctdb_srvid_message
*request
;
1771 struct ctdb_recoverd
*rec
= talloc_get_type(
1772 private_data
, struct ctdb_recoverd
);
1774 if (data
.dsize
!= sizeof(struct ctdb_srvid_message
)) {
1775 DEBUG(DEBUG_ERR
, (__location__
" Wrong size of return address.\n"));
1779 request
= (struct ctdb_srvid_message
*)data
.dptr
;
1781 srvid_request_add(rec
->ctdb
, &rec
->reallocate_requests
, request
);
1784 static void process_ipreallocate_requests(struct ctdb_context
*ctdb
,
1785 struct ctdb_recoverd
*rec
)
1789 struct srvid_requests
*current
;
1791 /* Only process requests that are currently pending. More
1792 * might come in while the takeover run is in progress and
1793 * they will need to be processed later since they might
1794 * be in response flag changes.
1796 current
= rec
->reallocate_requests
;
1797 rec
->reallocate_requests
= NULL
;
1799 if (do_takeover_run(rec
, rec
->nodemap
)) {
1805 result
.dsize
= sizeof(int32_t);
1806 result
.dptr
= (uint8_t *)&ret
;
1808 srvid_requests_reply(ctdb
, ¤t
, result
);
1812 * handler for assigning banning credits
1814 static void banning_handler(uint64_t srvid
, TDB_DATA data
, void *private_data
)
1816 struct ctdb_recoverd
*rec
= talloc_get_type(
1817 private_data
, struct ctdb_recoverd
);
1820 /* Ignore if we are not leader */
1821 if (!this_node_is_leader(rec
)) {
1825 if (data
.dsize
!= sizeof(uint32_t)) {
1826 DEBUG(DEBUG_ERR
, (__location__
"invalid data size %zu\n",
1831 ban_pnn
= *(uint32_t *)data
.dptr
;
1833 ctdb_set_culprit_count(rec
, ban_pnn
, rec
->nodemap
->num
);
1837 * Handler for leader elections
1839 static void election_handler(uint64_t srvid
, TDB_DATA data
, void *private_data
)
1841 struct ctdb_recoverd
*rec
= talloc_get_type(
1842 private_data
, struct ctdb_recoverd
);
1843 struct ctdb_context
*ctdb
= rec
->ctdb
;
1844 struct election_message
*em
= (struct election_message
*)data
.dptr
;
1846 /* Ignore election packets from ourself */
1847 if (rec
->pnn
== em
->pnn
) {
1851 /* we got an election packet - update the timeout for the election */
1852 talloc_free(rec
->election_timeout
);
1853 rec
->election_in_progress
= true;
1854 rec
->election_timeout
= tevent_add_timer(
1857 timeval_current_ofs(0, 500000) :
1858 timeval_current_ofs(ctdb
->tunable
.election_timeout
, 0),
1859 ctdb_election_timeout
, rec
);
1861 /* someone called an election. check their election data
1862 and if we disagree and we would rather be the elected node,
1863 send a new election message to all other nodes
1865 if (ctdb_election_win(rec
, em
)) {
1866 if (!rec
->send_election_te
) {
1867 rec
->send_election_te
= tevent_add_timer(
1869 timeval_current_ofs(0, 500000),
1870 election_send_request
, rec
);
1876 TALLOC_FREE(rec
->send_election_te
);
1878 /* Release the cluster lock file */
1879 if (cluster_lock_held(rec
)) {
1880 cluster_lock_release(rec
);
1883 /* Set leader to the winner of this round */
1884 rec
->leader
= em
->pnn
;
1889 static void cluster_lock_election(struct ctdb_recoverd
*rec
)
1893 if (!this_node_can_be_leader(rec
)) {
1894 if (cluster_lock_held(rec
)) {
1895 cluster_lock_release(rec
);
1901 * Don't need to unconditionally release the lock and then
1902 * attempt to retake it. This provides stability.
1904 if (cluster_lock_held(rec
)) {
1908 rec
->leader
= CTDB_UNKNOWN_PNN
;
1910 ok
= cluster_lock_take(rec
);
1912 rec
->leader
= rec
->pnn
;
1913 D_WARNING("Took cluster lock, leader=%"PRIu32
"\n", rec
->leader
);
1917 rec
->election_in_progress
= false;
1921 force the start of the election process
1923 static void force_election(struct ctdb_recoverd
*rec
)
1926 struct ctdb_context
*ctdb
= rec
->ctdb
;
1928 D_ERR("Start election\n");
1930 /* set all nodes to recovery mode to stop all internode traffic */
1931 ret
= set_recovery_mode(ctdb
, rec
, rec
->nodemap
, CTDB_RECOVERY_ACTIVE
);
1933 DEBUG(DEBUG_ERR
, (__location__
" Unable to set recovery mode to active on cluster\n"));
1937 rec
->election_in_progress
= true;
1938 /* Let other nodes know that an election is underway */
1939 leader_broadcast_send(rec
, CTDB_UNKNOWN_PNN
);
1941 if (cluster_lock_enabled(rec
)) {
1942 cluster_lock_election(rec
);
1946 talloc_free(rec
->election_timeout
);
1947 rec
->election_timeout
= tevent_add_timer(
1950 timeval_current_ofs(0, 500000) :
1951 timeval_current_ofs(ctdb
->tunable
.election_timeout
, 0),
1952 ctdb_election_timeout
, rec
);
1954 ret
= send_election_request(rec
);
1956 DBG_ERR("Failed to initiate leader election\n");
1960 /* wait for a few seconds to collect all responses */
1961 ctdb_wait_election(rec
);
1965 static void srvid_not_implemented(uint64_t srvid
,
1972 case CTDB_SRVID_SET_NODE_FLAGS
:
1973 s
= "CTDB_SRVID_SET_NODE_FLAGS";
1979 D_WARNING("SRVID %s (0x%" PRIx64
") is obsolete\n", s
, srvid
);
1983 handler for when we need to push out flag changes to all other nodes
1985 static void push_flags_handler(uint64_t srvid
, TDB_DATA data
,
1988 struct ctdb_recoverd
*rec
= talloc_get_type(
1989 private_data
, struct ctdb_recoverd
);
1990 struct ctdb_context
*ctdb
= rec
->ctdb
;
1992 struct ctdb_node_flag_change
*c
= (struct ctdb_node_flag_change
*)data
.dptr
;
1993 struct ctdb_node_map_old
*nodemap
=NULL
;
1994 TALLOC_CTX
*tmp_ctx
= talloc_new(ctdb
);
1997 /* read the node flags from the leader */
1998 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), rec
->leader
,
2001 DEBUG(DEBUG_ERR
, (__location__
" Unable to get nodemap from node %u\n", c
->pnn
));
2002 talloc_free(tmp_ctx
);
2005 if (c
->pnn
>= nodemap
->num
) {
2006 DBG_ERR("Nodemap from leader does not contain node %d\n",
2008 talloc_free(tmp_ctx
);
2012 /* send the flags update to all connected nodes */
2013 nodes
= list_of_connected_nodes(ctdb
, nodemap
, tmp_ctx
, true);
2015 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_MODIFY_FLAGS
,
2016 nodes
, 0, CONTROL_TIMEOUT(),
2020 DEBUG(DEBUG_ERR
, (__location__
" ctdb_control to modify node flags failed\n"));
2022 talloc_free(tmp_ctx
);
2026 talloc_free(tmp_ctx
);
2029 static void leader_broadcast_timeout_handler(struct tevent_context
*ev
,
2030 struct tevent_timer
*te
,
2031 struct timeval current_time
,
2034 struct ctdb_recoverd
*rec
= talloc_get_type_abort(
2035 private_data
, struct ctdb_recoverd
);
2037 rec
->leader_broadcast_timeout_te
= NULL
;
2039 D_NOTICE("Leader broadcast timeout\n");
2041 force_election(rec
);
2044 static void leader_broadcast_timeout_cancel(struct ctdb_recoverd
*rec
)
2046 TALLOC_FREE(rec
->leader_broadcast_timeout_te
);
2049 static int leader_broadcast_timeout_start(struct ctdb_recoverd
*rec
)
2051 struct ctdb_context
*ctdb
= rec
->ctdb
;
2054 * This should not be necessary. However, there will be
2055 * interactions with election code here. It will want to
2056 * cancel and restart the timer around potentially long
2059 leader_broadcast_timeout_cancel(rec
);
2061 rec
->leader_broadcast_timeout_te
=
2065 timeval_current_ofs(ctdb_config
.leader_timeout
, 0),
2066 leader_broadcast_timeout_handler
,
2068 if (rec
->leader_broadcast_timeout_te
== NULL
) {
2069 D_ERR("Unable to start leader broadcast timeout\n");
2076 static bool leader_broadcast_timeout_active(struct ctdb_recoverd
*rec
)
2078 return rec
->leader_broadcast_timeout_te
!= NULL
;
2081 static void leader_handler(uint64_t srvid
, TDB_DATA data
, void *private_data
)
2083 struct ctdb_recoverd
*rec
= talloc_get_type_abort(
2084 private_data
, struct ctdb_recoverd
);
2089 ret
= ctdb_uint32_pull(data
.dptr
, data
.dsize
, &pnn
, &npull
);
2091 DBG_WARNING("Unable to parse leader broadcast, ret=%d\n", ret
);
2095 leader_broadcast_timeout_cancel(rec
);
2097 if (pnn
== rec
->leader
) {
2101 if (pnn
== CTDB_UNKNOWN_PNN
) {
2102 bool was_election_in_progress
= rec
->election_in_progress
;
2105 * Leader broadcast timeout was cancelled above - stop
2106 * main loop from restarting it until election is
2109 rec
->election_in_progress
= true;
2112 * This is the only notification for a cluster lock
2113 * election, so handle it here...
2115 if (cluster_lock_enabled(rec
) && !was_election_in_progress
) {
2116 cluster_lock_election(rec
);
2122 D_NOTICE("Received leader broadcast, leader=%"PRIu32
"\n", pnn
);
2126 leader_broadcast_timeout_start(rec
);
2129 struct verify_recmode_normal_data
{
2131 enum monitor_result status
;
2134 static void verify_recmode_normal_callback(struct ctdb_client_control_state
*state
)
2136 struct verify_recmode_normal_data
*rmdata
= talloc_get_type(state
->async
.private_data
, struct verify_recmode_normal_data
);
2139 /* one more node has responded with recmode data*/
2142 /* if we failed to get the recmode, then return an error and let
2143 the main loop try again.
2145 if (state
->state
!= CTDB_CONTROL_DONE
) {
2146 if (rmdata
->status
== MONITOR_OK
) {
2147 rmdata
->status
= MONITOR_FAILED
;
2152 /* if we got a response, then the recmode will be stored in the
2155 if (state
->status
!= CTDB_RECOVERY_NORMAL
) {
2156 DEBUG(DEBUG_NOTICE
, ("Node:%u was in recovery mode. Start recovery process\n", state
->c
->hdr
.destnode
));
2157 rmdata
->status
= MONITOR_RECOVERY_NEEDED
;
2164 /* verify that all nodes are in normal recovery mode */
2165 static enum monitor_result
verify_recmode(struct ctdb_context
*ctdb
, struct ctdb_node_map_old
*nodemap
)
2167 struct verify_recmode_normal_data
*rmdata
;
2168 TALLOC_CTX
*mem_ctx
= talloc_new(ctdb
);
2169 struct ctdb_client_control_state
*state
;
2170 enum monitor_result status
;
2173 rmdata
= talloc(mem_ctx
, struct verify_recmode_normal_data
);
2174 CTDB_NO_MEMORY_FATAL(ctdb
, rmdata
);
2176 rmdata
->status
= MONITOR_OK
;
2178 /* loop over all active nodes and send an async getrecmode call to
2180 for (j
=0; j
<nodemap
->num
; j
++) {
2181 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
2184 state
= ctdb_ctrl_getrecmode_send(ctdb
, mem_ctx
,
2186 nodemap
->nodes
[j
].pnn
);
2187 if (state
== NULL
) {
2188 /* we failed to send the control, treat this as
2189 an error and try again next iteration
2191 DEBUG(DEBUG_ERR
,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2192 talloc_free(mem_ctx
);
2193 return MONITOR_FAILED
;
2196 /* set up the callback functions */
2197 state
->async
.fn
= verify_recmode_normal_callback
;
2198 state
->async
.private_data
= rmdata
;
2200 /* one more control to wait for to complete */
2205 /* now wait for up to the maximum number of seconds allowed
2206 or until all nodes we expect a response from has replied
2208 while (rmdata
->count
> 0) {
2209 tevent_loop_once(ctdb
->ev
);
2212 status
= rmdata
->status
;
2213 talloc_free(mem_ctx
);
2218 static bool interfaces_have_changed(struct ctdb_context
*ctdb
,
2219 struct ctdb_recoverd
*rec
)
2221 struct ctdb_iface_list_old
*ifaces
= NULL
;
2222 TALLOC_CTX
*mem_ctx
;
2225 mem_ctx
= talloc_new(NULL
);
2227 /* Read the interfaces from the local node */
2228 if (ctdb_ctrl_get_ifaces(ctdb
, CONTROL_TIMEOUT(),
2229 CTDB_CURRENT_NODE
, mem_ctx
, &ifaces
) != 0) {
2230 D_ERR("Unable to get interfaces from local node %u\n", rec
->pnn
);
2231 /* We could return an error. However, this will be
2232 * rare so we'll decide that the interfaces have
2233 * actually changed, just in case.
2235 talloc_free(mem_ctx
);
2240 /* We haven't been here before so things have changed */
2241 DEBUG(DEBUG_NOTICE
, ("Initial interface fetched\n"));
2243 } else if (rec
->ifaces
->num
!= ifaces
->num
) {
2244 /* Number of interfaces has changed */
2245 DEBUG(DEBUG_NOTICE
, ("Interface count changed from %d to %d\n",
2246 rec
->ifaces
->num
, ifaces
->num
));
2249 /* See if interface names or link states have changed */
2251 for (i
= 0; i
< rec
->ifaces
->num
; i
++) {
2252 struct ctdb_iface
* iface
= &rec
->ifaces
->ifaces
[i
];
2253 if (strcmp(iface
->name
, ifaces
->ifaces
[i
].name
) != 0) {
2255 ("Interface in slot %d changed: %s => %s\n",
2256 i
, iface
->name
, ifaces
->ifaces
[i
].name
));
2260 if (iface
->link_state
!= ifaces
->ifaces
[i
].link_state
) {
2262 ("Interface %s changed state: %d => %d\n",
2263 iface
->name
, iface
->link_state
,
2264 ifaces
->ifaces
[i
].link_state
));
2271 talloc_free(rec
->ifaces
);
2272 rec
->ifaces
= talloc_steal(rec
, ifaces
);
2274 talloc_free(mem_ctx
);
2278 /* Check that the local allocation of public IP addresses is correct
2279 * and do some house-keeping */
2280 static int verify_local_ip_allocation(struct ctdb_recoverd
*rec
)
2282 TALLOC_CTX
*mem_ctx
= talloc_new(NULL
);
2283 struct ctdb_context
*ctdb
= rec
->ctdb
;
2286 bool need_takeover_run
= false;
2287 struct ctdb_public_ip_list_old
*ips
= NULL
;
2289 /* If we are not the leader then do some housekeeping */
2290 if (!this_node_is_leader(rec
)) {
2291 /* Ignore any IP reallocate requests - only leader
2294 TALLOC_FREE(rec
->reallocate_requests
);
2295 /* Clear any nodes that should be force rebalanced in
2296 * the next takeover run. If the leader has changed
2297 * then we don't want to process these some time in
2300 TALLOC_FREE(rec
->force_rebalance_nodes
);
2303 /* Return early if disabled... */
2304 if (ctdb_config
.failover_disabled
||
2305 ctdb_op_is_disabled(rec
->takeover_run
)) {
2306 talloc_free(mem_ctx
);
2310 if (interfaces_have_changed(ctdb
, rec
)) {
2311 need_takeover_run
= true;
2314 /* If there are unhosted IPs but this node can host them then
2315 * trigger an IP reallocation */
2317 /* Read *available* IPs from local node */
2318 ret
= ctdb_ctrl_get_public_ips_flags(
2319 ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, mem_ctx
,
2320 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE
, &ips
);
2322 DEBUG(DEBUG_ERR
, ("Unable to retrieve available public IPs\n"));
2323 talloc_free(mem_ctx
);
2327 for (j
=0; j
<ips
->num
; j
++) {
2328 if (ips
->ips
[j
].pnn
== CTDB_UNKNOWN_PNN
&&
2329 rec
->nodemap
->nodes
[rec
->pnn
].flags
== 0) {
2330 DEBUG(DEBUG_WARNING
,
2331 ("Unassigned IP %s can be served by this node\n",
2332 ctdb_addr_to_str(&ips
->ips
[j
].addr
)));
2333 need_takeover_run
= true;
2339 if (!ctdb
->do_checkpublicip
) {
2343 /* Validate the IP addresses that this node has on network
2344 * interfaces. If there is an inconsistency between reality
2345 * and the state expected by CTDB then try to fix it by
2346 * triggering an IP reallocation or releasing extraneous IP
2349 /* Read *known* IPs from local node */
2350 ret
= ctdb_ctrl_get_public_ips_flags(
2351 ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, mem_ctx
, 0, &ips
);
2353 DEBUG(DEBUG_ERR
, ("Unable to retrieve known public IPs\n"));
2354 talloc_free(mem_ctx
);
2358 for (j
=0; j
<ips
->num
; j
++) {
2359 if (ips
->ips
[j
].pnn
== rec
->pnn
) {
2360 if (!ctdb_sys_have_ip(&ips
->ips
[j
].addr
)) {
2362 ("Assigned IP %s not on an interface\n",
2363 ctdb_addr_to_str(&ips
->ips
[j
].addr
)));
2364 need_takeover_run
= true;
2367 if (ctdb_sys_have_ip(&ips
->ips
[j
].addr
)) {
2369 ("IP %s incorrectly on an interface\n",
2370 ctdb_addr_to_str(&ips
->ips
[j
].addr
)));
2371 need_takeover_run
= true;
2377 if (need_takeover_run
) {
2378 struct ctdb_srvid_message rd
;
2381 DEBUG(DEBUG_NOTICE
,("Trigger takeoverrun\n"));
2386 data
.dptr
= (uint8_t *)&rd
;
2387 data
.dsize
= sizeof(rd
);
2389 ret
= ctdb_client_send_message(ctdb
,
2390 CTDB_BROADCAST_CONNECTED
,
2391 CTDB_SRVID_TAKEOVER_RUN
,
2394 D_ERR("Failed to send takeover run request\n");
2397 talloc_free(mem_ctx
);
2402 struct remote_nodemaps_state
{
2403 struct ctdb_node_map_old
**remote_nodemaps
;
2404 struct ctdb_recoverd
*rec
;
2407 static void async_getnodemap_callback(struct ctdb_context
*ctdb
,
2411 void *callback_data
)
2413 struct remote_nodemaps_state
*state
=
2414 (struct remote_nodemaps_state
*)callback_data
;
2415 struct ctdb_node_map_old
**remote_nodemaps
= state
->remote_nodemaps
;
2416 struct ctdb_node_map_old
*nodemap
= state
->rec
->nodemap
;
2419 for (i
= 0; i
< nodemap
->num
; i
++) {
2420 if (nodemap
->nodes
[i
].pnn
== node_pnn
) {
2425 if (i
>= nodemap
->num
) {
2426 DBG_ERR("Invalid PNN %"PRIu32
"\n", node_pnn
);
2430 remote_nodemaps
[i
] = (struct ctdb_node_map_old
*)talloc_steal(
2431 remote_nodemaps
, outdata
.dptr
);
2435 static void async_getnodemap_error(struct ctdb_context
*ctdb
,
2439 void *callback_data
)
2441 struct remote_nodemaps_state
*state
=
2442 (struct remote_nodemaps_state
*)callback_data
;
2443 struct ctdb_recoverd
*rec
= state
->rec
;
2445 DBG_ERR("Failed to retrieve nodemap from node %u\n", node_pnn
);
2446 ctdb_set_culprit(rec
, node_pnn
);
2449 static int get_remote_nodemaps(struct ctdb_recoverd
*rec
,
2450 TALLOC_CTX
*mem_ctx
,
2451 struct ctdb_node_map_old
***remote_nodemaps
)
2453 struct ctdb_context
*ctdb
= rec
->ctdb
;
2454 struct ctdb_node_map_old
**t
;
2456 struct remote_nodemaps_state state
;
2459 t
= talloc_zero_array(mem_ctx
,
2460 struct ctdb_node_map_old
*,
2463 DBG_ERR("Memory allocation error\n");
2467 nodes
= list_of_connected_nodes(ctdb
, rec
->nodemap
, mem_ctx
, false);
2469 state
.remote_nodemaps
= t
;
2472 ret
= ctdb_client_async_control(ctdb
,
2473 CTDB_CONTROL_GET_NODEMAP
,
2479 async_getnodemap_callback
,
2480 async_getnodemap_error
,
2489 *remote_nodemaps
= t
;
2493 static void main_loop(struct ctdb_context
*ctdb
, struct ctdb_recoverd
*rec
,
2494 TALLOC_CTX
*mem_ctx
)
2496 struct ctdb_node_map_old
*nodemap
=NULL
;
2497 struct ctdb_node_map_old
**remote_nodemaps
=NULL
;
2498 struct ctdb_vnn_map
*vnnmap
=NULL
;
2499 struct ctdb_vnn_map
*remote_vnnmap
=NULL
;
2500 uint32_t num_lmasters
;
2501 int32_t debug_level
;
2507 /* verify that the main daemon is still running */
2508 if (ctdb_kill(ctdb
, ctdb
->ctdbd_pid
, 0) != 0) {
2509 DEBUG(DEBUG_CRIT
,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2513 /* ping the local daemon to tell it we are alive */
2514 ctdb_ctrl_recd_ping(ctdb
);
2516 if (rec
->election_in_progress
) {
2517 /* an election is in progress */
2522 * Start leader broadcasts if they are not active (1st time
2523 * through main loop? Memory allocation error?)
2525 if (!leader_broadcast_loop_active(rec
)) {
2526 ret
= leader_broadcast_loop(rec
);
2528 D_ERR("Failed to set up leader broadcast\n");
2529 ctdb_set_culprit(rec
, rec
->pnn
);
2533 * Similar for leader broadcast timeouts. These can also have
2534 * been stopped by another node receiving a leader broadcast
2535 * timeout and transmitting an "unknown leader broadcast".
2536 * Note that this should never be done during an election - at
2537 * the moment there is nothing between here and the above
2538 * election-in-progress check that can process an election
2539 * result (i.e. no event loop).
2541 if (!leader_broadcast_timeout_active(rec
)) {
2542 ret
= leader_broadcast_timeout_start(rec
);
2544 ctdb_set_culprit(rec
, rec
->pnn
);
2549 /* read the debug level from the parent and update locally */
2550 ret
= ctdb_ctrl_get_debuglevel(ctdb
, CTDB_CURRENT_NODE
, &debug_level
);
2552 DEBUG(DEBUG_ERR
, (__location__
" Failed to read debuglevel from parent\n"));
2555 debuglevel_set(debug_level
);
2557 /* get relevant tunables */
2558 ret
= ctdb_ctrl_get_all_tunables(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, &ctdb
->tunable
);
2560 DEBUG(DEBUG_ERR
,("Failed to get tunables - retrying\n"));
2565 ret
= ctdb_ctrl_get_runstate(ctdb
, CONTROL_TIMEOUT(),
2566 CTDB_CURRENT_NODE
, &ctdb
->runstate
);
2568 DEBUG(DEBUG_ERR
, ("Failed to get runstate - retrying\n"));
2573 ret
= ctdb_ctrl_getnodemap(ctdb
,
2579 DBG_ERR("Unable to get nodemap from node %"PRIu32
"\n", rec
->pnn
);
2582 talloc_free(rec
->nodemap
);
2583 rec
->nodemap
= nodemap
;
2585 /* remember our own node flags */
2586 rec
->node_flags
= nodemap
->nodes
[rec
->pnn
].flags
;
2588 ban_misbehaving_nodes(rec
, &self_ban
);
2590 DEBUG(DEBUG_NOTICE
, ("This node was banned, restart main_loop\n"));
2594 ret
= ctdb_ctrl_getrecmode(ctdb
, mem_ctx
, CONTROL_TIMEOUT(),
2595 CTDB_CURRENT_NODE
, &ctdb
->recovery_mode
);
2597 D_ERR("Failed to read recmode from local node\n");
2601 /* if the local daemon is STOPPED or BANNED, we verify that the databases are
2602 also frozen and that the recmode is set to active.
2604 if (rec
->node_flags
& NODE_FLAGS_INACTIVE
) {
2605 /* If this node has become inactive then we want to
2606 * reduce the chances of it taking over the leader
2607 * role when it becomes active again. This
2608 * helps to stabilise the leader role so that
2609 * it stays on the most stable node.
2611 rec
->priority_time
= timeval_current();
2613 if (ctdb
->recovery_mode
== CTDB_RECOVERY_NORMAL
) {
2614 DEBUG(DEBUG_ERR
,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
2616 ret
= ctdb_ctrl_setrecmode(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, CTDB_RECOVERY_ACTIVE
);
2618 DEBUG(DEBUG_ERR
,(__location__
" Failed to activate recovery mode in STOPPED or BANNED state\n"));
2623 if (! rec
->frozen_on_inactive
) {
2624 ret
= ctdb_ctrl_freeze(ctdb
, CONTROL_TIMEOUT(),
2628 (__location__
" Failed to freeze node "
2629 "in STOPPED or BANNED state\n"));
2633 rec
->frozen_on_inactive
= true;
2636 /* If this node is stopped or banned then it is not the recovery
2637 * master, so don't do anything. This prevents stopped or banned
2638 * node from starting election and sending unnecessary controls.
2643 rec
->frozen_on_inactive
= false;
2645 /* Retrieve capabilities from all connected nodes */
2646 ret
= update_capabilities(rec
, nodemap
);
2648 DEBUG(DEBUG_ERR
, (__location__
" Unable to update node capabilities.\n"));
2652 if (ctdb
->recovery_mode
== CTDB_RECOVERY_NORMAL
) {
2653 /* Check if an IP takeover run is needed and trigger one if
2655 verify_local_ip_allocation(rec
);
2658 /* If this node is not the leader then skip recovery checks */
2659 if (!this_node_is_leader(rec
)) {
2664 /* Get the nodemaps for all connected remote nodes */
2665 ret
= get_remote_nodemaps(rec
, mem_ctx
, &remote_nodemaps
);
2667 DBG_ERR("Failed to read remote nodemaps\n");
2671 /* Ensure our local and remote flags are correct */
2672 ret
= update_flags(rec
, nodemap
, remote_nodemaps
);
2674 D_ERR("Unable to update flags\n");
2678 if (ctdb
->num_nodes
!= nodemap
->num
) {
2679 DEBUG(DEBUG_ERR
, (__location__
" ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb
->num_nodes
, nodemap
->num
));
2680 ctdb_load_nodes_file(ctdb
);
2684 /* get the vnnmap */
2685 ret
= ctdb_ctrl_getvnnmap(ctdb
,
2691 DBG_ERR("Unable to get vnnmap from node %u\n", rec
->pnn
);
2695 if (rec
->need_recovery
) {
2696 /* a previous recovery didn't finish */
2697 do_recovery(rec
, mem_ctx
);
2701 /* verify that all active nodes are in normal mode
2702 and not in recovery mode
2704 switch (verify_recmode(ctdb
, nodemap
)) {
2705 case MONITOR_RECOVERY_NEEDED
:
2706 do_recovery(rec
, mem_ctx
);
2708 case MONITOR_FAILED
:
2710 case MONITOR_ELECTION_NEEDED
:
2711 /* can not happen */
2716 if (cluster_lock_enabled(rec
)) {
2717 /* We must already hold the cluster lock */
2718 if (!cluster_lock_held(rec
)) {
2719 D_ERR("Failed cluster lock sanity check\n");
2720 ctdb_set_culprit(rec
, rec
->pnn
);
2721 do_recovery(rec
, mem_ctx
);
2727 /* If recoveries are disabled then there is no use doing any
2728 * nodemap or flags checks. Recoveries might be disabled due
2729 * to "reloadnodes", so doing these checks might cause an
2730 * unnecessary recovery. */
2731 if (ctdb_op_is_disabled(rec
->recovery
)) {
2732 goto takeover_run_checks
;
2735 /* verify that all other nodes have the same nodemap as we have
2737 for (j
=0; j
<nodemap
->num
; j
++) {
2738 if (nodemap
->nodes
[j
].pnn
== rec
->pnn
) {
2741 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
2745 /* if the nodes disagree on how many nodes there are
2746 then this is a good reason to try recovery
2748 if (remote_nodemaps
[j
]->num
!= nodemap
->num
) {
2749 DEBUG(DEBUG_ERR
, (__location__
" Remote node:%u has different node count. %u vs %u of the local node\n",
2750 nodemap
->nodes
[j
].pnn
, remote_nodemaps
[j
]->num
, nodemap
->num
));
2751 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
2752 do_recovery(rec
, mem_ctx
);
2756 /* if the nodes disagree on which nodes exist and are
2757 active, then that is also a good reason to do recovery
2759 for (i
=0;i
<nodemap
->num
;i
++) {
2760 if (remote_nodemaps
[j
]->nodes
[i
].pnn
!= nodemap
->nodes
[i
].pnn
) {
2761 DEBUG(DEBUG_ERR
, (__location__
" Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
2762 nodemap
->nodes
[j
].pnn
, i
,
2763 remote_nodemaps
[j
]->nodes
[i
].pnn
, nodemap
->nodes
[i
].pnn
));
2764 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
2765 do_recovery(rec
, mem_ctx
);
2771 /* count how many active nodes there are */
2773 for (i
=0; i
<nodemap
->num
; i
++) {
2774 if (!(nodemap
->nodes
[i
].flags
& NODE_FLAGS_INACTIVE
)) {
2775 if (ctdb_node_has_capabilities(rec
->caps
,
2776 ctdb
->nodes
[i
]->pnn
,
2777 CTDB_CAP_LMASTER
)) {
2784 /* There must be the same number of lmasters in the vnn map as
2785 * there are active nodes with the lmaster capability... or
2788 if (vnnmap
->size
!= num_lmasters
) {
2789 DEBUG(DEBUG_ERR
, (__location__
" The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
2790 vnnmap
->size
, num_lmasters
));
2791 ctdb_set_culprit(rec
, rec
->pnn
);
2792 do_recovery(rec
, mem_ctx
);
2797 * Verify that all active lmaster nodes in the nodemap also
2798 * exist in the vnnmap
2800 for (j
=0; j
<nodemap
->num
; j
++) {
2801 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
2804 if (! ctdb_node_has_capabilities(rec
->caps
,
2805 nodemap
->nodes
[j
].pnn
,
2806 CTDB_CAP_LMASTER
)) {
2809 if (nodemap
->nodes
[j
].pnn
== rec
->pnn
) {
2813 for (i
=0; i
<vnnmap
->size
; i
++) {
2814 if (vnnmap
->map
[i
] == nodemap
->nodes
[j
].pnn
) {
2818 if (i
== vnnmap
->size
) {
2819 D_ERR("Active LMASTER node %u is not in the vnnmap\n",
2820 nodemap
->nodes
[j
].pnn
);
2821 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
2822 do_recovery(rec
, mem_ctx
);
2828 /* verify that all other nodes have the same vnnmap
2829 and are from the same generation
2831 for (j
=0; j
<nodemap
->num
; j
++) {
2832 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
2835 if (nodemap
->nodes
[j
].pnn
== rec
->pnn
) {
2839 ret
= ctdb_ctrl_getvnnmap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
2840 mem_ctx
, &remote_vnnmap
);
2842 DEBUG(DEBUG_ERR
, (__location__
" Unable to get vnnmap from remote node %u\n",
2843 nodemap
->nodes
[j
].pnn
));
2847 /* verify the vnnmap generation is the same */
2848 if (vnnmap
->generation
!= remote_vnnmap
->generation
) {
2849 DEBUG(DEBUG_ERR
, (__location__
" Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
2850 nodemap
->nodes
[j
].pnn
, remote_vnnmap
->generation
, vnnmap
->generation
));
2851 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
2852 do_recovery(rec
, mem_ctx
);
2856 /* verify the vnnmap size is the same */
2857 if (vnnmap
->size
!= remote_vnnmap
->size
) {
2858 DEBUG(DEBUG_ERR
, (__location__
" Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
2859 nodemap
->nodes
[j
].pnn
, remote_vnnmap
->size
, vnnmap
->size
));
2860 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
2861 do_recovery(rec
, mem_ctx
);
2865 /* verify the vnnmap is the same */
2866 for (i
=0;i
<vnnmap
->size
;i
++) {
2867 if (remote_vnnmap
->map
[i
] != vnnmap
->map
[i
]) {
2868 DEBUG(DEBUG_ERR
, (__location__
" Remote node %u has different vnnmap.\n",
2869 nodemap
->nodes
[j
].pnn
));
2870 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
2871 do_recovery(rec
, mem_ctx
);
2877 /* FIXME: Add remote public IP checking to ensure that nodes
2878 * have the IP addresses that are allocated to them. */
2880 takeover_run_checks
:
2882 /* If there are IP takeover runs requested or the previous one
2883 * failed then perform one and notify the waiters */
2884 if (!ctdb_op_is_disabled(rec
->takeover_run
) &&
2885 (rec
->reallocate_requests
|| rec
->need_takeover_run
)) {
2886 process_ipreallocate_requests(ctdb
, rec
);
2890 static void recd_sig_term_handler(struct tevent_context
*ev
,
2891 struct tevent_signal
*se
, int signum
,
2892 int count
, void *dont_care
,
2895 struct ctdb_recoverd
*rec
= talloc_get_type_abort(
2896 private_data
, struct ctdb_recoverd
);
2898 DEBUG(DEBUG_ERR
, ("Received SIGTERM, exiting\n"));
2899 cluster_lock_release(rec
);
2904 * Periodically log elements of the cluster state
2906 * This can be used to confirm a split brain has occurred
2908 static void maybe_log_cluster_state(struct tevent_context
*ev
,
2909 struct tevent_timer
*te
,
2910 struct timeval current_time
,
2913 struct ctdb_recoverd
*rec
= talloc_get_type_abort(
2914 private_data
, struct ctdb_recoverd
);
2915 struct ctdb_context
*ctdb
= rec
->ctdb
;
2916 struct tevent_timer
*tt
;
2918 static struct timeval start_incomplete
= {
2926 unsigned int minutes
;
2927 unsigned int num_connected
;
2929 if (!this_node_is_leader(rec
)) {
2933 if (rec
->nodemap
== NULL
) {
2939 for (i
= 0; i
< rec
->nodemap
->num
; i
++) {
2940 struct ctdb_node_and_flags
*n
= &rec
->nodemap
->nodes
[i
];
2942 if (n
->pnn
== rec
->pnn
) {
2945 if ((n
->flags
& NODE_FLAGS_DELETED
) != 0) {
2948 if ((n
->flags
& NODE_FLAGS_DISCONNECTED
) != 0) {
2949 is_complete
= false;
2956 was_complete
= timeval_is_zero(&start_incomplete
);
2959 if (! was_complete
) {
2960 D_WARNING("Cluster complete with leader=%u\n",
2962 start_incomplete
= timeval_zero();
2967 /* Cluster is newly incomplete... */
2969 start_incomplete
= current_time
;
2975 * Cluster has been incomplete since previous check, so figure
2976 * out how long (in minutes) and decide whether to log anything
2978 seconds
= timeval_elapsed2(&start_incomplete
, ¤t_time
);
2979 minutes
= (unsigned int)seconds
/ 60;
2980 if (minutes
>= 60) {
2981 /* Over an hour, log every hour */
2982 if (minutes
% 60 != 0) {
2985 } else if (minutes
>= 10) {
2986 /* Over 10 minutes, log every 10 minutes */
2987 if (minutes
% 10 != 0) {
2993 D_WARNING("Cluster incomplete with leader=%u, elapsed=%u minutes, "
3000 tt
= tevent_add_timer(ctdb
->ev
,
3002 timeval_current_ofs(60, 0),
3003 maybe_log_cluster_state
,
3006 DBG_WARNING("Failed to set up cluster state timer\n");
3010 static void recd_sighup_hook(void *private_data
)
3012 struct ctdb_recoverd
*rec
= talloc_get_type_abort(
3013 private_data
, struct ctdb_recoverd
);
3015 if (rec
->helper_pid
> 0) {
3016 kill(rec
->helper_pid
, SIGHUP
);
3021 the main monitoring loop
3023 static void monitor_cluster(struct ctdb_context
*ctdb
)
3025 struct tevent_signal
*se
;
3026 struct ctdb_recoverd
*rec
;
3029 DEBUG(DEBUG_NOTICE
,("monitor_cluster starting\n"));
3031 rec
= talloc_zero(ctdb
, struct ctdb_recoverd
);
3032 CTDB_NO_MEMORY_FATAL(ctdb
, rec
);
3035 rec
->leader
= CTDB_UNKNOWN_PNN
;
3036 rec
->pnn
= ctdb_get_pnn(ctdb
);
3037 rec
->cluster_lock_handle
= NULL
;
3038 rec
->helper_pid
= -1;
3040 rec
->takeover_run
= ctdb_op_init(rec
, "takeover runs");
3041 CTDB_NO_MEMORY_FATAL(ctdb
, rec
->takeover_run
);
3043 rec
->recovery
= ctdb_op_init(rec
, "recoveries");
3044 CTDB_NO_MEMORY_FATAL(ctdb
, rec
->recovery
);
3046 rec
->priority_time
= timeval_current();
3047 rec
->frozen_on_inactive
= false;
3049 status
= logging_setup_sighup_handler(rec
->ctdb
->ev
,
3054 D_ERR("Failed to install SIGHUP handler\n");
3058 se
= tevent_add_signal(ctdb
->ev
, ctdb
, SIGTERM
, 0,
3059 recd_sig_term_handler
, rec
);
3061 DEBUG(DEBUG_ERR
, ("Failed to install SIGTERM handler\n"));
3065 if (!cluster_lock_enabled(rec
)) {
3066 struct tevent_timer
*tt
;
3068 tt
= tevent_add_timer(ctdb
->ev
,
3070 timeval_current_ofs(60, 0),
3071 maybe_log_cluster_state
,
3074 DBG_WARNING("Failed to set up cluster state timer\n");
3078 /* register a message port for sending memory dumps */
3079 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_MEM_DUMP
, mem_dump_handler
, rec
);
3081 /* when a node is assigned banning credits */
3082 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_BANNING
,
3083 banning_handler
, rec
);
3085 /* register a message port for recovery elections */
3086 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_ELECTION
, election_handler
, rec
);
3088 ctdb_client_set_message_handler(ctdb
,
3089 CTDB_SRVID_SET_NODE_FLAGS
,
3090 srvid_not_implemented
,
3093 /* when we are asked to puch out a flag change */
3094 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_PUSH_NODE_FLAGS
, push_flags_handler
, rec
);
3096 /* register a message port for reloadnodes */
3097 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_RELOAD_NODES
, reload_nodes_handler
, rec
);
3099 /* register a message port for performing a takeover run */
3100 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_TAKEOVER_RUN
, ip_reallocate_handler
, rec
);
3102 /* register a message port for disabling the ip check for a short while */
3103 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_DISABLE_IP_CHECK
, disable_ip_check_handler
, rec
);
3105 /* register a message port for forcing a rebalance of a node next
3107 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_REBALANCE_NODE
, recd_node_rebalance_handler
, rec
);
3109 /* Register a message port for disabling takeover runs */
3110 ctdb_client_set_message_handler(ctdb
,
3111 CTDB_SRVID_DISABLE_TAKEOVER_RUNS
,
3112 disable_takeover_runs_handler
, rec
);
3114 /* Register a message port for disabling recoveries */
3115 ctdb_client_set_message_handler(ctdb
,
3116 CTDB_SRVID_DISABLE_RECOVERIES
,
3117 disable_recoveries_handler
, rec
);
3119 ctdb_client_set_message_handler(ctdb
,
3125 TALLOC_CTX
*mem_ctx
= talloc_new(ctdb
);
3126 struct timeval start
;
3130 DEBUG(DEBUG_CRIT
,(__location__
3131 " Failed to create temp context\n"));
3135 start
= timeval_current();
3136 main_loop(ctdb
, rec
, mem_ctx
);
3137 talloc_free(mem_ctx
);
3139 /* we only check for recovery once every second */
3140 elapsed
= timeval_elapsed(&start
);
3141 if (elapsed
< ctdb
->tunable
.recover_interval
) {
3142 ctdb_wait_timeout(ctdb
, ctdb
->tunable
.recover_interval
3149 event handler for when the main ctdbd dies
3151 static void ctdb_recoverd_parent(struct tevent_context
*ev
,
3152 struct tevent_fd
*fde
,
3153 uint16_t flags
, void *private_data
)
3155 DEBUG(DEBUG_ALERT
,("recovery daemon parent died - exiting\n"));
3160 called regularly to verify that the recovery daemon is still running
3162 static void ctdb_check_recd(struct tevent_context
*ev
,
3163 struct tevent_timer
*te
,
3164 struct timeval yt
, void *p
)
3166 struct ctdb_context
*ctdb
= talloc_get_type(p
, struct ctdb_context
);
3168 if (ctdb_kill(ctdb
, ctdb
->recoverd_pid
, 0) != 0) {
3169 DEBUG(DEBUG_ERR
,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb
->recoverd_pid
));
3171 tevent_add_timer(ctdb
->ev
, ctdb
, timeval_zero(),
3172 ctdb_restart_recd
, ctdb
);
3177 tevent_add_timer(ctdb
->ev
, ctdb
->recd_ctx
,
3178 timeval_current_ofs(30, 0),
3179 ctdb_check_recd
, ctdb
);
3182 static void recd_sig_child_handler(struct tevent_context
*ev
,
3183 struct tevent_signal
*se
, int signum
,
3184 int count
, void *dont_care
,
3187 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3192 pid
= waitpid(-1, &status
, WNOHANG
);
3194 if (errno
!= ECHILD
) {
3195 DEBUG(DEBUG_ERR
, (__location__
" waitpid() returned error. errno:%s(%d)\n", strerror(errno
),errno
));
3200 DEBUG(DEBUG_DEBUG
, ("RECD SIGCHLD from %d\n", (int)pid
));
3206 startup the recovery daemon as a child of the main ctdb daemon
3208 int ctdb_start_recoverd(struct ctdb_context
*ctdb
)
3211 struct tevent_signal
*se
;
3212 struct tevent_fd
*fde
;
3215 if (pipe(fd
) != 0) {
3219 ctdb
->recoverd_pid
= ctdb_fork(ctdb
);
3220 if (ctdb
->recoverd_pid
== -1) {
3224 if (ctdb
->recoverd_pid
!= 0) {
3225 talloc_free(ctdb
->recd_ctx
);
3226 ctdb
->recd_ctx
= talloc_new(ctdb
);
3227 CTDB_NO_MEMORY(ctdb
, ctdb
->recd_ctx
);
3230 tevent_add_timer(ctdb
->ev
, ctdb
->recd_ctx
,
3231 timeval_current_ofs(30, 0),
3232 ctdb_check_recd
, ctdb
);
3238 srandom(getpid() ^ time(NULL
));
3240 ret
= logging_init(ctdb
, NULL
, NULL
, "ctdb-recoverd");
3245 prctl_set_comment("ctdb_recoverd");
3246 if (switch_from_server_to_client(ctdb
) != 0) {
3247 DEBUG(DEBUG_CRIT
, (__location__
"ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3251 DEBUG(DEBUG_DEBUG
, (__location__
" Created PIPE FD:%d to recovery daemon\n", fd
[0]));
3253 fde
= tevent_add_fd(ctdb
->ev
, ctdb
, fd
[0], TEVENT_FD_READ
,
3254 ctdb_recoverd_parent
, &fd
[0]);
3255 tevent_fd_set_auto_close(fde
);
3257 /* set up a handler to pick up sigchld */
3258 se
= tevent_add_signal(ctdb
->ev
, ctdb
, SIGCHLD
, 0,
3259 recd_sig_child_handler
, ctdb
);
3261 DEBUG(DEBUG_CRIT
,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3265 monitor_cluster(ctdb
);
3267 DEBUG(DEBUG_ALERT
,("ERROR: ctdb_recoverd finished!?\n"));
3272 shutdown the recovery daemon
3274 void ctdb_stop_recoverd(struct ctdb_context
*ctdb
)
3276 if (ctdb
->recoverd_pid
== 0) {
3280 DEBUG(DEBUG_NOTICE
,("Shutting down recovery daemon\n"));
3281 ctdb_kill(ctdb
, ctdb
->recoverd_pid
, SIGTERM
);
3283 TALLOC_FREE(ctdb
->recd_ctx
);
3284 TALLOC_FREE(ctdb
->recd_ping_count
);
3287 static void ctdb_restart_recd(struct tevent_context
*ev
,
3288 struct tevent_timer
*te
,
3289 struct timeval t
, void *private_data
)
3291 struct ctdb_context
*ctdb
= talloc_get_type(private_data
, struct ctdb_context
);
3293 DEBUG(DEBUG_ERR
,("Restarting recovery daemon\n"));
3294 ctdb_stop_recoverd(ctdb
);
3295 ctdb_start_recoverd(ctdb
);