s3:idmap_ad: add support for ADS_AUTH_SASL_{STARTTLS,LDAPS}
[Samba.git] / ctdb / server / ctdb_recoverd.c
blob0bcc4d33f8dd11d17e1f44cb536d307c49e3f95a
1 /*
2 ctdb recovery daemon
4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
20 #include "replace.h"
21 #include "system/filesys.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/wait.h"
26 #include <popt.h>
27 #include <talloc.h>
28 #include <tevent.h>
29 #include <tdb.h>
31 #include "lib/tdb_wrap/tdb_wrap.h"
32 #include "lib/util/dlinklist.h"
33 #include "lib/util/debug.h"
34 #include "lib/util/samba_util.h"
35 #include "lib/util/sys_rw.h"
36 #include "lib/util/util_process.h"
38 #include "ctdb_private.h"
39 #include "ctdb_client.h"
41 #include "protocol/protocol_basic.h"
43 #include "common/system_socket.h"
44 #include "common/common.h"
45 #include "common/logging.h"
47 #include "server/ctdb_config.h"
49 #include "ctdb_cluster_mutex.h"
51 /* List of SRVID requests that need to be processed */
52 struct srvid_list {
53 struct srvid_list *next, *prev;
54 struct ctdb_srvid_message *request;
57 struct srvid_requests {
58 struct srvid_list *requests;
61 static void srvid_request_reply(struct ctdb_context *ctdb,
62 struct ctdb_srvid_message *request,
63 TDB_DATA result)
65 /* Someone that sent srvid==0 does not want a reply */
66 if (request->srvid == 0) {
67 talloc_free(request);
68 return;
71 if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
72 result) == 0) {
73 DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
74 (unsigned)request->pnn,
75 (unsigned long long)request->srvid));
76 } else {
77 DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
78 (unsigned)request->pnn,
79 (unsigned long long)request->srvid));
82 talloc_free(request);
85 static void srvid_requests_reply(struct ctdb_context *ctdb,
86 struct srvid_requests **requests,
87 TDB_DATA result)
89 struct srvid_list *r;
91 if (*requests == NULL) {
92 return;
95 for (r = (*requests)->requests; r != NULL; r = r->next) {
96 srvid_request_reply(ctdb, r->request, result);
99 /* Free the list structure... */
100 TALLOC_FREE(*requests);
103 static void srvid_request_add(struct ctdb_context *ctdb,
104 struct srvid_requests **requests,
105 struct ctdb_srvid_message *request)
107 struct srvid_list *t;
108 int32_t ret;
109 TDB_DATA result;
111 if (*requests == NULL) {
112 *requests = talloc_zero(ctdb, struct srvid_requests);
113 if (*requests == NULL) {
114 goto nomem;
118 t = talloc_zero(*requests, struct srvid_list);
119 if (t == NULL) {
120 /* If *requests was just allocated above then free it */
121 if ((*requests)->requests == NULL) {
122 TALLOC_FREE(*requests);
124 goto nomem;
127 t->request = (struct ctdb_srvid_message *)talloc_steal(t, request);
128 DLIST_ADD((*requests)->requests, t);
130 return;
132 nomem:
133 /* Failed to add the request to the list. Send a fail. */
134 DEBUG(DEBUG_ERR, (__location__
135 " Out of memory, failed to queue SRVID request\n"));
136 ret = -ENOMEM;
137 result.dsize = sizeof(ret);
138 result.dptr = (uint8_t *)&ret;
139 srvid_request_reply(ctdb, request, result);
142 /* An abstraction to allow an operation (takeover runs, recoveries,
143 * ...) to be disabled for a given timeout */
144 struct ctdb_op_state {
145 struct tevent_timer *timer;
146 bool in_progress;
147 const char *name;
150 static struct ctdb_op_state *ctdb_op_init(TALLOC_CTX *mem_ctx, const char *name)
152 struct ctdb_op_state *state = talloc_zero(mem_ctx, struct ctdb_op_state);
154 if (state != NULL) {
155 state->in_progress = false;
156 state->name = name;
159 return state;
162 static bool ctdb_op_is_disabled(struct ctdb_op_state *state)
164 return state->timer != NULL;
167 static bool ctdb_op_begin(struct ctdb_op_state *state)
169 if (ctdb_op_is_disabled(state)) {
170 DEBUG(DEBUG_NOTICE,
171 ("Unable to begin - %s are disabled\n", state->name));
172 return false;
175 state->in_progress = true;
176 return true;
179 static bool ctdb_op_end(struct ctdb_op_state *state)
181 return state->in_progress = false;
184 static bool ctdb_op_is_in_progress(struct ctdb_op_state *state)
186 return state->in_progress;
189 static void ctdb_op_enable(struct ctdb_op_state *state)
191 TALLOC_FREE(state->timer);
194 static void ctdb_op_timeout_handler(struct tevent_context *ev,
195 struct tevent_timer *te,
196 struct timeval yt, void *p)
198 struct ctdb_op_state *state =
199 talloc_get_type(p, struct ctdb_op_state);
201 DEBUG(DEBUG_NOTICE,("Reenabling %s after timeout\n", state->name));
202 ctdb_op_enable(state);
205 static int ctdb_op_disable(struct ctdb_op_state *state,
206 struct tevent_context *ev,
207 uint32_t timeout)
209 if (timeout == 0) {
210 DEBUG(DEBUG_NOTICE,("Reenabling %s\n", state->name));
211 ctdb_op_enable(state);
212 return 0;
215 if (state->in_progress) {
216 DEBUG(DEBUG_ERR,
217 ("Unable to disable %s - in progress\n", state->name));
218 return -EAGAIN;
221 DEBUG(DEBUG_NOTICE,("Disabling %s for %u seconds\n",
222 state->name, timeout));
224 /* Clear any old timers */
225 talloc_free(state->timer);
227 /* Arrange for the timeout to occur */
228 state->timer = tevent_add_timer(ev, state,
229 timeval_current_ofs(timeout, 0),
230 ctdb_op_timeout_handler, state);
231 if (state->timer == NULL) {
232 DEBUG(DEBUG_ERR,(__location__ " Unable to setup timer\n"));
233 return -ENOMEM;
236 return 0;
239 struct ctdb_banning_state {
240 uint32_t pnn;
241 uint32_t count;
242 struct timeval last_reported_time;
245 struct ctdb_cluster_lock_handle;
248 private state of recovery daemon
250 struct ctdb_recoverd {
251 struct ctdb_context *ctdb;
252 uint32_t leader;
253 struct tevent_timer *leader_broadcast_te;
254 struct tevent_timer *leader_broadcast_timeout_te;
255 uint32_t pnn;
256 uint32_t last_culprit_node;
257 struct ctdb_banning_state *banning_state;
258 struct ctdb_node_map_old *nodemap;
259 struct timeval priority_time;
260 bool need_takeover_run;
261 bool need_recovery;
262 uint32_t node_flags;
263 struct tevent_timer *send_election_te;
264 bool election_in_progress;
265 struct tevent_timer *election_timeout;
266 struct srvid_requests *reallocate_requests;
267 struct ctdb_op_state *takeover_run;
268 struct ctdb_op_state *recovery;
269 struct ctdb_iface_list_old *ifaces;
270 uint32_t *force_rebalance_nodes;
271 struct ctdb_node_capabilities *caps;
272 bool frozen_on_inactive;
273 struct ctdb_cluster_lock_handle *cluster_lock_handle;
274 pid_t helper_pid;
277 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
278 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
280 static void ctdb_restart_recd(struct tevent_context *ev,
281 struct tevent_timer *te, struct timeval t,
282 void *private_data);
284 static bool this_node_is_leader(struct ctdb_recoverd *rec)
286 return rec->leader == rec->pnn;
289 static bool this_node_can_be_leader(struct ctdb_recoverd *rec)
291 return (rec->node_flags & NODE_FLAGS_INACTIVE) == 0 &&
292 (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) != 0;
295 static bool node_flags(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t *flags)
297 size_t i;
299 for (i = 0; i < rec->nodemap->num; i++) {
300 struct ctdb_node_and_flags *node = &rec->nodemap->nodes[i];
301 if (node->pnn == pnn) {
302 if (flags != NULL) {
303 *flags = node->flags;
305 return true;
309 return false;
313 ban a node for a period of time
315 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn)
317 int ret;
318 struct ctdb_context *ctdb = rec->ctdb;
319 uint32_t ban_time = ctdb->tunable.recovery_ban_period;
320 struct ctdb_ban_state bantime;
322 if (!ctdb_validate_pnn(ctdb, pnn)) {
323 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
324 return;
327 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
329 bantime.pnn = pnn;
330 bantime.time = ban_time;
332 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
333 if (ret != 0) {
334 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
335 return;
340 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
344 remember the trouble maker
346 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec,
347 uint32_t culprit,
348 uint32_t count)
350 struct ctdb_context *ctdb = talloc_get_type_abort(
351 rec->ctdb, struct ctdb_context);
352 struct ctdb_banning_state *ban_state = NULL;
353 size_t len;
354 bool ok;
356 ok = node_flags(rec, culprit, NULL);
357 if (!ok) {
358 DBG_WARNING("Unknown culprit node %"PRIu32"\n", culprit);
359 return;
362 /* If we are banned or stopped, do not set other nodes as culprits */
363 if (rec->node_flags & NODE_FLAGS_INACTIVE) {
364 D_WARNING("This node is INACTIVE, cannot set culprit node %d\n",
365 culprit);
366 return;
369 if (rec->banning_state == NULL) {
370 len = 0;
371 } else {
372 size_t i;
374 len = talloc_array_length(rec->banning_state);
376 for (i = 0 ; i < len; i++) {
377 if (rec->banning_state[i].pnn == culprit) {
378 ban_state= &rec->banning_state[i];
379 break;
384 /* Not found, so extend (or allocate new) array */
385 if (ban_state == NULL) {
386 struct ctdb_banning_state *t;
388 len += 1;
390 * talloc_realloc() handles the corner case where
391 * rec->banning_state is NULL
393 t = talloc_realloc(rec,
394 rec->banning_state,
395 struct ctdb_banning_state,
396 len);
397 if (t == NULL) {
398 DBG_WARNING("Memory allocation error\n");
399 return;
401 rec->banning_state = t;
403 /* New element is always at the end - initialise it... */
404 ban_state = &rec->banning_state[len - 1];
405 *ban_state = (struct ctdb_banning_state) {
406 .pnn = culprit,
407 .count = 0,
409 } else if (ban_state->count > 0 &&
410 timeval_elapsed(&ban_state->last_reported_time) >
411 ctdb->tunable.recovery_grace_period) {
413 * Forgive old transgressions beyond the tunable time-limit
415 ban_state->count = 0;
418 ban_state->count += count;
419 ban_state->last_reported_time = timeval_current();
420 rec->last_culprit_node = culprit;
423 static void ban_counts_reset(struct ctdb_recoverd *rec)
425 D_NOTICE("Resetting ban count to 0 for all nodes\n");
426 TALLOC_FREE(rec->banning_state);
430 remember the trouble maker
432 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
434 ctdb_set_culprit_count(rec, culprit, 1);
438 Retrieve capabilities from all connected nodes
440 static int update_capabilities(struct ctdb_recoverd *rec,
441 struct ctdb_node_map_old *nodemap)
443 uint32_t *capp;
444 TALLOC_CTX *tmp_ctx;
445 struct ctdb_node_capabilities *caps;
446 struct ctdb_context *ctdb = rec->ctdb;
448 tmp_ctx = talloc_new(rec);
449 CTDB_NO_MEMORY(ctdb, tmp_ctx);
451 caps = ctdb_get_capabilities(ctdb, tmp_ctx,
452 CONTROL_TIMEOUT(), nodemap);
454 if (caps == NULL) {
455 DEBUG(DEBUG_ERR,
456 (__location__ " Failed to get node capabilities\n"));
457 talloc_free(tmp_ctx);
458 return -1;
461 capp = ctdb_get_node_capabilities(caps, rec->pnn);
462 if (capp == NULL) {
463 DEBUG(DEBUG_ERR,
464 (__location__
465 " Capabilities don't include current node.\n"));
466 talloc_free(tmp_ctx);
467 return -1;
469 ctdb->capabilities = *capp;
471 TALLOC_FREE(rec->caps);
472 rec->caps = talloc_steal(rec, caps);
474 talloc_free(tmp_ctx);
475 return 0;
479 change recovery mode on all nodes
481 static int set_recovery_mode(struct ctdb_context *ctdb,
482 struct ctdb_recoverd *rec,
483 struct ctdb_node_map_old *nodemap,
484 uint32_t rec_mode)
486 TDB_DATA data;
487 uint32_t *nodes;
488 TALLOC_CTX *tmp_ctx;
490 tmp_ctx = talloc_new(ctdb);
491 CTDB_NO_MEMORY(ctdb, tmp_ctx);
493 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
495 data.dsize = sizeof(uint32_t);
496 data.dptr = (unsigned char *)&rec_mode;
498 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
499 nodes, 0,
500 CONTROL_TIMEOUT(),
501 false, data,
502 NULL, NULL,
503 NULL) != 0) {
504 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
505 talloc_free(tmp_ctx);
506 return -1;
509 talloc_free(tmp_ctx);
510 return 0;
514 * Update flags on all connected nodes
516 static int update_flags_on_all_nodes(struct ctdb_recoverd *rec,
517 uint32_t pnn,
518 uint32_t flags)
520 struct ctdb_context *ctdb = rec->ctdb;
521 struct timeval timeout = CONTROL_TIMEOUT();
522 TDB_DATA data;
523 struct ctdb_node_map_old *nodemap=NULL;
524 struct ctdb_node_flag_change c;
525 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
526 uint32_t *nodes;
527 uint32_t i;
528 int ret;
530 nodemap = rec->nodemap;
532 for (i = 0; i < nodemap->num; i++) {
533 if (pnn == nodemap->nodes[i].pnn) {
534 break;
537 if (i >= nodemap->num) {
538 DBG_ERR("Nodemap does not contain node %d\n", pnn);
539 talloc_free(tmp_ctx);
540 return -1;
543 c.pnn = pnn;
544 c.old_flags = nodemap->nodes[i].flags;
545 c.new_flags = flags;
547 data.dsize = sizeof(c);
548 data.dptr = (unsigned char *)&c;
550 /* send the flags update to all connected nodes */
551 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
553 ret = ctdb_client_async_control(ctdb,
554 CTDB_CONTROL_MODIFY_FLAGS,
555 nodes,
557 timeout,
558 false,
559 data,
560 NULL,
561 NULL,
562 NULL);
563 if (ret != 0) {
564 DBG_ERR("Unable to update flags on remote nodes\n");
565 talloc_free(tmp_ctx);
566 return -1;
569 talloc_free(tmp_ctx);
570 return 0;
573 static bool _cluster_lock_lock(struct ctdb_recoverd *rec);
574 static bool cluster_lock_held(struct ctdb_recoverd *rec);
576 static bool cluster_lock_enabled(struct ctdb_recoverd *rec)
578 return rec->ctdb->recovery_lock != NULL;
581 static bool cluster_lock_take(struct ctdb_recoverd *rec)
583 struct ctdb_context *ctdb = rec->ctdb;
584 bool have_lock;
586 if (!cluster_lock_enabled(rec)) {
587 return true;
590 if (cluster_lock_held(rec)) {
591 D_NOTICE("Already holding cluster lock\n");
592 return true;
595 D_NOTICE("Attempting to take cluster lock (%s)\n", ctdb->recovery_lock);
596 have_lock = _cluster_lock_lock(rec);
597 if (!have_lock) {
598 return false;
601 D_NOTICE("Cluster lock taken successfully\n");
602 return true;
606 called when ctdb_wait_timeout should finish
608 static void ctdb_wait_handler(struct tevent_context *ev,
609 struct tevent_timer *te,
610 struct timeval yt, void *p)
612 uint32_t *timed_out = (uint32_t *)p;
613 (*timed_out) = 1;
617 wait for a given number of seconds
619 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
621 uint32_t timed_out = 0;
622 uint32_t usecs = (secs - (uint32_t)secs) * 1000000;
623 tevent_add_timer(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs),
624 ctdb_wait_handler, &timed_out);
625 while (!timed_out) {
626 tevent_loop_once(ctdb->ev);
631 * Broadcast cluster leader
634 static int leader_broadcast_send(struct ctdb_recoverd *rec, uint32_t pnn)
636 struct ctdb_context *ctdb = rec->ctdb;
637 TDB_DATA data;
638 int ret;
640 data.dptr = (uint8_t *)&pnn;
641 data.dsize = sizeof(pnn);
643 ret = ctdb_client_send_message(ctdb,
644 CTDB_BROADCAST_CONNECTED,
645 CTDB_SRVID_LEADER,
646 data);
647 return ret;
650 static int leader_broadcast_loop(struct ctdb_recoverd *rec);
651 static void cluster_lock_release(struct ctdb_recoverd *rec);
653 /* This runs continuously but only sends the broadcast when leader */
654 static void leader_broadcast_loop_handler(struct tevent_context *ev,
655 struct tevent_timer *te,
656 struct timeval current_time,
657 void *private_data)
659 struct ctdb_recoverd *rec = talloc_get_type_abort(
660 private_data, struct ctdb_recoverd);
661 int ret;
663 if (!this_node_can_be_leader(rec)) {
664 if (this_node_is_leader(rec)) {
665 rec->leader = CTDB_UNKNOWN_PNN;
667 if (cluster_lock_enabled(rec) && cluster_lock_held(rec)) {
668 cluster_lock_release(rec);
670 goto done;
673 if (!this_node_is_leader(rec)) {
674 goto done;
677 if (rec->election_in_progress) {
678 goto done;
681 ret = leader_broadcast_send(rec, rec->leader);
682 if (ret != 0) {
683 DBG_WARNING("Failed to send leader broadcast\n");
686 done:
687 ret = leader_broadcast_loop(rec);
688 if (ret != 0) {
689 D_WARNING("Failed to set up leader broadcast\n");
693 static int leader_broadcast_loop(struct ctdb_recoverd *rec)
695 struct ctdb_context *ctdb = rec->ctdb;
697 TALLOC_FREE(rec->leader_broadcast_te);
698 rec->leader_broadcast_te =
699 tevent_add_timer(ctdb->ev,
700 rec,
701 timeval_current_ofs(1, 0),
702 leader_broadcast_loop_handler,
703 rec);
704 if (rec->leader_broadcast_te == NULL) {
705 return ENOMEM;
708 return 0;
711 static bool leader_broadcast_loop_active(struct ctdb_recoverd *rec)
713 return rec->leader_broadcast_te != NULL;
717 called when an election times out (ends)
719 static void ctdb_election_timeout(struct tevent_context *ev,
720 struct tevent_timer *te,
721 struct timeval t, void *p)
723 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
724 bool ok;
726 rec->election_in_progress = false;
727 rec->election_timeout = NULL;
728 fast_start = false;
730 D_WARNING("Election period ended, leader=%u\n", rec->leader);
732 if (!this_node_is_leader(rec)) {
733 return;
736 ok = cluster_lock_take(rec);
737 if (!ok) {
738 D_ERR("Unable to get cluster lock, banning node\n");
739 ctdb_ban_node(rec, rec->pnn);
745 wait for an election to finish. It finished election_timeout seconds after
746 the last election packet is received
748 static void ctdb_wait_election(struct ctdb_recoverd *rec)
750 struct ctdb_context *ctdb = rec->ctdb;
751 while (rec->election_in_progress) {
752 tevent_loop_once(ctdb->ev);
757 * Update local flags from all remote connected nodes and push out
758 * flags changes to all nodes. This is only run by the leader.
760 static int update_flags(struct ctdb_recoverd *rec,
761 struct ctdb_node_map_old *nodemap,
762 struct ctdb_node_map_old **remote_nodemaps)
764 unsigned int j;
765 struct ctdb_context *ctdb = rec->ctdb;
766 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
768 /* Check flags from remote nodes */
769 for (j=0; j<nodemap->num; j++) {
770 struct ctdb_node_map_old *remote_nodemap=NULL;
771 uint32_t local_flags = nodemap->nodes[j].flags;
772 uint32_t remote_pnn = nodemap->nodes[j].pnn;
773 uint32_t remote_flags;
774 unsigned int i;
775 int ret;
777 if (local_flags & NODE_FLAGS_DISCONNECTED) {
778 continue;
780 if (remote_pnn == rec->pnn) {
782 * No remote nodemap for this node since this
783 * is the local nodemap. However, still need
784 * to check this against the remote nodes and
785 * push it if they are out-of-date.
787 goto compare_remotes;
790 remote_nodemap = remote_nodemaps[j];
791 remote_flags = remote_nodemap->nodes[j].flags;
793 if (local_flags != remote_flags) {
795 * Update the local copy of the flags in the
796 * recovery daemon.
798 D_NOTICE("Remote node %u had flags 0x%x, "
799 "local had 0x%x - updating local\n",
800 remote_pnn,
801 remote_flags,
802 local_flags);
803 nodemap->nodes[j].flags = remote_flags;
804 local_flags = remote_flags;
805 goto push;
808 compare_remotes:
809 for (i = 0; i < nodemap->num; i++) {
810 if (i == j) {
811 continue;
813 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
814 continue;
816 if (nodemap->nodes[i].pnn == rec->pnn) {
817 continue;
820 remote_nodemap = remote_nodemaps[i];
821 remote_flags = remote_nodemap->nodes[j].flags;
823 if (local_flags != remote_flags) {
824 goto push;
828 continue;
830 push:
831 D_NOTICE("Pushing updated flags for node %u (0x%x)\n",
832 remote_pnn,
833 local_flags);
834 ret = update_flags_on_all_nodes(rec, remote_pnn, local_flags);
835 if (ret != 0) {
836 DBG_ERR("Unable to update flags on remote nodes\n");
837 talloc_free(mem_ctx);
838 return -1;
841 talloc_free(mem_ctx);
842 return 0;
846 /* Create a new random generation id.
847 The generation id can not be the INVALID_GENERATION id
849 static uint32_t new_generation(void)
851 uint32_t generation;
853 while (1) {
854 generation = random();
856 if (generation != INVALID_GENERATION) {
857 break;
861 return generation;
864 static bool cluster_lock_held(struct ctdb_recoverd *rec)
866 return (rec->cluster_lock_handle != NULL);
869 struct ctdb_cluster_lock_handle {
870 bool done;
871 bool locked;
872 double latency;
873 struct ctdb_cluster_mutex_handle *h;
874 struct ctdb_recoverd *rec;
877 static void take_cluster_lock_handler(char status,
878 double latency,
879 void *private_data)
881 struct ctdb_cluster_lock_handle *s =
882 (struct ctdb_cluster_lock_handle *) private_data;
884 s->locked = (status == '0') ;
887 * If unsuccessful then ensure the process has exited and that
888 * the file descriptor event handler has been cancelled
890 if (! s->locked) {
891 TALLOC_FREE(s->h);
894 switch (status) {
895 case '0':
896 s->latency = latency;
897 break;
899 case '1':
900 D_ERR("Unable to take cluster lock - contention\n");
901 break;
903 case '2':
904 D_ERR("Unable to take cluster lock - timeout\n");
905 break;
907 default:
908 D_ERR("Unable to take cluster lock - unknown error\n");
911 s->done = true;
914 static void force_election(struct ctdb_recoverd *rec);
916 static void lost_cluster_lock_handler(void *private_data)
918 struct ctdb_recoverd *rec = talloc_get_type_abort(
919 private_data, struct ctdb_recoverd);
921 D_ERR("Cluster lock helper terminated\n");
922 TALLOC_FREE(rec->cluster_lock_handle);
924 if (this_node_can_be_leader(rec)) {
925 force_election(rec);
929 static bool _cluster_lock_lock(struct ctdb_recoverd *rec)
931 struct ctdb_context *ctdb = rec->ctdb;
932 struct ctdb_cluster_mutex_handle *h;
933 struct ctdb_cluster_lock_handle *s;
935 s = talloc_zero(rec, struct ctdb_cluster_lock_handle);
936 if (s == NULL) {
937 DBG_ERR("Memory allocation error\n");
938 return false;
941 s->rec = rec;
943 h = ctdb_cluster_mutex(s,
944 ctdb,
945 ctdb->recovery_lock,
946 120,
947 take_cluster_lock_handler,
949 lost_cluster_lock_handler,
950 rec);
951 if (h == NULL) {
952 talloc_free(s);
953 return false;
956 rec->cluster_lock_handle = s;
957 s->h = h;
959 while (! s->done) {
960 tevent_loop_once(ctdb->ev);
963 if (! s->locked) {
964 TALLOC_FREE(rec->cluster_lock_handle);
965 return false;
968 ctdb_ctrl_report_recd_lock_latency(ctdb,
969 CONTROL_TIMEOUT(),
970 s->latency);
972 return true;
975 static void cluster_lock_release(struct ctdb_recoverd *rec)
977 if (rec->cluster_lock_handle == NULL) {
978 return;
981 if (! rec->cluster_lock_handle->done) {
983 * Taking of cluster lock still in progress. Free
984 * the cluster mutex handle to release it but leave
985 * the cluster lock handle in place to allow taking
986 * of the lock to fail.
988 D_NOTICE("Cancelling cluster lock\n");
989 TALLOC_FREE(rec->cluster_lock_handle->h);
990 rec->cluster_lock_handle->done = true;
991 rec->cluster_lock_handle->locked = false;
992 return;
995 D_NOTICE("Releasing cluster lock\n");
996 TALLOC_FREE(rec->cluster_lock_handle);
999 static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
1001 size_t len = talloc_array_length(rec->banning_state);
1002 size_t i;
1005 *self_ban = false;
1006 for (i = 0; i < len; i++) {
1007 struct ctdb_banning_state *ban_state = &rec->banning_state[i];
1009 if (ban_state->count < 2 * rec->nodemap->num) {
1010 continue;
1013 D_NOTICE("Node %u reached %u banning credits\n",
1014 ban_state->pnn,
1015 ban_state->count);
1016 ctdb_ban_node(rec, ban_state->pnn);
1017 ban_state->count = 0;
1019 /* Banning ourself? */
1020 if (ban_state->pnn == rec->pnn) {
1021 *self_ban = true;
1026 struct helper_state {
1027 int fd[2];
1028 pid_t pid;
1029 int result;
1030 bool done;
1033 static void helper_handler(struct tevent_context *ev,
1034 struct tevent_fd *fde,
1035 uint16_t flags, void *private_data)
1037 struct helper_state *state = talloc_get_type_abort(
1038 private_data, struct helper_state);
1039 int ret;
1041 ret = sys_read(state->fd[0], &state->result, sizeof(state->result));
1042 if (ret != sizeof(state->result)) {
1043 state->result = EPIPE;
1046 state->done = true;
1049 static int helper_run(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx,
1050 const char *prog, const char *arg, const char *type)
1052 struct helper_state *state;
1053 struct tevent_fd *fde;
1054 const char **args;
1055 int nargs, ret;
1057 state = talloc_zero(mem_ctx, struct helper_state);
1058 if (state == NULL) {
1059 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1060 return -1;
1063 state->pid = -1;
1065 ret = pipe(state->fd);
1066 if (ret != 0) {
1067 DEBUG(DEBUG_ERR,
1068 ("Failed to create pipe for %s helper\n", type));
1069 goto fail;
1072 set_close_on_exec(state->fd[0]);
1074 nargs = 4;
1075 args = talloc_array(state, const char *, nargs);
1076 if (args == NULL) {
1077 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1078 goto fail;
1081 args[0] = talloc_asprintf(args, "%d", state->fd[1]);
1082 if (args[0] == NULL) {
1083 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1084 goto fail;
1086 args[1] = rec->ctdb->daemon.name;
1087 args[2] = arg;
1088 args[3] = NULL;
1090 if (args[2] == NULL) {
1091 nargs = 3;
1094 state->pid = ctdb_vfork_exec(state, rec->ctdb, prog, nargs, args);
1095 if (state->pid == -1) {
1096 DEBUG(DEBUG_ERR,
1097 ("Failed to create child for %s helper\n", type));
1098 goto fail;
1101 close(state->fd[1]);
1102 state->fd[1] = -1;
1104 rec->helper_pid = state->pid;
1105 state->done = false;
1107 fde = tevent_add_fd(rec->ctdb->ev, state, state->fd[0],
1108 TEVENT_FD_READ, helper_handler, state);
1109 if (fde == NULL) {
1110 goto fail;
1112 tevent_fd_set_auto_close(fde);
1114 while (!state->done) {
1115 tevent_loop_once(rec->ctdb->ev);
1117 if (!this_node_is_leader(rec)) {
1118 D_ERR("Leader changed to %u, aborting %s\n",
1119 rec->leader,
1120 type);
1121 state->result = 1;
1122 break;
1126 close(state->fd[0]);
1127 state->fd[0] = -1;
1129 if (state->result != 0) {
1130 goto fail;
1133 rec->helper_pid = -1;
1134 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
1135 talloc_free(state);
1136 return 0;
1138 fail:
1139 if (state->fd[0] != -1) {
1140 close(state->fd[0]);
1142 if (state->fd[1] != -1) {
1143 close(state->fd[1]);
1145 rec->helper_pid = -1;
1146 if (state->pid != -1) {
1147 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
1149 talloc_free(state);
1150 return -1;
1154 static int ctdb_takeover(struct ctdb_recoverd *rec,
1155 uint32_t *force_rebalance_nodes)
1157 static char prog[PATH_MAX+1] = "";
1158 char *arg;
1159 unsigned int i;
1160 int ret;
1162 if (!ctdb_set_helper("takeover_helper", prog, sizeof(prog),
1163 "CTDB_TAKEOVER_HELPER", CTDB_HELPER_BINDIR,
1164 "ctdb_takeover_helper")) {
1165 ctdb_die(rec->ctdb, "Unable to set takeover helper\n");
1168 arg = NULL;
1169 for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1170 uint32_t pnn = force_rebalance_nodes[i];
1171 if (arg == NULL) {
1172 arg = talloc_asprintf(rec, "%u", pnn);
1173 } else {
1174 arg = talloc_asprintf_append(arg, ",%u", pnn);
1176 if (arg == NULL) {
1177 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1178 return -1;
1182 if (ctdb_config.failover_disabled) {
1183 ret = setenv("CTDB_DISABLE_IP_FAILOVER", "1", 1);
1184 if (ret != 0) {
1185 D_ERR("Failed to set CTDB_DISABLE_IP_FAILOVER variable\n");
1186 return -1;
1190 return helper_run(rec, rec, prog, arg, "takeover");
1193 static bool do_takeover_run(struct ctdb_recoverd *rec,
1194 struct ctdb_node_map_old *nodemap)
1196 uint32_t *nodes = NULL;
1197 struct ctdb_disable_message dtr;
1198 TDB_DATA data;
1199 size_t i;
1200 uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
1201 int ret;
1202 bool ok;
1204 DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
1206 if (ctdb_op_is_in_progress(rec->takeover_run)) {
1207 DEBUG(DEBUG_ERR, (__location__
1208 " takeover run already in progress \n"));
1209 ok = false;
1210 goto done;
1213 if (!ctdb_op_begin(rec->takeover_run)) {
1214 ok = false;
1215 goto done;
1218 /* Disable IP checks (takeover runs, really) on other nodes
1219 * while doing this takeover run. This will stop those other
1220 * nodes from triggering takeover runs when think they should
1221 * be hosting an IP but it isn't yet on an interface. Don't
1222 * wait for replies since a failure here might cause some
1223 * noise in the logs but will not actually cause a problem.
1225 ZERO_STRUCT(dtr);
1226 dtr.srvid = 0; /* No reply */
1227 dtr.pnn = -1;
1229 data.dptr = (uint8_t*)&dtr;
1230 data.dsize = sizeof(dtr);
1232 nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
1234 /* Disable for 60 seconds. This can be a tunable later if
1235 * necessary.
1237 dtr.timeout = 60;
1238 for (i = 0; i < talloc_array_length(nodes); i++) {
1239 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1240 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1241 data) != 0) {
1242 DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
1246 ret = ctdb_takeover(rec, rec->force_rebalance_nodes);
1248 /* Re-enable takeover runs and IP checks on other nodes */
1249 dtr.timeout = 0;
1250 for (i = 0; i < talloc_array_length(nodes); i++) {
1251 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1252 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1253 data) != 0) {
1254 DEBUG(DEBUG_INFO,("Failed to re-enable takeover runs\n"));
1258 if (ret != 0) {
1259 DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
1260 ok = false;
1261 goto done;
1264 ok = true;
1265 /* Takeover run was successful so clear force rebalance targets */
1266 if (rebalance_nodes == rec->force_rebalance_nodes) {
1267 TALLOC_FREE(rec->force_rebalance_nodes);
1268 } else {
1269 DEBUG(DEBUG_WARNING,
1270 ("Rebalance target nodes changed during takeover run - not clearing\n"));
1272 done:
1273 rec->need_takeover_run = !ok;
1274 talloc_free(nodes);
1275 ctdb_op_end(rec->takeover_run);
1277 DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
1278 return ok;
1281 static int db_recovery_parallel(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx)
1283 static char prog[PATH_MAX+1] = "";
1284 const char *arg;
1286 if (!ctdb_set_helper("recovery_helper", prog, sizeof(prog),
1287 "CTDB_RECOVERY_HELPER", CTDB_HELPER_BINDIR,
1288 "ctdb_recovery_helper")) {
1289 ctdb_die(rec->ctdb, "Unable to set recovery helper\n");
1292 arg = talloc_asprintf(mem_ctx, "%u", new_generation());
1293 if (arg == NULL) {
1294 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1295 return -1;
1298 setenv("CTDB_DBDIR_STATE", rec->ctdb->db_directory_state, 1);
1300 return helper_run(rec, mem_ctx, prog, arg, "recovery");
1304 * Main recovery function, only run by leader
1306 static int do_recovery(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx)
1308 struct ctdb_context *ctdb = rec->ctdb;
1309 struct ctdb_node_map_old *nodemap = rec->nodemap;
1310 unsigned int i;
1311 int ret;
1312 bool self_ban;
1314 DBG_NOTICE("Starting do_recovery\n");
1316 /* Check if the current node is still the leader. It's possible that
1317 * re-election has changed the leader.
1319 if (!this_node_is_leader(rec)) {
1320 D_NOTICE("Leader changed to %" PRIu32 ", aborting recovery\n",
1321 rec->leader);
1322 return -1;
1325 /* if recovery fails, force it again */
1326 rec->need_recovery = true;
1328 if (!ctdb_op_begin(rec->recovery)) {
1329 return -1;
1332 if (rec->election_in_progress) {
1333 /* an election is in progress */
1334 DBG_ERR("do_recovery called while election in progress - try "
1335 "again later\n");
1336 goto fail;
1339 ban_misbehaving_nodes(rec, &self_ban);
1340 if (self_ban) {
1341 DBG_NOTICE("This node was banned, aborting recovery\n");
1342 goto fail;
1345 if (cluster_lock_enabled(rec) && !cluster_lock_held(rec)) {
1346 /* Leader can change in ban_misbehaving_nodes() */
1347 if (!this_node_is_leader(rec)) {
1348 D_NOTICE("Leader changed to %" PRIu32
1349 ", aborting recovery\n",
1350 rec->leader);
1351 rec->need_recovery = false;
1352 goto fail;
1355 D_ERR("Cluster lock not held - abort recovery, ban node\n");
1356 ctdb_ban_node(rec, rec->pnn);
1357 goto fail;
1360 DBG_NOTICE("Recovery initiated due to problem with node %" PRIu32 "\n",
1361 rec->last_culprit_node);
1363 /* Retrieve capabilities from all connected nodes */
1364 ret = update_capabilities(rec, nodemap);
1365 if (ret!=0) {
1366 DBG_ERR("Unable to update node capabilities.\n");
1367 return -1;
1371 update all nodes to have the same flags that we have
1373 for (i=0;i<nodemap->num;i++) {
1374 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1375 continue;
1378 ret = update_flags_on_all_nodes(rec,
1379 nodemap->nodes[i].pnn,
1380 nodemap->nodes[i].flags);
1381 if (ret != 0) {
1382 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1383 DBG_WARNING("Unable to update flags on "
1384 "inactive node %d\n",
1386 } else {
1387 DBG_ERR("Unable to update flags on all nodes "
1388 "for node %d\n",
1390 return -1;
1395 DBG_NOTICE("Recovery - updated flags\n");
1397 ret = db_recovery_parallel(rec, mem_ctx);
1398 if (ret != 0) {
1399 goto fail;
1402 do_takeover_run(rec, nodemap);
1404 /* send a message to all clients telling them that the cluster
1405 has been reconfigured */
1406 ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
1407 CTDB_SRVID_RECONFIGURE, tdb_null);
1408 if (ret != 0) {
1409 DBG_ERR("Failed to send reconfigure message\n");
1410 goto fail;
1413 DBG_NOTICE("Recovery complete\n");
1415 rec->need_recovery = false;
1416 ctdb_op_end(rec->recovery);
1419 * Completed a full recovery so forgive any past transgressions
1421 ban_counts_reset(rec);
1423 /* We just finished a recovery successfully.
1424 We now wait for rerecovery_timeout before we allow
1425 another recovery to take place.
1427 D_NOTICE("Just finished a recovery. New recoveries will now be "
1428 "suppressed for the rerecovery timeout (%" PRIu32
1429 " seconds)\n",
1430 ctdb->tunable.rerecovery_timeout);
1431 ctdb_op_disable(rec->recovery, ctdb->ev,
1432 ctdb->tunable.rerecovery_timeout);
1433 return 0;
1435 fail:
1436 ctdb_op_end(rec->recovery);
1437 return -1;
1442 elections are won by first checking the number of connected nodes, then
1443 the priority time, then the pnn
1445 struct election_message {
1446 uint32_t num_connected;
1447 struct timeval priority_time;
1448 uint32_t pnn;
1449 uint32_t node_flags;
1453 form this nodes election data
1455 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1457 unsigned int i;
1458 int ret;
1459 struct ctdb_node_map_old *nodemap;
1460 struct ctdb_context *ctdb = rec->ctdb;
1461 bool ok;
1463 ZERO_STRUCTP(em);
1465 em->pnn = rec->pnn;
1466 em->priority_time = rec->priority_time;
1468 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1469 if (ret != 0) {
1470 DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
1471 return;
1474 ok = node_flags(rec, rec->pnn, &rec->node_flags);
1475 if (!ok) {
1476 DBG_ERR("Unable to get node flags for this node\n");
1477 return;
1479 em->node_flags = rec->node_flags;
1481 for (i=0;i<nodemap->num;i++) {
1482 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1483 em->num_connected++;
1487 if (!this_node_can_be_leader(rec)) {
1488 /* Try to lose... */
1489 em->num_connected = 0;
1490 em->priority_time = timeval_current();
1493 talloc_free(nodemap);
1497 see if the given election data wins
1499 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1501 struct election_message myem;
1502 int cmp = 0;
1504 ctdb_election_data(rec, &myem);
1506 if (!this_node_can_be_leader(rec)) {
1507 return false;
1510 /* Automatically win if other node is banned or stopped */
1511 if (em->node_flags & NODE_FLAGS_INACTIVE) {
1512 return true;
1515 /* then the longest running node */
1516 if (cmp == 0) {
1517 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1520 if (cmp == 0) {
1521 cmp = (int)myem.pnn - (int)em->pnn;
1524 return cmp > 0;
1528 send out an election request
1530 static int send_election_request(struct ctdb_recoverd *rec)
1532 TDB_DATA election_data;
1533 struct election_message emsg;
1534 uint64_t srvid;
1535 struct ctdb_context *ctdb = rec->ctdb;
1537 srvid = CTDB_SRVID_ELECTION;
1539 ctdb_election_data(rec, &emsg);
1541 election_data.dsize = sizeof(struct election_message);
1542 election_data.dptr = (unsigned char *)&emsg;
1545 /* Assume this node will win the election, set leader accordingly */
1546 rec->leader = rec->pnn;
1548 /* send an election message to all active nodes */
1549 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
1550 return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1554 we think we are winning the election - send a broadcast election request
1556 static void election_send_request(struct tevent_context *ev,
1557 struct tevent_timer *te,
1558 struct timeval t, void *p)
1560 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1561 int ret;
1563 ret = send_election_request(rec);
1564 if (ret != 0) {
1565 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
1568 TALLOC_FREE(rec->send_election_te);
1572 handler for memory dumps
1574 static void mem_dump_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1576 struct ctdb_recoverd *rec = talloc_get_type(
1577 private_data, struct ctdb_recoverd);
1578 struct ctdb_context *ctdb = rec->ctdb;
1579 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1580 TDB_DATA *dump;
1581 int ret;
1582 struct ctdb_srvid_message *rd;
1584 if (data.dsize != sizeof(struct ctdb_srvid_message)) {
1585 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1586 talloc_free(tmp_ctx);
1587 return;
1589 rd = (struct ctdb_srvid_message *)data.dptr;
1591 dump = talloc_zero(tmp_ctx, TDB_DATA);
1592 if (dump == NULL) {
1593 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
1594 talloc_free(tmp_ctx);
1595 return;
1597 ret = ctdb_dump_memory(ctdb, dump);
1598 if (ret != 0) {
1599 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
1600 talloc_free(tmp_ctx);
1601 return;
1604 DBG_ERR("recovery daemon memory dump\n");
1606 ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
1607 if (ret != 0) {
1608 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
1609 talloc_free(tmp_ctx);
1610 return;
1613 talloc_free(tmp_ctx);
1617 handler for reload_nodes
1619 static void reload_nodes_handler(uint64_t srvid, TDB_DATA data,
1620 void *private_data)
1622 struct ctdb_recoverd *rec = talloc_get_type(
1623 private_data, struct ctdb_recoverd);
1625 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
1627 ctdb_load_nodes_file(rec->ctdb);
1631 static void recd_node_rebalance_handler(uint64_t srvid, TDB_DATA data,
1632 void *private_data)
1634 struct ctdb_recoverd *rec = talloc_get_type(
1635 private_data, struct ctdb_recoverd);
1636 struct ctdb_context *ctdb = rec->ctdb;
1637 uint32_t pnn;
1638 uint32_t *t;
1639 int len;
1641 if (!this_node_is_leader(rec)) {
1642 return;
1645 if (data.dsize != sizeof(uint32_t)) {
1646 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
1647 return;
1650 pnn = *(uint32_t *)&data.dptr[0];
1652 DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
1654 /* Copy any existing list of nodes. There's probably some
1655 * sort of realloc variant that will do this but we need to
1656 * make sure that freeing the old array also cancels the timer
1657 * event for the timeout... not sure if realloc will do that.
1659 len = (rec->force_rebalance_nodes != NULL) ?
1660 talloc_array_length(rec->force_rebalance_nodes) :
1663 /* This allows duplicates to be added but they don't cause
1664 * harm. A call to add a duplicate PNN arguably means that
1665 * the timeout should be reset, so this is the simplest
1666 * solution.
1668 t = talloc_zero_array(rec, uint32_t, len+1);
1669 CTDB_NO_MEMORY_VOID(ctdb, t);
1670 if (len > 0) {
1671 memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
1673 t[len] = pnn;
1675 talloc_free(rec->force_rebalance_nodes);
1677 rec->force_rebalance_nodes = t;
1682 static void srvid_disable_and_reply(struct ctdb_recoverd *rec,
1683 TDB_DATA data,
1684 struct ctdb_op_state *op_state)
1686 struct ctdb_context *ctdb = rec->ctdb;
1687 struct ctdb_disable_message *r;
1688 uint32_t timeout;
1689 TDB_DATA result;
1690 int32_t ret = 0;
1692 /* Validate input data */
1693 if (data.dsize != sizeof(struct ctdb_disable_message)) {
1694 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1695 "expecting %lu\n", (long unsigned)data.dsize,
1696 (long unsigned)sizeof(struct ctdb_srvid_message)));
1697 return;
1699 if (data.dptr == NULL) {
1700 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
1701 return;
1704 r = (struct ctdb_disable_message *)data.dptr;
1705 timeout = r->timeout;
1707 ret = ctdb_op_disable(op_state, ctdb->ev, timeout);
1708 if (ret != 0) {
1709 goto done;
1712 /* Returning our PNN tells the caller that we succeeded */
1713 ret = rec->pnn;
1714 done:
1715 result.dsize = sizeof(int32_t);
1716 result.dptr = (uint8_t *)&ret;
1717 srvid_request_reply(ctdb, (struct ctdb_srvid_message *)r, result);
1720 static void disable_takeover_runs_handler(uint64_t srvid, TDB_DATA data,
1721 void *private_data)
1723 struct ctdb_recoverd *rec = talloc_get_type(
1724 private_data, struct ctdb_recoverd);
1726 srvid_disable_and_reply(rec, data, rec->takeover_run);
1729 /* Backward compatibility for this SRVID */
1730 static void disable_ip_check_handler(uint64_t srvid, TDB_DATA data,
1731 void *private_data)
1733 struct ctdb_recoverd *rec = talloc_get_type(
1734 private_data, struct ctdb_recoverd);
1735 uint32_t timeout;
1737 if (data.dsize != sizeof(uint32_t)) {
1738 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1739 "expecting %lu\n", (long unsigned)data.dsize,
1740 (long unsigned)sizeof(uint32_t)));
1741 return;
1743 if (data.dptr == NULL) {
1744 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
1745 return;
1748 timeout = *((uint32_t *)data.dptr);
1750 ctdb_op_disable(rec->takeover_run, rec->ctdb->ev, timeout);
1753 static void disable_recoveries_handler(uint64_t srvid, TDB_DATA data,
1754 void *private_data)
1756 struct ctdb_recoverd *rec = talloc_get_type(
1757 private_data, struct ctdb_recoverd);
1759 srvid_disable_and_reply(rec, data, rec->recovery);
1763 handler for ip reallocate, just add it to the list of requests and
1764 handle this later in the monitor_cluster loop so we do not recurse
1765 with other requests to takeover_run()
1767 static void ip_reallocate_handler(uint64_t srvid, TDB_DATA data,
1768 void *private_data)
1770 struct ctdb_srvid_message *request;
1771 struct ctdb_recoverd *rec = talloc_get_type(
1772 private_data, struct ctdb_recoverd);
1774 if (data.dsize != sizeof(struct ctdb_srvid_message)) {
1775 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1776 return;
1779 request = (struct ctdb_srvid_message *)data.dptr;
1781 srvid_request_add(rec->ctdb, &rec->reallocate_requests, request);
1784 static void process_ipreallocate_requests(struct ctdb_context *ctdb,
1785 struct ctdb_recoverd *rec)
1787 TDB_DATA result;
1788 int32_t ret;
1789 struct srvid_requests *current;
1791 /* Only process requests that are currently pending. More
1792 * might come in while the takeover run is in progress and
1793 * they will need to be processed later since they might
1794 * be in response flag changes.
1796 current = rec->reallocate_requests;
1797 rec->reallocate_requests = NULL;
1799 if (do_takeover_run(rec, rec->nodemap)) {
1800 ret = rec->pnn;
1801 } else {
1802 ret = -1;
1805 result.dsize = sizeof(int32_t);
1806 result.dptr = (uint8_t *)&ret;
1808 srvid_requests_reply(ctdb, &current, result);
1812 * handler for assigning banning credits
1814 static void banning_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1816 struct ctdb_recoverd *rec = talloc_get_type(
1817 private_data, struct ctdb_recoverd);
1818 uint32_t ban_pnn;
1820 /* Ignore if we are not leader */
1821 if (!this_node_is_leader(rec)) {
1822 return;
1825 if (data.dsize != sizeof(uint32_t)) {
1826 DEBUG(DEBUG_ERR, (__location__ "invalid data size %zu\n",
1827 data.dsize));
1828 return;
1831 ban_pnn = *(uint32_t *)data.dptr;
1833 ctdb_set_culprit_count(rec, ban_pnn, rec->nodemap->num);
1837 * Handler for leader elections
1839 static void election_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1841 struct ctdb_recoverd *rec = talloc_get_type(
1842 private_data, struct ctdb_recoverd);
1843 struct ctdb_context *ctdb = rec->ctdb;
1844 struct election_message *em = (struct election_message *)data.dptr;
1846 /* Ignore election packets from ourself */
1847 if (rec->pnn == em->pnn) {
1848 return;
1851 /* we got an election packet - update the timeout for the election */
1852 talloc_free(rec->election_timeout);
1853 rec->election_in_progress = true;
1854 rec->election_timeout = tevent_add_timer(
1855 ctdb->ev, ctdb,
1856 fast_start ?
1857 timeval_current_ofs(0, 500000) :
1858 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1859 ctdb_election_timeout, rec);
1861 /* someone called an election. check their election data
1862 and if we disagree and we would rather be the elected node,
1863 send a new election message to all other nodes
1865 if (ctdb_election_win(rec, em)) {
1866 if (!rec->send_election_te) {
1867 rec->send_election_te = tevent_add_timer(
1868 ctdb->ev, rec,
1869 timeval_current_ofs(0, 500000),
1870 election_send_request, rec);
1872 return;
1875 /* we didn't win */
1876 TALLOC_FREE(rec->send_election_te);
1878 /* Release the cluster lock file */
1879 if (cluster_lock_held(rec)) {
1880 cluster_lock_release(rec);
1883 /* Set leader to the winner of this round */
1884 rec->leader = em->pnn;
1886 return;
1889 static void cluster_lock_election(struct ctdb_recoverd *rec)
1891 bool ok;
1893 if (!this_node_can_be_leader(rec)) {
1894 if (cluster_lock_held(rec)) {
1895 cluster_lock_release(rec);
1897 goto done;
1901 * Don't need to unconditionally release the lock and then
1902 * attempt to retake it. This provides stability.
1904 if (cluster_lock_held(rec)) {
1905 goto done;
1908 rec->leader = CTDB_UNKNOWN_PNN;
1910 ok = cluster_lock_take(rec);
1911 if (ok) {
1912 rec->leader = rec->pnn;
1913 D_WARNING("Took cluster lock, leader=%"PRIu32"\n", rec->leader);
1916 done:
1917 rec->election_in_progress = false;
1921 force the start of the election process
1923 static void force_election(struct ctdb_recoverd *rec)
1925 int ret;
1926 struct ctdb_context *ctdb = rec->ctdb;
1928 D_ERR("Start election\n");
1930 /* set all nodes to recovery mode to stop all internode traffic */
1931 ret = set_recovery_mode(ctdb, rec, rec->nodemap, CTDB_RECOVERY_ACTIVE);
1932 if (ret != 0) {
1933 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1934 return;
1937 rec->election_in_progress = true;
1938 /* Let other nodes know that an election is underway */
1939 leader_broadcast_send(rec, CTDB_UNKNOWN_PNN);
1941 if (cluster_lock_enabled(rec)) {
1942 cluster_lock_election(rec);
1943 return;
1946 talloc_free(rec->election_timeout);
1947 rec->election_timeout = tevent_add_timer(
1948 ctdb->ev, ctdb,
1949 fast_start ?
1950 timeval_current_ofs(0, 500000) :
1951 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1952 ctdb_election_timeout, rec);
1954 ret = send_election_request(rec);
1955 if (ret!=0) {
1956 DBG_ERR("Failed to initiate leader election\n");
1957 return;
1960 /* wait for a few seconds to collect all responses */
1961 ctdb_wait_election(rec);
1965 static void srvid_not_implemented(uint64_t srvid,
1966 TDB_DATA data,
1967 void *private_data)
1969 const char *s;
1971 switch (srvid) {
1972 case CTDB_SRVID_SET_NODE_FLAGS:
1973 s = "CTDB_SRVID_SET_NODE_FLAGS";
1974 break;
1975 default:
1976 s = "UNKNOWN";
1979 D_WARNING("SRVID %s (0x%" PRIx64 ") is obsolete\n", s, srvid);
1983 handler for when we need to push out flag changes to all other nodes
1985 static void push_flags_handler(uint64_t srvid, TDB_DATA data,
1986 void *private_data)
1988 struct ctdb_recoverd *rec = talloc_get_type(
1989 private_data, struct ctdb_recoverd);
1990 struct ctdb_context *ctdb = rec->ctdb;
1991 int ret;
1992 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
1993 struct ctdb_node_map_old *nodemap=NULL;
1994 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1995 uint32_t *nodes;
1997 /* read the node flags from the leader */
1998 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->leader,
1999 tmp_ctx, &nodemap);
2000 if (ret != 0) {
2001 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2002 talloc_free(tmp_ctx);
2003 return;
2005 if (c->pnn >= nodemap->num) {
2006 DBG_ERR("Nodemap from leader does not contain node %d\n",
2007 c->pnn);
2008 talloc_free(tmp_ctx);
2009 return;
2012 /* send the flags update to all connected nodes */
2013 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2015 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2016 nodes, 0, CONTROL_TIMEOUT(),
2017 false, data,
2018 NULL, NULL,
2019 NULL) != 0) {
2020 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2022 talloc_free(tmp_ctx);
2023 return;
2026 talloc_free(tmp_ctx);
2029 static void leader_broadcast_timeout_handler(struct tevent_context *ev,
2030 struct tevent_timer *te,
2031 struct timeval current_time,
2032 void *private_data)
2034 struct ctdb_recoverd *rec = talloc_get_type_abort(
2035 private_data, struct ctdb_recoverd);
2037 rec->leader_broadcast_timeout_te = NULL;
2039 D_NOTICE("Leader broadcast timeout\n");
2041 force_election(rec);
2044 static void leader_broadcast_timeout_cancel(struct ctdb_recoverd *rec)
2046 TALLOC_FREE(rec->leader_broadcast_timeout_te);
2049 static int leader_broadcast_timeout_start(struct ctdb_recoverd *rec)
2051 struct ctdb_context *ctdb = rec->ctdb;
2054 * This should not be necessary. However, there will be
2055 * interactions with election code here. It will want to
2056 * cancel and restart the timer around potentially long
2057 * elections.
2059 leader_broadcast_timeout_cancel(rec);
2061 rec->leader_broadcast_timeout_te =
2062 tevent_add_timer(
2063 ctdb->ev,
2064 rec,
2065 timeval_current_ofs(ctdb_config.leader_timeout, 0),
2066 leader_broadcast_timeout_handler,
2067 rec);
2068 if (rec->leader_broadcast_timeout_te == NULL) {
2069 D_ERR("Unable to start leader broadcast timeout\n");
2070 return ENOMEM;
2073 return 0;
2076 static bool leader_broadcast_timeout_active(struct ctdb_recoverd *rec)
2078 return rec->leader_broadcast_timeout_te != NULL;
2081 static void leader_handler(uint64_t srvid, TDB_DATA data, void *private_data)
2083 struct ctdb_recoverd *rec = talloc_get_type_abort(
2084 private_data, struct ctdb_recoverd);
2085 uint32_t pnn;
2086 size_t npull;
2087 int ret;
2089 ret = ctdb_uint32_pull(data.dptr, data.dsize, &pnn, &npull);
2090 if (ret != 0) {
2091 DBG_WARNING("Unable to parse leader broadcast, ret=%d\n", ret);
2092 return;
2095 leader_broadcast_timeout_cancel(rec);
2097 if (pnn == rec->leader) {
2098 goto done;
2101 if (pnn == CTDB_UNKNOWN_PNN) {
2102 bool was_election_in_progress = rec->election_in_progress;
2105 * Leader broadcast timeout was cancelled above - stop
2106 * main loop from restarting it until election is
2107 * complete
2109 rec->election_in_progress = true;
2112 * This is the only notification for a cluster lock
2113 * election, so handle it here...
2115 if (cluster_lock_enabled(rec) && !was_election_in_progress) {
2116 cluster_lock_election(rec);
2119 return;
2122 D_NOTICE("Received leader broadcast, leader=%"PRIu32"\n", pnn);
2123 rec->leader = pnn;
2125 done:
2126 leader_broadcast_timeout_start(rec);
2129 struct verify_recmode_normal_data {
2130 uint32_t count;
2131 enum monitor_result status;
2134 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2136 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2139 /* one more node has responded with recmode data*/
2140 rmdata->count--;
2142 /* if we failed to get the recmode, then return an error and let
2143 the main loop try again.
2145 if (state->state != CTDB_CONTROL_DONE) {
2146 if (rmdata->status == MONITOR_OK) {
2147 rmdata->status = MONITOR_FAILED;
2149 return;
2152 /* if we got a response, then the recmode will be stored in the
2153 status field
2155 if (state->status != CTDB_RECOVERY_NORMAL) {
2156 DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
2157 rmdata->status = MONITOR_RECOVERY_NEEDED;
2160 return;
2164 /* verify that all nodes are in normal recovery mode */
2165 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap)
2167 struct verify_recmode_normal_data *rmdata;
2168 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2169 struct ctdb_client_control_state *state;
2170 enum monitor_result status;
2171 unsigned int j;
2173 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2174 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2175 rmdata->count = 0;
2176 rmdata->status = MONITOR_OK;
2178 /* loop over all active nodes and send an async getrecmode call to
2179 them*/
2180 for (j=0; j<nodemap->num; j++) {
2181 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2182 continue;
2184 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2185 CONTROL_TIMEOUT(),
2186 nodemap->nodes[j].pnn);
2187 if (state == NULL) {
2188 /* we failed to send the control, treat this as
2189 an error and try again next iteration
2191 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2192 talloc_free(mem_ctx);
2193 return MONITOR_FAILED;
2196 /* set up the callback functions */
2197 state->async.fn = verify_recmode_normal_callback;
2198 state->async.private_data = rmdata;
2200 /* one more control to wait for to complete */
2201 rmdata->count++;
2205 /* now wait for up to the maximum number of seconds allowed
2206 or until all nodes we expect a response from has replied
2208 while (rmdata->count > 0) {
2209 tevent_loop_once(ctdb->ev);
2212 status = rmdata->status;
2213 talloc_free(mem_ctx);
2214 return status;
2218 static bool interfaces_have_changed(struct ctdb_context *ctdb,
2219 struct ctdb_recoverd *rec)
2221 struct ctdb_iface_list_old *ifaces = NULL;
2222 TALLOC_CTX *mem_ctx;
2223 bool ret = false;
2225 mem_ctx = talloc_new(NULL);
2227 /* Read the interfaces from the local node */
2228 if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
2229 CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
2230 D_ERR("Unable to get interfaces from local node %u\n", rec->pnn);
2231 /* We could return an error. However, this will be
2232 * rare so we'll decide that the interfaces have
2233 * actually changed, just in case.
2235 talloc_free(mem_ctx);
2236 return true;
2239 if (!rec->ifaces) {
2240 /* We haven't been here before so things have changed */
2241 DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
2242 ret = true;
2243 } else if (rec->ifaces->num != ifaces->num) {
2244 /* Number of interfaces has changed */
2245 DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
2246 rec->ifaces->num, ifaces->num));
2247 ret = true;
2248 } else {
2249 /* See if interface names or link states have changed */
2250 unsigned int i;
2251 for (i = 0; i < rec->ifaces->num; i++) {
2252 struct ctdb_iface * iface = &rec->ifaces->ifaces[i];
2253 if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
2254 DEBUG(DEBUG_NOTICE,
2255 ("Interface in slot %d changed: %s => %s\n",
2256 i, iface->name, ifaces->ifaces[i].name));
2257 ret = true;
2258 break;
2260 if (iface->link_state != ifaces->ifaces[i].link_state) {
2261 DEBUG(DEBUG_NOTICE,
2262 ("Interface %s changed state: %d => %d\n",
2263 iface->name, iface->link_state,
2264 ifaces->ifaces[i].link_state));
2265 ret = true;
2266 break;
2271 talloc_free(rec->ifaces);
2272 rec->ifaces = talloc_steal(rec, ifaces);
2274 talloc_free(mem_ctx);
2275 return ret;
2278 /* Check that the local allocation of public IP addresses is correct
2279 * and do some house-keeping */
2280 static int verify_local_ip_allocation(struct ctdb_recoverd *rec)
2282 TALLOC_CTX *mem_ctx = talloc_new(NULL);
2283 struct ctdb_context *ctdb = rec->ctdb;
2284 unsigned int j;
2285 int ret;
2286 bool need_takeover_run = false;
2287 struct ctdb_public_ip_list_old *ips = NULL;
2289 /* If we are not the leader then do some housekeeping */
2290 if (!this_node_is_leader(rec)) {
2291 /* Ignore any IP reallocate requests - only leader
2292 * processes them
2294 TALLOC_FREE(rec->reallocate_requests);
2295 /* Clear any nodes that should be force rebalanced in
2296 * the next takeover run. If the leader has changed
2297 * then we don't want to process these some time in
2298 * the future.
2300 TALLOC_FREE(rec->force_rebalance_nodes);
2303 /* Return early if disabled... */
2304 if (ctdb_config.failover_disabled ||
2305 ctdb_op_is_disabled(rec->takeover_run)) {
2306 talloc_free(mem_ctx);
2307 return 0;
2310 if (interfaces_have_changed(ctdb, rec)) {
2311 need_takeover_run = true;
2314 /* If there are unhosted IPs but this node can host them then
2315 * trigger an IP reallocation */
2317 /* Read *available* IPs from local node */
2318 ret = ctdb_ctrl_get_public_ips_flags(
2319 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx,
2320 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
2321 if (ret != 0) {
2322 DEBUG(DEBUG_ERR, ("Unable to retrieve available public IPs\n"));
2323 talloc_free(mem_ctx);
2324 return -1;
2327 for (j=0; j<ips->num; j++) {
2328 if (ips->ips[j].pnn == CTDB_UNKNOWN_PNN &&
2329 rec->nodemap->nodes[rec->pnn].flags == 0) {
2330 DEBUG(DEBUG_WARNING,
2331 ("Unassigned IP %s can be served by this node\n",
2332 ctdb_addr_to_str(&ips->ips[j].addr)));
2333 need_takeover_run = true;
2337 talloc_free(ips);
2339 if (!ctdb->do_checkpublicip) {
2340 goto done;
2343 /* Validate the IP addresses that this node has on network
2344 * interfaces. If there is an inconsistency between reality
2345 * and the state expected by CTDB then try to fix it by
2346 * triggering an IP reallocation or releasing extraneous IP
2347 * addresses. */
2349 /* Read *known* IPs from local node */
2350 ret = ctdb_ctrl_get_public_ips_flags(
2351 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
2352 if (ret != 0) {
2353 DEBUG(DEBUG_ERR, ("Unable to retrieve known public IPs\n"));
2354 talloc_free(mem_ctx);
2355 return -1;
2358 for (j=0; j<ips->num; j++) {
2359 if (ips->ips[j].pnn == rec->pnn) {
2360 if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
2361 DEBUG(DEBUG_ERR,
2362 ("Assigned IP %s not on an interface\n",
2363 ctdb_addr_to_str(&ips->ips[j].addr)));
2364 need_takeover_run = true;
2366 } else {
2367 if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
2368 DEBUG(DEBUG_ERR,
2369 ("IP %s incorrectly on an interface\n",
2370 ctdb_addr_to_str(&ips->ips[j].addr)));
2371 need_takeover_run = true;
2376 done:
2377 if (need_takeover_run) {
2378 struct ctdb_srvid_message rd;
2379 TDB_DATA data;
2381 DEBUG(DEBUG_NOTICE,("Trigger takeoverrun\n"));
2383 ZERO_STRUCT(rd);
2384 rd.pnn = rec->pnn;
2385 rd.srvid = 0;
2386 data.dptr = (uint8_t *)&rd;
2387 data.dsize = sizeof(rd);
2389 ret = ctdb_client_send_message(ctdb,
2390 CTDB_BROADCAST_CONNECTED,
2391 CTDB_SRVID_TAKEOVER_RUN,
2392 data);
2393 if (ret != 0) {
2394 D_ERR("Failed to send takeover run request\n");
2397 talloc_free(mem_ctx);
2398 return 0;
2402 struct remote_nodemaps_state {
2403 struct ctdb_node_map_old **remote_nodemaps;
2404 struct ctdb_recoverd *rec;
2407 static void async_getnodemap_callback(struct ctdb_context *ctdb,
2408 uint32_t node_pnn,
2409 int32_t res,
2410 TDB_DATA outdata,
2411 void *callback_data)
2413 struct remote_nodemaps_state *state =
2414 (struct remote_nodemaps_state *)callback_data;
2415 struct ctdb_node_map_old **remote_nodemaps = state->remote_nodemaps;
2416 struct ctdb_node_map_old *nodemap = state->rec->nodemap;
2417 size_t i;
2419 for (i = 0; i < nodemap->num; i++) {
2420 if (nodemap->nodes[i].pnn == node_pnn) {
2421 break;
2425 if (i >= nodemap->num) {
2426 DBG_ERR("Invalid PNN %"PRIu32"\n", node_pnn);
2427 return;
2430 remote_nodemaps[i] = (struct ctdb_node_map_old *)talloc_steal(
2431 remote_nodemaps, outdata.dptr);
2435 static void async_getnodemap_error(struct ctdb_context *ctdb,
2436 uint32_t node_pnn,
2437 int32_t res,
2438 TDB_DATA outdata,
2439 void *callback_data)
2441 struct remote_nodemaps_state *state =
2442 (struct remote_nodemaps_state *)callback_data;
2443 struct ctdb_recoverd *rec = state->rec;
2445 DBG_ERR("Failed to retrieve nodemap from node %u\n", node_pnn);
2446 ctdb_set_culprit(rec, node_pnn);
2449 static int get_remote_nodemaps(struct ctdb_recoverd *rec,
2450 TALLOC_CTX *mem_ctx,
2451 struct ctdb_node_map_old ***remote_nodemaps)
2453 struct ctdb_context *ctdb = rec->ctdb;
2454 struct ctdb_node_map_old **t;
2455 uint32_t *nodes;
2456 struct remote_nodemaps_state state;
2457 int ret;
2459 t = talloc_zero_array(mem_ctx,
2460 struct ctdb_node_map_old *,
2461 rec->nodemap->num);
2462 if (t == NULL) {
2463 DBG_ERR("Memory allocation error\n");
2464 return -1;
2467 nodes = list_of_connected_nodes(ctdb, rec->nodemap, mem_ctx, false);
2469 state.remote_nodemaps = t;
2470 state.rec = rec;
2472 ret = ctdb_client_async_control(ctdb,
2473 CTDB_CONTROL_GET_NODEMAP,
2474 nodes,
2476 CONTROL_TIMEOUT(),
2477 false,
2478 tdb_null,
2479 async_getnodemap_callback,
2480 async_getnodemap_error,
2481 &state);
2482 talloc_free(nodes);
2484 if (ret != 0) {
2485 talloc_free(t);
2486 return ret;
2489 *remote_nodemaps = t;
2490 return 0;
2493 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
2494 TALLOC_CTX *mem_ctx)
2496 struct ctdb_node_map_old *nodemap=NULL;
2497 struct ctdb_node_map_old **remote_nodemaps=NULL;
2498 struct ctdb_vnn_map *vnnmap=NULL;
2499 struct ctdb_vnn_map *remote_vnnmap=NULL;
2500 uint32_t num_lmasters;
2501 int32_t debug_level;
2502 unsigned int i, j;
2503 int ret;
2504 bool self_ban;
2507 /* verify that the main daemon is still running */
2508 if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
2509 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2510 exit(-1);
2513 /* ping the local daemon to tell it we are alive */
2514 ctdb_ctrl_recd_ping(ctdb);
2516 if (rec->election_in_progress) {
2517 /* an election is in progress */
2518 return;
2522 * Start leader broadcasts if they are not active (1st time
2523 * through main loop? Memory allocation error?)
2525 if (!leader_broadcast_loop_active(rec)) {
2526 ret = leader_broadcast_loop(rec);
2527 if (ret != 0) {
2528 D_ERR("Failed to set up leader broadcast\n");
2529 ctdb_set_culprit(rec, rec->pnn);
2533 * Similar for leader broadcast timeouts. These can also have
2534 * been stopped by another node receiving a leader broadcast
2535 * timeout and transmitting an "unknown leader broadcast".
2536 * Note that this should never be done during an election - at
2537 * the moment there is nothing between here and the above
2538 * election-in-progress check that can process an election
2539 * result (i.e. no event loop).
2541 if (!leader_broadcast_timeout_active(rec)) {
2542 ret = leader_broadcast_timeout_start(rec);
2543 if (ret != 0) {
2544 ctdb_set_culprit(rec, rec->pnn);
2549 /* read the debug level from the parent and update locally */
2550 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
2551 if (ret !=0) {
2552 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
2553 return;
2555 debuglevel_set(debug_level);
2557 /* get relevant tunables */
2558 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
2559 if (ret != 0) {
2560 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
2561 return;
2564 /* get runstate */
2565 ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(),
2566 CTDB_CURRENT_NODE, &ctdb->runstate);
2567 if (ret != 0) {
2568 DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n"));
2569 return;
2572 /* get nodemap */
2573 ret = ctdb_ctrl_getnodemap(ctdb,
2574 CONTROL_TIMEOUT(),
2575 rec->pnn,
2576 rec,
2577 &nodemap);
2578 if (ret != 0) {
2579 DBG_ERR("Unable to get nodemap from node %"PRIu32"\n", rec->pnn);
2580 return;
2582 talloc_free(rec->nodemap);
2583 rec->nodemap = nodemap;
2585 /* remember our own node flags */
2586 rec->node_flags = nodemap->nodes[rec->pnn].flags;
2588 ban_misbehaving_nodes(rec, &self_ban);
2589 if (self_ban) {
2590 DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
2591 return;
2594 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2595 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2596 if (ret != 0) {
2597 D_ERR("Failed to read recmode from local node\n");
2598 return;
2601 /* if the local daemon is STOPPED or BANNED, we verify that the databases are
2602 also frozen and that the recmode is set to active.
2604 if (rec->node_flags & NODE_FLAGS_INACTIVE) {
2605 /* If this node has become inactive then we want to
2606 * reduce the chances of it taking over the leader
2607 * role when it becomes active again. This
2608 * helps to stabilise the leader role so that
2609 * it stays on the most stable node.
2611 rec->priority_time = timeval_current();
2613 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2614 DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
2616 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2617 if (ret != 0) {
2618 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
2620 return;
2623 if (! rec->frozen_on_inactive) {
2624 ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(),
2625 CTDB_CURRENT_NODE);
2626 if (ret != 0) {
2627 DEBUG(DEBUG_ERR,
2628 (__location__ " Failed to freeze node "
2629 "in STOPPED or BANNED state\n"));
2630 return;
2633 rec->frozen_on_inactive = true;
2636 /* If this node is stopped or banned then it is not the recovery
2637 * master, so don't do anything. This prevents stopped or banned
2638 * node from starting election and sending unnecessary controls.
2640 return;
2643 rec->frozen_on_inactive = false;
2645 /* Retrieve capabilities from all connected nodes */
2646 ret = update_capabilities(rec, nodemap);
2647 if (ret != 0) {
2648 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
2649 return;
2652 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2653 /* Check if an IP takeover run is needed and trigger one if
2654 * necessary */
2655 verify_local_ip_allocation(rec);
2658 /* If this node is not the leader then skip recovery checks */
2659 if (!this_node_is_leader(rec)) {
2660 return;
2664 /* Get the nodemaps for all connected remote nodes */
2665 ret = get_remote_nodemaps(rec, mem_ctx, &remote_nodemaps);
2666 if (ret != 0) {
2667 DBG_ERR("Failed to read remote nodemaps\n");
2668 return;
2671 /* Ensure our local and remote flags are correct */
2672 ret = update_flags(rec, nodemap, remote_nodemaps);
2673 if (ret != 0) {
2674 D_ERR("Unable to update flags\n");
2675 return;
2678 if (ctdb->num_nodes != nodemap->num) {
2679 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
2680 ctdb_load_nodes_file(ctdb);
2681 return;
2684 /* get the vnnmap */
2685 ret = ctdb_ctrl_getvnnmap(ctdb,
2686 CONTROL_TIMEOUT(),
2687 rec->pnn,
2688 mem_ctx,
2689 &vnnmap);
2690 if (ret != 0) {
2691 DBG_ERR("Unable to get vnnmap from node %u\n", rec->pnn);
2692 return;
2695 if (rec->need_recovery) {
2696 /* a previous recovery didn't finish */
2697 do_recovery(rec, mem_ctx);
2698 return;
2701 /* verify that all active nodes are in normal mode
2702 and not in recovery mode
2704 switch (verify_recmode(ctdb, nodemap)) {
2705 case MONITOR_RECOVERY_NEEDED:
2706 do_recovery(rec, mem_ctx);
2707 return;
2708 case MONITOR_FAILED:
2709 return;
2710 case MONITOR_ELECTION_NEEDED:
2711 /* can not happen */
2712 case MONITOR_OK:
2713 break;
2716 if (cluster_lock_enabled(rec)) {
2717 /* We must already hold the cluster lock */
2718 if (!cluster_lock_held(rec)) {
2719 D_ERR("Failed cluster lock sanity check\n");
2720 ctdb_set_culprit(rec, rec->pnn);
2721 do_recovery(rec, mem_ctx);
2722 return;
2727 /* If recoveries are disabled then there is no use doing any
2728 * nodemap or flags checks. Recoveries might be disabled due
2729 * to "reloadnodes", so doing these checks might cause an
2730 * unnecessary recovery. */
2731 if (ctdb_op_is_disabled(rec->recovery)) {
2732 goto takeover_run_checks;
2735 /* verify that all other nodes have the same nodemap as we have
2737 for (j=0; j<nodemap->num; j++) {
2738 if (nodemap->nodes[j].pnn == rec->pnn) {
2739 continue;
2741 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2742 continue;
2745 /* if the nodes disagree on how many nodes there are
2746 then this is a good reason to try recovery
2748 if (remote_nodemaps[j]->num != nodemap->num) {
2749 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
2750 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
2751 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2752 do_recovery(rec, mem_ctx);
2753 return;
2756 /* if the nodes disagree on which nodes exist and are
2757 active, then that is also a good reason to do recovery
2759 for (i=0;i<nodemap->num;i++) {
2760 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
2761 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
2762 nodemap->nodes[j].pnn, i,
2763 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
2764 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2765 do_recovery(rec, mem_ctx);
2766 return;
2771 /* count how many active nodes there are */
2772 num_lmasters = 0;
2773 for (i=0; i<nodemap->num; i++) {
2774 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
2775 if (ctdb_node_has_capabilities(rec->caps,
2776 ctdb->nodes[i]->pnn,
2777 CTDB_CAP_LMASTER)) {
2778 num_lmasters++;
2784 /* There must be the same number of lmasters in the vnn map as
2785 * there are active nodes with the lmaster capability... or
2786 * do a recovery.
2788 if (vnnmap->size != num_lmasters) {
2789 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
2790 vnnmap->size, num_lmasters));
2791 ctdb_set_culprit(rec, rec->pnn);
2792 do_recovery(rec, mem_ctx);
2793 return;
2797 * Verify that all active lmaster nodes in the nodemap also
2798 * exist in the vnnmap
2800 for (j=0; j<nodemap->num; j++) {
2801 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2802 continue;
2804 if (! ctdb_node_has_capabilities(rec->caps,
2805 nodemap->nodes[j].pnn,
2806 CTDB_CAP_LMASTER)) {
2807 continue;
2809 if (nodemap->nodes[j].pnn == rec->pnn) {
2810 continue;
2813 for (i=0; i<vnnmap->size; i++) {
2814 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
2815 break;
2818 if (i == vnnmap->size) {
2819 D_ERR("Active LMASTER node %u is not in the vnnmap\n",
2820 nodemap->nodes[j].pnn);
2821 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2822 do_recovery(rec, mem_ctx);
2823 return;
2828 /* verify that all other nodes have the same vnnmap
2829 and are from the same generation
2831 for (j=0; j<nodemap->num; j++) {
2832 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2833 continue;
2835 if (nodemap->nodes[j].pnn == rec->pnn) {
2836 continue;
2839 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
2840 mem_ctx, &remote_vnnmap);
2841 if (ret != 0) {
2842 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
2843 nodemap->nodes[j].pnn));
2844 return;
2847 /* verify the vnnmap generation is the same */
2848 if (vnnmap->generation != remote_vnnmap->generation) {
2849 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
2850 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
2851 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2852 do_recovery(rec, mem_ctx);
2853 return;
2856 /* verify the vnnmap size is the same */
2857 if (vnnmap->size != remote_vnnmap->size) {
2858 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
2859 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
2860 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2861 do_recovery(rec, mem_ctx);
2862 return;
2865 /* verify the vnnmap is the same */
2866 for (i=0;i<vnnmap->size;i++) {
2867 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
2868 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
2869 nodemap->nodes[j].pnn));
2870 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2871 do_recovery(rec, mem_ctx);
2872 return;
2877 /* FIXME: Add remote public IP checking to ensure that nodes
2878 * have the IP addresses that are allocated to them. */
2880 takeover_run_checks:
2882 /* If there are IP takeover runs requested or the previous one
2883 * failed then perform one and notify the waiters */
2884 if (!ctdb_op_is_disabled(rec->takeover_run) &&
2885 (rec->reallocate_requests || rec->need_takeover_run)) {
2886 process_ipreallocate_requests(ctdb, rec);
2890 static void recd_sig_term_handler(struct tevent_context *ev,
2891 struct tevent_signal *se, int signum,
2892 int count, void *dont_care,
2893 void *private_data)
2895 struct ctdb_recoverd *rec = talloc_get_type_abort(
2896 private_data, struct ctdb_recoverd);
2898 DEBUG(DEBUG_ERR, ("Received SIGTERM, exiting\n"));
2899 cluster_lock_release(rec);
2900 exit(0);
2904 * Periodically log elements of the cluster state
2906 * This can be used to confirm a split brain has occurred
2908 static void maybe_log_cluster_state(struct tevent_context *ev,
2909 struct tevent_timer *te,
2910 struct timeval current_time,
2911 void *private_data)
2913 struct ctdb_recoverd *rec = talloc_get_type_abort(
2914 private_data, struct ctdb_recoverd);
2915 struct ctdb_context *ctdb = rec->ctdb;
2916 struct tevent_timer *tt;
2918 static struct timeval start_incomplete = {
2919 .tv_sec = 0,
2922 bool is_complete;
2923 bool was_complete;
2924 unsigned int i;
2925 double seconds;
2926 unsigned int minutes;
2927 unsigned int num_connected;
2929 if (!this_node_is_leader(rec)) {
2930 goto done;
2933 if (rec->nodemap == NULL) {
2934 goto done;
2937 is_complete = true;
2938 num_connected = 0;
2939 for (i = 0; i < rec->nodemap->num; i++) {
2940 struct ctdb_node_and_flags *n = &rec->nodemap->nodes[i];
2942 if (n->pnn == rec->pnn) {
2943 continue;
2945 if ((n->flags & NODE_FLAGS_DELETED) != 0) {
2946 continue;
2948 if ((n->flags & NODE_FLAGS_DISCONNECTED) != 0) {
2949 is_complete = false;
2950 continue;
2953 num_connected++;
2956 was_complete = timeval_is_zero(&start_incomplete);
2958 if (is_complete) {
2959 if (! was_complete) {
2960 D_WARNING("Cluster complete with leader=%u\n",
2961 rec->leader);
2962 start_incomplete = timeval_zero();
2964 goto done;
2967 /* Cluster is newly incomplete... */
2968 if (was_complete) {
2969 start_incomplete = current_time;
2970 minutes = 0;
2971 goto log;
2975 * Cluster has been incomplete since previous check, so figure
2976 * out how long (in minutes) and decide whether to log anything
2978 seconds = timeval_elapsed2(&start_incomplete, &current_time);
2979 minutes = (unsigned int)seconds / 60;
2980 if (minutes >= 60) {
2981 /* Over an hour, log every hour */
2982 if (minutes % 60 != 0) {
2983 goto done;
2985 } else if (minutes >= 10) {
2986 /* Over 10 minutes, log every 10 minutes */
2987 if (minutes % 10 != 0) {
2988 goto done;
2992 log:
2993 D_WARNING("Cluster incomplete with leader=%u, elapsed=%u minutes, "
2994 "connected=%u\n",
2995 rec->leader,
2996 minutes,
2997 num_connected);
2999 done:
3000 tt = tevent_add_timer(ctdb->ev,
3001 rec,
3002 timeval_current_ofs(60, 0),
3003 maybe_log_cluster_state,
3004 rec);
3005 if (tt == NULL) {
3006 DBG_WARNING("Failed to set up cluster state timer\n");
3010 static void recd_sighup_hook(void *private_data)
3012 struct ctdb_recoverd *rec = talloc_get_type_abort(
3013 private_data, struct ctdb_recoverd);
3015 if (rec->helper_pid > 0) {
3016 kill(rec->helper_pid, SIGHUP);
3021 the main monitoring loop
3023 static void monitor_cluster(struct ctdb_context *ctdb)
3025 struct tevent_signal *se;
3026 struct ctdb_recoverd *rec;
3027 bool status;
3029 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
3031 rec = talloc_zero(ctdb, struct ctdb_recoverd);
3032 CTDB_NO_MEMORY_FATAL(ctdb, rec);
3034 rec->ctdb = ctdb;
3035 rec->leader = CTDB_UNKNOWN_PNN;
3036 rec->pnn = ctdb_get_pnn(ctdb);
3037 rec->cluster_lock_handle = NULL;
3038 rec->helper_pid = -1;
3040 rec->takeover_run = ctdb_op_init(rec, "takeover runs");
3041 CTDB_NO_MEMORY_FATAL(ctdb, rec->takeover_run);
3043 rec->recovery = ctdb_op_init(rec, "recoveries");
3044 CTDB_NO_MEMORY_FATAL(ctdb, rec->recovery);
3046 rec->priority_time = timeval_current();
3047 rec->frozen_on_inactive = false;
3049 status = logging_setup_sighup_handler(rec->ctdb->ev,
3050 rec,
3051 recd_sighup_hook,
3052 rec);
3053 if (!status) {
3054 D_ERR("Failed to install SIGHUP handler\n");
3055 exit(1);
3058 se = tevent_add_signal(ctdb->ev, ctdb, SIGTERM, 0,
3059 recd_sig_term_handler, rec);
3060 if (se == NULL) {
3061 DEBUG(DEBUG_ERR, ("Failed to install SIGTERM handler\n"));
3062 exit(1);
3065 if (!cluster_lock_enabled(rec)) {
3066 struct tevent_timer *tt;
3068 tt = tevent_add_timer(ctdb->ev,
3069 rec,
3070 timeval_current_ofs(60, 0),
3071 maybe_log_cluster_state,
3072 rec);
3073 if (tt == NULL) {
3074 DBG_WARNING("Failed to set up cluster state timer\n");
3078 /* register a message port for sending memory dumps */
3079 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
3081 /* when a node is assigned banning credits */
3082 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_BANNING,
3083 banning_handler, rec);
3085 /* register a message port for recovery elections */
3086 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_ELECTION, election_handler, rec);
3088 ctdb_client_set_message_handler(ctdb,
3089 CTDB_SRVID_SET_NODE_FLAGS,
3090 srvid_not_implemented,
3091 rec);
3093 /* when we are asked to puch out a flag change */
3094 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
3096 /* register a message port for reloadnodes */
3097 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
3099 /* register a message port for performing a takeover run */
3100 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
3102 /* register a message port for disabling the ip check for a short while */
3103 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
3105 /* register a message port for forcing a rebalance of a node next
3106 reallocation */
3107 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
3109 /* Register a message port for disabling takeover runs */
3110 ctdb_client_set_message_handler(ctdb,
3111 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
3112 disable_takeover_runs_handler, rec);
3114 /* Register a message port for disabling recoveries */
3115 ctdb_client_set_message_handler(ctdb,
3116 CTDB_SRVID_DISABLE_RECOVERIES,
3117 disable_recoveries_handler, rec);
3119 ctdb_client_set_message_handler(ctdb,
3120 CTDB_SRVID_LEADER,
3121 leader_handler,
3122 rec);
3124 for (;;) {
3125 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3126 struct timeval start;
3127 double elapsed;
3129 if (!mem_ctx) {
3130 DEBUG(DEBUG_CRIT,(__location__
3131 " Failed to create temp context\n"));
3132 exit(-1);
3135 start = timeval_current();
3136 main_loop(ctdb, rec, mem_ctx);
3137 talloc_free(mem_ctx);
3139 /* we only check for recovery once every second */
3140 elapsed = timeval_elapsed(&start);
3141 if (elapsed < ctdb->tunable.recover_interval) {
3142 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
3143 - elapsed);
3149 event handler for when the main ctdbd dies
3151 static void ctdb_recoverd_parent(struct tevent_context *ev,
3152 struct tevent_fd *fde,
3153 uint16_t flags, void *private_data)
3155 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
3156 _exit(1);
3160 called regularly to verify that the recovery daemon is still running
3162 static void ctdb_check_recd(struct tevent_context *ev,
3163 struct tevent_timer *te,
3164 struct timeval yt, void *p)
3166 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3168 if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
3169 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
3171 tevent_add_timer(ctdb->ev, ctdb, timeval_zero(),
3172 ctdb_restart_recd, ctdb);
3174 return;
3177 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3178 timeval_current_ofs(30, 0),
3179 ctdb_check_recd, ctdb);
3182 static void recd_sig_child_handler(struct tevent_context *ev,
3183 struct tevent_signal *se, int signum,
3184 int count, void *dont_care,
3185 void *private_data)
3187 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3188 int status;
3189 pid_t pid = -1;
3191 while (pid != 0) {
3192 pid = waitpid(-1, &status, WNOHANG);
3193 if (pid == -1) {
3194 if (errno != ECHILD) {
3195 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3197 return;
3199 if (pid > 0) {
3200 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3206 startup the recovery daemon as a child of the main ctdb daemon
3208 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3210 int fd[2];
3211 struct tevent_signal *se;
3212 struct tevent_fd *fde;
3213 int ret;
3215 if (pipe(fd) != 0) {
3216 return -1;
3219 ctdb->recoverd_pid = ctdb_fork(ctdb);
3220 if (ctdb->recoverd_pid == -1) {
3221 return -1;
3224 if (ctdb->recoverd_pid != 0) {
3225 talloc_free(ctdb->recd_ctx);
3226 ctdb->recd_ctx = talloc_new(ctdb);
3227 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
3229 close(fd[0]);
3230 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3231 timeval_current_ofs(30, 0),
3232 ctdb_check_recd, ctdb);
3233 return 0;
3236 close(fd[1]);
3238 srandom(getpid() ^ time(NULL));
3240 ret = logging_init(ctdb, NULL, NULL, "ctdb-recoverd");
3241 if (ret != 0) {
3242 return -1;
3245 prctl_set_comment("ctdb_recoverd");
3246 if (switch_from_server_to_client(ctdb) != 0) {
3247 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3248 exit(1);
3251 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
3253 fde = tevent_add_fd(ctdb->ev, ctdb, fd[0], TEVENT_FD_READ,
3254 ctdb_recoverd_parent, &fd[0]);
3255 tevent_fd_set_auto_close(fde);
3257 /* set up a handler to pick up sigchld */
3258 se = tevent_add_signal(ctdb->ev, ctdb, SIGCHLD, 0,
3259 recd_sig_child_handler, ctdb);
3260 if (se == NULL) {
3261 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3262 exit(1);
3265 monitor_cluster(ctdb);
3267 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
3268 return -1;
3272 shutdown the recovery daemon
3274 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
3276 if (ctdb->recoverd_pid == 0) {
3277 return;
3280 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
3281 ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
3283 TALLOC_FREE(ctdb->recd_ctx);
3284 TALLOC_FREE(ctdb->recd_ping_count);
3287 static void ctdb_restart_recd(struct tevent_context *ev,
3288 struct tevent_timer *te,
3289 struct timeval t, void *private_data)
3291 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3293 DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
3294 ctdb_stop_recoverd(ctdb);
3295 ctdb_start_recoverd(ctdb);