ctdb-recoverd: Avoid dereferencing NULL rec->nodemap
[Samba.git] / ctdb / server / ctdb_recoverd.c
blobee083e92fb110b04bb5313dc5ad50434999b13ab
1 /*
2 ctdb recovery daemon
4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
20 #include "replace.h"
21 #include "system/filesys.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/wait.h"
26 #include <popt.h>
27 #include <talloc.h>
28 #include <tevent.h>
29 #include <tdb.h>
31 #include "lib/tdb_wrap/tdb_wrap.h"
32 #include "lib/util/dlinklist.h"
33 #include "lib/util/debug.h"
34 #include "lib/util/samba_util.h"
35 #include "lib/util/sys_rw.h"
36 #include "lib/util/util_process.h"
38 #include "ctdb_private.h"
39 #include "ctdb_client.h"
41 #include "common/system_socket.h"
42 #include "common/common.h"
43 #include "common/logging.h"
45 #include "server/ctdb_config.h"
47 #include "ctdb_cluster_mutex.h"
49 /* List of SRVID requests that need to be processed */
50 struct srvid_list {
51 struct srvid_list *next, *prev;
52 struct ctdb_srvid_message *request;
55 struct srvid_requests {
56 struct srvid_list *requests;
59 static void srvid_request_reply(struct ctdb_context *ctdb,
60 struct ctdb_srvid_message *request,
61 TDB_DATA result)
63 /* Someone that sent srvid==0 does not want a reply */
64 if (request->srvid == 0) {
65 talloc_free(request);
66 return;
69 if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
70 result) == 0) {
71 DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
72 (unsigned)request->pnn,
73 (unsigned long long)request->srvid));
74 } else {
75 DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
76 (unsigned)request->pnn,
77 (unsigned long long)request->srvid));
80 talloc_free(request);
83 static void srvid_requests_reply(struct ctdb_context *ctdb,
84 struct srvid_requests **requests,
85 TDB_DATA result)
87 struct srvid_list *r;
89 if (*requests == NULL) {
90 return;
93 for (r = (*requests)->requests; r != NULL; r = r->next) {
94 srvid_request_reply(ctdb, r->request, result);
97 /* Free the list structure... */
98 TALLOC_FREE(*requests);
101 static void srvid_request_add(struct ctdb_context *ctdb,
102 struct srvid_requests **requests,
103 struct ctdb_srvid_message *request)
105 struct srvid_list *t;
106 int32_t ret;
107 TDB_DATA result;
109 if (*requests == NULL) {
110 *requests = talloc_zero(ctdb, struct srvid_requests);
111 if (*requests == NULL) {
112 goto nomem;
116 t = talloc_zero(*requests, struct srvid_list);
117 if (t == NULL) {
118 /* If *requests was just allocated above then free it */
119 if ((*requests)->requests == NULL) {
120 TALLOC_FREE(*requests);
122 goto nomem;
125 t->request = (struct ctdb_srvid_message *)talloc_steal(t, request);
126 DLIST_ADD((*requests)->requests, t);
128 return;
130 nomem:
131 /* Failed to add the request to the list. Send a fail. */
132 DEBUG(DEBUG_ERR, (__location__
133 " Out of memory, failed to queue SRVID request\n"));
134 ret = -ENOMEM;
135 result.dsize = sizeof(ret);
136 result.dptr = (uint8_t *)&ret;
137 srvid_request_reply(ctdb, request, result);
140 /* An abstraction to allow an operation (takeover runs, recoveries,
141 * ...) to be disabled for a given timeout */
142 struct ctdb_op_state {
143 struct tevent_timer *timer;
144 bool in_progress;
145 const char *name;
148 static struct ctdb_op_state *ctdb_op_init(TALLOC_CTX *mem_ctx, const char *name)
150 struct ctdb_op_state *state = talloc_zero(mem_ctx, struct ctdb_op_state);
152 if (state != NULL) {
153 state->in_progress = false;
154 state->name = name;
157 return state;
160 static bool ctdb_op_is_disabled(struct ctdb_op_state *state)
162 return state->timer != NULL;
165 static bool ctdb_op_begin(struct ctdb_op_state *state)
167 if (ctdb_op_is_disabled(state)) {
168 DEBUG(DEBUG_NOTICE,
169 ("Unable to begin - %s are disabled\n", state->name));
170 return false;
173 state->in_progress = true;
174 return true;
177 static bool ctdb_op_end(struct ctdb_op_state *state)
179 return state->in_progress = false;
182 static bool ctdb_op_is_in_progress(struct ctdb_op_state *state)
184 return state->in_progress;
187 static void ctdb_op_enable(struct ctdb_op_state *state)
189 TALLOC_FREE(state->timer);
192 static void ctdb_op_timeout_handler(struct tevent_context *ev,
193 struct tevent_timer *te,
194 struct timeval yt, void *p)
196 struct ctdb_op_state *state =
197 talloc_get_type(p, struct ctdb_op_state);
199 DEBUG(DEBUG_NOTICE,("Reenabling %s after timeout\n", state->name));
200 ctdb_op_enable(state);
203 static int ctdb_op_disable(struct ctdb_op_state *state,
204 struct tevent_context *ev,
205 uint32_t timeout)
207 if (timeout == 0) {
208 DEBUG(DEBUG_NOTICE,("Reenabling %s\n", state->name));
209 ctdb_op_enable(state);
210 return 0;
213 if (state->in_progress) {
214 DEBUG(DEBUG_ERR,
215 ("Unable to disable %s - in progress\n", state->name));
216 return -EAGAIN;
219 DEBUG(DEBUG_NOTICE,("Disabling %s for %u seconds\n",
220 state->name, timeout));
222 /* Clear any old timers */
223 talloc_free(state->timer);
225 /* Arrange for the timeout to occur */
226 state->timer = tevent_add_timer(ev, state,
227 timeval_current_ofs(timeout, 0),
228 ctdb_op_timeout_handler, state);
229 if (state->timer == NULL) {
230 DEBUG(DEBUG_ERR,(__location__ " Unable to setup timer\n"));
231 return -ENOMEM;
234 return 0;
237 struct ctdb_banning_state {
238 uint32_t count;
239 struct timeval last_reported_time;
242 struct ctdb_recovery_lock_handle;
245 private state of recovery daemon
247 struct ctdb_recoverd {
248 struct ctdb_context *ctdb;
249 uint32_t recmaster;
250 uint32_t last_culprit_node;
251 struct ctdb_node_map_old *nodemap;
252 struct timeval priority_time;
253 bool need_takeover_run;
254 bool need_recovery;
255 uint32_t node_flags;
256 struct tevent_timer *send_election_te;
257 struct tevent_timer *election_timeout;
258 struct srvid_requests *reallocate_requests;
259 struct ctdb_op_state *takeover_run;
260 struct ctdb_op_state *recovery;
261 struct ctdb_iface_list_old *ifaces;
262 uint32_t *force_rebalance_nodes;
263 struct ctdb_node_capabilities *caps;
264 bool frozen_on_inactive;
265 struct ctdb_recovery_lock_handle *recovery_lock_handle;
268 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
269 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
271 static void ctdb_restart_recd(struct tevent_context *ev,
272 struct tevent_timer *te, struct timeval t,
273 void *private_data);
276 ban a node for a period of time
278 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
280 int ret;
281 struct ctdb_context *ctdb = rec->ctdb;
282 struct ctdb_ban_state bantime;
284 if (!ctdb_validate_pnn(ctdb, pnn)) {
285 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
286 return;
289 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
291 bantime.pnn = pnn;
292 bantime.time = ban_time;
294 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
295 if (ret != 0) {
296 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
297 return;
302 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
306 remember the trouble maker
308 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
310 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
311 struct ctdb_banning_state *ban_state;
313 if (culprit > ctdb->num_nodes) {
314 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
315 return;
318 /* If we are banned or stopped, do not set other nodes as culprits */
319 if (rec->node_flags & NODE_FLAGS_INACTIVE) {
320 DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
321 return;
324 if (ctdb->nodes[culprit]->ban_state == NULL) {
325 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
326 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
330 ban_state = ctdb->nodes[culprit]->ban_state;
331 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
332 /* this was the first time in a long while this node
333 misbehaved so we will forgive any old transgressions.
335 ban_state->count = 0;
338 ban_state->count += count;
339 ban_state->last_reported_time = timeval_current();
340 rec->last_culprit_node = culprit;
344 remember the trouble maker
346 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
348 ctdb_set_culprit_count(rec, culprit, 1);
352 Retrieve capabilities from all connected nodes
354 static int update_capabilities(struct ctdb_recoverd *rec,
355 struct ctdb_node_map_old *nodemap)
357 uint32_t *capp;
358 TALLOC_CTX *tmp_ctx;
359 struct ctdb_node_capabilities *caps;
360 struct ctdb_context *ctdb = rec->ctdb;
362 tmp_ctx = talloc_new(rec);
363 CTDB_NO_MEMORY(ctdb, tmp_ctx);
365 caps = ctdb_get_capabilities(ctdb, tmp_ctx,
366 CONTROL_TIMEOUT(), nodemap);
368 if (caps == NULL) {
369 DEBUG(DEBUG_ERR,
370 (__location__ " Failed to get node capabilities\n"));
371 talloc_free(tmp_ctx);
372 return -1;
375 capp = ctdb_get_node_capabilities(caps, ctdb_get_pnn(ctdb));
376 if (capp == NULL) {
377 DEBUG(DEBUG_ERR,
378 (__location__
379 " Capabilities don't include current node.\n"));
380 talloc_free(tmp_ctx);
381 return -1;
383 ctdb->capabilities = *capp;
385 TALLOC_FREE(rec->caps);
386 rec->caps = talloc_steal(rec, caps);
388 talloc_free(tmp_ctx);
389 return 0;
393 change recovery mode on all nodes
395 static int set_recovery_mode(struct ctdb_context *ctdb,
396 struct ctdb_recoverd *rec,
397 struct ctdb_node_map_old *nodemap,
398 uint32_t rec_mode)
400 TDB_DATA data;
401 uint32_t *nodes;
402 TALLOC_CTX *tmp_ctx;
404 tmp_ctx = talloc_new(ctdb);
405 CTDB_NO_MEMORY(ctdb, tmp_ctx);
407 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
409 data.dsize = sizeof(uint32_t);
410 data.dptr = (unsigned char *)&rec_mode;
412 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
413 nodes, 0,
414 CONTROL_TIMEOUT(),
415 false, data,
416 NULL, NULL,
417 NULL) != 0) {
418 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
419 talloc_free(tmp_ctx);
420 return -1;
423 talloc_free(tmp_ctx);
424 return 0;
428 update flags on all active nodes
430 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap, uint32_t pnn, uint32_t flags)
432 int ret;
434 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
435 if (ret != 0) {
436 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
437 return -1;
440 return 0;
444 called when ctdb_wait_timeout should finish
446 static void ctdb_wait_handler(struct tevent_context *ev,
447 struct tevent_timer *te,
448 struct timeval yt, void *p)
450 uint32_t *timed_out = (uint32_t *)p;
451 (*timed_out) = 1;
455 wait for a given number of seconds
457 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
459 uint32_t timed_out = 0;
460 time_t usecs = (secs - (time_t)secs) * 1000000;
461 tevent_add_timer(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs),
462 ctdb_wait_handler, &timed_out);
463 while (!timed_out) {
464 tevent_loop_once(ctdb->ev);
469 called when an election times out (ends)
471 static void ctdb_election_timeout(struct tevent_context *ev,
472 struct tevent_timer *te,
473 struct timeval t, void *p)
475 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
476 rec->election_timeout = NULL;
477 fast_start = false;
479 D_WARNING("Election period ended, master=%u\n", rec->recmaster);
484 wait for an election to finish. It finished election_timeout seconds after
485 the last election packet is received
487 static void ctdb_wait_election(struct ctdb_recoverd *rec)
489 struct ctdb_context *ctdb = rec->ctdb;
490 while (rec->election_timeout) {
491 tevent_loop_once(ctdb->ev);
496 Update our local flags from all remote connected nodes.
497 This is only run when we are or we belive we are the recovery master
499 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap)
501 unsigned int j;
502 struct ctdb_context *ctdb = rec->ctdb;
503 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
505 /* get the nodemap for all active remote nodes and verify
506 they are the same as for this node
508 for (j=0; j<nodemap->num; j++) {
509 struct ctdb_node_map_old *remote_nodemap=NULL;
510 int ret;
512 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
513 continue;
515 if (nodemap->nodes[j].pnn == ctdb->pnn) {
516 continue;
519 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
520 mem_ctx, &remote_nodemap);
521 if (ret != 0) {
522 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
523 nodemap->nodes[j].pnn));
524 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
525 talloc_free(mem_ctx);
526 return -1;
528 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
529 /* We should tell our daemon about this so it
530 updates its flags or else we will log the same
531 message again in the next iteration of recovery.
532 Since we are the recovery master we can just as
533 well update the flags on all nodes.
535 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
536 if (ret != 0) {
537 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
538 return -1;
541 /* Update our local copy of the flags in the recovery
542 daemon.
544 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
545 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
546 nodemap->nodes[j].flags));
547 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
549 talloc_free(remote_nodemap);
551 talloc_free(mem_ctx);
552 return 0;
556 /* Create a new random generation id.
557 The generation id can not be the INVALID_GENERATION id
559 static uint32_t new_generation(void)
561 uint32_t generation;
563 while (1) {
564 generation = random();
566 if (generation != INVALID_GENERATION) {
567 break;
571 return generation;
574 static bool ctdb_recovery_have_lock(struct ctdb_recoverd *rec)
576 return (rec->recovery_lock_handle != NULL);
579 struct ctdb_recovery_lock_handle {
580 bool done;
581 bool locked;
582 double latency;
583 struct ctdb_cluster_mutex_handle *h;
584 struct ctdb_recoverd *rec;
587 static void take_reclock_handler(char status,
588 double latency,
589 void *private_data)
591 struct ctdb_recovery_lock_handle *s =
592 (struct ctdb_recovery_lock_handle *) private_data;
594 s->locked = (status == '0') ;
597 * If unsuccessful then ensure the process has exited and that
598 * the file descriptor event handler has been cancelled
600 if (! s->locked) {
601 TALLOC_FREE(s->h);
604 switch (status) {
605 case '0':
606 s->latency = latency;
607 break;
609 case '1':
610 D_ERR("Unable to take recovery lock - contention\n");
611 break;
613 case '2':
614 D_ERR("Unable to take recovery lock - timeout\n");
615 break;
617 default:
618 D_ERR("Unable to take recover lock - unknown error\n");
621 struct ctdb_recoverd *rec = s->rec;
622 struct ctdb_context *ctdb = rec->ctdb;
623 uint32_t pnn = ctdb_get_pnn(ctdb);
625 D_ERR("Banning this node\n");
626 ctdb_ban_node(rec,
627 pnn,
628 ctdb->tunable.recovery_ban_period);
632 s->done = true;
635 static void force_election(struct ctdb_recoverd *rec,
636 uint32_t pnn,
637 struct ctdb_node_map_old *nodemap);
639 static void lost_reclock_handler(void *private_data)
641 struct ctdb_recoverd *rec = talloc_get_type_abort(
642 private_data, struct ctdb_recoverd);
644 D_ERR("Recovery lock helper terminated, triggering an election\n");
645 TALLOC_FREE(rec->recovery_lock_handle);
647 force_election(rec, ctdb_get_pnn(rec->ctdb), rec->nodemap);
650 static bool ctdb_recovery_lock(struct ctdb_recoverd *rec)
652 struct ctdb_context *ctdb = rec->ctdb;
653 struct ctdb_cluster_mutex_handle *h;
654 struct ctdb_recovery_lock_handle *s;
656 s = talloc_zero(rec, struct ctdb_recovery_lock_handle);
657 if (s == NULL) {
658 DBG_ERR("Memory allocation error\n");
659 return false;
662 s->rec = rec;
664 h = ctdb_cluster_mutex(s,
665 ctdb,
666 ctdb->recovery_lock,
667 120,
668 take_reclock_handler,
670 lost_reclock_handler,
671 rec);
672 if (h == NULL) {
673 talloc_free(s);
674 return false;
677 rec->recovery_lock_handle = s;
678 s->h = h;
680 while (! s->done) {
681 tevent_loop_once(ctdb->ev);
684 if (! s->locked) {
685 TALLOC_FREE(rec->recovery_lock_handle);
686 return false;
689 ctdb_ctrl_report_recd_lock_latency(ctdb,
690 CONTROL_TIMEOUT(),
691 s->latency);
693 return true;
696 static void ctdb_recovery_unlock(struct ctdb_recoverd *rec)
698 if (rec->recovery_lock_handle == NULL) {
699 return;
702 if (! rec->recovery_lock_handle->done) {
704 * Taking of recovery lock still in progress. Free
705 * the cluster mutex handle to release it but leave
706 * the recovery lock handle in place to allow taking
707 * of the lock to fail.
709 D_NOTICE("Cancelling recovery lock\n");
710 TALLOC_FREE(rec->recovery_lock_handle->h);
711 rec->recovery_lock_handle->done = true;
712 rec->recovery_lock_handle->locked = false;
713 return;
716 D_NOTICE("Releasing recovery lock\n");
717 TALLOC_FREE(rec->recovery_lock_handle);
720 static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
722 struct ctdb_context *ctdb = rec->ctdb;
723 unsigned int i;
724 struct ctdb_banning_state *ban_state;
726 *self_ban = false;
727 for (i=0; i<ctdb->num_nodes; i++) {
728 if (ctdb->nodes[i]->ban_state == NULL) {
729 continue;
731 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
732 if (ban_state->count < 2*ctdb->num_nodes) {
733 continue;
736 DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
737 ctdb->nodes[i]->pnn, ban_state->count,
738 ctdb->tunable.recovery_ban_period));
739 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
740 ban_state->count = 0;
742 /* Banning ourself? */
743 if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
744 *self_ban = true;
749 struct helper_state {
750 int fd[2];
751 pid_t pid;
752 int result;
753 bool done;
756 static void helper_handler(struct tevent_context *ev,
757 struct tevent_fd *fde,
758 uint16_t flags, void *private_data)
760 struct helper_state *state = talloc_get_type_abort(
761 private_data, struct helper_state);
762 int ret;
764 ret = sys_read(state->fd[0], &state->result, sizeof(state->result));
765 if (ret != sizeof(state->result)) {
766 state->result = EPIPE;
769 state->done = true;
772 static int helper_run(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx,
773 const char *prog, const char *arg, const char *type)
775 struct helper_state *state;
776 struct tevent_fd *fde;
777 const char **args;
778 int nargs, ret;
779 uint32_t recmaster = rec->recmaster;
781 state = talloc_zero(mem_ctx, struct helper_state);
782 if (state == NULL) {
783 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
784 return -1;
787 state->pid = -1;
789 ret = pipe(state->fd);
790 if (ret != 0) {
791 DEBUG(DEBUG_ERR,
792 ("Failed to create pipe for %s helper\n", type));
793 goto fail;
796 set_close_on_exec(state->fd[0]);
798 nargs = 4;
799 args = talloc_array(state, const char *, nargs);
800 if (args == NULL) {
801 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
802 goto fail;
805 args[0] = talloc_asprintf(args, "%d", state->fd[1]);
806 if (args[0] == NULL) {
807 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
808 goto fail;
810 args[1] = rec->ctdb->daemon.name;
811 args[2] = arg;
812 args[3] = NULL;
814 if (args[2] == NULL) {
815 nargs = 3;
818 state->pid = ctdb_vfork_exec(state, rec->ctdb, prog, nargs, args);
819 if (state->pid == -1) {
820 DEBUG(DEBUG_ERR,
821 ("Failed to create child for %s helper\n", type));
822 goto fail;
825 close(state->fd[1]);
826 state->fd[1] = -1;
828 state->done = false;
830 fde = tevent_add_fd(rec->ctdb->ev, state, state->fd[0],
831 TEVENT_FD_READ, helper_handler, state);
832 if (fde == NULL) {
833 goto fail;
835 tevent_fd_set_auto_close(fde);
837 while (!state->done) {
838 tevent_loop_once(rec->ctdb->ev);
840 /* If recmaster changes, we have lost election */
841 if (recmaster != rec->recmaster) {
842 D_ERR("Recmaster changed to %u, aborting %s\n",
843 rec->recmaster, type);
844 state->result = 1;
845 break;
849 close(state->fd[0]);
850 state->fd[0] = -1;
852 if (state->result != 0) {
853 goto fail;
856 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
857 talloc_free(state);
858 return 0;
860 fail:
861 if (state->fd[0] != -1) {
862 close(state->fd[0]);
864 if (state->fd[1] != -1) {
865 close(state->fd[1]);
867 if (state->pid != -1) {
868 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
870 talloc_free(state);
871 return -1;
875 static int ctdb_takeover(struct ctdb_recoverd *rec,
876 uint32_t *force_rebalance_nodes)
878 static char prog[PATH_MAX+1] = "";
879 char *arg;
880 unsigned int i;
881 int ret;
883 if (!ctdb_set_helper("takeover_helper", prog, sizeof(prog),
884 "CTDB_TAKEOVER_HELPER", CTDB_HELPER_BINDIR,
885 "ctdb_takeover_helper")) {
886 ctdb_die(rec->ctdb, "Unable to set takeover helper\n");
889 arg = NULL;
890 for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
891 uint32_t pnn = force_rebalance_nodes[i];
892 if (arg == NULL) {
893 arg = talloc_asprintf(rec, "%u", pnn);
894 } else {
895 arg = talloc_asprintf_append(arg, ",%u", pnn);
897 if (arg == NULL) {
898 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
899 return -1;
903 if (ctdb_config.failover_disabled) {
904 ret = setenv("CTDB_DISABLE_IP_FAILOVER", "1", 1);
905 if (ret != 0) {
906 D_ERR("Failed to set CTDB_DISABLE_IP_FAILOVER variable\n");
907 return -1;
911 return helper_run(rec, rec, prog, arg, "takeover");
914 static bool do_takeover_run(struct ctdb_recoverd *rec,
915 struct ctdb_node_map_old *nodemap)
917 uint32_t *nodes = NULL;
918 struct ctdb_disable_message dtr;
919 TDB_DATA data;
920 size_t i;
921 uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
922 int ret;
923 bool ok;
925 DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
927 if (ctdb_op_is_in_progress(rec->takeover_run)) {
928 DEBUG(DEBUG_ERR, (__location__
929 " takeover run already in progress \n"));
930 ok = false;
931 goto done;
934 if (!ctdb_op_begin(rec->takeover_run)) {
935 ok = false;
936 goto done;
939 /* Disable IP checks (takeover runs, really) on other nodes
940 * while doing this takeover run. This will stop those other
941 * nodes from triggering takeover runs when think they should
942 * be hosting an IP but it isn't yet on an interface. Don't
943 * wait for replies since a failure here might cause some
944 * noise in the logs but will not actually cause a problem.
946 ZERO_STRUCT(dtr);
947 dtr.srvid = 0; /* No reply */
948 dtr.pnn = -1;
950 data.dptr = (uint8_t*)&dtr;
951 data.dsize = sizeof(dtr);
953 nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
955 /* Disable for 60 seconds. This can be a tunable later if
956 * necessary.
958 dtr.timeout = 60;
959 for (i = 0; i < talloc_array_length(nodes); i++) {
960 if (ctdb_client_send_message(rec->ctdb, nodes[i],
961 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
962 data) != 0) {
963 DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
967 ret = ctdb_takeover(rec, rec->force_rebalance_nodes);
969 /* Reenable takeover runs and IP checks on other nodes */
970 dtr.timeout = 0;
971 for (i = 0; i < talloc_array_length(nodes); i++) {
972 if (ctdb_client_send_message(rec->ctdb, nodes[i],
973 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
974 data) != 0) {
975 DEBUG(DEBUG_INFO,("Failed to re-enable takeover runs\n"));
979 if (ret != 0) {
980 DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
981 ok = false;
982 goto done;
985 ok = true;
986 /* Takeover run was successful so clear force rebalance targets */
987 if (rebalance_nodes == rec->force_rebalance_nodes) {
988 TALLOC_FREE(rec->force_rebalance_nodes);
989 } else {
990 DEBUG(DEBUG_WARNING,
991 ("Rebalance target nodes changed during takeover run - not clearing\n"));
993 done:
994 rec->need_takeover_run = !ok;
995 talloc_free(nodes);
996 ctdb_op_end(rec->takeover_run);
998 DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
999 return ok;
1002 static int db_recovery_parallel(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx)
1004 static char prog[PATH_MAX+1] = "";
1005 const char *arg;
1007 if (!ctdb_set_helper("recovery_helper", prog, sizeof(prog),
1008 "CTDB_RECOVERY_HELPER", CTDB_HELPER_BINDIR,
1009 "ctdb_recovery_helper")) {
1010 ctdb_die(rec->ctdb, "Unable to set recovery helper\n");
1013 arg = talloc_asprintf(mem_ctx, "%u", new_generation());
1014 if (arg == NULL) {
1015 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1016 return -1;
1019 setenv("CTDB_DBDIR_STATE", rec->ctdb->db_directory_state, 1);
1021 return helper_run(rec, mem_ctx, prog, arg, "recovery");
1025 we are the recmaster, and recovery is needed - start a recovery run
1027 static int do_recovery(struct ctdb_recoverd *rec,
1028 TALLOC_CTX *mem_ctx, uint32_t pnn,
1029 struct ctdb_node_map_old *nodemap, struct ctdb_vnn_map *vnnmap)
1031 struct ctdb_context *ctdb = rec->ctdb;
1032 unsigned int i;
1033 int ret;
1034 bool self_ban;
1036 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1038 /* Check if the current node is still the recmaster. It's possible that
1039 * re-election has changed the recmaster.
1041 if (pnn != rec->recmaster) {
1042 DEBUG(DEBUG_NOTICE,
1043 ("Recovery master changed to %u, aborting recovery\n",
1044 rec->recmaster));
1045 return -1;
1048 /* if recovery fails, force it again */
1049 rec->need_recovery = true;
1051 if (!ctdb_op_begin(rec->recovery)) {
1052 return -1;
1055 if (rec->election_timeout) {
1056 /* an election is in progress */
1057 DEBUG(DEBUG_ERR, ("do_recovery called while election in progress - try again later\n"));
1058 goto fail;
1061 ban_misbehaving_nodes(rec, &self_ban);
1062 if (self_ban) {
1063 DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
1064 goto fail;
1067 if (ctdb->recovery_lock != NULL) {
1068 if (ctdb_recovery_have_lock(rec)) {
1069 D_NOTICE("Already holding recovery lock\n");
1070 } else {
1071 bool ok;
1073 D_NOTICE("Attempting to take recovery lock (%s)\n",
1074 ctdb->recovery_lock);
1076 ok = ctdb_recovery_lock(rec);
1077 if (! ok) {
1078 D_ERR("Unable to take recovery lock\n");
1080 if (pnn != rec->recmaster) {
1081 D_NOTICE("Recovery master changed to %u,"
1082 " aborting recovery\n",
1083 rec->recmaster);
1084 rec->need_recovery = false;
1085 goto fail;
1088 if (ctdb->runstate ==
1089 CTDB_RUNSTATE_FIRST_RECOVERY) {
1091 * First recovery? Perhaps
1092 * current node does not yet
1093 * know who the recmaster is.
1095 D_ERR("Retrying recovery\n");
1096 goto fail;
1099 D_ERR("Abort recovery, "
1100 "ban this node for %u seconds\n",
1101 ctdb->tunable.recovery_ban_period);
1102 ctdb_ban_node(rec,
1103 pnn,
1104 ctdb->tunable.recovery_ban_period);
1105 goto fail;
1107 D_NOTICE("Recovery lock taken successfully\n");
1111 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1113 /* Retrieve capabilities from all connected nodes */
1114 ret = update_capabilities(rec, nodemap);
1115 if (ret!=0) {
1116 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1117 return -1;
1121 update all nodes to have the same flags that we have
1123 for (i=0;i<nodemap->num;i++) {
1124 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1125 continue;
1128 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1129 if (ret != 0) {
1130 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1131 DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
1132 } else {
1133 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1134 return -1;
1139 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1141 ret = db_recovery_parallel(rec, mem_ctx);
1142 if (ret != 0) {
1143 goto fail;
1146 do_takeover_run(rec, nodemap);
1148 /* send a message to all clients telling them that the cluster
1149 has been reconfigured */
1150 ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
1151 CTDB_SRVID_RECONFIGURE, tdb_null);
1152 if (ret != 0) {
1153 DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n"));
1154 goto fail;
1157 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1159 rec->need_recovery = false;
1160 ctdb_op_end(rec->recovery);
1162 /* we managed to complete a full recovery, make sure to forgive
1163 any past sins by the nodes that could now participate in the
1164 recovery.
1166 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
1167 for (i=0;i<nodemap->num;i++) {
1168 struct ctdb_banning_state *ban_state;
1170 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1171 continue;
1174 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
1175 if (ban_state == NULL) {
1176 continue;
1179 ban_state->count = 0;
1182 /* We just finished a recovery successfully.
1183 We now wait for rerecovery_timeout before we allow
1184 another recovery to take place.
1186 DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be suppressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
1187 ctdb_op_disable(rec->recovery, ctdb->ev,
1188 ctdb->tunable.rerecovery_timeout);
1189 return 0;
1191 fail:
1192 ctdb_op_end(rec->recovery);
1193 return -1;
1198 elections are won by first checking the number of connected nodes, then
1199 the priority time, then the pnn
1201 struct election_message {
1202 uint32_t num_connected;
1203 struct timeval priority_time;
1204 uint32_t pnn;
1205 uint32_t node_flags;
1209 form this nodes election data
1211 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1213 unsigned int i;
1214 int ret;
1215 struct ctdb_node_map_old *nodemap;
1216 struct ctdb_context *ctdb = rec->ctdb;
1218 ZERO_STRUCTP(em);
1220 em->pnn = rec->ctdb->pnn;
1221 em->priority_time = rec->priority_time;
1223 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1224 if (ret != 0) {
1225 DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
1226 return;
1229 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
1230 em->node_flags = rec->node_flags;
1232 for (i=0;i<nodemap->num;i++) {
1233 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1234 em->num_connected++;
1238 /* we shouldnt try to win this election if we cant be a recmaster */
1239 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1240 em->num_connected = 0;
1241 em->priority_time = timeval_current();
1244 talloc_free(nodemap);
1248 see if the given election data wins
1250 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1252 struct election_message myem;
1253 int cmp = 0;
1255 ctdb_election_data(rec, &myem);
1257 /* we cant win if we don't have the recmaster capability */
1258 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1259 return false;
1262 /* we cant win if we are banned */
1263 if (rec->node_flags & NODE_FLAGS_BANNED) {
1264 return false;
1267 /* we cant win if we are stopped */
1268 if (rec->node_flags & NODE_FLAGS_STOPPED) {
1269 return false;
1272 /* we will automatically win if the other node is banned */
1273 if (em->node_flags & NODE_FLAGS_BANNED) {
1274 return true;
1277 /* we will automatically win if the other node is banned */
1278 if (em->node_flags & NODE_FLAGS_STOPPED) {
1279 return true;
1282 /* then the longest running node */
1283 if (cmp == 0) {
1284 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1287 if (cmp == 0) {
1288 cmp = (int)myem.pnn - (int)em->pnn;
1291 return cmp > 0;
1295 send out an election request
1297 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
1299 int ret;
1300 TDB_DATA election_data;
1301 struct election_message emsg;
1302 uint64_t srvid;
1303 struct ctdb_context *ctdb = rec->ctdb;
1305 srvid = CTDB_SRVID_ELECTION;
1307 ctdb_election_data(rec, &emsg);
1309 election_data.dsize = sizeof(struct election_message);
1310 election_data.dptr = (unsigned char *)&emsg;
1313 /* first we assume we will win the election and set
1314 recoverymaster to be ourself on the current node
1316 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
1317 CTDB_CURRENT_NODE, pnn);
1318 if (ret != 0) {
1319 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster\n"));
1320 return -1;
1322 rec->recmaster = pnn;
1324 /* send an election message to all active nodes */
1325 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
1326 return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1330 we think we are winning the election - send a broadcast election request
1332 static void election_send_request(struct tevent_context *ev,
1333 struct tevent_timer *te,
1334 struct timeval t, void *p)
1336 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1337 int ret;
1339 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
1340 if (ret != 0) {
1341 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
1344 TALLOC_FREE(rec->send_election_te);
1348 handler for memory dumps
1350 static void mem_dump_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1352 struct ctdb_recoverd *rec = talloc_get_type(
1353 private_data, struct ctdb_recoverd);
1354 struct ctdb_context *ctdb = rec->ctdb;
1355 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1356 TDB_DATA *dump;
1357 int ret;
1358 struct ctdb_srvid_message *rd;
1360 if (data.dsize != sizeof(struct ctdb_srvid_message)) {
1361 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1362 talloc_free(tmp_ctx);
1363 return;
1365 rd = (struct ctdb_srvid_message *)data.dptr;
1367 dump = talloc_zero(tmp_ctx, TDB_DATA);
1368 if (dump == NULL) {
1369 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
1370 talloc_free(tmp_ctx);
1371 return;
1373 ret = ctdb_dump_memory(ctdb, dump);
1374 if (ret != 0) {
1375 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
1376 talloc_free(tmp_ctx);
1377 return;
1380 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
1382 ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
1383 if (ret != 0) {
1384 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
1385 talloc_free(tmp_ctx);
1386 return;
1389 talloc_free(tmp_ctx);
1393 handler for reload_nodes
1395 static void reload_nodes_handler(uint64_t srvid, TDB_DATA data,
1396 void *private_data)
1398 struct ctdb_recoverd *rec = talloc_get_type(
1399 private_data, struct ctdb_recoverd);
1401 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
1403 ctdb_load_nodes_file(rec->ctdb);
1407 static void recd_node_rebalance_handler(uint64_t srvid, TDB_DATA data,
1408 void *private_data)
1410 struct ctdb_recoverd *rec = talloc_get_type(
1411 private_data, struct ctdb_recoverd);
1412 struct ctdb_context *ctdb = rec->ctdb;
1413 uint32_t pnn;
1414 uint32_t *t;
1415 int len;
1417 if (rec->recmaster != ctdb_get_pnn(ctdb)) {
1418 return;
1421 if (data.dsize != sizeof(uint32_t)) {
1422 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
1423 return;
1426 pnn = *(uint32_t *)&data.dptr[0];
1428 DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
1430 /* Copy any existing list of nodes. There's probably some
1431 * sort of realloc variant that will do this but we need to
1432 * make sure that freeing the old array also cancels the timer
1433 * event for the timeout... not sure if realloc will do that.
1435 len = (rec->force_rebalance_nodes != NULL) ?
1436 talloc_array_length(rec->force_rebalance_nodes) :
1439 /* This allows duplicates to be added but they don't cause
1440 * harm. A call to add a duplicate PNN arguably means that
1441 * the timeout should be reset, so this is the simplest
1442 * solution.
1444 t = talloc_zero_array(rec, uint32_t, len+1);
1445 CTDB_NO_MEMORY_VOID(ctdb, t);
1446 if (len > 0) {
1447 memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
1449 t[len] = pnn;
1451 talloc_free(rec->force_rebalance_nodes);
1453 rec->force_rebalance_nodes = t;
1458 static void srvid_disable_and_reply(struct ctdb_context *ctdb,
1459 TDB_DATA data,
1460 struct ctdb_op_state *op_state)
1462 struct ctdb_disable_message *r;
1463 uint32_t timeout;
1464 TDB_DATA result;
1465 int32_t ret = 0;
1467 /* Validate input data */
1468 if (data.dsize != sizeof(struct ctdb_disable_message)) {
1469 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1470 "expecting %lu\n", (long unsigned)data.dsize,
1471 (long unsigned)sizeof(struct ctdb_srvid_message)));
1472 return;
1474 if (data.dptr == NULL) {
1475 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
1476 return;
1479 r = (struct ctdb_disable_message *)data.dptr;
1480 timeout = r->timeout;
1482 ret = ctdb_op_disable(op_state, ctdb->ev, timeout);
1483 if (ret != 0) {
1484 goto done;
1487 /* Returning our PNN tells the caller that we succeeded */
1488 ret = ctdb_get_pnn(ctdb);
1489 done:
1490 result.dsize = sizeof(int32_t);
1491 result.dptr = (uint8_t *)&ret;
1492 srvid_request_reply(ctdb, (struct ctdb_srvid_message *)r, result);
1495 static void disable_takeover_runs_handler(uint64_t srvid, TDB_DATA data,
1496 void *private_data)
1498 struct ctdb_recoverd *rec = talloc_get_type(
1499 private_data, struct ctdb_recoverd);
1501 srvid_disable_and_reply(rec->ctdb, data, rec->takeover_run);
1504 /* Backward compatibility for this SRVID */
1505 static void disable_ip_check_handler(uint64_t srvid, TDB_DATA data,
1506 void *private_data)
1508 struct ctdb_recoverd *rec = talloc_get_type(
1509 private_data, struct ctdb_recoverd);
1510 uint32_t timeout;
1512 if (data.dsize != sizeof(uint32_t)) {
1513 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1514 "expecting %lu\n", (long unsigned)data.dsize,
1515 (long unsigned)sizeof(uint32_t)));
1516 return;
1518 if (data.dptr == NULL) {
1519 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
1520 return;
1523 timeout = *((uint32_t *)data.dptr);
1525 ctdb_op_disable(rec->takeover_run, rec->ctdb->ev, timeout);
1528 static void disable_recoveries_handler(uint64_t srvid, TDB_DATA data,
1529 void *private_data)
1531 struct ctdb_recoverd *rec = talloc_get_type(
1532 private_data, struct ctdb_recoverd);
1534 srvid_disable_and_reply(rec->ctdb, data, rec->recovery);
1538 handler for ip reallocate, just add it to the list of requests and
1539 handle this later in the monitor_cluster loop so we do not recurse
1540 with other requests to takeover_run()
1542 static void ip_reallocate_handler(uint64_t srvid, TDB_DATA data,
1543 void *private_data)
1545 struct ctdb_srvid_message *request;
1546 struct ctdb_recoverd *rec = talloc_get_type(
1547 private_data, struct ctdb_recoverd);
1549 if (data.dsize != sizeof(struct ctdb_srvid_message)) {
1550 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1551 return;
1554 request = (struct ctdb_srvid_message *)data.dptr;
1556 srvid_request_add(rec->ctdb, &rec->reallocate_requests, request);
1559 static void process_ipreallocate_requests(struct ctdb_context *ctdb,
1560 struct ctdb_recoverd *rec)
1562 TDB_DATA result;
1563 int32_t ret;
1564 struct srvid_requests *current;
1566 /* Only process requests that are currently pending. More
1567 * might come in while the takeover run is in progress and
1568 * they will need to be processed later since they might
1569 * be in response flag changes.
1571 current = rec->reallocate_requests;
1572 rec->reallocate_requests = NULL;
1574 if (do_takeover_run(rec, rec->nodemap)) {
1575 ret = ctdb_get_pnn(ctdb);
1576 } else {
1577 ret = -1;
1580 result.dsize = sizeof(int32_t);
1581 result.dptr = (uint8_t *)&ret;
1583 srvid_requests_reply(ctdb, &current, result);
1587 * handler for assigning banning credits
1589 static void banning_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1591 struct ctdb_recoverd *rec = talloc_get_type(
1592 private_data, struct ctdb_recoverd);
1593 uint32_t ban_pnn;
1595 /* Ignore if we are not recmaster */
1596 if (rec->ctdb->pnn != rec->recmaster) {
1597 return;
1600 if (data.dsize != sizeof(uint32_t)) {
1601 DEBUG(DEBUG_ERR, (__location__ "invalid data size %zu\n",
1602 data.dsize));
1603 return;
1606 ban_pnn = *(uint32_t *)data.dptr;
1608 ctdb_set_culprit_count(rec, ban_pnn, rec->nodemap->num);
1612 handler for recovery master elections
1614 static void election_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1616 struct ctdb_recoverd *rec = talloc_get_type(
1617 private_data, struct ctdb_recoverd);
1618 struct ctdb_context *ctdb = rec->ctdb;
1619 int ret;
1620 struct election_message *em = (struct election_message *)data.dptr;
1622 /* Ignore election packets from ourself */
1623 if (ctdb->pnn == em->pnn) {
1624 return;
1627 /* we got an election packet - update the timeout for the election */
1628 talloc_free(rec->election_timeout);
1629 rec->election_timeout = tevent_add_timer(
1630 ctdb->ev, ctdb,
1631 fast_start ?
1632 timeval_current_ofs(0, 500000) :
1633 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1634 ctdb_election_timeout, rec);
1636 /* someone called an election. check their election data
1637 and if we disagree and we would rather be the elected node,
1638 send a new election message to all other nodes
1640 if (ctdb_election_win(rec, em)) {
1641 if (!rec->send_election_te) {
1642 rec->send_election_te = tevent_add_timer(
1643 ctdb->ev, rec,
1644 timeval_current_ofs(0, 500000),
1645 election_send_request, rec);
1647 return;
1650 /* we didn't win */
1651 TALLOC_FREE(rec->send_election_te);
1653 /* Release the recovery lock file */
1654 if (ctdb_recovery_have_lock(rec)) {
1655 ctdb_recovery_unlock(rec);
1658 /* ok, let that guy become recmaster then */
1659 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
1660 CTDB_CURRENT_NODE, em->pnn);
1661 if (ret != 0) {
1662 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster"));
1663 return;
1665 rec->recmaster = em->pnn;
1667 return;
1672 force the start of the election process
1674 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
1675 struct ctdb_node_map_old *nodemap)
1677 int ret;
1678 struct ctdb_context *ctdb = rec->ctdb;
1680 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
1682 /* set all nodes to recovery mode to stop all internode traffic */
1683 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1684 if (ret != 0) {
1685 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1686 return;
1689 talloc_free(rec->election_timeout);
1690 rec->election_timeout = tevent_add_timer(
1691 ctdb->ev, ctdb,
1692 fast_start ?
1693 timeval_current_ofs(0, 500000) :
1694 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1695 ctdb_election_timeout, rec);
1697 ret = send_election_request(rec, pnn);
1698 if (ret!=0) {
1699 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
1700 return;
1703 /* wait for a few seconds to collect all responses */
1704 ctdb_wait_election(rec);
1710 handler for when a node changes its flags
1712 static void monitor_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1714 struct ctdb_recoverd *rec = talloc_get_type(
1715 private_data, struct ctdb_recoverd);
1716 struct ctdb_context *ctdb = rec->ctdb;
1717 int ret;
1718 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
1719 struct ctdb_node_map_old *nodemap=NULL;
1720 TALLOC_CTX *tmp_ctx;
1721 unsigned int i;
1723 if (data.dsize != sizeof(*c)) {
1724 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
1725 return;
1728 tmp_ctx = talloc_new(ctdb);
1729 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
1731 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1732 if (ret != 0) {
1733 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
1734 talloc_free(tmp_ctx);
1735 return;
1739 for (i=0;i<nodemap->num;i++) {
1740 if (nodemap->nodes[i].pnn == c->pnn) break;
1743 if (i == nodemap->num) {
1744 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
1745 talloc_free(tmp_ctx);
1746 return;
1749 if (c->old_flags != c->new_flags) {
1750 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
1753 nodemap->nodes[i].flags = c->new_flags;
1755 talloc_free(tmp_ctx);
1759 handler for when we need to push out flag changes to all other nodes
1761 static void push_flags_handler(uint64_t srvid, TDB_DATA data,
1762 void *private_data)
1764 struct ctdb_recoverd *rec = talloc_get_type(
1765 private_data, struct ctdb_recoverd);
1766 struct ctdb_context *ctdb = rec->ctdb;
1767 int ret;
1768 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
1769 struct ctdb_node_map_old *nodemap=NULL;
1770 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1771 uint32_t *nodes;
1773 /* read the node flags from the recmaster */
1774 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
1775 tmp_ctx, &nodemap);
1776 if (ret != 0) {
1777 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
1778 talloc_free(tmp_ctx);
1779 return;
1781 if (c->pnn >= nodemap->num) {
1782 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
1783 talloc_free(tmp_ctx);
1784 return;
1787 /* send the flags update to all connected nodes */
1788 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
1790 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
1791 nodes, 0, CONTROL_TIMEOUT(),
1792 false, data,
1793 NULL, NULL,
1794 NULL) != 0) {
1795 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
1797 talloc_free(tmp_ctx);
1798 return;
1801 talloc_free(tmp_ctx);
1805 struct verify_recmode_normal_data {
1806 uint32_t count;
1807 enum monitor_result status;
1810 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
1812 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
1815 /* one more node has responded with recmode data*/
1816 rmdata->count--;
1818 /* if we failed to get the recmode, then return an error and let
1819 the main loop try again.
1821 if (state->state != CTDB_CONTROL_DONE) {
1822 if (rmdata->status == MONITOR_OK) {
1823 rmdata->status = MONITOR_FAILED;
1825 return;
1828 /* if we got a response, then the recmode will be stored in the
1829 status field
1831 if (state->status != CTDB_RECOVERY_NORMAL) {
1832 DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
1833 rmdata->status = MONITOR_RECOVERY_NEEDED;
1836 return;
1840 /* verify that all nodes are in normal recovery mode */
1841 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap)
1843 struct verify_recmode_normal_data *rmdata;
1844 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1845 struct ctdb_client_control_state *state;
1846 enum monitor_result status;
1847 unsigned int j;
1849 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
1850 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
1851 rmdata->count = 0;
1852 rmdata->status = MONITOR_OK;
1854 /* loop over all active nodes and send an async getrecmode call to
1855 them*/
1856 for (j=0; j<nodemap->num; j++) {
1857 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1858 continue;
1860 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
1861 CONTROL_TIMEOUT(),
1862 nodemap->nodes[j].pnn);
1863 if (state == NULL) {
1864 /* we failed to send the control, treat this as
1865 an error and try again next iteration
1867 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
1868 talloc_free(mem_ctx);
1869 return MONITOR_FAILED;
1872 /* set up the callback functions */
1873 state->async.fn = verify_recmode_normal_callback;
1874 state->async.private_data = rmdata;
1876 /* one more control to wait for to complete */
1877 rmdata->count++;
1881 /* now wait for up to the maximum number of seconds allowed
1882 or until all nodes we expect a response from has replied
1884 while (rmdata->count > 0) {
1885 tevent_loop_once(ctdb->ev);
1888 status = rmdata->status;
1889 talloc_free(mem_ctx);
1890 return status;
1894 struct verify_recmaster_data {
1895 struct ctdb_recoverd *rec;
1896 uint32_t count;
1897 uint32_t pnn;
1898 enum monitor_result status;
1901 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
1903 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
1906 /* one more node has responded with recmaster data*/
1907 rmdata->count--;
1909 /* if we failed to get the recmaster, then return an error and let
1910 the main loop try again.
1912 if (state->state != CTDB_CONTROL_DONE) {
1913 if (rmdata->status == MONITOR_OK) {
1914 rmdata->status = MONITOR_FAILED;
1916 return;
1919 /* if we got a response, then the recmaster will be stored in the
1920 status field
1922 if ((uint32_t)state->status != rmdata->pnn) {
1923 DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
1924 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
1925 rmdata->status = MONITOR_ELECTION_NEEDED;
1928 return;
1932 /* verify that all nodes agree that we are the recmaster */
1933 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap, uint32_t pnn)
1935 struct ctdb_context *ctdb = rec->ctdb;
1936 struct verify_recmaster_data *rmdata;
1937 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1938 struct ctdb_client_control_state *state;
1939 enum monitor_result status;
1940 unsigned int j;
1942 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
1943 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
1944 rmdata->rec = rec;
1945 rmdata->count = 0;
1946 rmdata->pnn = pnn;
1947 rmdata->status = MONITOR_OK;
1949 /* loop over all active nodes and send an async getrecmaster call to
1950 them*/
1951 for (j=0; j<nodemap->num; j++) {
1952 if (nodemap->nodes[j].pnn == rec->recmaster) {
1953 continue;
1955 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1956 continue;
1958 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
1959 CONTROL_TIMEOUT(),
1960 nodemap->nodes[j].pnn);
1961 if (state == NULL) {
1962 /* we failed to send the control, treat this as
1963 an error and try again next iteration
1965 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
1966 talloc_free(mem_ctx);
1967 return MONITOR_FAILED;
1970 /* set up the callback functions */
1971 state->async.fn = verify_recmaster_callback;
1972 state->async.private_data = rmdata;
1974 /* one more control to wait for to complete */
1975 rmdata->count++;
1979 /* now wait for up to the maximum number of seconds allowed
1980 or until all nodes we expect a response from has replied
1982 while (rmdata->count > 0) {
1983 tevent_loop_once(ctdb->ev);
1986 status = rmdata->status;
1987 talloc_free(mem_ctx);
1988 return status;
1991 static bool interfaces_have_changed(struct ctdb_context *ctdb,
1992 struct ctdb_recoverd *rec)
1994 struct ctdb_iface_list_old *ifaces = NULL;
1995 TALLOC_CTX *mem_ctx;
1996 bool ret = false;
1998 mem_ctx = talloc_new(NULL);
2000 /* Read the interfaces from the local node */
2001 if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
2002 CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
2003 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
2004 /* We could return an error. However, this will be
2005 * rare so we'll decide that the interfaces have
2006 * actually changed, just in case.
2008 talloc_free(mem_ctx);
2009 return true;
2012 if (!rec->ifaces) {
2013 /* We haven't been here before so things have changed */
2014 DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
2015 ret = true;
2016 } else if (rec->ifaces->num != ifaces->num) {
2017 /* Number of interfaces has changed */
2018 DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
2019 rec->ifaces->num, ifaces->num));
2020 ret = true;
2021 } else {
2022 /* See if interface names or link states have changed */
2023 unsigned int i;
2024 for (i = 0; i < rec->ifaces->num; i++) {
2025 struct ctdb_iface * iface = &rec->ifaces->ifaces[i];
2026 if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
2027 DEBUG(DEBUG_NOTICE,
2028 ("Interface in slot %d changed: %s => %s\n",
2029 i, iface->name, ifaces->ifaces[i].name));
2030 ret = true;
2031 break;
2033 if (iface->link_state != ifaces->ifaces[i].link_state) {
2034 DEBUG(DEBUG_NOTICE,
2035 ("Interface %s changed state: %d => %d\n",
2036 iface->name, iface->link_state,
2037 ifaces->ifaces[i].link_state));
2038 ret = true;
2039 break;
2044 talloc_free(rec->ifaces);
2045 rec->ifaces = talloc_steal(rec, ifaces);
2047 talloc_free(mem_ctx);
2048 return ret;
2051 /* Check that the local allocation of public IP addresses is correct
2052 * and do some house-keeping */
2053 static int verify_local_ip_allocation(struct ctdb_context *ctdb,
2054 struct ctdb_recoverd *rec,
2055 uint32_t pnn,
2056 struct ctdb_node_map_old *nodemap)
2058 TALLOC_CTX *mem_ctx = talloc_new(NULL);
2059 unsigned int j;
2060 int ret;
2061 bool need_takeover_run = false;
2062 struct ctdb_public_ip_list_old *ips = NULL;
2064 /* If we are not the recmaster then do some housekeeping */
2065 if (rec->recmaster != pnn) {
2066 /* Ignore any IP reallocate requests - only recmaster
2067 * processes them
2069 TALLOC_FREE(rec->reallocate_requests);
2070 /* Clear any nodes that should be force rebalanced in
2071 * the next takeover run. If the recovery master role
2072 * has moved then we don't want to process these some
2073 * time in the future.
2075 TALLOC_FREE(rec->force_rebalance_nodes);
2078 /* Return early if disabled... */
2079 if (ctdb_config.failover_disabled ||
2080 ctdb_op_is_disabled(rec->takeover_run)) {
2081 return 0;
2084 if (interfaces_have_changed(ctdb, rec)) {
2085 need_takeover_run = true;
2088 /* If there are unhosted IPs but this node can host them then
2089 * trigger an IP reallocation */
2091 /* Read *available* IPs from local node */
2092 ret = ctdb_ctrl_get_public_ips_flags(
2093 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx,
2094 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
2095 if (ret != 0) {
2096 DEBUG(DEBUG_ERR, ("Unable to retrieve available public IPs\n"));
2097 talloc_free(mem_ctx);
2098 return -1;
2101 for (j=0; j<ips->num; j++) {
2102 if (ips->ips[j].pnn == CTDB_UNKNOWN_PNN &&
2103 nodemap->nodes[pnn].flags == 0) {
2104 DEBUG(DEBUG_WARNING,
2105 ("Unassigned IP %s can be served by this node\n",
2106 ctdb_addr_to_str(&ips->ips[j].addr)));
2107 need_takeover_run = true;
2111 talloc_free(ips);
2113 if (!ctdb->do_checkpublicip) {
2114 goto done;
2117 /* Validate the IP addresses that this node has on network
2118 * interfaces. If there is an inconsistency between reality
2119 * and the state expected by CTDB then try to fix it by
2120 * triggering an IP reallocation or releasing extraneous IP
2121 * addresses. */
2123 /* Read *known* IPs from local node */
2124 ret = ctdb_ctrl_get_public_ips_flags(
2125 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
2126 if (ret != 0) {
2127 DEBUG(DEBUG_ERR, ("Unable to retrieve known public IPs\n"));
2128 talloc_free(mem_ctx);
2129 return -1;
2132 for (j=0; j<ips->num; j++) {
2133 if (ips->ips[j].pnn == pnn) {
2134 if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
2135 DEBUG(DEBUG_ERR,
2136 ("Assigned IP %s not on an interface\n",
2137 ctdb_addr_to_str(&ips->ips[j].addr)));
2138 need_takeover_run = true;
2140 } else {
2141 if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
2142 DEBUG(DEBUG_ERR,
2143 ("IP %s incorrectly on an interface\n",
2144 ctdb_addr_to_str(&ips->ips[j].addr)));
2145 need_takeover_run = true;
2150 done:
2151 if (need_takeover_run) {
2152 struct ctdb_srvid_message rd;
2153 TDB_DATA data;
2155 DEBUG(DEBUG_NOTICE,("Trigger takeoverrun\n"));
2157 ZERO_STRUCT(rd);
2158 rd.pnn = ctdb->pnn;
2159 rd.srvid = 0;
2160 data.dptr = (uint8_t *)&rd;
2161 data.dsize = sizeof(rd);
2163 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
2164 if (ret != 0) {
2165 DEBUG(DEBUG_ERR,
2166 ("Failed to send takeover run request\n"));
2169 talloc_free(mem_ctx);
2170 return 0;
2174 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2176 struct ctdb_node_map_old **remote_nodemaps = callback_data;
2178 if (node_pnn >= ctdb->num_nodes) {
2179 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
2180 return;
2183 remote_nodemaps[node_pnn] = (struct ctdb_node_map_old *)talloc_steal(remote_nodemaps, outdata.dptr);
2187 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
2188 struct ctdb_node_map_old *nodemap,
2189 struct ctdb_node_map_old **remote_nodemaps)
2191 uint32_t *nodes;
2193 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
2194 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
2195 nodes, 0,
2196 CONTROL_TIMEOUT(), false, tdb_null,
2197 async_getnodemap_callback,
2198 NULL,
2199 remote_nodemaps) != 0) {
2200 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
2202 return -1;
2205 return 0;
2208 static bool validate_recovery_master(struct ctdb_recoverd *rec,
2209 TALLOC_CTX *mem_ctx)
2211 struct ctdb_context *ctdb = rec->ctdb;
2212 uint32_t pnn = ctdb_get_pnn(ctdb);
2213 struct ctdb_node_map_old *nodemap = rec->nodemap;
2214 struct ctdb_node_map_old *recmaster_nodemap = NULL;
2215 int ret;
2217 /* When recovery daemon is started, recmaster is set to
2218 * "unknown" so it knows to start an election.
2220 if (rec->recmaster == CTDB_UNKNOWN_PNN) {
2221 DEBUG(DEBUG_NOTICE,
2222 ("Initial recovery master set - forcing election\n"));
2223 force_election(rec, pnn, nodemap);
2224 return false;
2228 * If the current recmaster does not have CTDB_CAP_RECMASTER,
2229 * but we have, then force an election and try to become the new
2230 * recmaster.
2232 if (!ctdb_node_has_capabilities(rec->caps,
2233 rec->recmaster,
2234 CTDB_CAP_RECMASTER) &&
2235 (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
2236 !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
2237 DEBUG(DEBUG_ERR,
2238 (" Current recmaster node %u does not have CAP_RECMASTER,"
2239 " but we (node %u) have - force an election\n",
2240 rec->recmaster, pnn));
2241 force_election(rec, pnn, nodemap);
2242 return false;
2245 /* Verify that the master node has not been deleted. This
2246 * should not happen because a node should always be shutdown
2247 * before being deleted, causing a new master to be elected
2248 * before now. However, if something strange has happened
2249 * then checking here will ensure we don't index beyond the
2250 * end of the nodemap array. */
2251 if (rec->recmaster >= nodemap->num) {
2252 DEBUG(DEBUG_ERR,
2253 ("Recmaster node %u has been deleted. Force election\n",
2254 rec->recmaster));
2255 force_election(rec, pnn, nodemap);
2256 return false;
2259 /* if recovery master is disconnected/deleted we must elect a new recmaster */
2260 if (nodemap->nodes[rec->recmaster].flags &
2261 (NODE_FLAGS_DISCONNECTED|NODE_FLAGS_DELETED)) {
2262 DEBUG(DEBUG_NOTICE,
2263 ("Recmaster node %u is disconnected/deleted. Force election\n",
2264 rec->recmaster));
2265 force_election(rec, pnn, nodemap);
2266 return false;
2269 /* get nodemap from the recovery master to check if it is inactive */
2270 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
2271 mem_ctx, &recmaster_nodemap);
2272 if (ret != 0) {
2273 DEBUG(DEBUG_ERR,
2274 (__location__
2275 " Unable to get nodemap from recovery master %u\n",
2276 rec->recmaster));
2277 /* No election, just error */
2278 return false;
2282 if ((recmaster_nodemap->nodes[rec->recmaster].flags & NODE_FLAGS_INACTIVE) &&
2283 (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
2284 DEBUG(DEBUG_NOTICE,
2285 ("Recmaster node %u is inactive. Force election\n",
2286 rec->recmaster));
2288 * update our nodemap to carry the recmaster's notion of
2289 * its own flags, so that we don't keep freezing the
2290 * inactive recmaster node...
2292 nodemap->nodes[rec->recmaster].flags =
2293 recmaster_nodemap->nodes[rec->recmaster].flags;
2294 force_election(rec, pnn, nodemap);
2295 return false;
2298 return true;
2301 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
2302 TALLOC_CTX *mem_ctx)
2304 uint32_t pnn;
2305 struct ctdb_node_map_old *nodemap=NULL;
2306 struct ctdb_node_map_old **remote_nodemaps=NULL;
2307 struct ctdb_vnn_map *vnnmap=NULL;
2308 struct ctdb_vnn_map *remote_vnnmap=NULL;
2309 uint32_t num_lmasters;
2310 int32_t debug_level;
2311 unsigned int i, j;
2312 int ret;
2313 bool self_ban;
2316 /* verify that the main daemon is still running */
2317 if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
2318 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2319 exit(-1);
2322 /* ping the local daemon to tell it we are alive */
2323 ctdb_ctrl_recd_ping(ctdb);
2325 if (rec->election_timeout) {
2326 /* an election is in progress */
2327 return;
2330 /* read the debug level from the parent and update locally */
2331 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
2332 if (ret !=0) {
2333 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
2334 return;
2336 debuglevel_set(debug_level);
2338 /* get relevant tunables */
2339 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
2340 if (ret != 0) {
2341 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
2342 return;
2345 /* get runstate */
2346 ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(),
2347 CTDB_CURRENT_NODE, &ctdb->runstate);
2348 if (ret != 0) {
2349 DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n"));
2350 return;
2353 pnn = ctdb_get_pnn(ctdb);
2355 /* get nodemap */
2356 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &nodemap);
2357 if (ret != 0) {
2358 DBG_ERR("Unable to get nodemap from node %"PRIu32"\n", pnn);
2359 return;
2361 talloc_free(rec->nodemap);
2362 rec->nodemap = nodemap;
2364 /* remember our own node flags */
2365 rec->node_flags = nodemap->nodes[pnn].flags;
2367 ban_misbehaving_nodes(rec, &self_ban);
2368 if (self_ban) {
2369 DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
2370 return;
2373 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2374 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2375 if (ret != 0) {
2376 D_ERR("Failed to read recmode from local node\n");
2377 return;
2380 /* if the local daemon is STOPPED or BANNED, we verify that the databases are
2381 also frozen and that the recmode is set to active.
2383 if (rec->node_flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
2384 /* If this node has become inactive then we want to
2385 * reduce the chances of it taking over the recovery
2386 * master role when it becomes active again. This
2387 * helps to stabilise the recovery master role so that
2388 * it stays on the most stable node.
2390 rec->priority_time = timeval_current();
2392 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2393 DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
2395 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2396 if (ret != 0) {
2397 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
2399 return;
2402 if (! rec->frozen_on_inactive) {
2403 ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(),
2404 CTDB_CURRENT_NODE);
2405 if (ret != 0) {
2406 DEBUG(DEBUG_ERR,
2407 (__location__ " Failed to freeze node "
2408 "in STOPPED or BANNED state\n"));
2409 return;
2412 rec->frozen_on_inactive = true;
2415 /* If this node is stopped or banned then it is not the recovery
2416 * master, so don't do anything. This prevents stopped or banned
2417 * node from starting election and sending unnecessary controls.
2419 return;
2422 rec->frozen_on_inactive = false;
2424 /* Retrieve capabilities from all connected nodes */
2425 ret = update_capabilities(rec, nodemap);
2426 if (ret != 0) {
2427 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
2428 return;
2431 if (! validate_recovery_master(rec, mem_ctx)) {
2432 return;
2435 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2436 /* Check if an IP takeover run is needed and trigger one if
2437 * necessary */
2438 verify_local_ip_allocation(ctdb, rec, pnn, nodemap);
2441 /* if we are not the recmaster then we do not need to check
2442 if recovery is needed
2444 if (pnn != rec->recmaster) {
2445 return;
2449 /* ensure our local copies of flags are right */
2450 ret = update_local_flags(rec, nodemap);
2451 if (ret != 0) {
2452 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
2453 return;
2456 if (ctdb->num_nodes != nodemap->num) {
2457 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
2458 ctdb_load_nodes_file(ctdb);
2459 return;
2462 /* verify that all active nodes agree that we are the recmaster */
2463 switch (verify_recmaster(rec, nodemap, pnn)) {
2464 case MONITOR_RECOVERY_NEEDED:
2465 /* can not happen */
2466 return;
2467 case MONITOR_ELECTION_NEEDED:
2468 force_election(rec, pnn, nodemap);
2469 return;
2470 case MONITOR_OK:
2471 break;
2472 case MONITOR_FAILED:
2473 return;
2477 /* get the vnnmap */
2478 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
2479 if (ret != 0) {
2480 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
2481 return;
2484 if (rec->need_recovery) {
2485 /* a previous recovery didn't finish */
2486 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2487 return;
2490 /* verify that all active nodes are in normal mode
2491 and not in recovery mode
2493 switch (verify_recmode(ctdb, nodemap)) {
2494 case MONITOR_RECOVERY_NEEDED:
2495 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2496 return;
2497 case MONITOR_FAILED:
2498 return;
2499 case MONITOR_ELECTION_NEEDED:
2500 /* can not happen */
2501 case MONITOR_OK:
2502 break;
2506 if (ctdb->recovery_lock != NULL) {
2507 /* We must already hold the recovery lock */
2508 if (!ctdb_recovery_have_lock(rec)) {
2509 DEBUG(DEBUG_ERR,("Failed recovery lock sanity check. Force a recovery\n"));
2510 ctdb_set_culprit(rec, ctdb->pnn);
2511 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2512 return;
2517 /* If recoveries are disabled then there is no use doing any
2518 * nodemap or flags checks. Recoveries might be disabled due
2519 * to "reloadnodes", so doing these checks might cause an
2520 * unnecessary recovery. */
2521 if (ctdb_op_is_disabled(rec->recovery)) {
2522 goto takeover_run_checks;
2525 /* get the nodemap for all active remote nodes
2527 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map_old *, nodemap->num);
2528 if (remote_nodemaps == NULL) {
2529 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
2530 return;
2532 for(i=0; i<nodemap->num; i++) {
2533 remote_nodemaps[i] = NULL;
2535 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
2536 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
2537 return;
2540 /* verify that all other nodes have the same nodemap as we have
2542 for (j=0; j<nodemap->num; j++) {
2543 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2544 continue;
2547 if (remote_nodemaps[j] == NULL) {
2548 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
2549 ctdb_set_culprit(rec, j);
2551 return;
2554 /* if the nodes disagree on how many nodes there are
2555 then this is a good reason to try recovery
2557 if (remote_nodemaps[j]->num != nodemap->num) {
2558 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
2559 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
2560 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2561 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2562 return;
2565 /* if the nodes disagree on which nodes exist and are
2566 active, then that is also a good reason to do recovery
2568 for (i=0;i<nodemap->num;i++) {
2569 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
2570 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
2571 nodemap->nodes[j].pnn, i,
2572 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
2573 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2574 do_recovery(rec, mem_ctx, pnn, nodemap,
2575 vnnmap);
2576 return;
2582 * Update node flags obtained from each active node. This ensure we have
2583 * up-to-date information for all the nodes.
2585 for (j=0; j<nodemap->num; j++) {
2586 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2587 continue;
2589 nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
2592 for (j=0; j<nodemap->num; j++) {
2593 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2594 continue;
2597 /* verify the flags are consistent
2599 for (i=0; i<nodemap->num; i++) {
2600 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2601 continue;
2604 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
2605 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
2606 nodemap->nodes[j].pnn,
2607 nodemap->nodes[i].pnn,
2608 remote_nodemaps[j]->nodes[i].flags,
2609 nodemap->nodes[i].flags));
2610 if (i == j) {
2611 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
2612 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
2613 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2614 do_recovery(rec, mem_ctx, pnn, nodemap,
2615 vnnmap);
2616 return;
2617 } else {
2618 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
2619 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
2620 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2621 do_recovery(rec, mem_ctx, pnn, nodemap,
2622 vnnmap);
2623 return;
2630 /* count how many active nodes there are */
2631 num_lmasters = 0;
2632 for (i=0; i<nodemap->num; i++) {
2633 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
2634 if (ctdb_node_has_capabilities(rec->caps,
2635 ctdb->nodes[i]->pnn,
2636 CTDB_CAP_LMASTER)) {
2637 num_lmasters++;
2643 /* There must be the same number of lmasters in the vnn map as
2644 * there are active nodes with the lmaster capability... or
2645 * do a recovery.
2647 if (vnnmap->size != num_lmasters) {
2648 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
2649 vnnmap->size, num_lmasters));
2650 ctdb_set_culprit(rec, ctdb->pnn);
2651 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2652 return;
2656 * Verify that all active lmaster nodes in the nodemap also
2657 * exist in the vnnmap
2659 for (j=0; j<nodemap->num; j++) {
2660 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2661 continue;
2663 if (! ctdb_node_has_capabilities(rec->caps,
2664 nodemap->nodes[j].pnn,
2665 CTDB_CAP_LMASTER)) {
2666 continue;
2668 if (nodemap->nodes[j].pnn == pnn) {
2669 continue;
2672 for (i=0; i<vnnmap->size; i++) {
2673 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
2674 break;
2677 if (i == vnnmap->size) {
2678 D_ERR("Active LMASTER node %u is not in the vnnmap\n",
2679 nodemap->nodes[j].pnn);
2680 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2681 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2682 return;
2687 /* verify that all other nodes have the same vnnmap
2688 and are from the same generation
2690 for (j=0; j<nodemap->num; j++) {
2691 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2692 continue;
2694 if (nodemap->nodes[j].pnn == pnn) {
2695 continue;
2698 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
2699 mem_ctx, &remote_vnnmap);
2700 if (ret != 0) {
2701 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
2702 nodemap->nodes[j].pnn));
2703 return;
2706 /* verify the vnnmap generation is the same */
2707 if (vnnmap->generation != remote_vnnmap->generation) {
2708 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
2709 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
2710 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2711 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2712 return;
2715 /* verify the vnnmap size is the same */
2716 if (vnnmap->size != remote_vnnmap->size) {
2717 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
2718 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
2719 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2720 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2721 return;
2724 /* verify the vnnmap is the same */
2725 for (i=0;i<vnnmap->size;i++) {
2726 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
2727 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
2728 nodemap->nodes[j].pnn));
2729 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2730 do_recovery(rec, mem_ctx, pnn, nodemap,
2731 vnnmap);
2732 return;
2737 /* FIXME: Add remote public IP checking to ensure that nodes
2738 * have the IP addresses that are allocated to them. */
2740 takeover_run_checks:
2742 /* If there are IP takeover runs requested or the previous one
2743 * failed then perform one and notify the waiters */
2744 if (!ctdb_op_is_disabled(rec->takeover_run) &&
2745 (rec->reallocate_requests || rec->need_takeover_run)) {
2746 process_ipreallocate_requests(ctdb, rec);
2750 static void recd_sig_term_handler(struct tevent_context *ev,
2751 struct tevent_signal *se, int signum,
2752 int count, void *dont_care,
2753 void *private_data)
2755 struct ctdb_recoverd *rec = talloc_get_type_abort(
2756 private_data, struct ctdb_recoverd);
2758 DEBUG(DEBUG_ERR, ("Received SIGTERM, exiting\n"));
2759 ctdb_recovery_unlock(rec);
2760 exit(0);
2764 * Periodically log elements of the cluster state
2766 * This can be used to confirm a split brain has occurred
2768 static void maybe_log_cluster_state(struct tevent_context *ev,
2769 struct tevent_timer *te,
2770 struct timeval current_time,
2771 void *private_data)
2773 struct ctdb_recoverd *rec = talloc_get_type_abort(
2774 private_data, struct ctdb_recoverd);
2775 struct ctdb_context *ctdb = rec->ctdb;
2776 struct tevent_timer *tt;
2778 static struct timeval start_incomplete = {
2779 .tv_sec = 0,
2782 bool is_complete;
2783 bool was_complete;
2784 unsigned int i;
2785 double seconds;
2786 unsigned int minutes;
2787 unsigned int num_connected;
2789 if (rec->recmaster != ctdb_get_pnn(ctdb)) {
2790 goto done;
2793 if (rec->nodemap == NULL) {
2794 goto done;
2797 is_complete = true;
2798 num_connected = 0;
2799 for (i = 0; i < rec->nodemap->num; i++) {
2800 struct ctdb_node_and_flags *n = &rec->nodemap->nodes[i];
2802 if (n->pnn == ctdb_get_pnn(ctdb)) {
2803 continue;
2805 if ((n->flags & NODE_FLAGS_DELETED) != 0) {
2806 continue;
2808 if ((n->flags & NODE_FLAGS_DISCONNECTED) != 0) {
2809 is_complete = false;
2810 continue;
2813 num_connected++;
2816 was_complete = timeval_is_zero(&start_incomplete);
2818 if (is_complete) {
2819 if (! was_complete) {
2820 D_WARNING("Cluster complete with master=%u\n",
2821 rec->recmaster);
2822 start_incomplete = timeval_zero();
2824 goto done;
2827 /* Cluster is newly incomplete... */
2828 if (was_complete) {
2829 start_incomplete = current_time;
2830 minutes = 0;
2831 goto log;
2835 * Cluster has been incomplete since previous check, so figure
2836 * out how long (in minutes) and decide whether to log anything
2838 seconds = timeval_elapsed2(&start_incomplete, &current_time);
2839 minutes = (unsigned int)seconds / 60;
2840 if (minutes >= 60) {
2841 /* Over an hour, log every hour */
2842 if (minutes % 60 != 0) {
2843 goto done;
2845 } else if (minutes >= 10) {
2846 /* Over 10 minutes, log every 10 minutes */
2847 if (minutes % 10 != 0) {
2848 goto done;
2852 log:
2853 D_WARNING("Cluster incomplete with master=%u, elapsed=%u minutes, "
2854 "connected=%u\n",
2855 rec->recmaster,
2856 minutes,
2857 num_connected);
2859 done:
2860 tt = tevent_add_timer(ctdb->ev,
2861 rec,
2862 timeval_current_ofs(60, 0),
2863 maybe_log_cluster_state,
2864 rec);
2865 if (tt == NULL) {
2866 DBG_WARNING("Failed to set up cluster state timer\n");
2871 the main monitoring loop
2873 static void monitor_cluster(struct ctdb_context *ctdb)
2875 struct tevent_signal *se;
2876 struct ctdb_recoverd *rec;
2878 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
2880 rec = talloc_zero(ctdb, struct ctdb_recoverd);
2881 CTDB_NO_MEMORY_FATAL(ctdb, rec);
2883 rec->ctdb = ctdb;
2884 rec->recmaster = CTDB_UNKNOWN_PNN;
2885 rec->recovery_lock_handle = NULL;
2887 rec->takeover_run = ctdb_op_init(rec, "takeover runs");
2888 CTDB_NO_MEMORY_FATAL(ctdb, rec->takeover_run);
2890 rec->recovery = ctdb_op_init(rec, "recoveries");
2891 CTDB_NO_MEMORY_FATAL(ctdb, rec->recovery);
2893 rec->priority_time = timeval_current();
2894 rec->frozen_on_inactive = false;
2896 se = tevent_add_signal(ctdb->ev, ctdb, SIGTERM, 0,
2897 recd_sig_term_handler, rec);
2898 if (se == NULL) {
2899 DEBUG(DEBUG_ERR, ("Failed to install SIGTERM handler\n"));
2900 exit(1);
2903 if (ctdb->recovery_lock == NULL) {
2904 struct tevent_timer *tt;
2906 tt = tevent_add_timer(ctdb->ev,
2907 rec,
2908 timeval_current_ofs(60, 0),
2909 maybe_log_cluster_state,
2910 rec);
2911 if (tt == NULL) {
2912 DBG_WARNING("Failed to set up cluster state timer\n");
2916 /* register a message port for sending memory dumps */
2917 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
2919 /* when a node is assigned banning credits */
2920 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_BANNING,
2921 banning_handler, rec);
2923 /* register a message port for recovery elections */
2924 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_ELECTION, election_handler, rec);
2926 /* when nodes are disabled/enabled */
2927 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
2929 /* when we are asked to puch out a flag change */
2930 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
2932 /* register a message port for reloadnodes */
2933 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
2935 /* register a message port for performing a takeover run */
2936 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
2938 /* register a message port for disabling the ip check for a short while */
2939 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
2941 /* register a message port for forcing a rebalance of a node next
2942 reallocation */
2943 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
2945 /* Register a message port for disabling takeover runs */
2946 ctdb_client_set_message_handler(ctdb,
2947 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
2948 disable_takeover_runs_handler, rec);
2950 /* Register a message port for disabling recoveries */
2951 ctdb_client_set_message_handler(ctdb,
2952 CTDB_SRVID_DISABLE_RECOVERIES,
2953 disable_recoveries_handler, rec);
2955 for (;;) {
2956 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2957 struct timeval start;
2958 double elapsed;
2960 if (!mem_ctx) {
2961 DEBUG(DEBUG_CRIT,(__location__
2962 " Failed to create temp context\n"));
2963 exit(-1);
2966 start = timeval_current();
2967 main_loop(ctdb, rec, mem_ctx);
2968 talloc_free(mem_ctx);
2970 /* we only check for recovery once every second */
2971 elapsed = timeval_elapsed(&start);
2972 if (elapsed < ctdb->tunable.recover_interval) {
2973 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
2974 - elapsed);
2980 event handler for when the main ctdbd dies
2982 static void ctdb_recoverd_parent(struct tevent_context *ev,
2983 struct tevent_fd *fde,
2984 uint16_t flags, void *private_data)
2986 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
2987 _exit(1);
2991 called regularly to verify that the recovery daemon is still running
2993 static void ctdb_check_recd(struct tevent_context *ev,
2994 struct tevent_timer *te,
2995 struct timeval yt, void *p)
2997 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
2999 if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
3000 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
3002 tevent_add_timer(ctdb->ev, ctdb, timeval_zero(),
3003 ctdb_restart_recd, ctdb);
3005 return;
3008 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3009 timeval_current_ofs(30, 0),
3010 ctdb_check_recd, ctdb);
3013 static void recd_sig_child_handler(struct tevent_context *ev,
3014 struct tevent_signal *se, int signum,
3015 int count, void *dont_care,
3016 void *private_data)
3018 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3019 int status;
3020 pid_t pid = -1;
3022 while (pid != 0) {
3023 pid = waitpid(-1, &status, WNOHANG);
3024 if (pid == -1) {
3025 if (errno != ECHILD) {
3026 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3028 return;
3030 if (pid > 0) {
3031 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3037 startup the recovery daemon as a child of the main ctdb daemon
3039 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3041 int fd[2];
3042 struct tevent_signal *se;
3043 struct tevent_fd *fde;
3044 int ret;
3046 if (pipe(fd) != 0) {
3047 return -1;
3050 ctdb->recoverd_pid = ctdb_fork(ctdb);
3051 if (ctdb->recoverd_pid == -1) {
3052 return -1;
3055 if (ctdb->recoverd_pid != 0) {
3056 talloc_free(ctdb->recd_ctx);
3057 ctdb->recd_ctx = talloc_new(ctdb);
3058 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
3060 close(fd[0]);
3061 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3062 timeval_current_ofs(30, 0),
3063 ctdb_check_recd, ctdb);
3064 return 0;
3067 close(fd[1]);
3069 srandom(getpid() ^ time(NULL));
3071 ret = logging_init(ctdb, NULL, NULL, "ctdb-recoverd");
3072 if (ret != 0) {
3073 return -1;
3076 prctl_set_comment("ctdb_recoverd");
3077 if (switch_from_server_to_client(ctdb) != 0) {
3078 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3079 exit(1);
3082 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
3084 fde = tevent_add_fd(ctdb->ev, ctdb, fd[0], TEVENT_FD_READ,
3085 ctdb_recoverd_parent, &fd[0]);
3086 tevent_fd_set_auto_close(fde);
3088 /* set up a handler to pick up sigchld */
3089 se = tevent_add_signal(ctdb->ev, ctdb, SIGCHLD, 0,
3090 recd_sig_child_handler, ctdb);
3091 if (se == NULL) {
3092 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3093 exit(1);
3096 monitor_cluster(ctdb);
3098 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
3099 return -1;
3103 shutdown the recovery daemon
3105 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
3107 if (ctdb->recoverd_pid == 0) {
3108 return;
3111 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
3112 ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
3114 TALLOC_FREE(ctdb->recd_ctx);
3115 TALLOC_FREE(ctdb->recd_ping_count);
3118 static void ctdb_restart_recd(struct tevent_context *ev,
3119 struct tevent_timer *te,
3120 struct timeval t, void *private_data)
3122 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3124 DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
3125 ctdb_stop_recoverd(ctdb);
3126 ctdb_start_recoverd(ctdb);