ctdb-recoverd: Drop unused nodemap argument from update_flags_on_all_nodes()
[Samba.git] / ctdb / server / ctdb_recoverd.c
blob2aa9a14678b2d7537ec8ec92aa1be926e0f960e6
1 /*
2 ctdb recovery daemon
4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
20 #include "replace.h"
21 #include "system/filesys.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/wait.h"
26 #include <popt.h>
27 #include <talloc.h>
28 #include <tevent.h>
29 #include <tdb.h>
31 #include "lib/tdb_wrap/tdb_wrap.h"
32 #include "lib/util/dlinklist.h"
33 #include "lib/util/debug.h"
34 #include "lib/util/samba_util.h"
35 #include "lib/util/sys_rw.h"
36 #include "lib/util/util_process.h"
38 #include "ctdb_private.h"
39 #include "ctdb_client.h"
41 #include "common/system_socket.h"
42 #include "common/common.h"
43 #include "common/logging.h"
45 #include "server/ctdb_config.h"
47 #include "ctdb_cluster_mutex.h"
49 /* List of SRVID requests that need to be processed */
50 struct srvid_list {
51 struct srvid_list *next, *prev;
52 struct ctdb_srvid_message *request;
55 struct srvid_requests {
56 struct srvid_list *requests;
59 static void srvid_request_reply(struct ctdb_context *ctdb,
60 struct ctdb_srvid_message *request,
61 TDB_DATA result)
63 /* Someone that sent srvid==0 does not want a reply */
64 if (request->srvid == 0) {
65 talloc_free(request);
66 return;
69 if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
70 result) == 0) {
71 DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
72 (unsigned)request->pnn,
73 (unsigned long long)request->srvid));
74 } else {
75 DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
76 (unsigned)request->pnn,
77 (unsigned long long)request->srvid));
80 talloc_free(request);
83 static void srvid_requests_reply(struct ctdb_context *ctdb,
84 struct srvid_requests **requests,
85 TDB_DATA result)
87 struct srvid_list *r;
89 if (*requests == NULL) {
90 return;
93 for (r = (*requests)->requests; r != NULL; r = r->next) {
94 srvid_request_reply(ctdb, r->request, result);
97 /* Free the list structure... */
98 TALLOC_FREE(*requests);
101 static void srvid_request_add(struct ctdb_context *ctdb,
102 struct srvid_requests **requests,
103 struct ctdb_srvid_message *request)
105 struct srvid_list *t;
106 int32_t ret;
107 TDB_DATA result;
109 if (*requests == NULL) {
110 *requests = talloc_zero(ctdb, struct srvid_requests);
111 if (*requests == NULL) {
112 goto nomem;
116 t = talloc_zero(*requests, struct srvid_list);
117 if (t == NULL) {
118 /* If *requests was just allocated above then free it */
119 if ((*requests)->requests == NULL) {
120 TALLOC_FREE(*requests);
122 goto nomem;
125 t->request = (struct ctdb_srvid_message *)talloc_steal(t, request);
126 DLIST_ADD((*requests)->requests, t);
128 return;
130 nomem:
131 /* Failed to add the request to the list. Send a fail. */
132 DEBUG(DEBUG_ERR, (__location__
133 " Out of memory, failed to queue SRVID request\n"));
134 ret = -ENOMEM;
135 result.dsize = sizeof(ret);
136 result.dptr = (uint8_t *)&ret;
137 srvid_request_reply(ctdb, request, result);
140 /* An abstraction to allow an operation (takeover runs, recoveries,
141 * ...) to be disabled for a given timeout */
142 struct ctdb_op_state {
143 struct tevent_timer *timer;
144 bool in_progress;
145 const char *name;
148 static struct ctdb_op_state *ctdb_op_init(TALLOC_CTX *mem_ctx, const char *name)
150 struct ctdb_op_state *state = talloc_zero(mem_ctx, struct ctdb_op_state);
152 if (state != NULL) {
153 state->in_progress = false;
154 state->name = name;
157 return state;
160 static bool ctdb_op_is_disabled(struct ctdb_op_state *state)
162 return state->timer != NULL;
165 static bool ctdb_op_begin(struct ctdb_op_state *state)
167 if (ctdb_op_is_disabled(state)) {
168 DEBUG(DEBUG_NOTICE,
169 ("Unable to begin - %s are disabled\n", state->name));
170 return false;
173 state->in_progress = true;
174 return true;
177 static bool ctdb_op_end(struct ctdb_op_state *state)
179 return state->in_progress = false;
182 static bool ctdb_op_is_in_progress(struct ctdb_op_state *state)
184 return state->in_progress;
187 static void ctdb_op_enable(struct ctdb_op_state *state)
189 TALLOC_FREE(state->timer);
192 static void ctdb_op_timeout_handler(struct tevent_context *ev,
193 struct tevent_timer *te,
194 struct timeval yt, void *p)
196 struct ctdb_op_state *state =
197 talloc_get_type(p, struct ctdb_op_state);
199 DEBUG(DEBUG_NOTICE,("Reenabling %s after timeout\n", state->name));
200 ctdb_op_enable(state);
203 static int ctdb_op_disable(struct ctdb_op_state *state,
204 struct tevent_context *ev,
205 uint32_t timeout)
207 if (timeout == 0) {
208 DEBUG(DEBUG_NOTICE,("Reenabling %s\n", state->name));
209 ctdb_op_enable(state);
210 return 0;
213 if (state->in_progress) {
214 DEBUG(DEBUG_ERR,
215 ("Unable to disable %s - in progress\n", state->name));
216 return -EAGAIN;
219 DEBUG(DEBUG_NOTICE,("Disabling %s for %u seconds\n",
220 state->name, timeout));
222 /* Clear any old timers */
223 talloc_free(state->timer);
225 /* Arrange for the timeout to occur */
226 state->timer = tevent_add_timer(ev, state,
227 timeval_current_ofs(timeout, 0),
228 ctdb_op_timeout_handler, state);
229 if (state->timer == NULL) {
230 DEBUG(DEBUG_ERR,(__location__ " Unable to setup timer\n"));
231 return -ENOMEM;
234 return 0;
237 struct ctdb_banning_state {
238 uint32_t count;
239 struct timeval last_reported_time;
242 struct ctdb_recovery_lock_handle;
245 private state of recovery daemon
247 struct ctdb_recoverd {
248 struct ctdb_context *ctdb;
249 uint32_t recmaster;
250 uint32_t last_culprit_node;
251 struct ctdb_node_map_old *nodemap;
252 struct timeval priority_time;
253 bool need_takeover_run;
254 bool need_recovery;
255 uint32_t node_flags;
256 struct tevent_timer *send_election_te;
257 struct tevent_timer *election_timeout;
258 struct srvid_requests *reallocate_requests;
259 struct ctdb_op_state *takeover_run;
260 struct ctdb_op_state *recovery;
261 struct ctdb_iface_list_old *ifaces;
262 uint32_t *force_rebalance_nodes;
263 struct ctdb_node_capabilities *caps;
264 bool frozen_on_inactive;
265 struct ctdb_recovery_lock_handle *recovery_lock_handle;
268 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
269 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
271 static void ctdb_restart_recd(struct tevent_context *ev,
272 struct tevent_timer *te, struct timeval t,
273 void *private_data);
276 ban a node for a period of time
278 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
280 int ret;
281 struct ctdb_context *ctdb = rec->ctdb;
282 struct ctdb_ban_state bantime;
284 if (!ctdb_validate_pnn(ctdb, pnn)) {
285 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
286 return;
289 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
291 bantime.pnn = pnn;
292 bantime.time = ban_time;
294 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
295 if (ret != 0) {
296 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
297 return;
302 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
306 remember the trouble maker
308 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
310 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
311 struct ctdb_banning_state *ban_state;
313 if (culprit > ctdb->num_nodes) {
314 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
315 return;
318 /* If we are banned or stopped, do not set other nodes as culprits */
319 if (rec->node_flags & NODE_FLAGS_INACTIVE) {
320 DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
321 return;
324 if (ctdb->nodes[culprit]->ban_state == NULL) {
325 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
326 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
330 ban_state = ctdb->nodes[culprit]->ban_state;
331 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
332 /* this was the first time in a long while this node
333 misbehaved so we will forgive any old transgressions.
335 ban_state->count = 0;
338 ban_state->count += count;
339 ban_state->last_reported_time = timeval_current();
340 rec->last_culprit_node = culprit;
344 remember the trouble maker
346 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
348 ctdb_set_culprit_count(rec, culprit, 1);
352 Retrieve capabilities from all connected nodes
354 static int update_capabilities(struct ctdb_recoverd *rec,
355 struct ctdb_node_map_old *nodemap)
357 uint32_t *capp;
358 TALLOC_CTX *tmp_ctx;
359 struct ctdb_node_capabilities *caps;
360 struct ctdb_context *ctdb = rec->ctdb;
362 tmp_ctx = talloc_new(rec);
363 CTDB_NO_MEMORY(ctdb, tmp_ctx);
365 caps = ctdb_get_capabilities(ctdb, tmp_ctx,
366 CONTROL_TIMEOUT(), nodemap);
368 if (caps == NULL) {
369 DEBUG(DEBUG_ERR,
370 (__location__ " Failed to get node capabilities\n"));
371 talloc_free(tmp_ctx);
372 return -1;
375 capp = ctdb_get_node_capabilities(caps, ctdb_get_pnn(ctdb));
376 if (capp == NULL) {
377 DEBUG(DEBUG_ERR,
378 (__location__
379 " Capabilities don't include current node.\n"));
380 talloc_free(tmp_ctx);
381 return -1;
383 ctdb->capabilities = *capp;
385 TALLOC_FREE(rec->caps);
386 rec->caps = talloc_steal(rec, caps);
388 talloc_free(tmp_ctx);
389 return 0;
393 change recovery mode on all nodes
395 static int set_recovery_mode(struct ctdb_context *ctdb,
396 struct ctdb_recoverd *rec,
397 struct ctdb_node_map_old *nodemap,
398 uint32_t rec_mode)
400 TDB_DATA data;
401 uint32_t *nodes;
402 TALLOC_CTX *tmp_ctx;
404 tmp_ctx = talloc_new(ctdb);
405 CTDB_NO_MEMORY(ctdb, tmp_ctx);
407 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
409 data.dsize = sizeof(uint32_t);
410 data.dptr = (unsigned char *)&rec_mode;
412 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
413 nodes, 0,
414 CONTROL_TIMEOUT(),
415 false, data,
416 NULL, NULL,
417 NULL) != 0) {
418 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
419 talloc_free(tmp_ctx);
420 return -1;
423 talloc_free(tmp_ctx);
424 return 0;
428 * Update flags on all connected nodes
430 static int update_flags_on_all_nodes(struct ctdb_context *ctdb,
431 uint32_t pnn,
432 uint32_t flags)
434 int ret;
436 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
437 if (ret != 0) {
438 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
439 return -1;
442 return 0;
446 called when ctdb_wait_timeout should finish
448 static void ctdb_wait_handler(struct tevent_context *ev,
449 struct tevent_timer *te,
450 struct timeval yt, void *p)
452 uint32_t *timed_out = (uint32_t *)p;
453 (*timed_out) = 1;
457 wait for a given number of seconds
459 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
461 uint32_t timed_out = 0;
462 time_t usecs = (secs - (time_t)secs) * 1000000;
463 tevent_add_timer(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs),
464 ctdb_wait_handler, &timed_out);
465 while (!timed_out) {
466 tevent_loop_once(ctdb->ev);
471 called when an election times out (ends)
473 static void ctdb_election_timeout(struct tevent_context *ev,
474 struct tevent_timer *te,
475 struct timeval t, void *p)
477 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
478 rec->election_timeout = NULL;
479 fast_start = false;
481 D_WARNING("Election period ended, master=%u\n", rec->recmaster);
486 wait for an election to finish. It finished election_timeout seconds after
487 the last election packet is received
489 static void ctdb_wait_election(struct ctdb_recoverd *rec)
491 struct ctdb_context *ctdb = rec->ctdb;
492 while (rec->election_timeout) {
493 tevent_loop_once(ctdb->ev);
498 Update our local flags from all remote connected nodes.
499 This is only run when we are or we belive we are the recovery master
501 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap)
503 unsigned int j;
504 struct ctdb_context *ctdb = rec->ctdb;
505 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
507 /* get the nodemap for all active remote nodes and verify
508 they are the same as for this node
510 for (j=0; j<nodemap->num; j++) {
511 struct ctdb_node_map_old *remote_nodemap=NULL;
512 int ret;
514 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
515 continue;
517 if (nodemap->nodes[j].pnn == ctdb->pnn) {
518 continue;
521 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
522 mem_ctx, &remote_nodemap);
523 if (ret != 0) {
524 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
525 nodemap->nodes[j].pnn));
526 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
527 talloc_free(mem_ctx);
528 return -1;
530 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
531 /* We should tell our daemon about this so it
532 updates its flags or else we will log the same
533 message again in the next iteration of recovery.
534 Since we are the recovery master we can just as
535 well update the flags on all nodes.
537 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
538 if (ret != 0) {
539 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
540 return -1;
543 /* Update our local copy of the flags in the recovery
544 daemon.
546 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
547 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
548 nodemap->nodes[j].flags));
549 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
551 talloc_free(remote_nodemap);
553 talloc_free(mem_ctx);
554 return 0;
558 /* Create a new random generation id.
559 The generation id can not be the INVALID_GENERATION id
561 static uint32_t new_generation(void)
563 uint32_t generation;
565 while (1) {
566 generation = random();
568 if (generation != INVALID_GENERATION) {
569 break;
573 return generation;
576 static bool ctdb_recovery_have_lock(struct ctdb_recoverd *rec)
578 return (rec->recovery_lock_handle != NULL);
581 struct ctdb_recovery_lock_handle {
582 bool done;
583 bool locked;
584 double latency;
585 struct ctdb_cluster_mutex_handle *h;
586 struct ctdb_recoverd *rec;
589 static void take_reclock_handler(char status,
590 double latency,
591 void *private_data)
593 struct ctdb_recovery_lock_handle *s =
594 (struct ctdb_recovery_lock_handle *) private_data;
596 s->locked = (status == '0') ;
599 * If unsuccessful then ensure the process has exited and that
600 * the file descriptor event handler has been cancelled
602 if (! s->locked) {
603 TALLOC_FREE(s->h);
606 switch (status) {
607 case '0':
608 s->latency = latency;
609 break;
611 case '1':
612 D_ERR("Unable to take recovery lock - contention\n");
613 break;
615 case '2':
616 D_ERR("Unable to take recovery lock - timeout\n");
617 break;
619 default:
620 D_ERR("Unable to take recover lock - unknown error\n");
623 struct ctdb_recoverd *rec = s->rec;
624 struct ctdb_context *ctdb = rec->ctdb;
625 uint32_t pnn = ctdb_get_pnn(ctdb);
627 D_ERR("Banning this node\n");
628 ctdb_ban_node(rec,
629 pnn,
630 ctdb->tunable.recovery_ban_period);
634 s->done = true;
637 static void force_election(struct ctdb_recoverd *rec,
638 uint32_t pnn,
639 struct ctdb_node_map_old *nodemap);
641 static void lost_reclock_handler(void *private_data)
643 struct ctdb_recoverd *rec = talloc_get_type_abort(
644 private_data, struct ctdb_recoverd);
646 D_ERR("Recovery lock helper terminated, triggering an election\n");
647 TALLOC_FREE(rec->recovery_lock_handle);
649 force_election(rec, ctdb_get_pnn(rec->ctdb), rec->nodemap);
652 static bool ctdb_recovery_lock(struct ctdb_recoverd *rec)
654 struct ctdb_context *ctdb = rec->ctdb;
655 struct ctdb_cluster_mutex_handle *h;
656 struct ctdb_recovery_lock_handle *s;
658 s = talloc_zero(rec, struct ctdb_recovery_lock_handle);
659 if (s == NULL) {
660 DBG_ERR("Memory allocation error\n");
661 return false;
664 s->rec = rec;
666 h = ctdb_cluster_mutex(s,
667 ctdb,
668 ctdb->recovery_lock,
669 120,
670 take_reclock_handler,
672 lost_reclock_handler,
673 rec);
674 if (h == NULL) {
675 talloc_free(s);
676 return false;
679 rec->recovery_lock_handle = s;
680 s->h = h;
682 while (! s->done) {
683 tevent_loop_once(ctdb->ev);
686 if (! s->locked) {
687 TALLOC_FREE(rec->recovery_lock_handle);
688 return false;
691 ctdb_ctrl_report_recd_lock_latency(ctdb,
692 CONTROL_TIMEOUT(),
693 s->latency);
695 return true;
698 static void ctdb_recovery_unlock(struct ctdb_recoverd *rec)
700 if (rec->recovery_lock_handle == NULL) {
701 return;
704 if (! rec->recovery_lock_handle->done) {
706 * Taking of recovery lock still in progress. Free
707 * the cluster mutex handle to release it but leave
708 * the recovery lock handle in place to allow taking
709 * of the lock to fail.
711 D_NOTICE("Cancelling recovery lock\n");
712 TALLOC_FREE(rec->recovery_lock_handle->h);
713 rec->recovery_lock_handle->done = true;
714 rec->recovery_lock_handle->locked = false;
715 return;
718 D_NOTICE("Releasing recovery lock\n");
719 TALLOC_FREE(rec->recovery_lock_handle);
722 static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
724 struct ctdb_context *ctdb = rec->ctdb;
725 unsigned int i;
726 struct ctdb_banning_state *ban_state;
728 *self_ban = false;
729 for (i=0; i<ctdb->num_nodes; i++) {
730 if (ctdb->nodes[i]->ban_state == NULL) {
731 continue;
733 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
734 if (ban_state->count < 2*ctdb->num_nodes) {
735 continue;
738 DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
739 ctdb->nodes[i]->pnn, ban_state->count,
740 ctdb->tunable.recovery_ban_period));
741 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
742 ban_state->count = 0;
744 /* Banning ourself? */
745 if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
746 *self_ban = true;
751 struct helper_state {
752 int fd[2];
753 pid_t pid;
754 int result;
755 bool done;
758 static void helper_handler(struct tevent_context *ev,
759 struct tevent_fd *fde,
760 uint16_t flags, void *private_data)
762 struct helper_state *state = talloc_get_type_abort(
763 private_data, struct helper_state);
764 int ret;
766 ret = sys_read(state->fd[0], &state->result, sizeof(state->result));
767 if (ret != sizeof(state->result)) {
768 state->result = EPIPE;
771 state->done = true;
774 static int helper_run(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx,
775 const char *prog, const char *arg, const char *type)
777 struct helper_state *state;
778 struct tevent_fd *fde;
779 const char **args;
780 int nargs, ret;
781 uint32_t recmaster = rec->recmaster;
783 state = talloc_zero(mem_ctx, struct helper_state);
784 if (state == NULL) {
785 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
786 return -1;
789 state->pid = -1;
791 ret = pipe(state->fd);
792 if (ret != 0) {
793 DEBUG(DEBUG_ERR,
794 ("Failed to create pipe for %s helper\n", type));
795 goto fail;
798 set_close_on_exec(state->fd[0]);
800 nargs = 4;
801 args = talloc_array(state, const char *, nargs);
802 if (args == NULL) {
803 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
804 goto fail;
807 args[0] = talloc_asprintf(args, "%d", state->fd[1]);
808 if (args[0] == NULL) {
809 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
810 goto fail;
812 args[1] = rec->ctdb->daemon.name;
813 args[2] = arg;
814 args[3] = NULL;
816 if (args[2] == NULL) {
817 nargs = 3;
820 state->pid = ctdb_vfork_exec(state, rec->ctdb, prog, nargs, args);
821 if (state->pid == -1) {
822 DEBUG(DEBUG_ERR,
823 ("Failed to create child for %s helper\n", type));
824 goto fail;
827 close(state->fd[1]);
828 state->fd[1] = -1;
830 state->done = false;
832 fde = tevent_add_fd(rec->ctdb->ev, state, state->fd[0],
833 TEVENT_FD_READ, helper_handler, state);
834 if (fde == NULL) {
835 goto fail;
837 tevent_fd_set_auto_close(fde);
839 while (!state->done) {
840 tevent_loop_once(rec->ctdb->ev);
842 /* If recmaster changes, we have lost election */
843 if (recmaster != rec->recmaster) {
844 D_ERR("Recmaster changed to %u, aborting %s\n",
845 rec->recmaster, type);
846 state->result = 1;
847 break;
851 close(state->fd[0]);
852 state->fd[0] = -1;
854 if (state->result != 0) {
855 goto fail;
858 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
859 talloc_free(state);
860 return 0;
862 fail:
863 if (state->fd[0] != -1) {
864 close(state->fd[0]);
866 if (state->fd[1] != -1) {
867 close(state->fd[1]);
869 if (state->pid != -1) {
870 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
872 talloc_free(state);
873 return -1;
877 static int ctdb_takeover(struct ctdb_recoverd *rec,
878 uint32_t *force_rebalance_nodes)
880 static char prog[PATH_MAX+1] = "";
881 char *arg;
882 unsigned int i;
883 int ret;
885 if (!ctdb_set_helper("takeover_helper", prog, sizeof(prog),
886 "CTDB_TAKEOVER_HELPER", CTDB_HELPER_BINDIR,
887 "ctdb_takeover_helper")) {
888 ctdb_die(rec->ctdb, "Unable to set takeover helper\n");
891 arg = NULL;
892 for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
893 uint32_t pnn = force_rebalance_nodes[i];
894 if (arg == NULL) {
895 arg = talloc_asprintf(rec, "%u", pnn);
896 } else {
897 arg = talloc_asprintf_append(arg, ",%u", pnn);
899 if (arg == NULL) {
900 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
901 return -1;
905 if (ctdb_config.failover_disabled) {
906 ret = setenv("CTDB_DISABLE_IP_FAILOVER", "1", 1);
907 if (ret != 0) {
908 D_ERR("Failed to set CTDB_DISABLE_IP_FAILOVER variable\n");
909 return -1;
913 return helper_run(rec, rec, prog, arg, "takeover");
916 static bool do_takeover_run(struct ctdb_recoverd *rec,
917 struct ctdb_node_map_old *nodemap)
919 uint32_t *nodes = NULL;
920 struct ctdb_disable_message dtr;
921 TDB_DATA data;
922 size_t i;
923 uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
924 int ret;
925 bool ok;
927 DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
929 if (ctdb_op_is_in_progress(rec->takeover_run)) {
930 DEBUG(DEBUG_ERR, (__location__
931 " takeover run already in progress \n"));
932 ok = false;
933 goto done;
936 if (!ctdb_op_begin(rec->takeover_run)) {
937 ok = false;
938 goto done;
941 /* Disable IP checks (takeover runs, really) on other nodes
942 * while doing this takeover run. This will stop those other
943 * nodes from triggering takeover runs when think they should
944 * be hosting an IP but it isn't yet on an interface. Don't
945 * wait for replies since a failure here might cause some
946 * noise in the logs but will not actually cause a problem.
948 ZERO_STRUCT(dtr);
949 dtr.srvid = 0; /* No reply */
950 dtr.pnn = -1;
952 data.dptr = (uint8_t*)&dtr;
953 data.dsize = sizeof(dtr);
955 nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
957 /* Disable for 60 seconds. This can be a tunable later if
958 * necessary.
960 dtr.timeout = 60;
961 for (i = 0; i < talloc_array_length(nodes); i++) {
962 if (ctdb_client_send_message(rec->ctdb, nodes[i],
963 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
964 data) != 0) {
965 DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
969 ret = ctdb_takeover(rec, rec->force_rebalance_nodes);
971 /* Reenable takeover runs and IP checks on other nodes */
972 dtr.timeout = 0;
973 for (i = 0; i < talloc_array_length(nodes); i++) {
974 if (ctdb_client_send_message(rec->ctdb, nodes[i],
975 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
976 data) != 0) {
977 DEBUG(DEBUG_INFO,("Failed to re-enable takeover runs\n"));
981 if (ret != 0) {
982 DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
983 ok = false;
984 goto done;
987 ok = true;
988 /* Takeover run was successful so clear force rebalance targets */
989 if (rebalance_nodes == rec->force_rebalance_nodes) {
990 TALLOC_FREE(rec->force_rebalance_nodes);
991 } else {
992 DEBUG(DEBUG_WARNING,
993 ("Rebalance target nodes changed during takeover run - not clearing\n"));
995 done:
996 rec->need_takeover_run = !ok;
997 talloc_free(nodes);
998 ctdb_op_end(rec->takeover_run);
1000 DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
1001 return ok;
1004 static int db_recovery_parallel(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx)
1006 static char prog[PATH_MAX+1] = "";
1007 const char *arg;
1009 if (!ctdb_set_helper("recovery_helper", prog, sizeof(prog),
1010 "CTDB_RECOVERY_HELPER", CTDB_HELPER_BINDIR,
1011 "ctdb_recovery_helper")) {
1012 ctdb_die(rec->ctdb, "Unable to set recovery helper\n");
1015 arg = talloc_asprintf(mem_ctx, "%u", new_generation());
1016 if (arg == NULL) {
1017 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1018 return -1;
1021 setenv("CTDB_DBDIR_STATE", rec->ctdb->db_directory_state, 1);
1023 return helper_run(rec, mem_ctx, prog, arg, "recovery");
1027 we are the recmaster, and recovery is needed - start a recovery run
1029 static int do_recovery(struct ctdb_recoverd *rec,
1030 TALLOC_CTX *mem_ctx, uint32_t pnn,
1031 struct ctdb_node_map_old *nodemap, struct ctdb_vnn_map *vnnmap)
1033 struct ctdb_context *ctdb = rec->ctdb;
1034 unsigned int i;
1035 int ret;
1036 bool self_ban;
1038 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1040 /* Check if the current node is still the recmaster. It's possible that
1041 * re-election has changed the recmaster.
1043 if (pnn != rec->recmaster) {
1044 DEBUG(DEBUG_NOTICE,
1045 ("Recovery master changed to %u, aborting recovery\n",
1046 rec->recmaster));
1047 return -1;
1050 /* if recovery fails, force it again */
1051 rec->need_recovery = true;
1053 if (!ctdb_op_begin(rec->recovery)) {
1054 return -1;
1057 if (rec->election_timeout) {
1058 /* an election is in progress */
1059 DEBUG(DEBUG_ERR, ("do_recovery called while election in progress - try again later\n"));
1060 goto fail;
1063 ban_misbehaving_nodes(rec, &self_ban);
1064 if (self_ban) {
1065 DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
1066 goto fail;
1069 if (ctdb->recovery_lock != NULL) {
1070 if (ctdb_recovery_have_lock(rec)) {
1071 D_NOTICE("Already holding recovery lock\n");
1072 } else {
1073 bool ok;
1075 D_NOTICE("Attempting to take recovery lock (%s)\n",
1076 ctdb->recovery_lock);
1078 ok = ctdb_recovery_lock(rec);
1079 if (! ok) {
1080 D_ERR("Unable to take recovery lock\n");
1082 if (pnn != rec->recmaster) {
1083 D_NOTICE("Recovery master changed to %u,"
1084 " aborting recovery\n",
1085 rec->recmaster);
1086 rec->need_recovery = false;
1087 goto fail;
1090 if (ctdb->runstate ==
1091 CTDB_RUNSTATE_FIRST_RECOVERY) {
1093 * First recovery? Perhaps
1094 * current node does not yet
1095 * know who the recmaster is.
1097 D_ERR("Retrying recovery\n");
1098 goto fail;
1101 D_ERR("Abort recovery, "
1102 "ban this node for %u seconds\n",
1103 ctdb->tunable.recovery_ban_period);
1104 ctdb_ban_node(rec,
1105 pnn,
1106 ctdb->tunable.recovery_ban_period);
1107 goto fail;
1109 D_NOTICE("Recovery lock taken successfully\n");
1113 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1115 /* Retrieve capabilities from all connected nodes */
1116 ret = update_capabilities(rec, nodemap);
1117 if (ret!=0) {
1118 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1119 return -1;
1123 update all nodes to have the same flags that we have
1125 for (i=0;i<nodemap->num;i++) {
1126 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1127 continue;
1130 ret = update_flags_on_all_nodes(ctdb,
1132 nodemap->nodes[i].flags);
1133 if (ret != 0) {
1134 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1135 DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
1136 } else {
1137 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1138 return -1;
1143 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1145 ret = db_recovery_parallel(rec, mem_ctx);
1146 if (ret != 0) {
1147 goto fail;
1150 do_takeover_run(rec, nodemap);
1152 /* send a message to all clients telling them that the cluster
1153 has been reconfigured */
1154 ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
1155 CTDB_SRVID_RECONFIGURE, tdb_null);
1156 if (ret != 0) {
1157 DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n"));
1158 goto fail;
1161 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1163 rec->need_recovery = false;
1164 ctdb_op_end(rec->recovery);
1166 /* we managed to complete a full recovery, make sure to forgive
1167 any past sins by the nodes that could now participate in the
1168 recovery.
1170 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
1171 for (i=0;i<nodemap->num;i++) {
1172 struct ctdb_banning_state *ban_state;
1174 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1175 continue;
1178 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
1179 if (ban_state == NULL) {
1180 continue;
1183 ban_state->count = 0;
1186 /* We just finished a recovery successfully.
1187 We now wait for rerecovery_timeout before we allow
1188 another recovery to take place.
1190 DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be suppressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
1191 ctdb_op_disable(rec->recovery, ctdb->ev,
1192 ctdb->tunable.rerecovery_timeout);
1193 return 0;
1195 fail:
1196 ctdb_op_end(rec->recovery);
1197 return -1;
1202 elections are won by first checking the number of connected nodes, then
1203 the priority time, then the pnn
1205 struct election_message {
1206 uint32_t num_connected;
1207 struct timeval priority_time;
1208 uint32_t pnn;
1209 uint32_t node_flags;
1213 form this nodes election data
1215 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1217 unsigned int i;
1218 int ret;
1219 struct ctdb_node_map_old *nodemap;
1220 struct ctdb_context *ctdb = rec->ctdb;
1222 ZERO_STRUCTP(em);
1224 em->pnn = rec->ctdb->pnn;
1225 em->priority_time = rec->priority_time;
1227 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1228 if (ret != 0) {
1229 DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
1230 return;
1233 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
1234 em->node_flags = rec->node_flags;
1236 for (i=0;i<nodemap->num;i++) {
1237 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1238 em->num_connected++;
1242 /* we shouldnt try to win this election if we cant be a recmaster */
1243 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1244 em->num_connected = 0;
1245 em->priority_time = timeval_current();
1248 talloc_free(nodemap);
1252 see if the given election data wins
1254 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1256 struct election_message myem;
1257 int cmp = 0;
1259 ctdb_election_data(rec, &myem);
1261 /* we cant win if we don't have the recmaster capability */
1262 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1263 return false;
1266 /* we cant win if we are banned */
1267 if (rec->node_flags & NODE_FLAGS_BANNED) {
1268 return false;
1271 /* we cant win if we are stopped */
1272 if (rec->node_flags & NODE_FLAGS_STOPPED) {
1273 return false;
1276 /* we will automatically win if the other node is banned */
1277 if (em->node_flags & NODE_FLAGS_BANNED) {
1278 return true;
1281 /* we will automatically win if the other node is banned */
1282 if (em->node_flags & NODE_FLAGS_STOPPED) {
1283 return true;
1286 /* then the longest running node */
1287 if (cmp == 0) {
1288 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1291 if (cmp == 0) {
1292 cmp = (int)myem.pnn - (int)em->pnn;
1295 return cmp > 0;
1299 send out an election request
1301 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
1303 int ret;
1304 TDB_DATA election_data;
1305 struct election_message emsg;
1306 uint64_t srvid;
1307 struct ctdb_context *ctdb = rec->ctdb;
1309 srvid = CTDB_SRVID_ELECTION;
1311 ctdb_election_data(rec, &emsg);
1313 election_data.dsize = sizeof(struct election_message);
1314 election_data.dptr = (unsigned char *)&emsg;
1317 /* first we assume we will win the election and set
1318 recoverymaster to be ourself on the current node
1320 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
1321 CTDB_CURRENT_NODE, pnn);
1322 if (ret != 0) {
1323 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster\n"));
1324 return -1;
1326 rec->recmaster = pnn;
1328 /* send an election message to all active nodes */
1329 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
1330 return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1334 we think we are winning the election - send a broadcast election request
1336 static void election_send_request(struct tevent_context *ev,
1337 struct tevent_timer *te,
1338 struct timeval t, void *p)
1340 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1341 int ret;
1343 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
1344 if (ret != 0) {
1345 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
1348 TALLOC_FREE(rec->send_election_te);
1352 handler for memory dumps
1354 static void mem_dump_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1356 struct ctdb_recoverd *rec = talloc_get_type(
1357 private_data, struct ctdb_recoverd);
1358 struct ctdb_context *ctdb = rec->ctdb;
1359 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1360 TDB_DATA *dump;
1361 int ret;
1362 struct ctdb_srvid_message *rd;
1364 if (data.dsize != sizeof(struct ctdb_srvid_message)) {
1365 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1366 talloc_free(tmp_ctx);
1367 return;
1369 rd = (struct ctdb_srvid_message *)data.dptr;
1371 dump = talloc_zero(tmp_ctx, TDB_DATA);
1372 if (dump == NULL) {
1373 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
1374 talloc_free(tmp_ctx);
1375 return;
1377 ret = ctdb_dump_memory(ctdb, dump);
1378 if (ret != 0) {
1379 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
1380 talloc_free(tmp_ctx);
1381 return;
1384 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
1386 ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
1387 if (ret != 0) {
1388 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
1389 talloc_free(tmp_ctx);
1390 return;
1393 talloc_free(tmp_ctx);
1397 handler for reload_nodes
1399 static void reload_nodes_handler(uint64_t srvid, TDB_DATA data,
1400 void *private_data)
1402 struct ctdb_recoverd *rec = talloc_get_type(
1403 private_data, struct ctdb_recoverd);
1405 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
1407 ctdb_load_nodes_file(rec->ctdb);
1411 static void recd_node_rebalance_handler(uint64_t srvid, TDB_DATA data,
1412 void *private_data)
1414 struct ctdb_recoverd *rec = talloc_get_type(
1415 private_data, struct ctdb_recoverd);
1416 struct ctdb_context *ctdb = rec->ctdb;
1417 uint32_t pnn;
1418 uint32_t *t;
1419 int len;
1421 if (rec->recmaster != ctdb_get_pnn(ctdb)) {
1422 return;
1425 if (data.dsize != sizeof(uint32_t)) {
1426 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
1427 return;
1430 pnn = *(uint32_t *)&data.dptr[0];
1432 DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
1434 /* Copy any existing list of nodes. There's probably some
1435 * sort of realloc variant that will do this but we need to
1436 * make sure that freeing the old array also cancels the timer
1437 * event for the timeout... not sure if realloc will do that.
1439 len = (rec->force_rebalance_nodes != NULL) ?
1440 talloc_array_length(rec->force_rebalance_nodes) :
1443 /* This allows duplicates to be added but they don't cause
1444 * harm. A call to add a duplicate PNN arguably means that
1445 * the timeout should be reset, so this is the simplest
1446 * solution.
1448 t = talloc_zero_array(rec, uint32_t, len+1);
1449 CTDB_NO_MEMORY_VOID(ctdb, t);
1450 if (len > 0) {
1451 memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
1453 t[len] = pnn;
1455 talloc_free(rec->force_rebalance_nodes);
1457 rec->force_rebalance_nodes = t;
1462 static void srvid_disable_and_reply(struct ctdb_context *ctdb,
1463 TDB_DATA data,
1464 struct ctdb_op_state *op_state)
1466 struct ctdb_disable_message *r;
1467 uint32_t timeout;
1468 TDB_DATA result;
1469 int32_t ret = 0;
1471 /* Validate input data */
1472 if (data.dsize != sizeof(struct ctdb_disable_message)) {
1473 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1474 "expecting %lu\n", (long unsigned)data.dsize,
1475 (long unsigned)sizeof(struct ctdb_srvid_message)));
1476 return;
1478 if (data.dptr == NULL) {
1479 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
1480 return;
1483 r = (struct ctdb_disable_message *)data.dptr;
1484 timeout = r->timeout;
1486 ret = ctdb_op_disable(op_state, ctdb->ev, timeout);
1487 if (ret != 0) {
1488 goto done;
1491 /* Returning our PNN tells the caller that we succeeded */
1492 ret = ctdb_get_pnn(ctdb);
1493 done:
1494 result.dsize = sizeof(int32_t);
1495 result.dptr = (uint8_t *)&ret;
1496 srvid_request_reply(ctdb, (struct ctdb_srvid_message *)r, result);
1499 static void disable_takeover_runs_handler(uint64_t srvid, TDB_DATA data,
1500 void *private_data)
1502 struct ctdb_recoverd *rec = talloc_get_type(
1503 private_data, struct ctdb_recoverd);
1505 srvid_disable_and_reply(rec->ctdb, data, rec->takeover_run);
1508 /* Backward compatibility for this SRVID */
1509 static void disable_ip_check_handler(uint64_t srvid, TDB_DATA data,
1510 void *private_data)
1512 struct ctdb_recoverd *rec = talloc_get_type(
1513 private_data, struct ctdb_recoverd);
1514 uint32_t timeout;
1516 if (data.dsize != sizeof(uint32_t)) {
1517 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1518 "expecting %lu\n", (long unsigned)data.dsize,
1519 (long unsigned)sizeof(uint32_t)));
1520 return;
1522 if (data.dptr == NULL) {
1523 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
1524 return;
1527 timeout = *((uint32_t *)data.dptr);
1529 ctdb_op_disable(rec->takeover_run, rec->ctdb->ev, timeout);
1532 static void disable_recoveries_handler(uint64_t srvid, TDB_DATA data,
1533 void *private_data)
1535 struct ctdb_recoverd *rec = talloc_get_type(
1536 private_data, struct ctdb_recoverd);
1538 srvid_disable_and_reply(rec->ctdb, data, rec->recovery);
1542 handler for ip reallocate, just add it to the list of requests and
1543 handle this later in the monitor_cluster loop so we do not recurse
1544 with other requests to takeover_run()
1546 static void ip_reallocate_handler(uint64_t srvid, TDB_DATA data,
1547 void *private_data)
1549 struct ctdb_srvid_message *request;
1550 struct ctdb_recoverd *rec = talloc_get_type(
1551 private_data, struct ctdb_recoverd);
1553 if (data.dsize != sizeof(struct ctdb_srvid_message)) {
1554 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1555 return;
1558 request = (struct ctdb_srvid_message *)data.dptr;
1560 srvid_request_add(rec->ctdb, &rec->reallocate_requests, request);
1563 static void process_ipreallocate_requests(struct ctdb_context *ctdb,
1564 struct ctdb_recoverd *rec)
1566 TDB_DATA result;
1567 int32_t ret;
1568 struct srvid_requests *current;
1570 /* Only process requests that are currently pending. More
1571 * might come in while the takeover run is in progress and
1572 * they will need to be processed later since they might
1573 * be in response flag changes.
1575 current = rec->reallocate_requests;
1576 rec->reallocate_requests = NULL;
1578 if (do_takeover_run(rec, rec->nodemap)) {
1579 ret = ctdb_get_pnn(ctdb);
1580 } else {
1581 ret = -1;
1584 result.dsize = sizeof(int32_t);
1585 result.dptr = (uint8_t *)&ret;
1587 srvid_requests_reply(ctdb, &current, result);
1591 * handler for assigning banning credits
1593 static void banning_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1595 struct ctdb_recoverd *rec = talloc_get_type(
1596 private_data, struct ctdb_recoverd);
1597 uint32_t ban_pnn;
1599 /* Ignore if we are not recmaster */
1600 if (rec->ctdb->pnn != rec->recmaster) {
1601 return;
1604 if (data.dsize != sizeof(uint32_t)) {
1605 DEBUG(DEBUG_ERR, (__location__ "invalid data size %zu\n",
1606 data.dsize));
1607 return;
1610 ban_pnn = *(uint32_t *)data.dptr;
1612 ctdb_set_culprit_count(rec, ban_pnn, rec->nodemap->num);
1616 handler for recovery master elections
1618 static void election_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1620 struct ctdb_recoverd *rec = talloc_get_type(
1621 private_data, struct ctdb_recoverd);
1622 struct ctdb_context *ctdb = rec->ctdb;
1623 int ret;
1624 struct election_message *em = (struct election_message *)data.dptr;
1626 /* Ignore election packets from ourself */
1627 if (ctdb->pnn == em->pnn) {
1628 return;
1631 /* we got an election packet - update the timeout for the election */
1632 talloc_free(rec->election_timeout);
1633 rec->election_timeout = tevent_add_timer(
1634 ctdb->ev, ctdb,
1635 fast_start ?
1636 timeval_current_ofs(0, 500000) :
1637 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1638 ctdb_election_timeout, rec);
1640 /* someone called an election. check their election data
1641 and if we disagree and we would rather be the elected node,
1642 send a new election message to all other nodes
1644 if (ctdb_election_win(rec, em)) {
1645 if (!rec->send_election_te) {
1646 rec->send_election_te = tevent_add_timer(
1647 ctdb->ev, rec,
1648 timeval_current_ofs(0, 500000),
1649 election_send_request, rec);
1651 return;
1654 /* we didn't win */
1655 TALLOC_FREE(rec->send_election_te);
1657 /* Release the recovery lock file */
1658 if (ctdb_recovery_have_lock(rec)) {
1659 ctdb_recovery_unlock(rec);
1662 /* ok, let that guy become recmaster then */
1663 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
1664 CTDB_CURRENT_NODE, em->pnn);
1665 if (ret != 0) {
1666 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster"));
1667 return;
1669 rec->recmaster = em->pnn;
1671 return;
1676 force the start of the election process
1678 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
1679 struct ctdb_node_map_old *nodemap)
1681 int ret;
1682 struct ctdb_context *ctdb = rec->ctdb;
1684 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
1686 /* set all nodes to recovery mode to stop all internode traffic */
1687 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1688 if (ret != 0) {
1689 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1690 return;
1693 talloc_free(rec->election_timeout);
1694 rec->election_timeout = tevent_add_timer(
1695 ctdb->ev, ctdb,
1696 fast_start ?
1697 timeval_current_ofs(0, 500000) :
1698 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1699 ctdb_election_timeout, rec);
1701 ret = send_election_request(rec, pnn);
1702 if (ret!=0) {
1703 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
1704 return;
1707 /* wait for a few seconds to collect all responses */
1708 ctdb_wait_election(rec);
1714 handler for when a node changes its flags
1716 static void monitor_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1718 struct ctdb_recoverd *rec = talloc_get_type(
1719 private_data, struct ctdb_recoverd);
1720 struct ctdb_context *ctdb = rec->ctdb;
1721 int ret;
1722 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
1723 struct ctdb_node_map_old *nodemap=NULL;
1724 TALLOC_CTX *tmp_ctx;
1725 unsigned int i;
1727 if (data.dsize != sizeof(*c)) {
1728 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
1729 return;
1732 tmp_ctx = talloc_new(ctdb);
1733 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
1735 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1736 if (ret != 0) {
1737 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
1738 talloc_free(tmp_ctx);
1739 return;
1743 for (i=0;i<nodemap->num;i++) {
1744 if (nodemap->nodes[i].pnn == c->pnn) break;
1747 if (i == nodemap->num) {
1748 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
1749 talloc_free(tmp_ctx);
1750 return;
1753 if (c->old_flags != c->new_flags) {
1754 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
1757 nodemap->nodes[i].flags = c->new_flags;
1759 talloc_free(tmp_ctx);
1763 handler for when we need to push out flag changes to all other nodes
1765 static void push_flags_handler(uint64_t srvid, TDB_DATA data,
1766 void *private_data)
1768 struct ctdb_recoverd *rec = talloc_get_type(
1769 private_data, struct ctdb_recoverd);
1770 struct ctdb_context *ctdb = rec->ctdb;
1771 int ret;
1772 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
1773 struct ctdb_node_map_old *nodemap=NULL;
1774 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1775 uint32_t *nodes;
1777 /* read the node flags from the recmaster */
1778 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
1779 tmp_ctx, &nodemap);
1780 if (ret != 0) {
1781 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
1782 talloc_free(tmp_ctx);
1783 return;
1785 if (c->pnn >= nodemap->num) {
1786 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
1787 talloc_free(tmp_ctx);
1788 return;
1791 /* send the flags update to all connected nodes */
1792 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
1794 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
1795 nodes, 0, CONTROL_TIMEOUT(),
1796 false, data,
1797 NULL, NULL,
1798 NULL) != 0) {
1799 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
1801 talloc_free(tmp_ctx);
1802 return;
1805 talloc_free(tmp_ctx);
1809 struct verify_recmode_normal_data {
1810 uint32_t count;
1811 enum monitor_result status;
1814 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
1816 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
1819 /* one more node has responded with recmode data*/
1820 rmdata->count--;
1822 /* if we failed to get the recmode, then return an error and let
1823 the main loop try again.
1825 if (state->state != CTDB_CONTROL_DONE) {
1826 if (rmdata->status == MONITOR_OK) {
1827 rmdata->status = MONITOR_FAILED;
1829 return;
1832 /* if we got a response, then the recmode will be stored in the
1833 status field
1835 if (state->status != CTDB_RECOVERY_NORMAL) {
1836 DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
1837 rmdata->status = MONITOR_RECOVERY_NEEDED;
1840 return;
1844 /* verify that all nodes are in normal recovery mode */
1845 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap)
1847 struct verify_recmode_normal_data *rmdata;
1848 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1849 struct ctdb_client_control_state *state;
1850 enum monitor_result status;
1851 unsigned int j;
1853 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
1854 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
1855 rmdata->count = 0;
1856 rmdata->status = MONITOR_OK;
1858 /* loop over all active nodes and send an async getrecmode call to
1859 them*/
1860 for (j=0; j<nodemap->num; j++) {
1861 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1862 continue;
1864 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
1865 CONTROL_TIMEOUT(),
1866 nodemap->nodes[j].pnn);
1867 if (state == NULL) {
1868 /* we failed to send the control, treat this as
1869 an error and try again next iteration
1871 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
1872 talloc_free(mem_ctx);
1873 return MONITOR_FAILED;
1876 /* set up the callback functions */
1877 state->async.fn = verify_recmode_normal_callback;
1878 state->async.private_data = rmdata;
1880 /* one more control to wait for to complete */
1881 rmdata->count++;
1885 /* now wait for up to the maximum number of seconds allowed
1886 or until all nodes we expect a response from has replied
1888 while (rmdata->count > 0) {
1889 tevent_loop_once(ctdb->ev);
1892 status = rmdata->status;
1893 talloc_free(mem_ctx);
1894 return status;
1898 struct verify_recmaster_data {
1899 struct ctdb_recoverd *rec;
1900 uint32_t count;
1901 uint32_t pnn;
1902 enum monitor_result status;
1905 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
1907 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
1910 /* one more node has responded with recmaster data*/
1911 rmdata->count--;
1913 /* if we failed to get the recmaster, then return an error and let
1914 the main loop try again.
1916 if (state->state != CTDB_CONTROL_DONE) {
1917 if (rmdata->status == MONITOR_OK) {
1918 rmdata->status = MONITOR_FAILED;
1920 return;
1923 /* if we got a response, then the recmaster will be stored in the
1924 status field
1926 if ((uint32_t)state->status != rmdata->pnn) {
1927 DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
1928 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
1929 rmdata->status = MONITOR_ELECTION_NEEDED;
1932 return;
1936 /* verify that all nodes agree that we are the recmaster */
1937 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap, uint32_t pnn)
1939 struct ctdb_context *ctdb = rec->ctdb;
1940 struct verify_recmaster_data *rmdata;
1941 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1942 struct ctdb_client_control_state *state;
1943 enum monitor_result status;
1944 unsigned int j;
1946 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
1947 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
1948 rmdata->rec = rec;
1949 rmdata->count = 0;
1950 rmdata->pnn = pnn;
1951 rmdata->status = MONITOR_OK;
1953 /* loop over all active nodes and send an async getrecmaster call to
1954 them*/
1955 for (j=0; j<nodemap->num; j++) {
1956 if (nodemap->nodes[j].pnn == rec->recmaster) {
1957 continue;
1959 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1960 continue;
1962 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
1963 CONTROL_TIMEOUT(),
1964 nodemap->nodes[j].pnn);
1965 if (state == NULL) {
1966 /* we failed to send the control, treat this as
1967 an error and try again next iteration
1969 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
1970 talloc_free(mem_ctx);
1971 return MONITOR_FAILED;
1974 /* set up the callback functions */
1975 state->async.fn = verify_recmaster_callback;
1976 state->async.private_data = rmdata;
1978 /* one more control to wait for to complete */
1979 rmdata->count++;
1983 /* now wait for up to the maximum number of seconds allowed
1984 or until all nodes we expect a response from has replied
1986 while (rmdata->count > 0) {
1987 tevent_loop_once(ctdb->ev);
1990 status = rmdata->status;
1991 talloc_free(mem_ctx);
1992 return status;
1995 static bool interfaces_have_changed(struct ctdb_context *ctdb,
1996 struct ctdb_recoverd *rec)
1998 struct ctdb_iface_list_old *ifaces = NULL;
1999 TALLOC_CTX *mem_ctx;
2000 bool ret = false;
2002 mem_ctx = talloc_new(NULL);
2004 /* Read the interfaces from the local node */
2005 if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
2006 CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
2007 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
2008 /* We could return an error. However, this will be
2009 * rare so we'll decide that the interfaces have
2010 * actually changed, just in case.
2012 talloc_free(mem_ctx);
2013 return true;
2016 if (!rec->ifaces) {
2017 /* We haven't been here before so things have changed */
2018 DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
2019 ret = true;
2020 } else if (rec->ifaces->num != ifaces->num) {
2021 /* Number of interfaces has changed */
2022 DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
2023 rec->ifaces->num, ifaces->num));
2024 ret = true;
2025 } else {
2026 /* See if interface names or link states have changed */
2027 unsigned int i;
2028 for (i = 0; i < rec->ifaces->num; i++) {
2029 struct ctdb_iface * iface = &rec->ifaces->ifaces[i];
2030 if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
2031 DEBUG(DEBUG_NOTICE,
2032 ("Interface in slot %d changed: %s => %s\n",
2033 i, iface->name, ifaces->ifaces[i].name));
2034 ret = true;
2035 break;
2037 if (iface->link_state != ifaces->ifaces[i].link_state) {
2038 DEBUG(DEBUG_NOTICE,
2039 ("Interface %s changed state: %d => %d\n",
2040 iface->name, iface->link_state,
2041 ifaces->ifaces[i].link_state));
2042 ret = true;
2043 break;
2048 talloc_free(rec->ifaces);
2049 rec->ifaces = talloc_steal(rec, ifaces);
2051 talloc_free(mem_ctx);
2052 return ret;
2055 /* Check that the local allocation of public IP addresses is correct
2056 * and do some house-keeping */
2057 static int verify_local_ip_allocation(struct ctdb_context *ctdb,
2058 struct ctdb_recoverd *rec,
2059 uint32_t pnn,
2060 struct ctdb_node_map_old *nodemap)
2062 TALLOC_CTX *mem_ctx = talloc_new(NULL);
2063 unsigned int j;
2064 int ret;
2065 bool need_takeover_run = false;
2066 struct ctdb_public_ip_list_old *ips = NULL;
2068 /* If we are not the recmaster then do some housekeeping */
2069 if (rec->recmaster != pnn) {
2070 /* Ignore any IP reallocate requests - only recmaster
2071 * processes them
2073 TALLOC_FREE(rec->reallocate_requests);
2074 /* Clear any nodes that should be force rebalanced in
2075 * the next takeover run. If the recovery master role
2076 * has moved then we don't want to process these some
2077 * time in the future.
2079 TALLOC_FREE(rec->force_rebalance_nodes);
2082 /* Return early if disabled... */
2083 if (ctdb_config.failover_disabled ||
2084 ctdb_op_is_disabled(rec->takeover_run)) {
2085 talloc_free(mem_ctx);
2086 return 0;
2089 if (interfaces_have_changed(ctdb, rec)) {
2090 need_takeover_run = true;
2093 /* If there are unhosted IPs but this node can host them then
2094 * trigger an IP reallocation */
2096 /* Read *available* IPs from local node */
2097 ret = ctdb_ctrl_get_public_ips_flags(
2098 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx,
2099 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
2100 if (ret != 0) {
2101 DEBUG(DEBUG_ERR, ("Unable to retrieve available public IPs\n"));
2102 talloc_free(mem_ctx);
2103 return -1;
2106 for (j=0; j<ips->num; j++) {
2107 if (ips->ips[j].pnn == CTDB_UNKNOWN_PNN &&
2108 nodemap->nodes[pnn].flags == 0) {
2109 DEBUG(DEBUG_WARNING,
2110 ("Unassigned IP %s can be served by this node\n",
2111 ctdb_addr_to_str(&ips->ips[j].addr)));
2112 need_takeover_run = true;
2116 talloc_free(ips);
2118 if (!ctdb->do_checkpublicip) {
2119 goto done;
2122 /* Validate the IP addresses that this node has on network
2123 * interfaces. If there is an inconsistency between reality
2124 * and the state expected by CTDB then try to fix it by
2125 * triggering an IP reallocation or releasing extraneous IP
2126 * addresses. */
2128 /* Read *known* IPs from local node */
2129 ret = ctdb_ctrl_get_public_ips_flags(
2130 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
2131 if (ret != 0) {
2132 DEBUG(DEBUG_ERR, ("Unable to retrieve known public IPs\n"));
2133 talloc_free(mem_ctx);
2134 return -1;
2137 for (j=0; j<ips->num; j++) {
2138 if (ips->ips[j].pnn == pnn) {
2139 if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
2140 DEBUG(DEBUG_ERR,
2141 ("Assigned IP %s not on an interface\n",
2142 ctdb_addr_to_str(&ips->ips[j].addr)));
2143 need_takeover_run = true;
2145 } else {
2146 if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
2147 DEBUG(DEBUG_ERR,
2148 ("IP %s incorrectly on an interface\n",
2149 ctdb_addr_to_str(&ips->ips[j].addr)));
2150 need_takeover_run = true;
2155 done:
2156 if (need_takeover_run) {
2157 struct ctdb_srvid_message rd;
2158 TDB_DATA data;
2160 DEBUG(DEBUG_NOTICE,("Trigger takeoverrun\n"));
2162 ZERO_STRUCT(rd);
2163 rd.pnn = ctdb->pnn;
2164 rd.srvid = 0;
2165 data.dptr = (uint8_t *)&rd;
2166 data.dsize = sizeof(rd);
2168 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
2169 if (ret != 0) {
2170 DEBUG(DEBUG_ERR,
2171 ("Failed to send takeover run request\n"));
2174 talloc_free(mem_ctx);
2175 return 0;
2179 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2181 struct ctdb_node_map_old **remote_nodemaps = callback_data;
2183 if (node_pnn >= ctdb->num_nodes) {
2184 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
2185 return;
2188 remote_nodemaps[node_pnn] = (struct ctdb_node_map_old *)talloc_steal(remote_nodemaps, outdata.dptr);
2192 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
2193 struct ctdb_node_map_old *nodemap,
2194 struct ctdb_node_map_old **remote_nodemaps)
2196 uint32_t *nodes;
2198 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
2199 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
2200 nodes, 0,
2201 CONTROL_TIMEOUT(), false, tdb_null,
2202 async_getnodemap_callback,
2203 NULL,
2204 remote_nodemaps) != 0) {
2205 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
2207 return -1;
2210 return 0;
2213 static bool validate_recovery_master(struct ctdb_recoverd *rec,
2214 TALLOC_CTX *mem_ctx)
2216 struct ctdb_context *ctdb = rec->ctdb;
2217 uint32_t pnn = ctdb_get_pnn(ctdb);
2218 struct ctdb_node_map_old *nodemap = rec->nodemap;
2219 struct ctdb_node_map_old *recmaster_nodemap = NULL;
2220 int ret;
2222 /* When recovery daemon is started, recmaster is set to
2223 * "unknown" so it knows to start an election.
2225 if (rec->recmaster == CTDB_UNKNOWN_PNN) {
2226 DEBUG(DEBUG_NOTICE,
2227 ("Initial recovery master set - forcing election\n"));
2228 force_election(rec, pnn, nodemap);
2229 return false;
2233 * If the current recmaster does not have CTDB_CAP_RECMASTER,
2234 * but we have, then force an election and try to become the new
2235 * recmaster.
2237 if (!ctdb_node_has_capabilities(rec->caps,
2238 rec->recmaster,
2239 CTDB_CAP_RECMASTER) &&
2240 (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
2241 !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
2242 DEBUG(DEBUG_ERR,
2243 (" Current recmaster node %u does not have CAP_RECMASTER,"
2244 " but we (node %u) have - force an election\n",
2245 rec->recmaster, pnn));
2246 force_election(rec, pnn, nodemap);
2247 return false;
2250 /* Verify that the master node has not been deleted. This
2251 * should not happen because a node should always be shutdown
2252 * before being deleted, causing a new master to be elected
2253 * before now. However, if something strange has happened
2254 * then checking here will ensure we don't index beyond the
2255 * end of the nodemap array. */
2256 if (rec->recmaster >= nodemap->num) {
2257 DEBUG(DEBUG_ERR,
2258 ("Recmaster node %u has been deleted. Force election\n",
2259 rec->recmaster));
2260 force_election(rec, pnn, nodemap);
2261 return false;
2264 /* if recovery master is disconnected/deleted we must elect a new recmaster */
2265 if (nodemap->nodes[rec->recmaster].flags &
2266 (NODE_FLAGS_DISCONNECTED|NODE_FLAGS_DELETED)) {
2267 DEBUG(DEBUG_NOTICE,
2268 ("Recmaster node %u is disconnected/deleted. Force election\n",
2269 rec->recmaster));
2270 force_election(rec, pnn, nodemap);
2271 return false;
2274 /* get nodemap from the recovery master to check if it is inactive */
2275 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
2276 mem_ctx, &recmaster_nodemap);
2277 if (ret != 0) {
2278 DEBUG(DEBUG_ERR,
2279 (__location__
2280 " Unable to get nodemap from recovery master %u\n",
2281 rec->recmaster));
2282 /* No election, just error */
2283 return false;
2287 if ((recmaster_nodemap->nodes[rec->recmaster].flags & NODE_FLAGS_INACTIVE) &&
2288 (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
2289 DEBUG(DEBUG_NOTICE,
2290 ("Recmaster node %u is inactive. Force election\n",
2291 rec->recmaster));
2293 * update our nodemap to carry the recmaster's notion of
2294 * its own flags, so that we don't keep freezing the
2295 * inactive recmaster node...
2297 nodemap->nodes[rec->recmaster].flags =
2298 recmaster_nodemap->nodes[rec->recmaster].flags;
2299 force_election(rec, pnn, nodemap);
2300 return false;
2303 return true;
2306 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
2307 TALLOC_CTX *mem_ctx)
2309 uint32_t pnn;
2310 struct ctdb_node_map_old *nodemap=NULL;
2311 struct ctdb_node_map_old **remote_nodemaps=NULL;
2312 struct ctdb_vnn_map *vnnmap=NULL;
2313 struct ctdb_vnn_map *remote_vnnmap=NULL;
2314 uint32_t num_lmasters;
2315 int32_t debug_level;
2316 unsigned int i, j;
2317 int ret;
2318 bool self_ban;
2321 /* verify that the main daemon is still running */
2322 if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
2323 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2324 exit(-1);
2327 /* ping the local daemon to tell it we are alive */
2328 ctdb_ctrl_recd_ping(ctdb);
2330 if (rec->election_timeout) {
2331 /* an election is in progress */
2332 return;
2335 /* read the debug level from the parent and update locally */
2336 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
2337 if (ret !=0) {
2338 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
2339 return;
2341 debuglevel_set(debug_level);
2343 /* get relevant tunables */
2344 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
2345 if (ret != 0) {
2346 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
2347 return;
2350 /* get runstate */
2351 ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(),
2352 CTDB_CURRENT_NODE, &ctdb->runstate);
2353 if (ret != 0) {
2354 DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n"));
2355 return;
2358 pnn = ctdb_get_pnn(ctdb);
2360 /* get nodemap */
2361 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &nodemap);
2362 if (ret != 0) {
2363 DBG_ERR("Unable to get nodemap from node %"PRIu32"\n", pnn);
2364 return;
2366 talloc_free(rec->nodemap);
2367 rec->nodemap = nodemap;
2369 /* remember our own node flags */
2370 rec->node_flags = nodemap->nodes[pnn].flags;
2372 ban_misbehaving_nodes(rec, &self_ban);
2373 if (self_ban) {
2374 DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
2375 return;
2378 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2379 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2380 if (ret != 0) {
2381 D_ERR("Failed to read recmode from local node\n");
2382 return;
2385 /* if the local daemon is STOPPED or BANNED, we verify that the databases are
2386 also frozen and that the recmode is set to active.
2388 if (rec->node_flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
2389 /* If this node has become inactive then we want to
2390 * reduce the chances of it taking over the recovery
2391 * master role when it becomes active again. This
2392 * helps to stabilise the recovery master role so that
2393 * it stays on the most stable node.
2395 rec->priority_time = timeval_current();
2397 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2398 DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
2400 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2401 if (ret != 0) {
2402 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
2404 return;
2407 if (! rec->frozen_on_inactive) {
2408 ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(),
2409 CTDB_CURRENT_NODE);
2410 if (ret != 0) {
2411 DEBUG(DEBUG_ERR,
2412 (__location__ " Failed to freeze node "
2413 "in STOPPED or BANNED state\n"));
2414 return;
2417 rec->frozen_on_inactive = true;
2420 /* If this node is stopped or banned then it is not the recovery
2421 * master, so don't do anything. This prevents stopped or banned
2422 * node from starting election and sending unnecessary controls.
2424 return;
2427 rec->frozen_on_inactive = false;
2429 /* Retrieve capabilities from all connected nodes */
2430 ret = update_capabilities(rec, nodemap);
2431 if (ret != 0) {
2432 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
2433 return;
2436 if (! validate_recovery_master(rec, mem_ctx)) {
2437 return;
2440 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2441 /* Check if an IP takeover run is needed and trigger one if
2442 * necessary */
2443 verify_local_ip_allocation(ctdb, rec, pnn, nodemap);
2446 /* if we are not the recmaster then we do not need to check
2447 if recovery is needed
2449 if (pnn != rec->recmaster) {
2450 return;
2454 /* ensure our local copies of flags are right */
2455 ret = update_local_flags(rec, nodemap);
2456 if (ret != 0) {
2457 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
2458 return;
2461 if (ctdb->num_nodes != nodemap->num) {
2462 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
2463 ctdb_load_nodes_file(ctdb);
2464 return;
2467 /* verify that all active nodes agree that we are the recmaster */
2468 switch (verify_recmaster(rec, nodemap, pnn)) {
2469 case MONITOR_RECOVERY_NEEDED:
2470 /* can not happen */
2471 return;
2472 case MONITOR_ELECTION_NEEDED:
2473 force_election(rec, pnn, nodemap);
2474 return;
2475 case MONITOR_OK:
2476 break;
2477 case MONITOR_FAILED:
2478 return;
2482 /* get the vnnmap */
2483 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
2484 if (ret != 0) {
2485 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
2486 return;
2489 if (rec->need_recovery) {
2490 /* a previous recovery didn't finish */
2491 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2492 return;
2495 /* verify that all active nodes are in normal mode
2496 and not in recovery mode
2498 switch (verify_recmode(ctdb, nodemap)) {
2499 case MONITOR_RECOVERY_NEEDED:
2500 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2501 return;
2502 case MONITOR_FAILED:
2503 return;
2504 case MONITOR_ELECTION_NEEDED:
2505 /* can not happen */
2506 case MONITOR_OK:
2507 break;
2511 if (ctdb->recovery_lock != NULL) {
2512 /* We must already hold the recovery lock */
2513 if (!ctdb_recovery_have_lock(rec)) {
2514 DEBUG(DEBUG_ERR,("Failed recovery lock sanity check. Force a recovery\n"));
2515 ctdb_set_culprit(rec, ctdb->pnn);
2516 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2517 return;
2522 /* If recoveries are disabled then there is no use doing any
2523 * nodemap or flags checks. Recoveries might be disabled due
2524 * to "reloadnodes", so doing these checks might cause an
2525 * unnecessary recovery. */
2526 if (ctdb_op_is_disabled(rec->recovery)) {
2527 goto takeover_run_checks;
2530 /* get the nodemap for all active remote nodes
2532 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map_old *, nodemap->num);
2533 if (remote_nodemaps == NULL) {
2534 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
2535 return;
2537 for(i=0; i<nodemap->num; i++) {
2538 remote_nodemaps[i] = NULL;
2540 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
2541 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
2542 return;
2545 /* verify that all other nodes have the same nodemap as we have
2547 for (j=0; j<nodemap->num; j++) {
2548 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2549 continue;
2552 if (remote_nodemaps[j] == NULL) {
2553 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
2554 ctdb_set_culprit(rec, j);
2556 return;
2559 /* if the nodes disagree on how many nodes there are
2560 then this is a good reason to try recovery
2562 if (remote_nodemaps[j]->num != nodemap->num) {
2563 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
2564 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
2565 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2566 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2567 return;
2570 /* if the nodes disagree on which nodes exist and are
2571 active, then that is also a good reason to do recovery
2573 for (i=0;i<nodemap->num;i++) {
2574 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
2575 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
2576 nodemap->nodes[j].pnn, i,
2577 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
2578 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2579 do_recovery(rec, mem_ctx, pnn, nodemap,
2580 vnnmap);
2581 return;
2587 * Update node flags obtained from each active node. This ensure we have
2588 * up-to-date information for all the nodes.
2590 for (j=0; j<nodemap->num; j++) {
2591 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2592 continue;
2594 nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
2597 for (j=0; j<nodemap->num; j++) {
2598 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2599 continue;
2602 /* verify the flags are consistent
2604 for (i=0; i<nodemap->num; i++) {
2605 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2606 continue;
2609 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
2610 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
2611 nodemap->nodes[j].pnn,
2612 nodemap->nodes[i].pnn,
2613 remote_nodemaps[j]->nodes[i].flags,
2614 nodemap->nodes[i].flags));
2615 if (i == j) {
2616 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
2617 update_flags_on_all_nodes(
2618 ctdb,
2619 nodemap->nodes[i].pnn,
2620 remote_nodemaps[j]->nodes[i].flags);
2621 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2622 do_recovery(rec, mem_ctx, pnn, nodemap,
2623 vnnmap);
2624 return;
2625 } else {
2626 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
2627 update_flags_on_all_nodes(
2628 ctdb,
2629 nodemap->nodes[i].pnn,
2630 nodemap->nodes[i].flags);
2631 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2632 do_recovery(rec, mem_ctx, pnn, nodemap,
2633 vnnmap);
2634 return;
2641 /* count how many active nodes there are */
2642 num_lmasters = 0;
2643 for (i=0; i<nodemap->num; i++) {
2644 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
2645 if (ctdb_node_has_capabilities(rec->caps,
2646 ctdb->nodes[i]->pnn,
2647 CTDB_CAP_LMASTER)) {
2648 num_lmasters++;
2654 /* There must be the same number of lmasters in the vnn map as
2655 * there are active nodes with the lmaster capability... or
2656 * do a recovery.
2658 if (vnnmap->size != num_lmasters) {
2659 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
2660 vnnmap->size, num_lmasters));
2661 ctdb_set_culprit(rec, ctdb->pnn);
2662 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2663 return;
2667 * Verify that all active lmaster nodes in the nodemap also
2668 * exist in the vnnmap
2670 for (j=0; j<nodemap->num; j++) {
2671 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2672 continue;
2674 if (! ctdb_node_has_capabilities(rec->caps,
2675 nodemap->nodes[j].pnn,
2676 CTDB_CAP_LMASTER)) {
2677 continue;
2679 if (nodemap->nodes[j].pnn == pnn) {
2680 continue;
2683 for (i=0; i<vnnmap->size; i++) {
2684 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
2685 break;
2688 if (i == vnnmap->size) {
2689 D_ERR("Active LMASTER node %u is not in the vnnmap\n",
2690 nodemap->nodes[j].pnn);
2691 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2692 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2693 return;
2698 /* verify that all other nodes have the same vnnmap
2699 and are from the same generation
2701 for (j=0; j<nodemap->num; j++) {
2702 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2703 continue;
2705 if (nodemap->nodes[j].pnn == pnn) {
2706 continue;
2709 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
2710 mem_ctx, &remote_vnnmap);
2711 if (ret != 0) {
2712 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
2713 nodemap->nodes[j].pnn));
2714 return;
2717 /* verify the vnnmap generation is the same */
2718 if (vnnmap->generation != remote_vnnmap->generation) {
2719 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
2720 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
2721 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2722 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2723 return;
2726 /* verify the vnnmap size is the same */
2727 if (vnnmap->size != remote_vnnmap->size) {
2728 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
2729 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
2730 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2731 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2732 return;
2735 /* verify the vnnmap is the same */
2736 for (i=0;i<vnnmap->size;i++) {
2737 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
2738 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
2739 nodemap->nodes[j].pnn));
2740 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2741 do_recovery(rec, mem_ctx, pnn, nodemap,
2742 vnnmap);
2743 return;
2748 /* FIXME: Add remote public IP checking to ensure that nodes
2749 * have the IP addresses that are allocated to them. */
2751 takeover_run_checks:
2753 /* If there are IP takeover runs requested or the previous one
2754 * failed then perform one and notify the waiters */
2755 if (!ctdb_op_is_disabled(rec->takeover_run) &&
2756 (rec->reallocate_requests || rec->need_takeover_run)) {
2757 process_ipreallocate_requests(ctdb, rec);
2761 static void recd_sig_term_handler(struct tevent_context *ev,
2762 struct tevent_signal *se, int signum,
2763 int count, void *dont_care,
2764 void *private_data)
2766 struct ctdb_recoverd *rec = talloc_get_type_abort(
2767 private_data, struct ctdb_recoverd);
2769 DEBUG(DEBUG_ERR, ("Received SIGTERM, exiting\n"));
2770 ctdb_recovery_unlock(rec);
2771 exit(0);
2775 * Periodically log elements of the cluster state
2777 * This can be used to confirm a split brain has occurred
2779 static void maybe_log_cluster_state(struct tevent_context *ev,
2780 struct tevent_timer *te,
2781 struct timeval current_time,
2782 void *private_data)
2784 struct ctdb_recoverd *rec = talloc_get_type_abort(
2785 private_data, struct ctdb_recoverd);
2786 struct ctdb_context *ctdb = rec->ctdb;
2787 struct tevent_timer *tt;
2789 static struct timeval start_incomplete = {
2790 .tv_sec = 0,
2793 bool is_complete;
2794 bool was_complete;
2795 unsigned int i;
2796 double seconds;
2797 unsigned int minutes;
2798 unsigned int num_connected;
2800 if (rec->recmaster != ctdb_get_pnn(ctdb)) {
2801 goto done;
2804 if (rec->nodemap == NULL) {
2805 goto done;
2808 is_complete = true;
2809 num_connected = 0;
2810 for (i = 0; i < rec->nodemap->num; i++) {
2811 struct ctdb_node_and_flags *n = &rec->nodemap->nodes[i];
2813 if (n->pnn == ctdb_get_pnn(ctdb)) {
2814 continue;
2816 if ((n->flags & NODE_FLAGS_DELETED) != 0) {
2817 continue;
2819 if ((n->flags & NODE_FLAGS_DISCONNECTED) != 0) {
2820 is_complete = false;
2821 continue;
2824 num_connected++;
2827 was_complete = timeval_is_zero(&start_incomplete);
2829 if (is_complete) {
2830 if (! was_complete) {
2831 D_WARNING("Cluster complete with master=%u\n",
2832 rec->recmaster);
2833 start_incomplete = timeval_zero();
2835 goto done;
2838 /* Cluster is newly incomplete... */
2839 if (was_complete) {
2840 start_incomplete = current_time;
2841 minutes = 0;
2842 goto log;
2846 * Cluster has been incomplete since previous check, so figure
2847 * out how long (in minutes) and decide whether to log anything
2849 seconds = timeval_elapsed2(&start_incomplete, &current_time);
2850 minutes = (unsigned int)seconds / 60;
2851 if (minutes >= 60) {
2852 /* Over an hour, log every hour */
2853 if (minutes % 60 != 0) {
2854 goto done;
2856 } else if (minutes >= 10) {
2857 /* Over 10 minutes, log every 10 minutes */
2858 if (minutes % 10 != 0) {
2859 goto done;
2863 log:
2864 D_WARNING("Cluster incomplete with master=%u, elapsed=%u minutes, "
2865 "connected=%u\n",
2866 rec->recmaster,
2867 minutes,
2868 num_connected);
2870 done:
2871 tt = tevent_add_timer(ctdb->ev,
2872 rec,
2873 timeval_current_ofs(60, 0),
2874 maybe_log_cluster_state,
2875 rec);
2876 if (tt == NULL) {
2877 DBG_WARNING("Failed to set up cluster state timer\n");
2882 the main monitoring loop
2884 static void monitor_cluster(struct ctdb_context *ctdb)
2886 struct tevent_signal *se;
2887 struct ctdb_recoverd *rec;
2889 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
2891 rec = talloc_zero(ctdb, struct ctdb_recoverd);
2892 CTDB_NO_MEMORY_FATAL(ctdb, rec);
2894 rec->ctdb = ctdb;
2895 rec->recmaster = CTDB_UNKNOWN_PNN;
2896 rec->recovery_lock_handle = NULL;
2898 rec->takeover_run = ctdb_op_init(rec, "takeover runs");
2899 CTDB_NO_MEMORY_FATAL(ctdb, rec->takeover_run);
2901 rec->recovery = ctdb_op_init(rec, "recoveries");
2902 CTDB_NO_MEMORY_FATAL(ctdb, rec->recovery);
2904 rec->priority_time = timeval_current();
2905 rec->frozen_on_inactive = false;
2907 se = tevent_add_signal(ctdb->ev, ctdb, SIGTERM, 0,
2908 recd_sig_term_handler, rec);
2909 if (se == NULL) {
2910 DEBUG(DEBUG_ERR, ("Failed to install SIGTERM handler\n"));
2911 exit(1);
2914 if (ctdb->recovery_lock == NULL) {
2915 struct tevent_timer *tt;
2917 tt = tevent_add_timer(ctdb->ev,
2918 rec,
2919 timeval_current_ofs(60, 0),
2920 maybe_log_cluster_state,
2921 rec);
2922 if (tt == NULL) {
2923 DBG_WARNING("Failed to set up cluster state timer\n");
2927 /* register a message port for sending memory dumps */
2928 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
2930 /* when a node is assigned banning credits */
2931 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_BANNING,
2932 banning_handler, rec);
2934 /* register a message port for recovery elections */
2935 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_ELECTION, election_handler, rec);
2937 /* when nodes are disabled/enabled */
2938 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
2940 /* when we are asked to puch out a flag change */
2941 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
2943 /* register a message port for reloadnodes */
2944 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
2946 /* register a message port for performing a takeover run */
2947 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
2949 /* register a message port for disabling the ip check for a short while */
2950 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
2952 /* register a message port for forcing a rebalance of a node next
2953 reallocation */
2954 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
2956 /* Register a message port for disabling takeover runs */
2957 ctdb_client_set_message_handler(ctdb,
2958 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
2959 disable_takeover_runs_handler, rec);
2961 /* Register a message port for disabling recoveries */
2962 ctdb_client_set_message_handler(ctdb,
2963 CTDB_SRVID_DISABLE_RECOVERIES,
2964 disable_recoveries_handler, rec);
2966 for (;;) {
2967 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2968 struct timeval start;
2969 double elapsed;
2971 if (!mem_ctx) {
2972 DEBUG(DEBUG_CRIT,(__location__
2973 " Failed to create temp context\n"));
2974 exit(-1);
2977 start = timeval_current();
2978 main_loop(ctdb, rec, mem_ctx);
2979 talloc_free(mem_ctx);
2981 /* we only check for recovery once every second */
2982 elapsed = timeval_elapsed(&start);
2983 if (elapsed < ctdb->tunable.recover_interval) {
2984 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
2985 - elapsed);
2991 event handler for when the main ctdbd dies
2993 static void ctdb_recoverd_parent(struct tevent_context *ev,
2994 struct tevent_fd *fde,
2995 uint16_t flags, void *private_data)
2997 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
2998 _exit(1);
3002 called regularly to verify that the recovery daemon is still running
3004 static void ctdb_check_recd(struct tevent_context *ev,
3005 struct tevent_timer *te,
3006 struct timeval yt, void *p)
3008 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3010 if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
3011 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
3013 tevent_add_timer(ctdb->ev, ctdb, timeval_zero(),
3014 ctdb_restart_recd, ctdb);
3016 return;
3019 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3020 timeval_current_ofs(30, 0),
3021 ctdb_check_recd, ctdb);
3024 static void recd_sig_child_handler(struct tevent_context *ev,
3025 struct tevent_signal *se, int signum,
3026 int count, void *dont_care,
3027 void *private_data)
3029 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3030 int status;
3031 pid_t pid = -1;
3033 while (pid != 0) {
3034 pid = waitpid(-1, &status, WNOHANG);
3035 if (pid == -1) {
3036 if (errno != ECHILD) {
3037 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3039 return;
3041 if (pid > 0) {
3042 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3048 startup the recovery daemon as a child of the main ctdb daemon
3050 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3052 int fd[2];
3053 struct tevent_signal *se;
3054 struct tevent_fd *fde;
3055 int ret;
3057 if (pipe(fd) != 0) {
3058 return -1;
3061 ctdb->recoverd_pid = ctdb_fork(ctdb);
3062 if (ctdb->recoverd_pid == -1) {
3063 return -1;
3066 if (ctdb->recoverd_pid != 0) {
3067 talloc_free(ctdb->recd_ctx);
3068 ctdb->recd_ctx = talloc_new(ctdb);
3069 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
3071 close(fd[0]);
3072 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3073 timeval_current_ofs(30, 0),
3074 ctdb_check_recd, ctdb);
3075 return 0;
3078 close(fd[1]);
3080 srandom(getpid() ^ time(NULL));
3082 ret = logging_init(ctdb, NULL, NULL, "ctdb-recoverd");
3083 if (ret != 0) {
3084 return -1;
3087 prctl_set_comment("ctdb_recoverd");
3088 if (switch_from_server_to_client(ctdb) != 0) {
3089 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3090 exit(1);
3093 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
3095 fde = tevent_add_fd(ctdb->ev, ctdb, fd[0], TEVENT_FD_READ,
3096 ctdb_recoverd_parent, &fd[0]);
3097 tevent_fd_set_auto_close(fde);
3099 /* set up a handler to pick up sigchld */
3100 se = tevent_add_signal(ctdb->ev, ctdb, SIGCHLD, 0,
3101 recd_sig_child_handler, ctdb);
3102 if (se == NULL) {
3103 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3104 exit(1);
3107 monitor_cluster(ctdb);
3109 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
3110 return -1;
3114 shutdown the recovery daemon
3116 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
3118 if (ctdb->recoverd_pid == 0) {
3119 return;
3122 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
3123 ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
3125 TALLOC_FREE(ctdb->recd_ctx);
3126 TALLOC_FREE(ctdb->recd_ping_count);
3129 static void ctdb_restart_recd(struct tevent_context *ev,
3130 struct tevent_timer *te,
3131 struct timeval t, void *private_data)
3133 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3135 DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
3136 ctdb_stop_recoverd(ctdb);
3137 ctdb_start_recoverd(ctdb);