ctdb-recoverd: Clean up logging on failure to take recovery lock
[Samba.git] / ctdb / server / ctdb_recoverd.c
blobed055bdcdfed26a60262c4efb8294450fe957510
1 /*
2 ctdb recovery daemon
4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
20 #include "replace.h"
21 #include "system/filesys.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/wait.h"
26 #include <popt.h>
27 #include <talloc.h>
28 #include <tevent.h>
29 #include <tdb.h>
31 #include "lib/tdb_wrap/tdb_wrap.h"
32 #include "lib/util/dlinklist.h"
33 #include "lib/util/debug.h"
34 #include "lib/util/samba_util.h"
35 #include "lib/util/sys_rw.h"
36 #include "lib/util/util_process.h"
38 #include "ctdb_private.h"
39 #include "ctdb_client.h"
41 #include "common/system_socket.h"
42 #include "common/common.h"
43 #include "common/logging.h"
45 #include "server/ctdb_config.h"
47 #include "ctdb_cluster_mutex.h"
49 /* List of SRVID requests that need to be processed */
50 struct srvid_list {
51 struct srvid_list *next, *prev;
52 struct ctdb_srvid_message *request;
55 struct srvid_requests {
56 struct srvid_list *requests;
59 static void srvid_request_reply(struct ctdb_context *ctdb,
60 struct ctdb_srvid_message *request,
61 TDB_DATA result)
63 /* Someone that sent srvid==0 does not want a reply */
64 if (request->srvid == 0) {
65 talloc_free(request);
66 return;
69 if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
70 result) == 0) {
71 DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
72 (unsigned)request->pnn,
73 (unsigned long long)request->srvid));
74 } else {
75 DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
76 (unsigned)request->pnn,
77 (unsigned long long)request->srvid));
80 talloc_free(request);
83 static void srvid_requests_reply(struct ctdb_context *ctdb,
84 struct srvid_requests **requests,
85 TDB_DATA result)
87 struct srvid_list *r;
89 if (*requests == NULL) {
90 return;
93 for (r = (*requests)->requests; r != NULL; r = r->next) {
94 srvid_request_reply(ctdb, r->request, result);
97 /* Free the list structure... */
98 TALLOC_FREE(*requests);
101 static void srvid_request_add(struct ctdb_context *ctdb,
102 struct srvid_requests **requests,
103 struct ctdb_srvid_message *request)
105 struct srvid_list *t;
106 int32_t ret;
107 TDB_DATA result;
109 if (*requests == NULL) {
110 *requests = talloc_zero(ctdb, struct srvid_requests);
111 if (*requests == NULL) {
112 goto nomem;
116 t = talloc_zero(*requests, struct srvid_list);
117 if (t == NULL) {
118 /* If *requests was just allocated above then free it */
119 if ((*requests)->requests == NULL) {
120 TALLOC_FREE(*requests);
122 goto nomem;
125 t->request = (struct ctdb_srvid_message *)talloc_steal(t, request);
126 DLIST_ADD((*requests)->requests, t);
128 return;
130 nomem:
131 /* Failed to add the request to the list. Send a fail. */
132 DEBUG(DEBUG_ERR, (__location__
133 " Out of memory, failed to queue SRVID request\n"));
134 ret = -ENOMEM;
135 result.dsize = sizeof(ret);
136 result.dptr = (uint8_t *)&ret;
137 srvid_request_reply(ctdb, request, result);
140 /* An abstraction to allow an operation (takeover runs, recoveries,
141 * ...) to be disabled for a given timeout */
142 struct ctdb_op_state {
143 struct tevent_timer *timer;
144 bool in_progress;
145 const char *name;
148 static struct ctdb_op_state *ctdb_op_init(TALLOC_CTX *mem_ctx, const char *name)
150 struct ctdb_op_state *state = talloc_zero(mem_ctx, struct ctdb_op_state);
152 if (state != NULL) {
153 state->in_progress = false;
154 state->name = name;
157 return state;
160 static bool ctdb_op_is_disabled(struct ctdb_op_state *state)
162 return state->timer != NULL;
165 static bool ctdb_op_begin(struct ctdb_op_state *state)
167 if (ctdb_op_is_disabled(state)) {
168 DEBUG(DEBUG_NOTICE,
169 ("Unable to begin - %s are disabled\n", state->name));
170 return false;
173 state->in_progress = true;
174 return true;
177 static bool ctdb_op_end(struct ctdb_op_state *state)
179 return state->in_progress = false;
182 static bool ctdb_op_is_in_progress(struct ctdb_op_state *state)
184 return state->in_progress;
187 static void ctdb_op_enable(struct ctdb_op_state *state)
189 TALLOC_FREE(state->timer);
192 static void ctdb_op_timeout_handler(struct tevent_context *ev,
193 struct tevent_timer *te,
194 struct timeval yt, void *p)
196 struct ctdb_op_state *state =
197 talloc_get_type(p, struct ctdb_op_state);
199 DEBUG(DEBUG_NOTICE,("Reenabling %s after timeout\n", state->name));
200 ctdb_op_enable(state);
203 static int ctdb_op_disable(struct ctdb_op_state *state,
204 struct tevent_context *ev,
205 uint32_t timeout)
207 if (timeout == 0) {
208 DEBUG(DEBUG_NOTICE,("Reenabling %s\n", state->name));
209 ctdb_op_enable(state);
210 return 0;
213 if (state->in_progress) {
214 DEBUG(DEBUG_ERR,
215 ("Unable to disable %s - in progress\n", state->name));
216 return -EAGAIN;
219 DEBUG(DEBUG_NOTICE,("Disabling %s for %u seconds\n",
220 state->name, timeout));
222 /* Clear any old timers */
223 talloc_free(state->timer);
225 /* Arrange for the timeout to occur */
226 state->timer = tevent_add_timer(ev, state,
227 timeval_current_ofs(timeout, 0),
228 ctdb_op_timeout_handler, state);
229 if (state->timer == NULL) {
230 DEBUG(DEBUG_ERR,(__location__ " Unable to setup timer\n"));
231 return -ENOMEM;
234 return 0;
237 struct ctdb_banning_state {
238 uint32_t count;
239 struct timeval last_reported_time;
242 struct ctdb_recovery_lock_handle;
245 private state of recovery daemon
247 struct ctdb_recoverd {
248 struct ctdb_context *ctdb;
249 uint32_t recmaster;
250 uint32_t last_culprit_node;
251 struct ctdb_node_map_old *nodemap;
252 struct timeval priority_time;
253 bool need_takeover_run;
254 bool need_recovery;
255 uint32_t node_flags;
256 struct tevent_timer *send_election_te;
257 struct tevent_timer *election_timeout;
258 struct srvid_requests *reallocate_requests;
259 struct ctdb_op_state *takeover_run;
260 struct ctdb_op_state *recovery;
261 struct ctdb_iface_list_old *ifaces;
262 uint32_t *force_rebalance_nodes;
263 struct ctdb_node_capabilities *caps;
264 bool frozen_on_inactive;
265 struct ctdb_recovery_lock_handle *recovery_lock_handle;
268 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
269 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
271 static void ctdb_restart_recd(struct tevent_context *ev,
272 struct tevent_timer *te, struct timeval t,
273 void *private_data);
276 ban a node for a period of time
278 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
280 int ret;
281 struct ctdb_context *ctdb = rec->ctdb;
282 struct ctdb_ban_state bantime;
284 if (!ctdb_validate_pnn(ctdb, pnn)) {
285 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
286 return;
289 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
291 bantime.pnn = pnn;
292 bantime.time = ban_time;
294 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
295 if (ret != 0) {
296 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
297 return;
302 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
306 remember the trouble maker
308 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
310 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
311 struct ctdb_banning_state *ban_state;
313 if (culprit > ctdb->num_nodes) {
314 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
315 return;
318 /* If we are banned or stopped, do not set other nodes as culprits */
319 if (rec->node_flags & NODE_FLAGS_INACTIVE) {
320 DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
321 return;
324 if (ctdb->nodes[culprit]->ban_state == NULL) {
325 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
326 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
330 ban_state = ctdb->nodes[culprit]->ban_state;
331 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
332 /* this was the first time in a long while this node
333 misbehaved so we will forgive any old transgressions.
335 ban_state->count = 0;
338 ban_state->count += count;
339 ban_state->last_reported_time = timeval_current();
340 rec->last_culprit_node = culprit;
344 remember the trouble maker
346 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
348 ctdb_set_culprit_count(rec, culprit, 1);
352 Retrieve capabilities from all connected nodes
354 static int update_capabilities(struct ctdb_recoverd *rec,
355 struct ctdb_node_map_old *nodemap)
357 uint32_t *capp;
358 TALLOC_CTX *tmp_ctx;
359 struct ctdb_node_capabilities *caps;
360 struct ctdb_context *ctdb = rec->ctdb;
362 tmp_ctx = talloc_new(rec);
363 CTDB_NO_MEMORY(ctdb, tmp_ctx);
365 caps = ctdb_get_capabilities(ctdb, tmp_ctx,
366 CONTROL_TIMEOUT(), nodemap);
368 if (caps == NULL) {
369 DEBUG(DEBUG_ERR,
370 (__location__ " Failed to get node capabilities\n"));
371 talloc_free(tmp_ctx);
372 return -1;
375 capp = ctdb_get_node_capabilities(caps, ctdb_get_pnn(ctdb));
376 if (capp == NULL) {
377 DEBUG(DEBUG_ERR,
378 (__location__
379 " Capabilities don't include current node.\n"));
380 talloc_free(tmp_ctx);
381 return -1;
383 ctdb->capabilities = *capp;
385 TALLOC_FREE(rec->caps);
386 rec->caps = talloc_steal(rec, caps);
388 talloc_free(tmp_ctx);
389 return 0;
393 change recovery mode on all nodes
395 static int set_recovery_mode(struct ctdb_context *ctdb,
396 struct ctdb_recoverd *rec,
397 struct ctdb_node_map_old *nodemap,
398 uint32_t rec_mode)
400 TDB_DATA data;
401 uint32_t *nodes;
402 TALLOC_CTX *tmp_ctx;
404 tmp_ctx = talloc_new(ctdb);
405 CTDB_NO_MEMORY(ctdb, tmp_ctx);
407 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
409 data.dsize = sizeof(uint32_t);
410 data.dptr = (unsigned char *)&rec_mode;
412 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
413 nodes, 0,
414 CONTROL_TIMEOUT(),
415 false, data,
416 NULL, NULL,
417 NULL) != 0) {
418 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
419 talloc_free(tmp_ctx);
420 return -1;
423 talloc_free(tmp_ctx);
424 return 0;
428 ensure all other nodes have attached to any databases that we have
430 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
431 uint32_t pnn, struct ctdb_dbid_map_old *dbmap, TALLOC_CTX *mem_ctx)
433 int i, j, db, ret;
434 struct ctdb_dbid_map_old *remote_dbmap;
436 /* verify that all other nodes have all our databases */
437 for (j=0; j<nodemap->num; j++) {
438 /* we don't need to ourself ourselves */
439 if (nodemap->nodes[j].pnn == pnn) {
440 continue;
442 /* don't check nodes that are unavailable */
443 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
444 continue;
447 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
448 mem_ctx, &remote_dbmap);
449 if (ret != 0) {
450 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
451 return -1;
454 /* step through all local databases */
455 for (db=0; db<dbmap->num;db++) {
456 const char *name;
459 for (i=0;i<remote_dbmap->num;i++) {
460 if (dbmap->dbs[db].db_id == remote_dbmap->dbs[i].db_id) {
461 break;
464 /* the remote node already have this database */
465 if (i!=remote_dbmap->num) {
466 continue;
468 /* ok so we need to create this database */
469 ret = ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn,
470 dbmap->dbs[db].db_id, mem_ctx,
471 &name);
472 if (ret != 0) {
473 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
474 return -1;
476 ret = ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(),
477 nodemap->nodes[j].pnn,
478 mem_ctx, name,
479 dbmap->dbs[db].flags, NULL);
480 if (ret != 0) {
481 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
482 return -1;
487 return 0;
492 ensure we are attached to any databases that anyone else is attached to
494 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
495 uint32_t pnn, struct ctdb_dbid_map_old **dbmap, TALLOC_CTX *mem_ctx)
497 int i, j, db, ret;
498 struct ctdb_dbid_map_old *remote_dbmap;
500 /* verify that we have all database any other node has */
501 for (j=0; j<nodemap->num; j++) {
502 /* we don't need to ourself ourselves */
503 if (nodemap->nodes[j].pnn == pnn) {
504 continue;
506 /* don't check nodes that are unavailable */
507 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
508 continue;
511 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
512 mem_ctx, &remote_dbmap);
513 if (ret != 0) {
514 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
515 return -1;
518 /* step through all databases on the remote node */
519 for (db=0; db<remote_dbmap->num;db++) {
520 const char *name;
522 for (i=0;i<(*dbmap)->num;i++) {
523 if (remote_dbmap->dbs[db].db_id == (*dbmap)->dbs[i].db_id) {
524 break;
527 /* we already have this db locally */
528 if (i!=(*dbmap)->num) {
529 continue;
531 /* ok so we need to create this database and
532 rebuild dbmap
534 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
535 remote_dbmap->dbs[db].db_id, mem_ctx, &name);
536 if (ret != 0) {
537 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
538 nodemap->nodes[j].pnn));
539 return -1;
541 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn,
542 mem_ctx, name,
543 remote_dbmap->dbs[db].flags, NULL);
544 if (ret != 0) {
545 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
546 return -1;
548 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
549 if (ret != 0) {
550 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
551 return -1;
556 return 0;
560 update flags on all active nodes
562 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap, uint32_t pnn, uint32_t flags)
564 int ret;
566 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
567 if (ret != 0) {
568 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
569 return -1;
572 return 0;
576 called when a vacuum fetch has completed - just free it and do the next one
578 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
580 talloc_free(state);
585 * Process one elements of the vacuum fetch list:
586 * Migrate it over to us with the special flag
587 * CTDB_CALL_FLAG_VACUUM_MIGRATION.
589 static bool vacuum_fetch_process_one(struct ctdb_db_context *ctdb_db,
590 uint32_t pnn,
591 struct ctdb_rec_data_old *r)
593 struct ctdb_client_call_state *state;
594 TDB_DATA data;
595 struct ctdb_ltdb_header *hdr;
596 struct ctdb_call call;
598 ZERO_STRUCT(call);
599 call.call_id = CTDB_NULL_FUNC;
600 call.flags = CTDB_IMMEDIATE_MIGRATION;
601 call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
603 call.key.dptr = &r->data[0];
604 call.key.dsize = r->keylen;
606 /* ensure we don't block this daemon - just skip a record if we can't get
607 the chainlock */
608 if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, call.key) != 0) {
609 return true;
612 data = tdb_fetch(ctdb_db->ltdb->tdb, call.key);
613 if (data.dptr == NULL) {
614 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
615 return true;
618 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
619 free(data.dptr);
620 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
621 return true;
624 hdr = (struct ctdb_ltdb_header *)data.dptr;
625 if (hdr->dmaster == pnn) {
626 /* its already local */
627 free(data.dptr);
628 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
629 return true;
632 free(data.dptr);
634 state = ctdb_call_send(ctdb_db, &call);
635 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
636 if (state == NULL) {
637 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
638 return false;
640 state->async.fn = vacuum_fetch_callback;
641 state->async.private_data = NULL;
643 return true;
648 handler for vacuum fetch
650 static void vacuum_fetch_handler(uint64_t srvid, TDB_DATA data,
651 void *private_data)
653 struct ctdb_recoverd *rec = talloc_get_type(
654 private_data, struct ctdb_recoverd);
655 struct ctdb_context *ctdb = rec->ctdb;
656 struct ctdb_marshall_buffer *recs;
657 int ret, i;
658 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
659 const char *name;
660 struct ctdb_dbid_map_old *dbmap=NULL;
661 uint8_t db_flags = 0;
662 struct ctdb_db_context *ctdb_db;
663 struct ctdb_rec_data_old *r;
665 recs = (struct ctdb_marshall_buffer *)data.dptr;
667 if (recs->count == 0) {
668 goto done;
671 /* work out if the database is persistent */
672 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
673 if (ret != 0) {
674 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
675 goto done;
678 for (i=0;i<dbmap->num;i++) {
679 if (dbmap->dbs[i].db_id == recs->db_id) {
680 db_flags = dbmap->dbs[i].flags;
681 break;
684 if (i == dbmap->num) {
685 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
686 goto done;
689 /* find the name of this database */
690 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
691 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
692 goto done;
695 /* attach to it */
696 ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, db_flags);
697 if (ctdb_db == NULL) {
698 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
699 goto done;
702 r = (struct ctdb_rec_data_old *)&recs->data[0];
703 while (recs->count) {
704 bool ok;
706 ok = vacuum_fetch_process_one(ctdb_db, rec->ctdb->pnn, r);
707 if (!ok) {
708 break;
711 r = (struct ctdb_rec_data_old *)(r->length + (uint8_t *)r);
712 recs->count--;
715 done:
716 talloc_free(tmp_ctx);
721 * handler for database detach
723 static void detach_database_handler(uint64_t srvid, TDB_DATA data,
724 void *private_data)
726 struct ctdb_recoverd *rec = talloc_get_type(
727 private_data, struct ctdb_recoverd);
728 struct ctdb_context *ctdb = rec->ctdb;
729 uint32_t db_id;
730 struct ctdb_db_context *ctdb_db;
732 if (data.dsize != sizeof(db_id)) {
733 return;
735 db_id = *(uint32_t *)data.dptr;
737 ctdb_db = find_ctdb_db(ctdb, db_id);
738 if (ctdb_db == NULL) {
739 /* database is not attached */
740 return;
743 DLIST_REMOVE(ctdb->db_list, ctdb_db);
745 DEBUG(DEBUG_NOTICE, ("Detached from database '%s'\n",
746 ctdb_db->db_name));
747 talloc_free(ctdb_db);
751 called when ctdb_wait_timeout should finish
753 static void ctdb_wait_handler(struct tevent_context *ev,
754 struct tevent_timer *te,
755 struct timeval yt, void *p)
757 uint32_t *timed_out = (uint32_t *)p;
758 (*timed_out) = 1;
762 wait for a given number of seconds
764 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
766 uint32_t timed_out = 0;
767 time_t usecs = (secs - (time_t)secs) * 1000000;
768 tevent_add_timer(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs),
769 ctdb_wait_handler, &timed_out);
770 while (!timed_out) {
771 tevent_loop_once(ctdb->ev);
776 called when an election times out (ends)
778 static void ctdb_election_timeout(struct tevent_context *ev,
779 struct tevent_timer *te,
780 struct timeval t, void *p)
782 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
783 rec->election_timeout = NULL;
784 fast_start = false;
786 DEBUG(DEBUG_WARNING,("Election period ended\n"));
791 wait for an election to finish. It finished election_timeout seconds after
792 the last election packet is received
794 static void ctdb_wait_election(struct ctdb_recoverd *rec)
796 struct ctdb_context *ctdb = rec->ctdb;
797 while (rec->election_timeout) {
798 tevent_loop_once(ctdb->ev);
803 Update our local flags from all remote connected nodes.
804 This is only run when we are or we belive we are the recovery master
806 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap)
808 int j;
809 struct ctdb_context *ctdb = rec->ctdb;
810 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
812 /* get the nodemap for all active remote nodes and verify
813 they are the same as for this node
815 for (j=0; j<nodemap->num; j++) {
816 struct ctdb_node_map_old *remote_nodemap=NULL;
817 int ret;
819 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
820 continue;
822 if (nodemap->nodes[j].pnn == ctdb->pnn) {
823 continue;
826 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
827 mem_ctx, &remote_nodemap);
828 if (ret != 0) {
829 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
830 nodemap->nodes[j].pnn));
831 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
832 talloc_free(mem_ctx);
833 return -1;
835 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
836 /* We should tell our daemon about this so it
837 updates its flags or else we will log the same
838 message again in the next iteration of recovery.
839 Since we are the recovery master we can just as
840 well update the flags on all nodes.
842 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
843 if (ret != 0) {
844 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
845 return -1;
848 /* Update our local copy of the flags in the recovery
849 daemon.
851 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
852 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
853 nodemap->nodes[j].flags));
854 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
856 talloc_free(remote_nodemap);
858 talloc_free(mem_ctx);
859 return 0;
863 /* Create a new random generation id.
864 The generation id can not be the INVALID_GENERATION id
866 static uint32_t new_generation(void)
868 uint32_t generation;
870 while (1) {
871 generation = random();
873 if (generation != INVALID_GENERATION) {
874 break;
878 return generation;
881 static bool ctdb_recovery_have_lock(struct ctdb_recoverd *rec)
883 return (rec->recovery_lock_handle != NULL);
886 struct ctdb_recovery_lock_handle {
887 bool done;
888 bool locked;
889 double latency;
890 struct ctdb_cluster_mutex_handle *h;
893 static void take_reclock_handler(char status,
894 double latency,
895 void *private_data)
897 struct ctdb_recovery_lock_handle *s =
898 (struct ctdb_recovery_lock_handle *) private_data;
900 s->locked = (status == '0') ;
903 * If unsuccessful then ensure the process has exited and that
904 * the file descriptor event handler has been cancelled
906 if (! s->locked) {
907 TALLOC_FREE(s->h);
910 switch (status) {
911 case '0':
912 s->latency = latency;
913 break;
915 case '1':
916 D_ERR("Unable to take recovery lock - contention\n");
917 break;
919 case '2':
920 D_ERR("Unable to take recovery lock - timeout\n");
921 break;
923 default:
924 D_ERR("Unable to take recover lock - unknown error\n");
927 s->done = true;
930 static bool ctdb_recovery_lock(struct ctdb_recoverd *rec);
932 static void lost_reclock_handler(void *private_data)
934 struct ctdb_recoverd *rec = talloc_get_type_abort(
935 private_data, struct ctdb_recoverd);
937 DEBUG(DEBUG_ERR,
938 ("Recovery lock helper terminated unexpectedly - "
939 "trying to retake recovery lock\n"));
940 TALLOC_FREE(rec->recovery_lock_handle);
941 if (! ctdb_recovery_lock(rec)) {
942 DEBUG(DEBUG_ERR, ("Failed to take recovery lock\n"));
946 static bool ctdb_recovery_lock(struct ctdb_recoverd *rec)
948 struct ctdb_context *ctdb = rec->ctdb;
949 struct ctdb_cluster_mutex_handle *h;
950 struct ctdb_recovery_lock_handle *s;
952 s = talloc_zero(rec, struct ctdb_recovery_lock_handle);
953 if (s == NULL) {
954 DBG_ERR("Memory allocation error\n");
955 return false;
958 h = ctdb_cluster_mutex(s,
959 ctdb,
960 ctdb->recovery_lock,
962 take_reclock_handler,
964 lost_reclock_handler,
965 rec);
966 if (h == NULL) {
967 talloc_free(s);
968 return false;
971 rec->recovery_lock_handle = s;
972 s->h = h;
974 while (! s->done) {
975 tevent_loop_once(ctdb->ev);
978 if (! s->locked) {
979 TALLOC_FREE(rec->recovery_lock_handle);
980 return false;
983 ctdb_ctrl_report_recd_lock_latency(ctdb,
984 CONTROL_TIMEOUT(),
985 s->latency);
987 return true;
990 static void ctdb_recovery_unlock(struct ctdb_recoverd *rec)
992 if (rec->recovery_lock_handle == NULL) {
993 return;
996 if (! rec->recovery_lock_handle->done) {
998 * Taking of recovery lock still in progress. Free
999 * the cluster mutex handle to release it but leave
1000 * the recovery lock handle in place to allow taking
1001 * of the lock to fail.
1003 D_NOTICE("Cancelling recovery lock\n");
1004 TALLOC_FREE(rec->recovery_lock_handle->h);
1005 rec->recovery_lock_handle->done = true;
1006 rec->recovery_lock_handle->locked = false;
1007 return;
1010 D_NOTICE("Releasing recovery lock\n");
1011 TALLOC_FREE(rec->recovery_lock_handle);
1014 static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
1016 struct ctdb_context *ctdb = rec->ctdb;
1017 int i;
1018 struct ctdb_banning_state *ban_state;
1020 *self_ban = false;
1021 for (i=0; i<ctdb->num_nodes; i++) {
1022 if (ctdb->nodes[i]->ban_state == NULL) {
1023 continue;
1025 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1026 if (ban_state->count < 2*ctdb->num_nodes) {
1027 continue;
1030 DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
1031 ctdb->nodes[i]->pnn, ban_state->count,
1032 ctdb->tunable.recovery_ban_period));
1033 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1034 ban_state->count = 0;
1036 /* Banning ourself? */
1037 if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
1038 *self_ban = true;
1043 struct helper_state {
1044 int fd[2];
1045 pid_t pid;
1046 int result;
1047 bool done;
1050 static void helper_handler(struct tevent_context *ev,
1051 struct tevent_fd *fde,
1052 uint16_t flags, void *private_data)
1054 struct helper_state *state = talloc_get_type_abort(
1055 private_data, struct helper_state);
1056 int ret;
1058 ret = sys_read(state->fd[0], &state->result, sizeof(state->result));
1059 if (ret != sizeof(state->result)) {
1060 state->result = EPIPE;
1063 state->done = true;
1066 static int helper_run(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx,
1067 const char *prog, const char *arg, const char *type)
1069 struct helper_state *state;
1070 struct tevent_fd *fde;
1071 const char **args;
1072 int nargs, ret;
1073 uint32_t recmaster = rec->recmaster;
1075 state = talloc_zero(mem_ctx, struct helper_state);
1076 if (state == NULL) {
1077 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1078 return -1;
1081 state->pid = -1;
1083 ret = pipe(state->fd);
1084 if (ret != 0) {
1085 DEBUG(DEBUG_ERR,
1086 ("Failed to create pipe for %s helper\n", type));
1087 goto fail;
1090 set_close_on_exec(state->fd[0]);
1092 nargs = 4;
1093 args = talloc_array(state, const char *, nargs);
1094 if (args == NULL) {
1095 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1096 goto fail;
1099 args[0] = talloc_asprintf(args, "%d", state->fd[1]);
1100 if (args[0] == NULL) {
1101 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1102 goto fail;
1104 args[1] = rec->ctdb->daemon.name;
1105 args[2] = arg;
1106 args[3] = NULL;
1108 if (args[2] == NULL) {
1109 nargs = 3;
1112 state->pid = ctdb_vfork_exec(state, rec->ctdb, prog, nargs, args);
1113 if (state->pid == -1) {
1114 DEBUG(DEBUG_ERR,
1115 ("Failed to create child for %s helper\n", type));
1116 goto fail;
1119 close(state->fd[1]);
1120 state->fd[1] = -1;
1122 state->done = false;
1124 fde = tevent_add_fd(rec->ctdb->ev, rec->ctdb, state->fd[0],
1125 TEVENT_FD_READ, helper_handler, state);
1126 if (fde == NULL) {
1127 goto fail;
1129 tevent_fd_set_auto_close(fde);
1131 while (!state->done) {
1132 tevent_loop_once(rec->ctdb->ev);
1134 /* If recmaster changes, we have lost election */
1135 if (recmaster != rec->recmaster) {
1136 D_ERR("Recmaster changed to %u, aborting %s\n",
1137 rec->recmaster, type);
1138 state->result = 1;
1139 break;
1143 close(state->fd[0]);
1144 state->fd[0] = -1;
1146 if (state->result != 0) {
1147 goto fail;
1150 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
1151 talloc_free(state);
1152 return 0;
1154 fail:
1155 if (state->fd[0] != -1) {
1156 close(state->fd[0]);
1158 if (state->fd[1] != -1) {
1159 close(state->fd[1]);
1161 if (state->pid != -1) {
1162 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
1164 talloc_free(state);
1165 return -1;
1169 static int ctdb_takeover(struct ctdb_recoverd *rec,
1170 uint32_t *force_rebalance_nodes)
1172 static char prog[PATH_MAX+1] = "";
1173 char *arg;
1174 int i, ret;
1176 if (!ctdb_set_helper("takeover_helper", prog, sizeof(prog),
1177 "CTDB_TAKEOVER_HELPER", CTDB_HELPER_BINDIR,
1178 "ctdb_takeover_helper")) {
1179 ctdb_die(rec->ctdb, "Unable to set takeover helper\n");
1182 arg = NULL;
1183 for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1184 uint32_t pnn = force_rebalance_nodes[i];
1185 if (arg == NULL) {
1186 arg = talloc_asprintf(rec, "%u", pnn);
1187 } else {
1188 arg = talloc_asprintf_append(arg, ",%u", pnn);
1190 if (arg == NULL) {
1191 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1192 return -1;
1196 if (ctdb_config.failover_disabled) {
1197 ret = setenv("CTDB_DISABLE_IP_FAILOVER", "1", 1);
1198 if (ret != 0) {
1199 D_ERR("Failed to set CTDB_DISABLE_IP_FAILOVER variable\n");
1200 return -1;
1204 return helper_run(rec, rec, prog, arg, "takeover");
1207 static bool do_takeover_run(struct ctdb_recoverd *rec,
1208 struct ctdb_node_map_old *nodemap)
1210 uint32_t *nodes = NULL;
1211 struct ctdb_disable_message dtr;
1212 TDB_DATA data;
1213 int i;
1214 uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
1215 int ret;
1216 bool ok;
1218 DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
1220 if (ctdb_op_is_in_progress(rec->takeover_run)) {
1221 DEBUG(DEBUG_ERR, (__location__
1222 " takeover run already in progress \n"));
1223 ok = false;
1224 goto done;
1227 if (!ctdb_op_begin(rec->takeover_run)) {
1228 ok = false;
1229 goto done;
1232 /* Disable IP checks (takeover runs, really) on other nodes
1233 * while doing this takeover run. This will stop those other
1234 * nodes from triggering takeover runs when think they should
1235 * be hosting an IP but it isn't yet on an interface. Don't
1236 * wait for replies since a failure here might cause some
1237 * noise in the logs but will not actually cause a problem.
1239 ZERO_STRUCT(dtr);
1240 dtr.srvid = 0; /* No reply */
1241 dtr.pnn = -1;
1243 data.dptr = (uint8_t*)&dtr;
1244 data.dsize = sizeof(dtr);
1246 nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
1248 /* Disable for 60 seconds. This can be a tunable later if
1249 * necessary.
1251 dtr.timeout = 60;
1252 for (i = 0; i < talloc_array_length(nodes); i++) {
1253 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1254 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1255 data) != 0) {
1256 DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
1260 ret = ctdb_takeover(rec, rec->force_rebalance_nodes);
1262 /* Reenable takeover runs and IP checks on other nodes */
1263 dtr.timeout = 0;
1264 for (i = 0; i < talloc_array_length(nodes); i++) {
1265 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1266 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1267 data) != 0) {
1268 DEBUG(DEBUG_INFO,("Failed to re-enable takeover runs\n"));
1272 if (ret != 0) {
1273 DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
1274 ok = false;
1275 goto done;
1278 ok = true;
1279 /* Takeover run was successful so clear force rebalance targets */
1280 if (rebalance_nodes == rec->force_rebalance_nodes) {
1281 TALLOC_FREE(rec->force_rebalance_nodes);
1282 } else {
1283 DEBUG(DEBUG_WARNING,
1284 ("Rebalance target nodes changed during takeover run - not clearing\n"));
1286 done:
1287 rec->need_takeover_run = !ok;
1288 talloc_free(nodes);
1289 ctdb_op_end(rec->takeover_run);
1291 DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
1292 return ok;
1295 static int db_recovery_parallel(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx)
1297 static char prog[PATH_MAX+1] = "";
1298 const char *arg;
1300 if (!ctdb_set_helper("recovery_helper", prog, sizeof(prog),
1301 "CTDB_RECOVERY_HELPER", CTDB_HELPER_BINDIR,
1302 "ctdb_recovery_helper")) {
1303 ctdb_die(rec->ctdb, "Unable to set recovery helper\n");
1306 arg = talloc_asprintf(mem_ctx, "%u", new_generation());
1307 if (arg == NULL) {
1308 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1309 return -1;
1312 setenv("CTDB_DBDIR_STATE", rec->ctdb->db_directory_state, 1);
1314 return helper_run(rec, mem_ctx, prog, arg, "recovery");
1318 we are the recmaster, and recovery is needed - start a recovery run
1320 static int do_recovery(struct ctdb_recoverd *rec,
1321 TALLOC_CTX *mem_ctx, uint32_t pnn,
1322 struct ctdb_node_map_old *nodemap, struct ctdb_vnn_map *vnnmap)
1324 struct ctdb_context *ctdb = rec->ctdb;
1325 int i, ret;
1326 struct ctdb_dbid_map_old *dbmap;
1327 bool self_ban;
1329 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1331 /* Check if the current node is still the recmaster. It's possible that
1332 * re-election has changed the recmaster.
1334 if (pnn != rec->recmaster) {
1335 DEBUG(DEBUG_NOTICE,
1336 ("Recovery master changed to %u, aborting recovery\n",
1337 rec->recmaster));
1338 return -1;
1341 /* if recovery fails, force it again */
1342 rec->need_recovery = true;
1344 if (!ctdb_op_begin(rec->recovery)) {
1345 return -1;
1348 if (rec->election_timeout) {
1349 /* an election is in progress */
1350 DEBUG(DEBUG_ERR, ("do_recovery called while election in progress - try again later\n"));
1351 goto fail;
1354 ban_misbehaving_nodes(rec, &self_ban);
1355 if (self_ban) {
1356 DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
1357 goto fail;
1360 if (ctdb->recovery_lock != NULL) {
1361 if (ctdb_recovery_have_lock(rec)) {
1362 D_NOTICE("Already holding recovery lock\n");
1363 } else {
1364 bool ok;
1366 D_NOTICE("Attempting to take recovery lock (%s)\n",
1367 ctdb->recovery_lock);
1369 ok = ctdb_recovery_lock(rec);
1370 if (! ok) {
1371 D_ERR("Unable to take recovery lock\n");
1373 if (pnn != rec->recmaster) {
1374 D_NOTICE("Recovery master changed to %u,"
1375 " aborting recovery\n",
1376 rec->recmaster);
1377 rec->need_recovery = false;
1378 goto fail;
1381 if (ctdb->runstate ==
1382 CTDB_RUNSTATE_FIRST_RECOVERY) {
1384 * First recovery? Perhaps
1385 * current node does not yet
1386 * know who the recmaster is.
1388 D_ERR("Retrying recovery\n");
1389 goto fail;
1392 D_ERR("Abort recovery, "
1393 "ban this node for %u seconds\n",
1394 ctdb->tunable.recovery_ban_period);
1395 ctdb_ban_node(rec,
1396 pnn,
1397 ctdb->tunable.recovery_ban_period);
1398 goto fail;
1400 D_NOTICE("Recovery lock taken successfully\n");
1404 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1406 /* get a list of all databases */
1407 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1408 if (ret != 0) {
1409 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1410 goto fail;
1413 /* we do the db creation before we set the recovery mode, so the freeze happens
1414 on all databases we will be dealing with. */
1416 /* verify that we have all the databases any other node has */
1417 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1418 if (ret != 0) {
1419 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1420 goto fail;
1423 /* verify that all other nodes have all our databases */
1424 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1425 if (ret != 0) {
1426 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1427 goto fail;
1429 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1432 /* Retrieve capabilities from all connected nodes */
1433 ret = update_capabilities(rec, nodemap);
1434 if (ret!=0) {
1435 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1436 return -1;
1440 update all nodes to have the same flags that we have
1442 for (i=0;i<nodemap->num;i++) {
1443 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1444 continue;
1447 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1448 if (ret != 0) {
1449 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1450 DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
1451 } else {
1452 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1453 return -1;
1458 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1460 ret = db_recovery_parallel(rec, mem_ctx);
1461 if (ret != 0) {
1462 goto fail;
1465 do_takeover_run(rec, nodemap);
1467 /* send a message to all clients telling them that the cluster
1468 has been reconfigured */
1469 ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
1470 CTDB_SRVID_RECONFIGURE, tdb_null);
1471 if (ret != 0) {
1472 DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n"));
1473 goto fail;
1476 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1478 rec->need_recovery = false;
1479 ctdb_op_end(rec->recovery);
1481 /* we managed to complete a full recovery, make sure to forgive
1482 any past sins by the nodes that could now participate in the
1483 recovery.
1485 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
1486 for (i=0;i<nodemap->num;i++) {
1487 struct ctdb_banning_state *ban_state;
1489 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1490 continue;
1493 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
1494 if (ban_state == NULL) {
1495 continue;
1498 ban_state->count = 0;
1501 /* We just finished a recovery successfully.
1502 We now wait for rerecovery_timeout before we allow
1503 another recovery to take place.
1505 DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be suppressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
1506 ctdb_op_disable(rec->recovery, ctdb->ev,
1507 ctdb->tunable.rerecovery_timeout);
1508 return 0;
1510 fail:
1511 ctdb_op_end(rec->recovery);
1512 return -1;
1517 elections are won by first checking the number of connected nodes, then
1518 the priority time, then the pnn
1520 struct election_message {
1521 uint32_t num_connected;
1522 struct timeval priority_time;
1523 uint32_t pnn;
1524 uint32_t node_flags;
1528 form this nodes election data
1530 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1532 int ret, i;
1533 struct ctdb_node_map_old *nodemap;
1534 struct ctdb_context *ctdb = rec->ctdb;
1536 ZERO_STRUCTP(em);
1538 em->pnn = rec->ctdb->pnn;
1539 em->priority_time = rec->priority_time;
1541 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1542 if (ret != 0) {
1543 DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
1544 return;
1547 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
1548 em->node_flags = rec->node_flags;
1550 for (i=0;i<nodemap->num;i++) {
1551 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1552 em->num_connected++;
1556 /* we shouldnt try to win this election if we cant be a recmaster */
1557 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1558 em->num_connected = 0;
1559 em->priority_time = timeval_current();
1562 talloc_free(nodemap);
1566 see if the given election data wins
1568 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1570 struct election_message myem;
1571 int cmp = 0;
1573 ctdb_election_data(rec, &myem);
1575 /* we cant win if we don't have the recmaster capability */
1576 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1577 return false;
1580 /* we cant win if we are banned */
1581 if (rec->node_flags & NODE_FLAGS_BANNED) {
1582 return false;
1585 /* we cant win if we are stopped */
1586 if (rec->node_flags & NODE_FLAGS_STOPPED) {
1587 return false;
1590 /* we will automatically win if the other node is banned */
1591 if (em->node_flags & NODE_FLAGS_BANNED) {
1592 return true;
1595 /* we will automatically win if the other node is banned */
1596 if (em->node_flags & NODE_FLAGS_STOPPED) {
1597 return true;
1600 /* then the longest running node */
1601 if (cmp == 0) {
1602 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1605 if (cmp == 0) {
1606 cmp = (int)myem.pnn - (int)em->pnn;
1609 return cmp > 0;
1613 send out an election request
1615 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
1617 int ret;
1618 TDB_DATA election_data;
1619 struct election_message emsg;
1620 uint64_t srvid;
1621 struct ctdb_context *ctdb = rec->ctdb;
1623 srvid = CTDB_SRVID_ELECTION;
1625 ctdb_election_data(rec, &emsg);
1627 election_data.dsize = sizeof(struct election_message);
1628 election_data.dptr = (unsigned char *)&emsg;
1631 /* first we assume we will win the election and set
1632 recoverymaster to be ourself on the current node
1634 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
1635 CTDB_CURRENT_NODE, pnn);
1636 if (ret != 0) {
1637 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster\n"));
1638 return -1;
1640 rec->recmaster = pnn;
1642 /* send an election message to all active nodes */
1643 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
1644 return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1648 we think we are winning the election - send a broadcast election request
1650 static void election_send_request(struct tevent_context *ev,
1651 struct tevent_timer *te,
1652 struct timeval t, void *p)
1654 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1655 int ret;
1657 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
1658 if (ret != 0) {
1659 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
1662 TALLOC_FREE(rec->send_election_te);
1666 handler for memory dumps
1668 static void mem_dump_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1670 struct ctdb_recoverd *rec = talloc_get_type(
1671 private_data, struct ctdb_recoverd);
1672 struct ctdb_context *ctdb = rec->ctdb;
1673 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1674 TDB_DATA *dump;
1675 int ret;
1676 struct ctdb_srvid_message *rd;
1678 if (data.dsize != sizeof(struct ctdb_srvid_message)) {
1679 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1680 talloc_free(tmp_ctx);
1681 return;
1683 rd = (struct ctdb_srvid_message *)data.dptr;
1685 dump = talloc_zero(tmp_ctx, TDB_DATA);
1686 if (dump == NULL) {
1687 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
1688 talloc_free(tmp_ctx);
1689 return;
1691 ret = ctdb_dump_memory(ctdb, dump);
1692 if (ret != 0) {
1693 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
1694 talloc_free(tmp_ctx);
1695 return;
1698 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
1700 ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
1701 if (ret != 0) {
1702 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
1703 talloc_free(tmp_ctx);
1704 return;
1707 talloc_free(tmp_ctx);
1711 handler for reload_nodes
1713 static void reload_nodes_handler(uint64_t srvid, TDB_DATA data,
1714 void *private_data)
1716 struct ctdb_recoverd *rec = talloc_get_type(
1717 private_data, struct ctdb_recoverd);
1719 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
1721 ctdb_load_nodes_file(rec->ctdb);
1725 static void recd_node_rebalance_handler(uint64_t srvid, TDB_DATA data,
1726 void *private_data)
1728 struct ctdb_recoverd *rec = talloc_get_type(
1729 private_data, struct ctdb_recoverd);
1730 struct ctdb_context *ctdb = rec->ctdb;
1731 uint32_t pnn;
1732 uint32_t *t;
1733 int len;
1735 if (rec->recmaster != ctdb_get_pnn(ctdb)) {
1736 return;
1739 if (data.dsize != sizeof(uint32_t)) {
1740 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
1741 return;
1744 pnn = *(uint32_t *)&data.dptr[0];
1746 DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
1748 /* Copy any existing list of nodes. There's probably some
1749 * sort of realloc variant that will do this but we need to
1750 * make sure that freeing the old array also cancels the timer
1751 * event for the timeout... not sure if realloc will do that.
1753 len = (rec->force_rebalance_nodes != NULL) ?
1754 talloc_array_length(rec->force_rebalance_nodes) :
1757 /* This allows duplicates to be added but they don't cause
1758 * harm. A call to add a duplicate PNN arguably means that
1759 * the timeout should be reset, so this is the simplest
1760 * solution.
1762 t = talloc_zero_array(rec, uint32_t, len+1);
1763 CTDB_NO_MEMORY_VOID(ctdb, t);
1764 if (len > 0) {
1765 memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
1767 t[len] = pnn;
1769 talloc_free(rec->force_rebalance_nodes);
1771 rec->force_rebalance_nodes = t;
1776 static void srvid_disable_and_reply(struct ctdb_context *ctdb,
1777 TDB_DATA data,
1778 struct ctdb_op_state *op_state)
1780 struct ctdb_disable_message *r;
1781 uint32_t timeout;
1782 TDB_DATA result;
1783 int32_t ret = 0;
1785 /* Validate input data */
1786 if (data.dsize != sizeof(struct ctdb_disable_message)) {
1787 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1788 "expecting %lu\n", (long unsigned)data.dsize,
1789 (long unsigned)sizeof(struct ctdb_srvid_message)));
1790 return;
1792 if (data.dptr == NULL) {
1793 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
1794 return;
1797 r = (struct ctdb_disable_message *)data.dptr;
1798 timeout = r->timeout;
1800 ret = ctdb_op_disable(op_state, ctdb->ev, timeout);
1801 if (ret != 0) {
1802 goto done;
1805 /* Returning our PNN tells the caller that we succeeded */
1806 ret = ctdb_get_pnn(ctdb);
1807 done:
1808 result.dsize = sizeof(int32_t);
1809 result.dptr = (uint8_t *)&ret;
1810 srvid_request_reply(ctdb, (struct ctdb_srvid_message *)r, result);
1813 static void disable_takeover_runs_handler(uint64_t srvid, TDB_DATA data,
1814 void *private_data)
1816 struct ctdb_recoverd *rec = talloc_get_type(
1817 private_data, struct ctdb_recoverd);
1819 srvid_disable_and_reply(rec->ctdb, data, rec->takeover_run);
1822 /* Backward compatibility for this SRVID */
1823 static void disable_ip_check_handler(uint64_t srvid, TDB_DATA data,
1824 void *private_data)
1826 struct ctdb_recoverd *rec = talloc_get_type(
1827 private_data, struct ctdb_recoverd);
1828 uint32_t timeout;
1830 if (data.dsize != sizeof(uint32_t)) {
1831 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1832 "expecting %lu\n", (long unsigned)data.dsize,
1833 (long unsigned)sizeof(uint32_t)));
1834 return;
1836 if (data.dptr == NULL) {
1837 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
1838 return;
1841 timeout = *((uint32_t *)data.dptr);
1843 ctdb_op_disable(rec->takeover_run, rec->ctdb->ev, timeout);
1846 static void disable_recoveries_handler(uint64_t srvid, TDB_DATA data,
1847 void *private_data)
1849 struct ctdb_recoverd *rec = talloc_get_type(
1850 private_data, struct ctdb_recoverd);
1852 srvid_disable_and_reply(rec->ctdb, data, rec->recovery);
1856 handler for ip reallocate, just add it to the list of requests and
1857 handle this later in the monitor_cluster loop so we do not recurse
1858 with other requests to takeover_run()
1860 static void ip_reallocate_handler(uint64_t srvid, TDB_DATA data,
1861 void *private_data)
1863 struct ctdb_srvid_message *request;
1864 struct ctdb_recoverd *rec = talloc_get_type(
1865 private_data, struct ctdb_recoverd);
1867 if (data.dsize != sizeof(struct ctdb_srvid_message)) {
1868 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1869 return;
1872 request = (struct ctdb_srvid_message *)data.dptr;
1874 srvid_request_add(rec->ctdb, &rec->reallocate_requests, request);
1877 static void process_ipreallocate_requests(struct ctdb_context *ctdb,
1878 struct ctdb_recoverd *rec)
1880 TDB_DATA result;
1881 int32_t ret;
1882 struct srvid_requests *current;
1884 /* Only process requests that are currently pending. More
1885 * might come in while the takeover run is in progress and
1886 * they will need to be processed later since they might
1887 * be in response flag changes.
1889 current = rec->reallocate_requests;
1890 rec->reallocate_requests = NULL;
1892 if (do_takeover_run(rec, rec->nodemap)) {
1893 ret = ctdb_get_pnn(ctdb);
1894 } else {
1895 ret = -1;
1898 result.dsize = sizeof(int32_t);
1899 result.dptr = (uint8_t *)&ret;
1901 srvid_requests_reply(ctdb, &current, result);
1905 * handler for assigning banning credits
1907 static void banning_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1909 struct ctdb_recoverd *rec = talloc_get_type(
1910 private_data, struct ctdb_recoverd);
1911 uint32_t ban_pnn;
1913 /* Ignore if we are not recmaster */
1914 if (rec->ctdb->pnn != rec->recmaster) {
1915 return;
1918 if (data.dsize != sizeof(uint32_t)) {
1919 DEBUG(DEBUG_ERR, (__location__ "invalid data size %zu\n",
1920 data.dsize));
1921 return;
1924 ban_pnn = *(uint32_t *)data.dptr;
1926 ctdb_set_culprit_count(rec, ban_pnn, rec->nodemap->num);
1930 handler for recovery master elections
1932 static void election_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1934 struct ctdb_recoverd *rec = talloc_get_type(
1935 private_data, struct ctdb_recoverd);
1936 struct ctdb_context *ctdb = rec->ctdb;
1937 int ret;
1938 struct election_message *em = (struct election_message *)data.dptr;
1940 /* Ignore election packets from ourself */
1941 if (ctdb->pnn == em->pnn) {
1942 return;
1945 /* we got an election packet - update the timeout for the election */
1946 talloc_free(rec->election_timeout);
1947 rec->election_timeout = tevent_add_timer(
1948 ctdb->ev, ctdb,
1949 fast_start ?
1950 timeval_current_ofs(0, 500000) :
1951 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1952 ctdb_election_timeout, rec);
1954 /* someone called an election. check their election data
1955 and if we disagree and we would rather be the elected node,
1956 send a new election message to all other nodes
1958 if (ctdb_election_win(rec, em)) {
1959 if (!rec->send_election_te) {
1960 rec->send_election_te = tevent_add_timer(
1961 ctdb->ev, rec,
1962 timeval_current_ofs(0, 500000),
1963 election_send_request, rec);
1965 return;
1968 /* we didn't win */
1969 TALLOC_FREE(rec->send_election_te);
1971 /* Release the recovery lock file */
1972 if (ctdb_recovery_have_lock(rec)) {
1973 ctdb_recovery_unlock(rec);
1976 /* ok, let that guy become recmaster then */
1977 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
1978 CTDB_CURRENT_NODE, em->pnn);
1979 if (ret != 0) {
1980 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster"));
1981 return;
1983 rec->recmaster = em->pnn;
1985 return;
1990 force the start of the election process
1992 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
1993 struct ctdb_node_map_old *nodemap)
1995 int ret;
1996 struct ctdb_context *ctdb = rec->ctdb;
1998 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
2000 /* set all nodes to recovery mode to stop all internode traffic */
2001 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
2002 if (ret != 0) {
2003 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
2004 return;
2007 talloc_free(rec->election_timeout);
2008 rec->election_timeout = tevent_add_timer(
2009 ctdb->ev, ctdb,
2010 fast_start ?
2011 timeval_current_ofs(0, 500000) :
2012 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2013 ctdb_election_timeout, rec);
2015 ret = send_election_request(rec, pnn);
2016 if (ret!=0) {
2017 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
2018 return;
2021 /* wait for a few seconds to collect all responses */
2022 ctdb_wait_election(rec);
2028 handler for when a node changes its flags
2030 static void monitor_handler(uint64_t srvid, TDB_DATA data, void *private_data)
2032 struct ctdb_recoverd *rec = talloc_get_type(
2033 private_data, struct ctdb_recoverd);
2034 struct ctdb_context *ctdb = rec->ctdb;
2035 int ret;
2036 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2037 struct ctdb_node_map_old *nodemap=NULL;
2038 TALLOC_CTX *tmp_ctx;
2039 int i;
2041 if (data.dsize != sizeof(*c)) {
2042 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2043 return;
2046 tmp_ctx = talloc_new(ctdb);
2047 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2049 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2050 if (ret != 0) {
2051 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2052 talloc_free(tmp_ctx);
2053 return;
2057 for (i=0;i<nodemap->num;i++) {
2058 if (nodemap->nodes[i].pnn == c->pnn) break;
2061 if (i == nodemap->num) {
2062 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2063 talloc_free(tmp_ctx);
2064 return;
2067 if (c->old_flags != c->new_flags) {
2068 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2071 nodemap->nodes[i].flags = c->new_flags;
2073 talloc_free(tmp_ctx);
2077 handler for when we need to push out flag changes ot all other nodes
2079 static void push_flags_handler(uint64_t srvid, TDB_DATA data,
2080 void *private_data)
2082 struct ctdb_recoverd *rec = talloc_get_type(
2083 private_data, struct ctdb_recoverd);
2084 struct ctdb_context *ctdb = rec->ctdb;
2085 int ret;
2086 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2087 struct ctdb_node_map_old *nodemap=NULL;
2088 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2089 uint32_t *nodes;
2091 /* read the node flags from the recmaster */
2092 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
2093 tmp_ctx, &nodemap);
2094 if (ret != 0) {
2095 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2096 talloc_free(tmp_ctx);
2097 return;
2099 if (c->pnn >= nodemap->num) {
2100 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2101 talloc_free(tmp_ctx);
2102 return;
2105 /* send the flags update to all connected nodes */
2106 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2108 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2109 nodes, 0, CONTROL_TIMEOUT(),
2110 false, data,
2111 NULL, NULL,
2112 NULL) != 0) {
2113 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2115 talloc_free(tmp_ctx);
2116 return;
2119 talloc_free(tmp_ctx);
2123 struct verify_recmode_normal_data {
2124 uint32_t count;
2125 enum monitor_result status;
2128 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2130 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2133 /* one more node has responded with recmode data*/
2134 rmdata->count--;
2136 /* if we failed to get the recmode, then return an error and let
2137 the main loop try again.
2139 if (state->state != CTDB_CONTROL_DONE) {
2140 if (rmdata->status == MONITOR_OK) {
2141 rmdata->status = MONITOR_FAILED;
2143 return;
2146 /* if we got a response, then the recmode will be stored in the
2147 status field
2149 if (state->status != CTDB_RECOVERY_NORMAL) {
2150 DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
2151 rmdata->status = MONITOR_RECOVERY_NEEDED;
2154 return;
2158 /* verify that all nodes are in normal recovery mode */
2159 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap)
2161 struct verify_recmode_normal_data *rmdata;
2162 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2163 struct ctdb_client_control_state *state;
2164 enum monitor_result status;
2165 int j;
2167 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2168 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2169 rmdata->count = 0;
2170 rmdata->status = MONITOR_OK;
2172 /* loop over all active nodes and send an async getrecmode call to
2173 them*/
2174 for (j=0; j<nodemap->num; j++) {
2175 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2176 continue;
2178 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2179 CONTROL_TIMEOUT(),
2180 nodemap->nodes[j].pnn);
2181 if (state == NULL) {
2182 /* we failed to send the control, treat this as
2183 an error and try again next iteration
2185 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2186 talloc_free(mem_ctx);
2187 return MONITOR_FAILED;
2190 /* set up the callback functions */
2191 state->async.fn = verify_recmode_normal_callback;
2192 state->async.private_data = rmdata;
2194 /* one more control to wait for to complete */
2195 rmdata->count++;
2199 /* now wait for up to the maximum number of seconds allowed
2200 or until all nodes we expect a response from has replied
2202 while (rmdata->count > 0) {
2203 tevent_loop_once(ctdb->ev);
2206 status = rmdata->status;
2207 talloc_free(mem_ctx);
2208 return status;
2212 struct verify_recmaster_data {
2213 struct ctdb_recoverd *rec;
2214 uint32_t count;
2215 uint32_t pnn;
2216 enum monitor_result status;
2219 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2221 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2224 /* one more node has responded with recmaster data*/
2225 rmdata->count--;
2227 /* if we failed to get the recmaster, then return an error and let
2228 the main loop try again.
2230 if (state->state != CTDB_CONTROL_DONE) {
2231 if (rmdata->status == MONITOR_OK) {
2232 rmdata->status = MONITOR_FAILED;
2234 return;
2237 /* if we got a response, then the recmaster will be stored in the
2238 status field
2240 if (state->status != rmdata->pnn) {
2241 DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
2242 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2243 rmdata->status = MONITOR_ELECTION_NEEDED;
2246 return;
2250 /* verify that all nodes agree that we are the recmaster */
2251 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap, uint32_t pnn)
2253 struct ctdb_context *ctdb = rec->ctdb;
2254 struct verify_recmaster_data *rmdata;
2255 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2256 struct ctdb_client_control_state *state;
2257 enum monitor_result status;
2258 int j;
2260 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2261 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2262 rmdata->rec = rec;
2263 rmdata->count = 0;
2264 rmdata->pnn = pnn;
2265 rmdata->status = MONITOR_OK;
2267 /* loop over all active nodes and send an async getrecmaster call to
2268 them*/
2269 for (j=0; j<nodemap->num; j++) {
2270 if (nodemap->nodes[j].pnn == rec->recmaster) {
2271 continue;
2273 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2274 continue;
2276 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
2277 CONTROL_TIMEOUT(),
2278 nodemap->nodes[j].pnn);
2279 if (state == NULL) {
2280 /* we failed to send the control, treat this as
2281 an error and try again next iteration
2283 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
2284 talloc_free(mem_ctx);
2285 return MONITOR_FAILED;
2288 /* set up the callback functions */
2289 state->async.fn = verify_recmaster_callback;
2290 state->async.private_data = rmdata;
2292 /* one more control to wait for to complete */
2293 rmdata->count++;
2297 /* now wait for up to the maximum number of seconds allowed
2298 or until all nodes we expect a response from has replied
2300 while (rmdata->count > 0) {
2301 tevent_loop_once(ctdb->ev);
2304 status = rmdata->status;
2305 talloc_free(mem_ctx);
2306 return status;
2309 static bool interfaces_have_changed(struct ctdb_context *ctdb,
2310 struct ctdb_recoverd *rec)
2312 struct ctdb_iface_list_old *ifaces = NULL;
2313 TALLOC_CTX *mem_ctx;
2314 bool ret = false;
2316 mem_ctx = talloc_new(NULL);
2318 /* Read the interfaces from the local node */
2319 if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
2320 CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
2321 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
2322 /* We could return an error. However, this will be
2323 * rare so we'll decide that the interfaces have
2324 * actually changed, just in case.
2326 talloc_free(mem_ctx);
2327 return true;
2330 if (!rec->ifaces) {
2331 /* We haven't been here before so things have changed */
2332 DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
2333 ret = true;
2334 } else if (rec->ifaces->num != ifaces->num) {
2335 /* Number of interfaces has changed */
2336 DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
2337 rec->ifaces->num, ifaces->num));
2338 ret = true;
2339 } else {
2340 /* See if interface names or link states have changed */
2341 int i;
2342 for (i = 0; i < rec->ifaces->num; i++) {
2343 struct ctdb_iface * iface = &rec->ifaces->ifaces[i];
2344 if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
2345 DEBUG(DEBUG_NOTICE,
2346 ("Interface in slot %d changed: %s => %s\n",
2347 i, iface->name, ifaces->ifaces[i].name));
2348 ret = true;
2349 break;
2351 if (iface->link_state != ifaces->ifaces[i].link_state) {
2352 DEBUG(DEBUG_NOTICE,
2353 ("Interface %s changed state: %d => %d\n",
2354 iface->name, iface->link_state,
2355 ifaces->ifaces[i].link_state));
2356 ret = true;
2357 break;
2362 talloc_free(rec->ifaces);
2363 rec->ifaces = talloc_steal(rec, ifaces);
2365 talloc_free(mem_ctx);
2366 return ret;
2369 /* Check that the local allocation of public IP addresses is correct
2370 * and do some house-keeping */
2371 static int verify_local_ip_allocation(struct ctdb_context *ctdb,
2372 struct ctdb_recoverd *rec,
2373 uint32_t pnn,
2374 struct ctdb_node_map_old *nodemap)
2376 TALLOC_CTX *mem_ctx = talloc_new(NULL);
2377 int ret, j;
2378 bool need_takeover_run = false;
2379 struct ctdb_public_ip_list_old *ips = NULL;
2381 /* If we are not the recmaster then do some housekeeping */
2382 if (rec->recmaster != pnn) {
2383 /* Ignore any IP reallocate requests - only recmaster
2384 * processes them
2386 TALLOC_FREE(rec->reallocate_requests);
2387 /* Clear any nodes that should be force rebalanced in
2388 * the next takeover run. If the recovery master role
2389 * has moved then we don't want to process these some
2390 * time in the future.
2392 TALLOC_FREE(rec->force_rebalance_nodes);
2395 /* Return early if disabled... */
2396 if (ctdb_config.failover_disabled ||
2397 ctdb_op_is_disabled(rec->takeover_run)) {
2398 return 0;
2401 if (interfaces_have_changed(ctdb, rec)) {
2402 need_takeover_run = true;
2405 /* If there are unhosted IPs but this node can host them then
2406 * trigger an IP reallocation */
2408 /* Read *available* IPs from local node */
2409 ret = ctdb_ctrl_get_public_ips_flags(
2410 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx,
2411 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
2412 if (ret != 0) {
2413 DEBUG(DEBUG_ERR, ("Unable to retrieve available public IPs\n"));
2414 talloc_free(mem_ctx);
2415 return -1;
2418 for (j=0; j<ips->num; j++) {
2419 if (ips->ips[j].pnn == -1 &&
2420 nodemap->nodes[pnn].flags == 0) {
2421 DEBUG(DEBUG_WARNING,
2422 ("Unassigned IP %s can be served by this node\n",
2423 ctdb_addr_to_str(&ips->ips[j].addr)));
2424 need_takeover_run = true;
2428 talloc_free(ips);
2430 if (!ctdb->do_checkpublicip) {
2431 goto done;
2434 /* Validate the IP addresses that this node has on network
2435 * interfaces. If there is an inconsistency between reality
2436 * and the state expected by CTDB then try to fix it by
2437 * triggering an IP reallocation or releasing extraneous IP
2438 * addresses. */
2440 /* Read *known* IPs from local node */
2441 ret = ctdb_ctrl_get_public_ips_flags(
2442 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
2443 if (ret != 0) {
2444 DEBUG(DEBUG_ERR, ("Unable to retrieve known public IPs\n"));
2445 talloc_free(mem_ctx);
2446 return -1;
2449 for (j=0; j<ips->num; j++) {
2450 if (ips->ips[j].pnn == pnn) {
2451 if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
2452 DEBUG(DEBUG_ERR,
2453 ("Assigned IP %s not on an interface\n",
2454 ctdb_addr_to_str(&ips->ips[j].addr)));
2455 need_takeover_run = true;
2457 } else {
2458 if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
2459 DEBUG(DEBUG_ERR,
2460 ("IP %s incorrectly on an interface\n",
2461 ctdb_addr_to_str(&ips->ips[j].addr)));
2462 need_takeover_run = true;
2467 done:
2468 if (need_takeover_run) {
2469 struct ctdb_srvid_message rd;
2470 TDB_DATA data;
2472 DEBUG(DEBUG_NOTICE,("Trigger takeoverrun\n"));
2474 ZERO_STRUCT(rd);
2475 rd.pnn = ctdb->pnn;
2476 rd.srvid = 0;
2477 data.dptr = (uint8_t *)&rd;
2478 data.dsize = sizeof(rd);
2480 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
2481 if (ret != 0) {
2482 DEBUG(DEBUG_ERR,
2483 ("Failed to send takeover run request\n"));
2486 talloc_free(mem_ctx);
2487 return 0;
2491 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2493 struct ctdb_node_map_old **remote_nodemaps = callback_data;
2495 if (node_pnn >= ctdb->num_nodes) {
2496 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
2497 return;
2500 remote_nodemaps[node_pnn] = (struct ctdb_node_map_old *)talloc_steal(remote_nodemaps, outdata.dptr);
2504 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
2505 struct ctdb_node_map_old *nodemap,
2506 struct ctdb_node_map_old **remote_nodemaps)
2508 uint32_t *nodes;
2510 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
2511 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
2512 nodes, 0,
2513 CONTROL_TIMEOUT(), false, tdb_null,
2514 async_getnodemap_callback,
2515 NULL,
2516 remote_nodemaps) != 0) {
2517 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
2519 return -1;
2522 return 0;
2525 static bool validate_recovery_master(struct ctdb_recoverd *rec,
2526 TALLOC_CTX *mem_ctx)
2528 struct ctdb_context *ctdb = rec->ctdb;
2529 uint32_t pnn = ctdb_get_pnn(ctdb);
2530 struct ctdb_node_map_old *nodemap = rec->nodemap;
2531 struct ctdb_node_map_old *recmaster_nodemap = NULL;
2532 int ret;
2534 /* When recovery daemon is started, recmaster is set to
2535 * "unknown" so it knows to start an election.
2537 if (rec->recmaster == CTDB_UNKNOWN_PNN) {
2538 DEBUG(DEBUG_NOTICE,
2539 ("Initial recovery master set - forcing election\n"));
2540 force_election(rec, pnn, nodemap);
2541 return false;
2545 * If the current recmaster does not have CTDB_CAP_RECMASTER,
2546 * but we have, then force an election and try to become the new
2547 * recmaster.
2549 if (!ctdb_node_has_capabilities(rec->caps,
2550 rec->recmaster,
2551 CTDB_CAP_RECMASTER) &&
2552 (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
2553 !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
2554 DEBUG(DEBUG_ERR,
2555 (" Current recmaster node %u does not have CAP_RECMASTER,"
2556 " but we (node %u) have - force an election\n",
2557 rec->recmaster, pnn));
2558 force_election(rec, pnn, nodemap);
2559 return false;
2562 /* Verify that the master node has not been deleted. This
2563 * should not happen because a node should always be shutdown
2564 * before being deleted, causing a new master to be elected
2565 * before now. However, if something strange has happened
2566 * then checking here will ensure we don't index beyond the
2567 * end of the nodemap array. */
2568 if (rec->recmaster >= nodemap->num) {
2569 DEBUG(DEBUG_ERR,
2570 ("Recmaster node %u has been deleted. Force election\n",
2571 rec->recmaster));
2572 force_election(rec, pnn, nodemap);
2573 return false;
2576 /* if recovery master is disconnected/deleted we must elect a new recmaster */
2577 if (nodemap->nodes[rec->recmaster].flags &
2578 (NODE_FLAGS_DISCONNECTED|NODE_FLAGS_DELETED)) {
2579 DEBUG(DEBUG_NOTICE,
2580 ("Recmaster node %u is disconnected/deleted. Force election\n",
2581 rec->recmaster));
2582 force_election(rec, pnn, nodemap);
2583 return false;
2586 /* get nodemap from the recovery master to check if it is inactive */
2587 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
2588 mem_ctx, &recmaster_nodemap);
2589 if (ret != 0) {
2590 DEBUG(DEBUG_ERR,
2591 (__location__
2592 " Unable to get nodemap from recovery master %u\n",
2593 rec->recmaster));
2594 /* No election, just error */
2595 return false;
2599 if ((recmaster_nodemap->nodes[rec->recmaster].flags & NODE_FLAGS_INACTIVE) &&
2600 (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
2601 DEBUG(DEBUG_NOTICE,
2602 ("Recmaster node %u is inactive. Force election\n",
2603 rec->recmaster));
2605 * update our nodemap to carry the recmaster's notion of
2606 * its own flags, so that we don't keep freezing the
2607 * inactive recmaster node...
2609 nodemap->nodes[rec->recmaster].flags =
2610 recmaster_nodemap->nodes[rec->recmaster].flags;
2611 force_election(rec, pnn, nodemap);
2612 return false;
2615 return true;
2618 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
2619 TALLOC_CTX *mem_ctx)
2621 uint32_t pnn;
2622 struct ctdb_node_map_old *nodemap=NULL;
2623 struct ctdb_node_map_old **remote_nodemaps=NULL;
2624 struct ctdb_vnn_map *vnnmap=NULL;
2625 struct ctdb_vnn_map *remote_vnnmap=NULL;
2626 uint32_t num_lmasters;
2627 int32_t debug_level;
2628 int i, j, ret;
2629 bool self_ban;
2632 /* verify that the main daemon is still running */
2633 if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
2634 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2635 exit(-1);
2638 /* ping the local daemon to tell it we are alive */
2639 ctdb_ctrl_recd_ping(ctdb);
2641 if (rec->election_timeout) {
2642 /* an election is in progress */
2643 return;
2646 /* read the debug level from the parent and update locally */
2647 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
2648 if (ret !=0) {
2649 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
2650 return;
2652 DEBUGLEVEL = debug_level;
2654 /* get relevant tunables */
2655 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
2656 if (ret != 0) {
2657 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
2658 return;
2661 /* get runstate */
2662 ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(),
2663 CTDB_CURRENT_NODE, &ctdb->runstate);
2664 if (ret != 0) {
2665 DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n"));
2666 return;
2669 pnn = ctdb_get_pnn(ctdb);
2671 /* get nodemap */
2672 TALLOC_FREE(rec->nodemap);
2673 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
2674 if (ret != 0) {
2675 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
2676 return;
2678 nodemap = rec->nodemap;
2680 /* remember our own node flags */
2681 rec->node_flags = nodemap->nodes[pnn].flags;
2683 ban_misbehaving_nodes(rec, &self_ban);
2684 if (self_ban) {
2685 DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
2686 return;
2689 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2690 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2691 if (ret != 0) {
2692 D_ERR("Failed to read recmode from local node\n");
2693 return;
2696 /* if the local daemon is STOPPED or BANNED, we verify that the databases are
2697 also frozen and that the recmode is set to active.
2699 if (rec->node_flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
2700 /* If this node has become inactive then we want to
2701 * reduce the chances of it taking over the recovery
2702 * master role when it becomes active again. This
2703 * helps to stabilise the recovery master role so that
2704 * it stays on the most stable node.
2706 rec->priority_time = timeval_current();
2708 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2709 DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
2711 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2712 if (ret != 0) {
2713 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
2715 return;
2718 if (! rec->frozen_on_inactive) {
2719 ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(),
2720 CTDB_CURRENT_NODE);
2721 if (ret != 0) {
2722 DEBUG(DEBUG_ERR,
2723 (__location__ " Failed to freeze node "
2724 "in STOPPED or BANNED state\n"));
2725 return;
2728 rec->frozen_on_inactive = true;
2731 /* If this node is stopped or banned then it is not the recovery
2732 * master, so don't do anything. This prevents stopped or banned
2733 * node from starting election and sending unnecessary controls.
2735 return;
2738 rec->frozen_on_inactive = false;
2740 /* Retrieve capabilities from all connected nodes */
2741 ret = update_capabilities(rec, nodemap);
2742 if (ret != 0) {
2743 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
2744 return;
2747 if (! validate_recovery_master(rec, mem_ctx)) {
2748 return;
2751 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2752 /* Check if an IP takeover run is needed and trigger one if
2753 * necessary */
2754 verify_local_ip_allocation(ctdb, rec, pnn, nodemap);
2757 /* if we are not the recmaster then we do not need to check
2758 if recovery is needed
2760 if (pnn != rec->recmaster) {
2761 return;
2765 /* ensure our local copies of flags are right */
2766 ret = update_local_flags(rec, nodemap);
2767 if (ret != 0) {
2768 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
2769 return;
2772 if (ctdb->num_nodes != nodemap->num) {
2773 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
2774 ctdb_load_nodes_file(ctdb);
2775 return;
2778 /* verify that all active nodes agree that we are the recmaster */
2779 switch (verify_recmaster(rec, nodemap, pnn)) {
2780 case MONITOR_RECOVERY_NEEDED:
2781 /* can not happen */
2782 return;
2783 case MONITOR_ELECTION_NEEDED:
2784 force_election(rec, pnn, nodemap);
2785 return;
2786 case MONITOR_OK:
2787 break;
2788 case MONITOR_FAILED:
2789 return;
2793 /* get the vnnmap */
2794 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
2795 if (ret != 0) {
2796 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
2797 return;
2800 if (rec->need_recovery) {
2801 /* a previous recovery didn't finish */
2802 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2803 return;
2806 /* verify that all active nodes are in normal mode
2807 and not in recovery mode
2809 switch (verify_recmode(ctdb, nodemap)) {
2810 case MONITOR_RECOVERY_NEEDED:
2811 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2812 return;
2813 case MONITOR_FAILED:
2814 return;
2815 case MONITOR_ELECTION_NEEDED:
2816 /* can not happen */
2817 case MONITOR_OK:
2818 break;
2822 if (ctdb->recovery_lock != NULL) {
2823 /* We must already hold the recovery lock */
2824 if (!ctdb_recovery_have_lock(rec)) {
2825 DEBUG(DEBUG_ERR,("Failed recovery lock sanity check. Force a recovery\n"));
2826 ctdb_set_culprit(rec, ctdb->pnn);
2827 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2828 return;
2833 /* If recoveries are disabled then there is no use doing any
2834 * nodemap or flags checks. Recoveries might be disabled due
2835 * to "reloadnodes", so doing these checks might cause an
2836 * unnecessary recovery. */
2837 if (ctdb_op_is_disabled(rec->recovery)) {
2838 goto takeover_run_checks;
2841 /* get the nodemap for all active remote nodes
2843 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map_old *, nodemap->num);
2844 if (remote_nodemaps == NULL) {
2845 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
2846 return;
2848 for(i=0; i<nodemap->num; i++) {
2849 remote_nodemaps[i] = NULL;
2851 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
2852 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
2853 return;
2856 /* verify that all other nodes have the same nodemap as we have
2858 for (j=0; j<nodemap->num; j++) {
2859 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2860 continue;
2863 if (remote_nodemaps[j] == NULL) {
2864 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
2865 ctdb_set_culprit(rec, j);
2867 return;
2870 /* if the nodes disagree on how many nodes there are
2871 then this is a good reason to try recovery
2873 if (remote_nodemaps[j]->num != nodemap->num) {
2874 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
2875 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
2876 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2877 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2878 return;
2881 /* if the nodes disagree on which nodes exist and are
2882 active, then that is also a good reason to do recovery
2884 for (i=0;i<nodemap->num;i++) {
2885 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
2886 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
2887 nodemap->nodes[j].pnn, i,
2888 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
2889 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2890 do_recovery(rec, mem_ctx, pnn, nodemap,
2891 vnnmap);
2892 return;
2898 * Update node flags obtained from each active node. This ensure we have
2899 * up-to-date information for all the nodes.
2901 for (j=0; j<nodemap->num; j++) {
2902 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2903 continue;
2905 nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
2908 for (j=0; j<nodemap->num; j++) {
2909 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2910 continue;
2913 /* verify the flags are consistent
2915 for (i=0; i<nodemap->num; i++) {
2916 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2917 continue;
2920 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
2921 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
2922 nodemap->nodes[j].pnn,
2923 nodemap->nodes[i].pnn,
2924 remote_nodemaps[j]->nodes[i].flags,
2925 nodemap->nodes[i].flags));
2926 if (i == j) {
2927 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
2928 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
2929 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2930 do_recovery(rec, mem_ctx, pnn, nodemap,
2931 vnnmap);
2932 return;
2933 } else {
2934 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
2935 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
2936 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2937 do_recovery(rec, mem_ctx, pnn, nodemap,
2938 vnnmap);
2939 return;
2946 /* count how many active nodes there are */
2947 num_lmasters = 0;
2948 for (i=0; i<nodemap->num; i++) {
2949 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
2950 if (ctdb_node_has_capabilities(rec->caps,
2951 ctdb->nodes[i]->pnn,
2952 CTDB_CAP_LMASTER)) {
2953 num_lmasters++;
2959 /* There must be the same number of lmasters in the vnn map as
2960 * there are active nodes with the lmaster capability... or
2961 * do a recovery.
2963 if (vnnmap->size != num_lmasters) {
2964 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
2965 vnnmap->size, num_lmasters));
2966 ctdb_set_culprit(rec, ctdb->pnn);
2967 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2968 return;
2971 /* verify that all active nodes in the nodemap also exist in
2972 the vnnmap.
2974 for (j=0; j<nodemap->num; j++) {
2975 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2976 continue;
2978 if (nodemap->nodes[j].pnn == pnn) {
2979 continue;
2982 for (i=0; i<vnnmap->size; i++) {
2983 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
2984 break;
2987 if (i == vnnmap->size) {
2988 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
2989 nodemap->nodes[j].pnn));
2990 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2991 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2992 return;
2997 /* verify that all other nodes have the same vnnmap
2998 and are from the same generation
3000 for (j=0; j<nodemap->num; j++) {
3001 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3002 continue;
3004 if (nodemap->nodes[j].pnn == pnn) {
3005 continue;
3008 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3009 mem_ctx, &remote_vnnmap);
3010 if (ret != 0) {
3011 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3012 nodemap->nodes[j].pnn));
3013 return;
3016 /* verify the vnnmap generation is the same */
3017 if (vnnmap->generation != remote_vnnmap->generation) {
3018 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3019 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3020 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3021 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3022 return;
3025 /* verify the vnnmap size is the same */
3026 if (vnnmap->size != remote_vnnmap->size) {
3027 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3028 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3029 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3030 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3031 return;
3034 /* verify the vnnmap is the same */
3035 for (i=0;i<vnnmap->size;i++) {
3036 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3037 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3038 nodemap->nodes[j].pnn));
3039 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3040 do_recovery(rec, mem_ctx, pnn, nodemap,
3041 vnnmap);
3042 return;
3047 /* FIXME: Add remote public IP checking to ensure that nodes
3048 * have the IP addresses that are allocated to them. */
3050 takeover_run_checks:
3052 /* If there are IP takeover runs requested or the previous one
3053 * failed then perform one and notify the waiters */
3054 if (!ctdb_op_is_disabled(rec->takeover_run) &&
3055 (rec->reallocate_requests || rec->need_takeover_run)) {
3056 process_ipreallocate_requests(ctdb, rec);
3060 static void recd_sig_term_handler(struct tevent_context *ev,
3061 struct tevent_signal *se, int signum,
3062 int count, void *dont_care,
3063 void *private_data)
3065 struct ctdb_recoverd *rec = talloc_get_type_abort(
3066 private_data, struct ctdb_recoverd);
3068 DEBUG(DEBUG_ERR, ("Received SIGTERM, exiting\n"));
3069 ctdb_recovery_unlock(rec);
3070 exit(0);
3075 the main monitoring loop
3077 static void monitor_cluster(struct ctdb_context *ctdb)
3079 struct tevent_signal *se;
3080 struct ctdb_recoverd *rec;
3082 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
3084 rec = talloc_zero(ctdb, struct ctdb_recoverd);
3085 CTDB_NO_MEMORY_FATAL(ctdb, rec);
3087 rec->ctdb = ctdb;
3088 rec->recmaster = CTDB_UNKNOWN_PNN;
3089 rec->recovery_lock_handle = NULL;
3091 rec->takeover_run = ctdb_op_init(rec, "takeover runs");
3092 CTDB_NO_MEMORY_FATAL(ctdb, rec->takeover_run);
3094 rec->recovery = ctdb_op_init(rec, "recoveries");
3095 CTDB_NO_MEMORY_FATAL(ctdb, rec->recovery);
3097 rec->priority_time = timeval_current();
3098 rec->frozen_on_inactive = false;
3100 se = tevent_add_signal(ctdb->ev, ctdb, SIGTERM, 0,
3101 recd_sig_term_handler, rec);
3102 if (se == NULL) {
3103 DEBUG(DEBUG_ERR, ("Failed to install SIGTERM handler\n"));
3104 exit(1);
3107 /* register a message port for sending memory dumps */
3108 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
3110 /* when a node is assigned banning credits */
3111 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_BANNING,
3112 banning_handler, rec);
3114 /* register a message port for recovery elections */
3115 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_ELECTION, election_handler, rec);
3117 /* when nodes are disabled/enabled */
3118 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
3120 /* when we are asked to puch out a flag change */
3121 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
3123 /* register a message port for vacuum fetch */
3124 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
3126 /* register a message port for reloadnodes */
3127 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
3129 /* register a message port for performing a takeover run */
3130 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
3132 /* register a message port for disabling the ip check for a short while */
3133 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
3135 /* register a message port for forcing a rebalance of a node next
3136 reallocation */
3137 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
3139 /* Register a message port for disabling takeover runs */
3140 ctdb_client_set_message_handler(ctdb,
3141 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
3142 disable_takeover_runs_handler, rec);
3144 /* Register a message port for disabling recoveries */
3145 ctdb_client_set_message_handler(ctdb,
3146 CTDB_SRVID_DISABLE_RECOVERIES,
3147 disable_recoveries_handler, rec);
3149 /* register a message port for detaching database */
3150 ctdb_client_set_message_handler(ctdb,
3151 CTDB_SRVID_DETACH_DATABASE,
3152 detach_database_handler, rec);
3154 for (;;) {
3155 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3156 struct timeval start;
3157 double elapsed;
3159 if (!mem_ctx) {
3160 DEBUG(DEBUG_CRIT,(__location__
3161 " Failed to create temp context\n"));
3162 exit(-1);
3165 start = timeval_current();
3166 main_loop(ctdb, rec, mem_ctx);
3167 talloc_free(mem_ctx);
3169 /* we only check for recovery once every second */
3170 elapsed = timeval_elapsed(&start);
3171 if (elapsed < ctdb->tunable.recover_interval) {
3172 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
3173 - elapsed);
3179 event handler for when the main ctdbd dies
3181 static void ctdb_recoverd_parent(struct tevent_context *ev,
3182 struct tevent_fd *fde,
3183 uint16_t flags, void *private_data)
3185 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
3186 _exit(1);
3190 called regularly to verify that the recovery daemon is still running
3192 static void ctdb_check_recd(struct tevent_context *ev,
3193 struct tevent_timer *te,
3194 struct timeval yt, void *p)
3196 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3198 if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
3199 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
3201 tevent_add_timer(ctdb->ev, ctdb, timeval_zero(),
3202 ctdb_restart_recd, ctdb);
3204 return;
3207 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3208 timeval_current_ofs(30, 0),
3209 ctdb_check_recd, ctdb);
3212 static void recd_sig_child_handler(struct tevent_context *ev,
3213 struct tevent_signal *se, int signum,
3214 int count, void *dont_care,
3215 void *private_data)
3217 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3218 int status;
3219 pid_t pid = -1;
3221 while (pid != 0) {
3222 pid = waitpid(-1, &status, WNOHANG);
3223 if (pid == -1) {
3224 if (errno != ECHILD) {
3225 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3227 return;
3229 if (pid > 0) {
3230 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3236 startup the recovery daemon as a child of the main ctdb daemon
3238 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3240 int fd[2];
3241 struct tevent_signal *se;
3242 struct tevent_fd *fde;
3243 int ret;
3245 if (pipe(fd) != 0) {
3246 return -1;
3249 ctdb->recoverd_pid = ctdb_fork(ctdb);
3250 if (ctdb->recoverd_pid == -1) {
3251 return -1;
3254 if (ctdb->recoverd_pid != 0) {
3255 talloc_free(ctdb->recd_ctx);
3256 ctdb->recd_ctx = talloc_new(ctdb);
3257 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
3259 close(fd[0]);
3260 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3261 timeval_current_ofs(30, 0),
3262 ctdb_check_recd, ctdb);
3263 return 0;
3266 close(fd[1]);
3268 srandom(getpid() ^ time(NULL));
3270 ret = logging_init(ctdb, NULL, NULL, "ctdb-recoverd");
3271 if (ret != 0) {
3272 return -1;
3275 prctl_set_comment("ctdb_recoverd");
3276 if (switch_from_server_to_client(ctdb) != 0) {
3277 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3278 exit(1);
3281 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
3283 fde = tevent_add_fd(ctdb->ev, ctdb, fd[0], TEVENT_FD_READ,
3284 ctdb_recoverd_parent, &fd[0]);
3285 tevent_fd_set_auto_close(fde);
3287 /* set up a handler to pick up sigchld */
3288 se = tevent_add_signal(ctdb->ev, ctdb, SIGCHLD, 0,
3289 recd_sig_child_handler, ctdb);
3290 if (se == NULL) {
3291 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3292 exit(1);
3295 monitor_cluster(ctdb);
3297 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
3298 return -1;
3302 shutdown the recovery daemon
3304 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
3306 if (ctdb->recoverd_pid == 0) {
3307 return;
3310 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
3311 ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
3313 TALLOC_FREE(ctdb->recd_ctx);
3314 TALLOC_FREE(ctdb->recd_ping_count);
3317 static void ctdb_restart_recd(struct tevent_context *ev,
3318 struct tevent_timer *te,
3319 struct timeval t, void *private_data)
3321 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3323 DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
3324 ctdb_stop_recoverd(ctdb);
3325 ctdb_start_recoverd(ctdb);