ctdb-recoverd: Simplify using TALLOC_FREE()
[Samba.git] / ctdb / server / ctdb_recoverd.c
blobe0eb7d60b660a650759f5c32f7eda478b13e30cb
1 /*
2 ctdb recovery daemon
4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
20 #include "includes.h"
21 #include "system/filesys.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/wait.h"
25 #include "popt.h"
26 #include "cmdline.h"
27 #include "../include/ctdb_client.h"
28 #include "../include/ctdb_private.h"
29 #include "lib/tdb_wrap/tdb_wrap.h"
30 #include "lib/util/dlinklist.h"
33 /* List of SRVID requests that need to be processed */
34 struct srvid_list {
35 struct srvid_list *next, *prev;
36 struct srvid_request *request;
39 struct srvid_requests {
40 struct srvid_list *requests;
43 static void srvid_request_reply(struct ctdb_context *ctdb,
44 struct srvid_request *request,
45 TDB_DATA result)
47 /* Someone that sent srvid==0 does not want a reply */
48 if (request->srvid == 0) {
49 talloc_free(request);
50 return;
53 if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
54 result) == 0) {
55 DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
56 (unsigned)request->pnn,
57 (unsigned long long)request->srvid));
58 } else {
59 DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
60 (unsigned)request->pnn,
61 (unsigned long long)request->srvid));
64 talloc_free(request);
67 static void srvid_requests_reply(struct ctdb_context *ctdb,
68 struct srvid_requests **requests,
69 TDB_DATA result)
71 struct srvid_list *r;
73 for (r = (*requests)->requests; r != NULL; r = r->next) {
74 srvid_request_reply(ctdb, r->request, result);
77 /* Free the list structure... */
78 TALLOC_FREE(*requests);
81 static void srvid_request_add(struct ctdb_context *ctdb,
82 struct srvid_requests **requests,
83 struct srvid_request *request)
85 struct srvid_list *t;
86 int32_t ret;
87 TDB_DATA result;
89 if (*requests == NULL) {
90 *requests = talloc_zero(ctdb, struct srvid_requests);
91 if (*requests == NULL) {
92 goto nomem;
96 t = talloc_zero(*requests, struct srvid_list);
97 if (t == NULL) {
98 /* If *requests was just allocated above then free it */
99 if ((*requests)->requests == NULL) {
100 TALLOC_FREE(*requests);
102 goto nomem;
105 t->request = (struct srvid_request *)talloc_steal(t, request);
106 DLIST_ADD((*requests)->requests, t);
108 return;
110 nomem:
111 /* Failed to add the request to the list. Send a fail. */
112 DEBUG(DEBUG_ERR, (__location__
113 " Out of memory, failed to queue SRVID request\n"));
114 ret = -ENOMEM;
115 result.dsize = sizeof(ret);
116 result.dptr = (uint8_t *)&ret;
117 srvid_request_reply(ctdb, request, result);
120 /* An abstraction to allow an operation (takeover runs, recoveries,
121 * ...) to be disabled for a given timeout */
122 struct ctdb_op_state {
123 struct tevent_timer *timer;
124 bool in_progress;
125 const char *name;
128 static struct ctdb_op_state *ctdb_op_init(TALLOC_CTX *mem_ctx, const char *name)
130 struct ctdb_op_state *state = talloc_zero(mem_ctx, struct ctdb_op_state);
132 if (state != NULL) {
133 state->in_progress = false;
134 state->name = name;
137 return state;
140 static bool ctdb_op_is_disabled(struct ctdb_op_state *state)
142 return state->timer != NULL;
145 static bool ctdb_op_begin(struct ctdb_op_state *state)
147 if (ctdb_op_is_disabled(state)) {
148 DEBUG(DEBUG_NOTICE,
149 ("Unable to begin - %s are disabled\n", state->name));
150 return false;
153 state->in_progress = true;
154 return true;
157 static bool ctdb_op_end(struct ctdb_op_state *state)
159 return state->in_progress = false;
162 static bool ctdb_op_is_in_progress(struct ctdb_op_state *state)
164 return state->in_progress;
167 static void ctdb_op_enable(struct ctdb_op_state *state)
169 TALLOC_FREE(state->timer);
172 static void ctdb_op_timeout_handler(struct event_context *ev,
173 struct timed_event *te,
174 struct timeval yt, void *p)
176 struct ctdb_op_state *state =
177 talloc_get_type(p, struct ctdb_op_state);
179 DEBUG(DEBUG_NOTICE,("Reenabling %s after timeout\n", state->name));
180 ctdb_op_enable(state);
183 static int ctdb_op_disable(struct ctdb_op_state *state,
184 struct tevent_context *ev,
185 uint32_t timeout)
187 if (timeout == 0) {
188 DEBUG(DEBUG_NOTICE,("Reenabling %s\n", state->name));
189 ctdb_op_enable(state);
190 return 0;
193 if (state->in_progress) {
194 DEBUG(DEBUG_ERR,
195 ("Unable to disable %s - in progress\n", state->name));
196 return -EAGAIN;
199 DEBUG(DEBUG_NOTICE,("Disabling %s for %u seconds\n",
200 state->name, timeout));
202 /* Clear any old timers */
203 talloc_free(state->timer);
205 /* Arrange for the timeout to occur */
206 state->timer = tevent_add_timer(ev, state,
207 timeval_current_ofs(timeout, 0),
208 ctdb_op_timeout_handler, state);
209 if (state->timer == NULL) {
210 DEBUG(DEBUG_ERR,(__location__ " Unable to setup timer\n"));
211 return -ENOMEM;
214 return 0;
217 struct ctdb_banning_state {
218 uint32_t count;
219 struct timeval last_reported_time;
223 private state of recovery daemon
225 struct ctdb_recoverd {
226 struct ctdb_context *ctdb;
227 uint32_t recmaster;
228 uint32_t last_culprit_node;
229 struct ctdb_node_map *nodemap;
230 struct timeval priority_time;
231 bool need_takeover_run;
232 bool need_recovery;
233 uint32_t node_flags;
234 struct timed_event *send_election_te;
235 struct timed_event *election_timeout;
236 struct vacuum_info *vacuum_info;
237 struct srvid_requests *reallocate_requests;
238 struct ctdb_op_state *takeover_run;
239 struct ctdb_op_state *recovery;
240 struct ctdb_control_get_ifaces *ifaces;
241 uint32_t *force_rebalance_nodes;
242 struct ctdb_node_capabilities *caps;
245 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
246 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
248 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te, struct timeval t, void *private_data);
251 ban a node for a period of time
253 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
255 int ret;
256 struct ctdb_context *ctdb = rec->ctdb;
257 struct ctdb_ban_time bantime;
259 if (!ctdb_validate_pnn(ctdb, pnn)) {
260 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
261 return;
264 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
266 bantime.pnn = pnn;
267 bantime.time = ban_time;
269 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
270 if (ret != 0) {
271 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
272 return;
277 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
281 remember the trouble maker
283 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
285 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
286 struct ctdb_banning_state *ban_state;
288 if (culprit > ctdb->num_nodes) {
289 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
290 return;
293 /* If we are banned or stopped, do not set other nodes as culprits */
294 if (rec->node_flags & NODE_FLAGS_INACTIVE) {
295 DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
296 return;
299 if (ctdb->nodes[culprit]->ban_state == NULL) {
300 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
301 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
305 ban_state = ctdb->nodes[culprit]->ban_state;
306 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
307 /* this was the first time in a long while this node
308 misbehaved so we will forgive any old transgressions.
310 ban_state->count = 0;
313 ban_state->count += count;
314 ban_state->last_reported_time = timeval_current();
315 rec->last_culprit_node = culprit;
319 remember the trouble maker
321 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
323 ctdb_set_culprit_count(rec, culprit, 1);
327 /* this callback is called for every node that failed to execute the
328 recovered event
330 static void recovered_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
332 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
334 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the recovered event. Setting it as recovery fail culprit\n", node_pnn));
336 ctdb_set_culprit(rec, node_pnn);
340 run the "recovered" eventscript on all nodes
342 static int run_recovered_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, const char *caller)
344 TALLOC_CTX *tmp_ctx;
345 uint32_t *nodes;
346 struct ctdb_context *ctdb = rec->ctdb;
348 tmp_ctx = talloc_new(ctdb);
349 CTDB_NO_MEMORY(ctdb, tmp_ctx);
351 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
352 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
353 nodes, 0,
354 CONTROL_TIMEOUT(), false, tdb_null,
355 NULL, recovered_fail_callback,
356 rec) != 0) {
357 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
359 talloc_free(tmp_ctx);
360 return -1;
363 talloc_free(tmp_ctx);
364 return 0;
367 /* this callback is called for every node that failed to execute the
368 start recovery event
370 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
372 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
374 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
376 ctdb_set_culprit(rec, node_pnn);
380 run the "startrecovery" eventscript on all nodes
382 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
384 TALLOC_CTX *tmp_ctx;
385 uint32_t *nodes;
386 struct ctdb_context *ctdb = rec->ctdb;
388 tmp_ctx = talloc_new(ctdb);
389 CTDB_NO_MEMORY(ctdb, tmp_ctx);
391 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
392 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
393 nodes, 0,
394 CONTROL_TIMEOUT(), false, tdb_null,
395 NULL,
396 startrecovery_fail_callback,
397 rec) != 0) {
398 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
399 talloc_free(tmp_ctx);
400 return -1;
403 talloc_free(tmp_ctx);
404 return 0;
408 update the node capabilities for all connected nodes
410 static int update_capabilities(struct ctdb_recoverd *rec,
411 struct ctdb_node_map *nodemap)
413 uint32_t *capp;
414 TALLOC_CTX *tmp_ctx;
415 struct ctdb_node_capabilities *caps;
416 struct ctdb_context *ctdb = rec->ctdb;
418 tmp_ctx = talloc_new(rec);
419 CTDB_NO_MEMORY(ctdb, tmp_ctx);
421 caps = ctdb_get_capabilities(ctdb, tmp_ctx,
422 CONTROL_TIMEOUT(), nodemap);
424 if (caps == NULL) {
425 DEBUG(DEBUG_ERR,
426 (__location__ " Failed to get node capabilities\n"));
427 talloc_free(tmp_ctx);
428 return -1;
431 capp = ctdb_get_node_capabilities(caps, ctdb_get_pnn(ctdb));
432 if (capp == NULL) {
433 DEBUG(DEBUG_ERR,
434 (__location__
435 " Capabilities don't include current node.\n"));
436 talloc_free(tmp_ctx);
437 return -1;
439 ctdb->capabilities = *capp;
441 TALLOC_FREE(rec->caps);
442 rec->caps = talloc_steal(rec, caps);
444 talloc_free(tmp_ctx);
445 return 0;
448 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
450 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
452 DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
453 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
456 static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
458 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
460 DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
461 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
465 change recovery mode on all nodes
467 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t rec_mode)
469 TDB_DATA data;
470 uint32_t *nodes;
471 TALLOC_CTX *tmp_ctx;
473 tmp_ctx = talloc_new(ctdb);
474 CTDB_NO_MEMORY(ctdb, tmp_ctx);
476 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
478 data.dsize = sizeof(uint32_t);
479 data.dptr = (unsigned char *)&rec_mode;
481 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
482 nodes, 0,
483 CONTROL_TIMEOUT(),
484 false, data,
485 NULL, NULL,
486 NULL) != 0) {
487 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
488 talloc_free(tmp_ctx);
489 return -1;
492 /* freeze all nodes */
493 if (rec_mode == CTDB_RECOVERY_ACTIVE) {
494 int i;
496 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
497 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
498 nodes, i,
499 CONTROL_TIMEOUT(),
500 false, tdb_null,
501 NULL,
502 set_recmode_fail_callback,
503 rec) != 0) {
504 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
505 talloc_free(tmp_ctx);
506 return -1;
511 talloc_free(tmp_ctx);
512 return 0;
516 change recovery master on all node
518 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
520 TDB_DATA data;
521 TALLOC_CTX *tmp_ctx;
522 uint32_t *nodes;
524 tmp_ctx = talloc_new(ctdb);
525 CTDB_NO_MEMORY(ctdb, tmp_ctx);
527 data.dsize = sizeof(uint32_t);
528 data.dptr = (unsigned char *)&pnn;
530 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
531 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
532 nodes, 0,
533 CONTROL_TIMEOUT(), false, data,
534 NULL, NULL,
535 NULL) != 0) {
536 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
537 talloc_free(tmp_ctx);
538 return -1;
541 talloc_free(tmp_ctx);
542 return 0;
545 /* update all remote nodes to use the same db priority that we have
546 this can fail if the remove node has not yet been upgraded to
547 support this function, so we always return success and never fail
548 a recovery if this call fails.
550 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
551 struct ctdb_node_map *nodemap,
552 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
554 int db;
556 /* step through all local databases */
557 for (db=0; db<dbmap->num;db++) {
558 struct ctdb_db_priority db_prio;
559 int ret;
561 db_prio.db_id = dbmap->dbs[db].dbid;
562 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].dbid, &db_prio.priority);
563 if (ret != 0) {
564 DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].dbid));
565 continue;
568 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].dbid, db_prio.priority));
570 ret = ctdb_ctrl_set_db_priority(ctdb, CONTROL_TIMEOUT(),
571 CTDB_CURRENT_NODE, &db_prio);
572 if (ret != 0) {
573 DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n",
574 db_prio.db_id));
578 return 0;
582 ensure all other nodes have attached to any databases that we have
584 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
585 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
587 int i, j, db, ret;
588 struct ctdb_dbid_map *remote_dbmap;
590 /* verify that all other nodes have all our databases */
591 for (j=0; j<nodemap->num; j++) {
592 /* we dont need to ourself ourselves */
593 if (nodemap->nodes[j].pnn == pnn) {
594 continue;
596 /* dont check nodes that are unavailable */
597 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
598 continue;
601 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
602 mem_ctx, &remote_dbmap);
603 if (ret != 0) {
604 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
605 return -1;
608 /* step through all local databases */
609 for (db=0; db<dbmap->num;db++) {
610 const char *name;
613 for (i=0;i<remote_dbmap->num;i++) {
614 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
615 break;
618 /* the remote node already have this database */
619 if (i!=remote_dbmap->num) {
620 continue;
622 /* ok so we need to create this database */
623 ret = ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn,
624 dbmap->dbs[db].dbid, mem_ctx,
625 &name);
626 if (ret != 0) {
627 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
628 return -1;
630 ret = ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(),
631 nodemap->nodes[j].pnn,
632 mem_ctx, name,
633 dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
634 if (ret != 0) {
635 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
636 return -1;
641 return 0;
646 ensure we are attached to any databases that anyone else is attached to
648 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
649 uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
651 int i, j, db, ret;
652 struct ctdb_dbid_map *remote_dbmap;
654 /* verify that we have all database any other node has */
655 for (j=0; j<nodemap->num; j++) {
656 /* we dont need to ourself ourselves */
657 if (nodemap->nodes[j].pnn == pnn) {
658 continue;
660 /* dont check nodes that are unavailable */
661 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
662 continue;
665 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
666 mem_ctx, &remote_dbmap);
667 if (ret != 0) {
668 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
669 return -1;
672 /* step through all databases on the remote node */
673 for (db=0; db<remote_dbmap->num;db++) {
674 const char *name;
676 for (i=0;i<(*dbmap)->num;i++) {
677 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
678 break;
681 /* we already have this db locally */
682 if (i!=(*dbmap)->num) {
683 continue;
685 /* ok so we need to create this database and
686 rebuild dbmap
688 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
689 remote_dbmap->dbs[db].dbid, mem_ctx, &name);
690 if (ret != 0) {
691 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
692 nodemap->nodes[j].pnn));
693 return -1;
695 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
696 remote_dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
697 if (ret != 0) {
698 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
699 return -1;
701 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
702 if (ret != 0) {
703 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
704 return -1;
709 return 0;
714 pull the remote database contents from one node into the recdb
716 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
717 struct tdb_wrap *recdb, uint32_t dbid)
719 int ret;
720 TDB_DATA outdata;
721 struct ctdb_marshall_buffer *reply;
722 struct ctdb_rec_data *recdata;
723 int i;
724 TALLOC_CTX *tmp_ctx = talloc_new(recdb);
726 ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
727 CONTROL_TIMEOUT(), &outdata);
728 if (ret != 0) {
729 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
730 talloc_free(tmp_ctx);
731 return -1;
734 reply = (struct ctdb_marshall_buffer *)outdata.dptr;
736 if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
737 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
738 talloc_free(tmp_ctx);
739 return -1;
742 recdata = (struct ctdb_rec_data *)&reply->data[0];
744 for (i=0;
745 i<reply->count;
746 recdata = (struct ctdb_rec_data *)(recdata->length + (uint8_t *)recdata), i++) {
747 TDB_DATA key, data;
748 struct ctdb_ltdb_header *hdr;
749 TDB_DATA existing;
751 key.dptr = &recdata->data[0];
752 key.dsize = recdata->keylen;
753 data.dptr = &recdata->data[key.dsize];
754 data.dsize = recdata->datalen;
756 hdr = (struct ctdb_ltdb_header *)data.dptr;
758 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
759 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
760 talloc_free(tmp_ctx);
761 return -1;
764 /* fetch the existing record, if any */
765 existing = tdb_fetch(recdb->tdb, key);
767 if (existing.dptr != NULL) {
768 struct ctdb_ltdb_header header;
769 if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
770 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
771 (unsigned)existing.dsize, srcnode));
772 free(existing.dptr);
773 talloc_free(tmp_ctx);
774 return -1;
776 header = *(struct ctdb_ltdb_header *)existing.dptr;
777 free(existing.dptr);
778 if (!(header.rsn < hdr->rsn ||
779 (header.dmaster != ctdb_get_pnn(ctdb) &&
780 header.rsn == hdr->rsn))) {
781 continue;
785 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
786 DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
787 talloc_free(tmp_ctx);
788 return -1;
792 talloc_free(tmp_ctx);
794 return 0;
798 struct pull_seqnum_cbdata {
799 int failed;
800 uint32_t pnn;
801 uint64_t seqnum;
804 static void pull_seqnum_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
806 struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
807 uint64_t seqnum;
809 if (cb_data->failed != 0) {
810 DEBUG(DEBUG_ERR, ("Got seqnum from node %d but we have already failed the entire operation\n", node_pnn));
811 return;
814 if (res != 0) {
815 DEBUG(DEBUG_ERR, ("Error when pulling seqnum from node %d\n", node_pnn));
816 cb_data->failed = 1;
817 return;
820 if (outdata.dsize != sizeof(uint64_t)) {
821 DEBUG(DEBUG_ERR, ("Error when reading pull seqnum from node %d, got %d bytes but expected %d\n", node_pnn, (int)outdata.dsize, (int)sizeof(uint64_t)));
822 cb_data->failed = -1;
823 return;
826 seqnum = *((uint64_t *)outdata.dptr);
828 if (seqnum > cb_data->seqnum ||
829 (cb_data->pnn == -1 && seqnum == 0)) {
830 cb_data->seqnum = seqnum;
831 cb_data->pnn = node_pnn;
835 static void pull_seqnum_fail_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
837 struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
839 DEBUG(DEBUG_ERR, ("Failed to pull db seqnum from node %d\n", node_pnn));
840 cb_data->failed = 1;
843 static int pull_highest_seqnum_pdb(struct ctdb_context *ctdb,
844 struct ctdb_recoverd *rec,
845 struct ctdb_node_map *nodemap,
846 struct tdb_wrap *recdb, uint32_t dbid)
848 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
849 uint32_t *nodes;
850 TDB_DATA data;
851 uint32_t outdata[2];
852 struct pull_seqnum_cbdata *cb_data;
854 DEBUG(DEBUG_NOTICE, ("Scan for highest seqnum pdb for db:0x%08x\n", dbid));
856 outdata[0] = dbid;
857 outdata[1] = 0;
859 data.dsize = sizeof(outdata);
860 data.dptr = (uint8_t *)&outdata[0];
862 cb_data = talloc(tmp_ctx, struct pull_seqnum_cbdata);
863 if (cb_data == NULL) {
864 DEBUG(DEBUG_ERR, ("Failed to allocate pull highest seqnum cb_data structure\n"));
865 talloc_free(tmp_ctx);
866 return -1;
869 cb_data->failed = 0;
870 cb_data->pnn = -1;
871 cb_data->seqnum = 0;
873 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
874 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_DB_SEQNUM,
875 nodes, 0,
876 CONTROL_TIMEOUT(), false, data,
877 pull_seqnum_cb,
878 pull_seqnum_fail_cb,
879 cb_data) != 0) {
880 DEBUG(DEBUG_ERR, (__location__ " Failed to run async GET_DB_SEQNUM\n"));
882 talloc_free(tmp_ctx);
883 return -1;
886 if (cb_data->failed != 0) {
887 DEBUG(DEBUG_NOTICE, ("Failed to pull sequence numbers for DB 0x%08x\n", dbid));
888 talloc_free(tmp_ctx);
889 return -1;
892 if (cb_data->pnn == -1) {
893 DEBUG(DEBUG_NOTICE, ("Failed to find a node with highest sequence numbers for DB 0x%08x\n", dbid));
894 talloc_free(tmp_ctx);
895 return -1;
898 DEBUG(DEBUG_NOTICE, ("Pull persistent db:0x%08x from node %d with highest seqnum:%lld\n", dbid, cb_data->pnn, (long long)cb_data->seqnum));
900 if (pull_one_remote_database(ctdb, cb_data->pnn, recdb, dbid) != 0) {
901 DEBUG(DEBUG_ERR, ("Failed to pull higest seqnum database 0x%08x from node %d\n", dbid, cb_data->pnn));
902 talloc_free(tmp_ctx);
903 return -1;
906 talloc_free(tmp_ctx);
907 return 0;
912 pull all the remote database contents into the recdb
914 static int pull_remote_database(struct ctdb_context *ctdb,
915 struct ctdb_recoverd *rec,
916 struct ctdb_node_map *nodemap,
917 struct tdb_wrap *recdb, uint32_t dbid,
918 bool persistent)
920 int j;
922 if (persistent && ctdb->tunable.recover_pdb_by_seqnum != 0) {
923 int ret;
924 ret = pull_highest_seqnum_pdb(ctdb, rec, nodemap, recdb, dbid);
925 if (ret == 0) {
926 return 0;
930 /* pull all records from all other nodes across onto this node
931 (this merges based on rsn)
933 for (j=0; j<nodemap->num; j++) {
934 /* dont merge from nodes that are unavailable */
935 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
936 continue;
938 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
939 DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
940 nodemap->nodes[j].pnn));
941 ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
942 return -1;
946 return 0;
951 update flags on all active nodes
953 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
955 int ret;
957 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
958 if (ret != 0) {
959 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
960 return -1;
963 return 0;
967 ensure all nodes have the same vnnmap we do
969 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
970 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
972 int j, ret;
974 /* push the new vnn map out to all the nodes */
975 for (j=0; j<nodemap->num; j++) {
976 /* dont push to nodes that are unavailable */
977 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
978 continue;
981 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
982 if (ret != 0) {
983 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
984 return -1;
988 return 0;
992 struct vacuum_info {
993 struct vacuum_info *next, *prev;
994 struct ctdb_recoverd *rec;
995 uint32_t srcnode;
996 struct ctdb_db_context *ctdb_db;
997 struct ctdb_marshall_buffer *recs;
998 struct ctdb_rec_data *r;
1001 static void vacuum_fetch_next(struct vacuum_info *v);
1004 called when a vacuum fetch has completed - just free it and do the next one
1006 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
1008 talloc_free(state);
1013 process the next element from the vacuum list
1015 static void vacuum_fetch_next(struct vacuum_info *v)
1017 struct ctdb_call call;
1018 struct ctdb_rec_data *r;
1020 while (v->recs->count) {
1021 struct ctdb_client_call_state *state;
1022 TDB_DATA data;
1023 struct ctdb_ltdb_header *hdr;
1025 ZERO_STRUCT(call);
1026 call.call_id = CTDB_NULL_FUNC;
1027 call.flags = CTDB_IMMEDIATE_MIGRATION;
1028 call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
1030 r = v->r;
1031 v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
1032 v->recs->count--;
1034 call.key.dptr = &r->data[0];
1035 call.key.dsize = r->keylen;
1037 /* ensure we don't block this daemon - just skip a record if we can't get
1038 the chainlock */
1039 if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
1040 continue;
1043 data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
1044 if (data.dptr == NULL) {
1045 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
1046 continue;
1049 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
1050 free(data.dptr);
1051 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
1052 continue;
1055 hdr = (struct ctdb_ltdb_header *)data.dptr;
1056 if (hdr->dmaster == v->rec->ctdb->pnn) {
1057 /* its already local */
1058 free(data.dptr);
1059 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
1060 continue;
1063 free(data.dptr);
1065 state = ctdb_call_send(v->ctdb_db, &call);
1066 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
1067 if (state == NULL) {
1068 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
1069 talloc_free(v);
1070 return;
1072 state->async.fn = vacuum_fetch_callback;
1073 state->async.private_data = NULL;
1076 talloc_free(v);
1081 destroy a vacuum info structure
1083 static int vacuum_info_destructor(struct vacuum_info *v)
1085 DLIST_REMOVE(v->rec->vacuum_info, v);
1086 return 0;
1091 handler for vacuum fetch
1093 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
1094 TDB_DATA data, void *private_data)
1096 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1097 struct ctdb_marshall_buffer *recs;
1098 int ret, i;
1099 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1100 const char *name;
1101 struct ctdb_dbid_map *dbmap=NULL;
1102 bool persistent = false;
1103 struct ctdb_db_context *ctdb_db;
1104 struct ctdb_rec_data *r;
1105 uint32_t srcnode;
1106 struct vacuum_info *v;
1108 recs = (struct ctdb_marshall_buffer *)data.dptr;
1109 r = (struct ctdb_rec_data *)&recs->data[0];
1111 if (recs->count == 0) {
1112 talloc_free(tmp_ctx);
1113 return;
1116 srcnode = r->reqid;
1118 for (v=rec->vacuum_info;v;v=v->next) {
1119 if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
1120 /* we're already working on records from this node */
1121 talloc_free(tmp_ctx);
1122 return;
1126 /* work out if the database is persistent */
1127 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
1128 if (ret != 0) {
1129 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
1130 talloc_free(tmp_ctx);
1131 return;
1134 for (i=0;i<dbmap->num;i++) {
1135 if (dbmap->dbs[i].dbid == recs->db_id) {
1136 persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
1137 break;
1140 if (i == dbmap->num) {
1141 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
1142 talloc_free(tmp_ctx);
1143 return;
1146 /* find the name of this database */
1147 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
1148 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
1149 talloc_free(tmp_ctx);
1150 return;
1153 /* attach to it */
1154 ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, persistent, 0);
1155 if (ctdb_db == NULL) {
1156 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
1157 talloc_free(tmp_ctx);
1158 return;
1161 v = talloc_zero(rec, struct vacuum_info);
1162 if (v == NULL) {
1163 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
1164 talloc_free(tmp_ctx);
1165 return;
1168 v->rec = rec;
1169 v->srcnode = srcnode;
1170 v->ctdb_db = ctdb_db;
1171 v->recs = talloc_memdup(v, recs, data.dsize);
1172 if (v->recs == NULL) {
1173 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
1174 talloc_free(v);
1175 talloc_free(tmp_ctx);
1176 return;
1178 v->r = (struct ctdb_rec_data *)&v->recs->data[0];
1180 DLIST_ADD(rec->vacuum_info, v);
1182 talloc_set_destructor(v, vacuum_info_destructor);
1184 vacuum_fetch_next(v);
1185 talloc_free(tmp_ctx);
1190 * handler for database detach
1192 static void detach_database_handler(struct ctdb_context *ctdb, uint64_t srvid,
1193 TDB_DATA data, void *private_data)
1195 struct ctdb_recoverd *rec = talloc_get_type(private_data,
1196 struct ctdb_recoverd);
1197 uint32_t db_id;
1198 struct vacuum_info *v, *vnext;
1199 struct ctdb_db_context *ctdb_db;
1201 if (data.dsize != sizeof(db_id)) {
1202 return;
1204 db_id = *(uint32_t *)data.dptr;
1206 ctdb_db = find_ctdb_db(ctdb, db_id);
1207 if (ctdb_db == NULL) {
1208 /* database is not attached */
1209 return;
1212 /* Stop any active vacuum fetch */
1213 v = rec->vacuum_info;
1214 while (v != NULL) {
1215 vnext = v->next;
1217 if (v->ctdb_db->db_id == db_id) {
1218 talloc_free(v);
1220 v = vnext;
1223 DLIST_REMOVE(ctdb->db_list, ctdb_db);
1225 DEBUG(DEBUG_NOTICE, ("Detached from database '%s'\n",
1226 ctdb_db->db_name));
1227 talloc_free(ctdb_db);
1231 called when ctdb_wait_timeout should finish
1233 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
1234 struct timeval yt, void *p)
1236 uint32_t *timed_out = (uint32_t *)p;
1237 (*timed_out) = 1;
1241 wait for a given number of seconds
1243 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
1245 uint32_t timed_out = 0;
1246 time_t usecs = (secs - (time_t)secs) * 1000000;
1247 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs), ctdb_wait_handler, &timed_out);
1248 while (!timed_out) {
1249 event_loop_once(ctdb->ev);
1254 called when an election times out (ends)
1256 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
1257 struct timeval t, void *p)
1259 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1260 rec->election_timeout = NULL;
1261 fast_start = false;
1263 DEBUG(DEBUG_WARNING,("Election period ended\n"));
1268 wait for an election to finish. It finished election_timeout seconds after
1269 the last election packet is received
1271 static void ctdb_wait_election(struct ctdb_recoverd *rec)
1273 struct ctdb_context *ctdb = rec->ctdb;
1274 while (rec->election_timeout) {
1275 event_loop_once(ctdb->ev);
1280 Update our local flags from all remote connected nodes.
1281 This is only run when we are or we belive we are the recovery master
1283 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
1285 int j;
1286 struct ctdb_context *ctdb = rec->ctdb;
1287 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1289 /* get the nodemap for all active remote nodes and verify
1290 they are the same as for this node
1292 for (j=0; j<nodemap->num; j++) {
1293 struct ctdb_node_map *remote_nodemap=NULL;
1294 int ret;
1296 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
1297 continue;
1299 if (nodemap->nodes[j].pnn == ctdb->pnn) {
1300 continue;
1303 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
1304 mem_ctx, &remote_nodemap);
1305 if (ret != 0) {
1306 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
1307 nodemap->nodes[j].pnn));
1308 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
1309 talloc_free(mem_ctx);
1310 return MONITOR_FAILED;
1312 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
1313 /* We should tell our daemon about this so it
1314 updates its flags or else we will log the same
1315 message again in the next iteration of recovery.
1316 Since we are the recovery master we can just as
1317 well update the flags on all nodes.
1319 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
1320 if (ret != 0) {
1321 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
1322 return -1;
1325 /* Update our local copy of the flags in the recovery
1326 daemon.
1328 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
1329 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
1330 nodemap->nodes[j].flags));
1331 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
1333 talloc_free(remote_nodemap);
1335 talloc_free(mem_ctx);
1336 return MONITOR_OK;
1340 /* Create a new random generation ip.
1341 The generation id can not be the INVALID_GENERATION id
1343 static uint32_t new_generation(void)
1345 uint32_t generation;
1347 while (1) {
1348 generation = random();
1350 if (generation != INVALID_GENERATION) {
1351 break;
1355 return generation;
1360 create a temporary working database
1362 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1364 char *name;
1365 struct tdb_wrap *recdb;
1366 unsigned tdb_flags;
1368 /* open up the temporary recovery database */
1369 name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
1370 ctdb->db_directory_state,
1371 ctdb->pnn);
1372 if (name == NULL) {
1373 return NULL;
1375 unlink(name);
1377 tdb_flags = TDB_NOLOCK;
1378 if (ctdb->valgrinding) {
1379 tdb_flags |= TDB_NOMMAP;
1381 tdb_flags |= (TDB_INCOMPATIBLE_HASH | TDB_DISALLOW_NESTING);
1383 recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1384 tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1385 if (recdb == NULL) {
1386 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1389 talloc_free(name);
1391 return recdb;
1396 a traverse function for pulling all relevant records from recdb
1398 struct recdb_data {
1399 struct ctdb_context *ctdb;
1400 struct ctdb_marshall_buffer *recdata;
1401 uint32_t len;
1402 uint32_t allocated_len;
1403 bool failed;
1404 bool persistent;
1407 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1409 struct recdb_data *params = (struct recdb_data *)p;
1410 struct ctdb_rec_data *recdata;
1411 struct ctdb_ltdb_header *hdr;
1414 * skip empty records - but NOT for persistent databases:
1416 * The record-by-record mode of recovery deletes empty records.
1417 * For persistent databases, this can lead to data corruption
1418 * by deleting records that should be there:
1420 * - Assume the cluster has been running for a while.
1422 * - A record R in a persistent database has been created and
1423 * deleted a couple of times, the last operation being deletion,
1424 * leaving an empty record with a high RSN, say 10.
1426 * - Now a node N is turned off.
1428 * - This leaves the local database copy of D on N with the empty
1429 * copy of R and RSN 10. On all other nodes, the recovery has deleted
1430 * the copy of record R.
1432 * - Now the record is created again while node N is turned off.
1433 * This creates R with RSN = 1 on all nodes except for N.
1435 * - Now node N is turned on again. The following recovery will chose
1436 * the older empty copy of R due to RSN 10 > RSN 1.
1438 * ==> Hence the record is gone after the recovery.
1440 * On databases like Samba's registry, this can damage the higher-level
1441 * data structures built from the various tdb-level records.
1443 if (!params->persistent && data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1444 return 0;
1447 /* update the dmaster field to point to us */
1448 hdr = (struct ctdb_ltdb_header *)data.dptr;
1449 if (!params->persistent) {
1450 hdr->dmaster = params->ctdb->pnn;
1451 hdr->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
1454 /* add the record to the blob ready to send to the nodes */
1455 recdata = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1456 if (recdata == NULL) {
1457 params->failed = true;
1458 return -1;
1460 if (params->len + recdata->length >= params->allocated_len) {
1461 params->allocated_len = recdata->length + params->len + params->ctdb->tunable.pulldb_preallocation_size;
1462 params->recdata = talloc_realloc_size(NULL, params->recdata, params->allocated_len);
1464 if (params->recdata == NULL) {
1465 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u\n",
1466 recdata->length + params->len));
1467 params->failed = true;
1468 return -1;
1470 params->recdata->count++;
1471 memcpy(params->len+(uint8_t *)params->recdata, recdata, recdata->length);
1472 params->len += recdata->length;
1473 talloc_free(recdata);
1475 return 0;
1479 push the recdb database out to all nodes
1481 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1482 bool persistent,
1483 struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1485 struct recdb_data params;
1486 struct ctdb_marshall_buffer *recdata;
1487 TDB_DATA outdata;
1488 TALLOC_CTX *tmp_ctx;
1489 uint32_t *nodes;
1491 tmp_ctx = talloc_new(ctdb);
1492 CTDB_NO_MEMORY(ctdb, tmp_ctx);
1494 recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1495 CTDB_NO_MEMORY(ctdb, recdata);
1497 recdata->db_id = dbid;
1499 params.ctdb = ctdb;
1500 params.recdata = recdata;
1501 params.len = offsetof(struct ctdb_marshall_buffer, data);
1502 params.allocated_len = params.len;
1503 params.failed = false;
1504 params.persistent = persistent;
1506 if (tdb_traverse_read(recdb->tdb, traverse_recdb, &params) == -1) {
1507 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1508 talloc_free(params.recdata);
1509 talloc_free(tmp_ctx);
1510 return -1;
1513 if (params.failed) {
1514 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1515 talloc_free(params.recdata);
1516 talloc_free(tmp_ctx);
1517 return -1;
1520 recdata = params.recdata;
1522 outdata.dptr = (void *)recdata;
1523 outdata.dsize = params.len;
1525 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1526 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1527 nodes, 0,
1528 CONTROL_TIMEOUT(), false, outdata,
1529 NULL, NULL,
1530 NULL) != 0) {
1531 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1532 talloc_free(recdata);
1533 talloc_free(tmp_ctx);
1534 return -1;
1537 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1538 dbid, recdata->count));
1540 talloc_free(recdata);
1541 talloc_free(tmp_ctx);
1543 return 0;
1548 go through a full recovery on one database
1550 static int recover_database(struct ctdb_recoverd *rec,
1551 TALLOC_CTX *mem_ctx,
1552 uint32_t dbid,
1553 bool persistent,
1554 uint32_t pnn,
1555 struct ctdb_node_map *nodemap,
1556 uint32_t transaction_id)
1558 struct tdb_wrap *recdb;
1559 int ret;
1560 struct ctdb_context *ctdb = rec->ctdb;
1561 TDB_DATA data;
1562 struct ctdb_control_wipe_database w;
1563 uint32_t *nodes;
1565 recdb = create_recdb(ctdb, mem_ctx);
1566 if (recdb == NULL) {
1567 return -1;
1570 /* pull all remote databases onto the recdb */
1571 ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
1572 if (ret != 0) {
1573 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1574 return -1;
1577 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1579 /* wipe all the remote databases. This is safe as we are in a transaction */
1580 w.db_id = dbid;
1581 w.transaction_id = transaction_id;
1583 data.dptr = (void *)&w;
1584 data.dsize = sizeof(w);
1586 nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1587 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1588 nodes, 0,
1589 CONTROL_TIMEOUT(), false, data,
1590 NULL, NULL,
1591 NULL) != 0) {
1592 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1593 talloc_free(recdb);
1594 return -1;
1597 /* push out the correct database. This sets the dmaster and skips
1598 the empty records */
1599 ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
1600 if (ret != 0) {
1601 talloc_free(recdb);
1602 return -1;
1605 /* all done with this database */
1606 talloc_free(recdb);
1608 return 0;
1611 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1612 struct ctdb_recoverd *rec,
1613 struct ctdb_node_map *nodemap,
1614 uint32_t *culprit)
1616 int j;
1617 int ret;
1619 if (ctdb->num_nodes != nodemap->num) {
1620 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
1621 ctdb->num_nodes, nodemap->num));
1622 if (culprit) {
1623 *culprit = ctdb->pnn;
1625 return -1;
1628 for (j=0; j<nodemap->num; j++) {
1629 /* For readability */
1630 struct ctdb_node *node = ctdb->nodes[j];
1632 /* release any existing data */
1633 if (node->known_public_ips) {
1634 talloc_free(node->known_public_ips);
1635 node->known_public_ips = NULL;
1637 if (node->available_public_ips) {
1638 talloc_free(node->available_public_ips);
1639 node->available_public_ips = NULL;
1642 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1643 continue;
1646 /* Retrieve the list of known public IPs from the node */
1647 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1648 CONTROL_TIMEOUT(),
1649 node->pnn,
1650 ctdb->nodes,
1652 &node->known_public_ips);
1653 if (ret != 0) {
1654 DEBUG(DEBUG_ERR,
1655 ("Failed to read known public IPs from node: %u\n",
1656 node->pnn));
1657 if (culprit) {
1658 *culprit = node->pnn;
1660 return -1;
1663 if (ctdb->do_checkpublicip &&
1664 !ctdb_op_is_disabled(rec->takeover_run) &&
1665 verify_remote_ip_allocation(ctdb,
1666 node->known_public_ips,
1667 node->pnn)) {
1668 DEBUG(DEBUG_ERR,("Trigger IP reallocation\n"));
1669 rec->need_takeover_run = true;
1672 /* Retrieve the list of available public IPs from the node */
1673 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1674 CONTROL_TIMEOUT(),
1675 node->pnn,
1676 ctdb->nodes,
1677 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1678 &node->available_public_ips);
1679 if (ret != 0) {
1680 DEBUG(DEBUG_ERR,
1681 ("Failed to read available public IPs from node: %u\n",
1682 node->pnn));
1683 if (culprit) {
1684 *culprit = node->pnn;
1686 return -1;
1690 return 0;
1693 /* when we start a recovery, make sure all nodes use the same reclock file
1694 setting
1696 static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd *rec)
1698 struct ctdb_context *ctdb = rec->ctdb;
1699 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
1700 TDB_DATA data;
1701 uint32_t *nodes;
1703 if (ctdb->recovery_lock_file == NULL) {
1704 data.dptr = NULL;
1705 data.dsize = 0;
1706 } else {
1707 data.dsize = strlen(ctdb->recovery_lock_file) + 1;
1708 data.dptr = (uint8_t *)ctdb->recovery_lock_file;
1711 nodes = list_of_active_nodes(ctdb, rec->nodemap, tmp_ctx, true);
1712 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECLOCK_FILE,
1713 nodes, 0,
1714 CONTROL_TIMEOUT(),
1715 false, data,
1716 NULL, NULL,
1717 rec) != 0) {
1718 DEBUG(DEBUG_ERR, (__location__ " Failed to sync reclock file settings\n"));
1719 talloc_free(tmp_ctx);
1720 return -1;
1723 talloc_free(tmp_ctx);
1724 return 0;
1729 * this callback is called for every node that failed to execute ctdb_takeover_run()
1730 * and set flag to re-run takeover run.
1732 static void takeover_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
1734 DEBUG(DEBUG_ERR, ("Node %u failed the takeover run\n", node_pnn));
1736 if (callback_data != NULL) {
1737 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
1739 DEBUG(DEBUG_ERR, ("Setting node %u as recovery fail culprit\n", node_pnn));
1741 ctdb_set_culprit(rec, node_pnn);
1746 static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
1748 struct ctdb_context *ctdb = rec->ctdb;
1749 int i;
1750 struct ctdb_banning_state *ban_state;
1752 *self_ban = false;
1753 for (i=0; i<ctdb->num_nodes; i++) {
1754 if (ctdb->nodes[i]->ban_state == NULL) {
1755 continue;
1757 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1758 if (ban_state->count < 2*ctdb->num_nodes) {
1759 continue;
1762 DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
1763 ctdb->nodes[i]->pnn, ban_state->count,
1764 ctdb->tunable.recovery_ban_period));
1765 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1766 ban_state->count = 0;
1768 /* Banning ourself? */
1769 if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
1770 *self_ban = true;
1775 static bool do_takeover_run(struct ctdb_recoverd *rec,
1776 struct ctdb_node_map *nodemap,
1777 bool banning_credits_on_fail)
1779 uint32_t *nodes = NULL;
1780 struct srvid_request_data dtr;
1781 TDB_DATA data;
1782 int i;
1783 uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
1784 int ret;
1785 bool ok;
1787 DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
1789 if (ctdb_op_is_in_progress(rec->takeover_run)) {
1790 DEBUG(DEBUG_ERR, (__location__
1791 " takeover run already in progress \n"));
1792 ok = false;
1793 goto done;
1796 if (!ctdb_op_begin(rec->takeover_run)) {
1797 ok = false;
1798 goto done;
1801 /* Disable IP checks (takeover runs, really) on other nodes
1802 * while doing this takeover run. This will stop those other
1803 * nodes from triggering takeover runs when think they should
1804 * be hosting an IP but it isn't yet on an interface. Don't
1805 * wait for replies since a failure here might cause some
1806 * noise in the logs but will not actually cause a problem.
1808 dtr.srvid = 0; /* No reply */
1809 dtr.pnn = -1;
1811 data.dptr = (uint8_t*)&dtr;
1812 data.dsize = sizeof(dtr);
1814 nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
1816 /* Disable for 60 seconds. This can be a tunable later if
1817 * necessary.
1819 dtr.data = 60;
1820 for (i = 0; i < talloc_array_length(nodes); i++) {
1821 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1822 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1823 data) != 0) {
1824 DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
1828 ret = ctdb_takeover_run(rec->ctdb, nodemap,
1829 rec->force_rebalance_nodes,
1830 takeover_fail_callback,
1831 banning_credits_on_fail ? rec : NULL);
1833 /* Reenable takeover runs and IP checks on other nodes */
1834 dtr.data = 0;
1835 for (i = 0; i < talloc_array_length(nodes); i++) {
1836 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1837 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1838 data) != 0) {
1839 DEBUG(DEBUG_INFO,("Failed to reenable takeover runs\n"));
1843 if (ret != 0) {
1844 DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
1845 ok = false;
1846 goto done;
1849 ok = true;
1850 /* Takeover run was successful so clear force rebalance targets */
1851 if (rebalance_nodes == rec->force_rebalance_nodes) {
1852 TALLOC_FREE(rec->force_rebalance_nodes);
1853 } else {
1854 DEBUG(DEBUG_WARNING,
1855 ("Rebalance target nodes changed during takeover run - not clearing\n"));
1857 done:
1858 rec->need_takeover_run = !ok;
1859 talloc_free(nodes);
1860 ctdb_op_end(rec->takeover_run);
1862 DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
1863 return ok;
1868 we are the recmaster, and recovery is needed - start a recovery run
1870 static int do_recovery(struct ctdb_recoverd *rec,
1871 TALLOC_CTX *mem_ctx, uint32_t pnn,
1872 struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap)
1874 struct ctdb_context *ctdb = rec->ctdb;
1875 int i, j, ret;
1876 uint32_t generation;
1877 struct ctdb_dbid_map *dbmap;
1878 TDB_DATA data;
1879 uint32_t *nodes;
1880 struct timeval start_time;
1881 uint32_t culprit = (uint32_t)-1;
1882 bool self_ban;
1884 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1886 /* if recovery fails, force it again */
1887 rec->need_recovery = true;
1889 if (!ctdb_op_begin(rec->recovery)) {
1890 return -1;
1893 if (rec->election_timeout) {
1894 /* an election is in progress */
1895 DEBUG(DEBUG_ERR, ("do_recovery called while election in progress - try again later\n"));
1896 goto fail;
1899 ban_misbehaving_nodes(rec, &self_ban);
1900 if (self_ban) {
1901 DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
1902 goto fail;
1905 if (ctdb->recovery_lock_file != NULL) {
1906 if (ctdb_recovery_have_lock(ctdb)) {
1907 DEBUG(DEBUG_NOTICE, ("Already holding recovery lock\n"));
1908 } else {
1909 start_time = timeval_current();
1910 DEBUG(DEBUG_NOTICE, ("Attempting to take recovery lock (%s)\n",
1911 ctdb->recovery_lock_file));
1912 if (!ctdb_recovery_lock(ctdb)) {
1913 if (ctdb->runstate == CTDB_RUNSTATE_FIRST_RECOVERY) {
1914 /* If ctdb is trying first recovery, it's
1915 * possible that current node does not know
1916 * yet who the recmaster is.
1918 DEBUG(DEBUG_ERR, ("Unable to get recovery lock"
1919 " - retrying recovery\n"));
1920 goto fail;
1923 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
1924 "and ban ourself for %u seconds\n",
1925 ctdb->tunable.recovery_ban_period));
1926 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
1927 goto fail;
1929 ctdb_ctrl_report_recd_lock_latency(ctdb,
1930 CONTROL_TIMEOUT(),
1931 timeval_elapsed(&start_time));
1932 DEBUG(DEBUG_NOTICE,
1933 ("Recovery lock taken successfully by recovery daemon\n"));
1937 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1939 /* get a list of all databases */
1940 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1941 if (ret != 0) {
1942 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1943 goto fail;
1946 /* we do the db creation before we set the recovery mode, so the freeze happens
1947 on all databases we will be dealing with. */
1949 /* verify that we have all the databases any other node has */
1950 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1951 if (ret != 0) {
1952 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1953 goto fail;
1956 /* verify that all other nodes have all our databases */
1957 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1958 if (ret != 0) {
1959 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1960 goto fail;
1962 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1964 /* update the database priority for all remote databases */
1965 ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
1966 if (ret != 0) {
1967 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
1969 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
1972 /* update all other nodes to use the same setting for reclock files
1973 as the local recovery master.
1975 sync_recovery_lock_file_across_cluster(rec);
1977 /* set recovery mode to active on all nodes */
1978 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1979 if (ret != 0) {
1980 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1981 goto fail;
1984 /* execute the "startrecovery" event script on all nodes */
1985 ret = run_startrecovery_eventscript(rec, nodemap);
1986 if (ret!=0) {
1987 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1988 goto fail;
1992 update all nodes to have the same flags that we have
1994 for (i=0;i<nodemap->num;i++) {
1995 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1996 continue;
1999 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
2000 if (ret != 0) {
2001 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2002 DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
2003 } else {
2004 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
2005 goto fail;
2010 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
2012 /* pick a new generation number */
2013 generation = new_generation();
2015 /* change the vnnmap on this node to use the new generation
2016 number but not on any other nodes.
2017 this guarantees that if we abort the recovery prematurely
2018 for some reason (a node stops responding?)
2019 that we can just return immediately and we will reenter
2020 recovery shortly again.
2021 I.e. we deliberately leave the cluster with an inconsistent
2022 generation id to allow us to abort recovery at any stage and
2023 just restart it from scratch.
2025 vnnmap->generation = generation;
2026 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
2027 if (ret != 0) {
2028 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
2029 goto fail;
2032 data.dptr = (void *)&generation;
2033 data.dsize = sizeof(uint32_t);
2035 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
2036 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
2037 nodes, 0,
2038 CONTROL_TIMEOUT(), false, data,
2039 NULL,
2040 transaction_start_fail_callback,
2041 rec) != 0) {
2042 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
2043 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
2044 nodes, 0,
2045 CONTROL_TIMEOUT(), false, tdb_null,
2046 NULL,
2047 NULL,
2048 NULL) != 0) {
2049 DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
2051 goto fail;
2054 DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
2056 for (i=0;i<dbmap->num;i++) {
2057 ret = recover_database(rec, mem_ctx,
2058 dbmap->dbs[i].dbid,
2059 dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT,
2060 pnn, nodemap, generation);
2061 if (ret != 0) {
2062 DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
2063 goto fail;
2067 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
2069 /* commit all the changes */
2070 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
2071 nodes, 0,
2072 CONTROL_TIMEOUT(), false, data,
2073 NULL, NULL,
2074 NULL) != 0) {
2075 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
2076 goto fail;
2079 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
2082 /* update the capabilities for all nodes */
2083 ret = update_capabilities(rec, nodemap);
2084 if (ret!=0) {
2085 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
2086 goto fail;
2089 /* build a new vnn map with all the currently active and
2090 unbanned nodes */
2091 generation = new_generation();
2092 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
2093 CTDB_NO_MEMORY(ctdb, vnnmap);
2094 vnnmap->generation = generation;
2095 vnnmap->size = 0;
2096 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
2097 CTDB_NO_MEMORY(ctdb, vnnmap->map);
2098 for (i=j=0;i<nodemap->num;i++) {
2099 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2100 continue;
2102 if (!ctdb_node_has_capabilities(rec->caps,
2103 ctdb->nodes[i]->pnn,
2104 CTDB_CAP_LMASTER)) {
2105 /* this node can not be an lmaster */
2106 DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
2107 continue;
2110 vnnmap->size++;
2111 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
2112 CTDB_NO_MEMORY(ctdb, vnnmap->map);
2113 vnnmap->map[j++] = nodemap->nodes[i].pnn;
2116 if (vnnmap->size == 0) {
2117 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
2118 vnnmap->size++;
2119 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
2120 CTDB_NO_MEMORY(ctdb, vnnmap->map);
2121 vnnmap->map[0] = pnn;
2124 /* update to the new vnnmap on all nodes */
2125 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
2126 if (ret != 0) {
2127 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
2128 goto fail;
2131 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
2133 /* update recmaster to point to us for all nodes */
2134 ret = set_recovery_master(ctdb, nodemap, pnn);
2135 if (ret!=0) {
2136 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
2137 goto fail;
2140 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
2142 /* disable recovery mode */
2143 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL);
2144 if (ret != 0) {
2145 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
2146 goto fail;
2149 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
2151 /* Fetch known/available public IPs from each active node */
2152 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
2153 if (ret != 0) {
2154 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2155 culprit));
2156 rec->need_takeover_run = true;
2157 goto fail;
2160 do_takeover_run(rec, nodemap, false);
2162 /* execute the "recovered" event script on all nodes */
2163 ret = run_recovered_eventscript(rec, nodemap, "do_recovery");
2164 if (ret!=0) {
2165 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
2166 goto fail;
2169 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
2171 /* send a message to all clients telling them that the cluster
2172 has been reconfigured */
2173 ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
2174 CTDB_SRVID_RECONFIGURE, tdb_null);
2175 if (ret != 0) {
2176 DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n"));
2177 goto fail;
2180 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
2182 rec->need_recovery = false;
2183 ctdb_op_end(rec->recovery);
2185 /* we managed to complete a full recovery, make sure to forgive
2186 any past sins by the nodes that could now participate in the
2187 recovery.
2189 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
2190 for (i=0;i<nodemap->num;i++) {
2191 struct ctdb_banning_state *ban_state;
2193 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2194 continue;
2197 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
2198 if (ban_state == NULL) {
2199 continue;
2202 ban_state->count = 0;
2205 /* We just finished a recovery successfully.
2206 We now wait for rerecovery_timeout before we allow
2207 another recovery to take place.
2209 DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
2210 ctdb_op_disable(rec->recovery, ctdb->ev,
2211 ctdb->tunable.rerecovery_timeout);
2212 return 0;
2214 fail:
2215 ctdb_op_end(rec->recovery);
2216 return -1;
2221 elections are won by first checking the number of connected nodes, then
2222 the priority time, then the pnn
2224 struct election_message {
2225 uint32_t num_connected;
2226 struct timeval priority_time;
2227 uint32_t pnn;
2228 uint32_t node_flags;
2232 form this nodes election data
2234 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
2236 int ret, i;
2237 struct ctdb_node_map *nodemap;
2238 struct ctdb_context *ctdb = rec->ctdb;
2240 ZERO_STRUCTP(em);
2242 em->pnn = rec->ctdb->pnn;
2243 em->priority_time = rec->priority_time;
2245 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
2246 if (ret != 0) {
2247 DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
2248 return;
2251 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
2252 em->node_flags = rec->node_flags;
2254 for (i=0;i<nodemap->num;i++) {
2255 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
2256 em->num_connected++;
2260 /* we shouldnt try to win this election if we cant be a recmaster */
2261 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2262 em->num_connected = 0;
2263 em->priority_time = timeval_current();
2266 talloc_free(nodemap);
2270 see if the given election data wins
2272 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
2274 struct election_message myem;
2275 int cmp = 0;
2277 ctdb_election_data(rec, &myem);
2279 /* we cant win if we dont have the recmaster capability */
2280 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2281 return false;
2284 /* we cant win if we are banned */
2285 if (rec->node_flags & NODE_FLAGS_BANNED) {
2286 return false;
2289 /* we cant win if we are stopped */
2290 if (rec->node_flags & NODE_FLAGS_STOPPED) {
2291 return false;
2294 /* we will automatically win if the other node is banned */
2295 if (em->node_flags & NODE_FLAGS_BANNED) {
2296 return true;
2299 /* we will automatically win if the other node is banned */
2300 if (em->node_flags & NODE_FLAGS_STOPPED) {
2301 return true;
2304 /* try to use the most connected node */
2305 if (cmp == 0) {
2306 cmp = (int)myem.num_connected - (int)em->num_connected;
2309 /* then the longest running node */
2310 if (cmp == 0) {
2311 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
2314 if (cmp == 0) {
2315 cmp = (int)myem.pnn - (int)em->pnn;
2318 return cmp > 0;
2322 send out an election request
2324 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
2326 int ret;
2327 TDB_DATA election_data;
2328 struct election_message emsg;
2329 uint64_t srvid;
2330 struct ctdb_context *ctdb = rec->ctdb;
2332 srvid = CTDB_SRVID_RECOVERY;
2334 ctdb_election_data(rec, &emsg);
2336 election_data.dsize = sizeof(struct election_message);
2337 election_data.dptr = (unsigned char *)&emsg;
2340 /* first we assume we will win the election and set
2341 recoverymaster to be ourself on the current node
2343 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
2344 if (ret != 0) {
2345 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
2346 return -1;
2350 /* send an election message to all active nodes */
2351 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
2352 return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
2356 this function will unban all nodes in the cluster
2358 static void unban_all_nodes(struct ctdb_context *ctdb)
2360 int ret, i;
2361 struct ctdb_node_map *nodemap;
2362 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2364 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2365 if (ret != 0) {
2366 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
2367 return;
2370 for (i=0;i<nodemap->num;i++) {
2371 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
2372 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
2373 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(),
2374 nodemap->nodes[i].pnn, 0,
2375 NODE_FLAGS_BANNED);
2376 if (ret != 0) {
2377 DEBUG(DEBUG_ERR, (__location__ " failed to reset ban state\n"));
2382 talloc_free(tmp_ctx);
2387 we think we are winning the election - send a broadcast election request
2389 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
2391 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2392 int ret;
2394 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
2395 if (ret != 0) {
2396 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
2399 talloc_free(rec->send_election_te);
2400 rec->send_election_te = NULL;
2404 handler for memory dumps
2406 static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
2407 TDB_DATA data, void *private_data)
2409 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2410 TDB_DATA *dump;
2411 int ret;
2412 struct srvid_request *rd;
2414 if (data.dsize != sizeof(struct srvid_request)) {
2415 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2416 talloc_free(tmp_ctx);
2417 return;
2419 rd = (struct srvid_request *)data.dptr;
2421 dump = talloc_zero(tmp_ctx, TDB_DATA);
2422 if (dump == NULL) {
2423 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
2424 talloc_free(tmp_ctx);
2425 return;
2427 ret = ctdb_dump_memory(ctdb, dump);
2428 if (ret != 0) {
2429 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
2430 talloc_free(tmp_ctx);
2431 return;
2434 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
2436 ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
2437 if (ret != 0) {
2438 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
2439 talloc_free(tmp_ctx);
2440 return;
2443 talloc_free(tmp_ctx);
2447 handler for reload_nodes
2449 static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid,
2450 TDB_DATA data, void *private_data)
2452 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2454 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
2456 ctdb_load_nodes_file(rec->ctdb);
2460 static void ctdb_rebalance_timeout(struct event_context *ev,
2461 struct timed_event *te,
2462 struct timeval t, void *p)
2464 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2466 if (rec->force_rebalance_nodes == NULL) {
2467 DEBUG(DEBUG_ERR,
2468 ("Rebalance timeout occurred - no nodes to rebalance\n"));
2469 return;
2472 DEBUG(DEBUG_NOTICE,
2473 ("Rebalance timeout occurred - do takeover run\n"));
2474 do_takeover_run(rec, rec->nodemap, false);
2478 static void recd_node_rebalance_handler(struct ctdb_context *ctdb,
2479 uint64_t srvid,
2480 TDB_DATA data, void *private_data)
2482 uint32_t pnn;
2483 uint32_t *t;
2484 int len;
2485 uint32_t deferred_rebalance;
2486 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2488 if (rec->recmaster != ctdb_get_pnn(ctdb)) {
2489 return;
2492 if (data.dsize != sizeof(uint32_t)) {
2493 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
2494 return;
2497 pnn = *(uint32_t *)&data.dptr[0];
2499 DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
2501 /* Copy any existing list of nodes. There's probably some
2502 * sort of realloc variant that will do this but we need to
2503 * make sure that freeing the old array also cancels the timer
2504 * event for the timeout... not sure if realloc will do that.
2506 len = (rec->force_rebalance_nodes != NULL) ?
2507 talloc_array_length(rec->force_rebalance_nodes) :
2510 /* This allows duplicates to be added but they don't cause
2511 * harm. A call to add a duplicate PNN arguably means that
2512 * the timeout should be reset, so this is the simplest
2513 * solution.
2515 t = talloc_zero_array(rec, uint32_t, len+1);
2516 CTDB_NO_MEMORY_VOID(ctdb, t);
2517 if (len > 0) {
2518 memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
2520 t[len] = pnn;
2522 talloc_free(rec->force_rebalance_nodes);
2524 rec->force_rebalance_nodes = t;
2526 /* If configured, setup a deferred takeover run to make sure
2527 * that certain nodes get IPs rebalanced to them. This will
2528 * be cancelled if a successful takeover run happens before
2529 * the timeout. Assign tunable value to variable for
2530 * readability.
2532 deferred_rebalance = ctdb->tunable.deferred_rebalance_on_node_add;
2533 if (deferred_rebalance != 0) {
2534 event_add_timed(ctdb->ev, rec->force_rebalance_nodes,
2535 timeval_current_ofs(deferred_rebalance, 0),
2536 ctdb_rebalance_timeout, rec);
2542 static void recd_update_ip_handler(struct ctdb_context *ctdb, uint64_t srvid,
2543 TDB_DATA data, void *private_data)
2545 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2546 struct ctdb_public_ip *ip;
2548 if (rec->recmaster != rec->ctdb->pnn) {
2549 DEBUG(DEBUG_INFO,("Not recmaster, ignore update ip message\n"));
2550 return;
2553 if (data.dsize != sizeof(struct ctdb_public_ip)) {
2554 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(struct ctdb_public_ip)));
2555 return;
2558 ip = (struct ctdb_public_ip *)data.dptr;
2560 update_ip_assignment_tree(rec->ctdb, ip);
2563 static void srvid_disable_and_reply(struct ctdb_context *ctdb,
2564 TDB_DATA data,
2565 struct ctdb_op_state *op_state)
2567 struct srvid_request_data *r;
2568 uint32_t timeout;
2569 TDB_DATA result;
2570 int32_t ret = 0;
2572 /* Validate input data */
2573 if (data.dsize != sizeof(struct srvid_request_data)) {
2574 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2575 "expecting %lu\n", (long unsigned)data.dsize,
2576 (long unsigned)sizeof(struct srvid_request)));
2577 return;
2579 if (data.dptr == NULL) {
2580 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2581 return;
2584 r = (struct srvid_request_data *)data.dptr;
2585 timeout = r->data;
2587 ret = ctdb_op_disable(op_state, ctdb->ev, timeout);
2588 if (ret != 0) {
2589 goto done;
2592 /* Returning our PNN tells the caller that we succeeded */
2593 ret = ctdb_get_pnn(ctdb);
2594 done:
2595 result.dsize = sizeof(int32_t);
2596 result.dptr = (uint8_t *)&ret;
2597 srvid_request_reply(ctdb, (struct srvid_request *)r, result);
2600 static void disable_takeover_runs_handler(struct ctdb_context *ctdb,
2601 uint64_t srvid, TDB_DATA data,
2602 void *private_data)
2604 struct ctdb_recoverd *rec = talloc_get_type(private_data,
2605 struct ctdb_recoverd);
2607 srvid_disable_and_reply(ctdb, data, rec->takeover_run);
2610 /* Backward compatibility for this SRVID */
2611 static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid,
2612 TDB_DATA data, void *private_data)
2614 struct ctdb_recoverd *rec = talloc_get_type(private_data,
2615 struct ctdb_recoverd);
2616 uint32_t timeout;
2618 if (data.dsize != sizeof(uint32_t)) {
2619 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2620 "expecting %lu\n", (long unsigned)data.dsize,
2621 (long unsigned)sizeof(uint32_t)));
2622 return;
2624 if (data.dptr == NULL) {
2625 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2626 return;
2629 timeout = *((uint32_t *)data.dptr);
2631 ctdb_op_disable(rec->takeover_run, ctdb->ev, timeout);
2634 static void disable_recoveries_handler(struct ctdb_context *ctdb,
2635 uint64_t srvid, TDB_DATA data,
2636 void *private_data)
2638 struct ctdb_recoverd *rec = talloc_get_type(private_data,
2639 struct ctdb_recoverd);
2641 srvid_disable_and_reply(ctdb, data, rec->recovery);
2645 handler for ip reallocate, just add it to the list of requests and
2646 handle this later in the monitor_cluster loop so we do not recurse
2647 with other requests to takeover_run()
2649 static void ip_reallocate_handler(struct ctdb_context *ctdb, uint64_t srvid,
2650 TDB_DATA data, void *private_data)
2652 struct srvid_request *request;
2653 struct ctdb_recoverd *rec = talloc_get_type(private_data,
2654 struct ctdb_recoverd);
2656 if (data.dsize != sizeof(struct srvid_request)) {
2657 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2658 return;
2661 request = (struct srvid_request *)data.dptr;
2663 srvid_request_add(ctdb, &rec->reallocate_requests, request);
2666 static void process_ipreallocate_requests(struct ctdb_context *ctdb,
2667 struct ctdb_recoverd *rec)
2669 TDB_DATA result;
2670 int32_t ret;
2671 uint32_t culprit;
2672 struct srvid_requests *current;
2674 DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
2676 /* Only process requests that are currently pending. More
2677 * might come in while the takeover run is in progress and
2678 * they will need to be processed later since they might
2679 * be in response flag changes.
2681 current = rec->reallocate_requests;
2682 rec->reallocate_requests = NULL;
2684 /* update the list of public ips that a node can handle for
2685 all connected nodes
2687 ret = ctdb_reload_remote_public_ips(ctdb, rec, rec->nodemap, &culprit);
2688 if (ret != 0) {
2689 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2690 culprit));
2691 rec->need_takeover_run = true;
2693 if (ret == 0) {
2694 if (do_takeover_run(rec, rec->nodemap, false)) {
2695 ret = ctdb_get_pnn(ctdb);
2696 } else {
2697 ret = -1;
2701 result.dsize = sizeof(int32_t);
2702 result.dptr = (uint8_t *)&ret;
2704 srvid_requests_reply(ctdb, &current, result);
2709 handler for recovery master elections
2711 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
2712 TDB_DATA data, void *private_data)
2714 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2715 int ret;
2716 struct election_message *em = (struct election_message *)data.dptr;
2718 /* Ignore election packets from ourself */
2719 if (ctdb->pnn == em->pnn) {
2720 return;
2723 /* we got an election packet - update the timeout for the election */
2724 talloc_free(rec->election_timeout);
2725 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2726 fast_start ?
2727 timeval_current_ofs(0, 500000) :
2728 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2729 ctdb_election_timeout, rec);
2731 /* someone called an election. check their election data
2732 and if we disagree and we would rather be the elected node,
2733 send a new election message to all other nodes
2735 if (ctdb_election_win(rec, em)) {
2736 if (!rec->send_election_te) {
2737 rec->send_election_te = event_add_timed(ctdb->ev, rec,
2738 timeval_current_ofs(0, 500000),
2739 election_send_request, rec);
2741 /*unban_all_nodes(ctdb);*/
2742 return;
2745 /* we didn't win */
2746 TALLOC_FREE(rec->send_election_te);
2748 if (ctdb->recovery_lock_file != NULL) {
2749 /* Release the recovery lock file */
2750 if (ctdb_recovery_have_lock(ctdb)) {
2751 ctdb_recovery_unlock(ctdb);
2752 unban_all_nodes(ctdb);
2756 /* ok, let that guy become recmaster then */
2757 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
2758 if (ret != 0) {
2759 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
2760 return;
2763 return;
2768 force the start of the election process
2770 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
2771 struct ctdb_node_map *nodemap)
2773 int ret;
2774 struct ctdb_context *ctdb = rec->ctdb;
2776 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
2778 /* set all nodes to recovery mode to stop all internode traffic */
2779 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
2780 if (ret != 0) {
2781 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
2782 return;
2785 talloc_free(rec->election_timeout);
2786 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2787 fast_start ?
2788 timeval_current_ofs(0, 500000) :
2789 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2790 ctdb_election_timeout, rec);
2792 ret = send_election_request(rec, pnn);
2793 if (ret!=0) {
2794 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
2795 return;
2798 /* wait for a few seconds to collect all responses */
2799 ctdb_wait_election(rec);
2805 handler for when a node changes its flags
2807 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
2808 TDB_DATA data, void *private_data)
2810 int ret;
2811 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2812 struct ctdb_node_map *nodemap=NULL;
2813 TALLOC_CTX *tmp_ctx;
2814 int i;
2815 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2816 int disabled_flag_changed;
2818 if (data.dsize != sizeof(*c)) {
2819 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2820 return;
2823 tmp_ctx = talloc_new(ctdb);
2824 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2826 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2827 if (ret != 0) {
2828 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2829 talloc_free(tmp_ctx);
2830 return;
2834 for (i=0;i<nodemap->num;i++) {
2835 if (nodemap->nodes[i].pnn == c->pnn) break;
2838 if (i == nodemap->num) {
2839 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2840 talloc_free(tmp_ctx);
2841 return;
2844 if (c->old_flags != c->new_flags) {
2845 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2848 disabled_flag_changed = (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
2850 nodemap->nodes[i].flags = c->new_flags;
2852 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2853 CTDB_CURRENT_NODE, &ctdb->recovery_master);
2855 if (ret == 0) {
2856 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2857 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2860 if (ret == 0 &&
2861 ctdb->recovery_master == ctdb->pnn &&
2862 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2863 /* Only do the takeover run if the perm disabled or unhealthy
2864 flags changed since these will cause an ip failover but not
2865 a recovery.
2866 If the node became disconnected or banned this will also
2867 lead to an ip address failover but that is handled
2868 during recovery
2870 if (disabled_flag_changed) {
2871 rec->need_takeover_run = true;
2875 talloc_free(tmp_ctx);
2879 handler for when we need to push out flag changes ot all other nodes
2881 static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
2882 TDB_DATA data, void *private_data)
2884 int ret;
2885 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2886 struct ctdb_node_map *nodemap=NULL;
2887 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2888 uint32_t recmaster;
2889 uint32_t *nodes;
2891 /* find the recovery master */
2892 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
2893 if (ret != 0) {
2894 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from local node\n"));
2895 talloc_free(tmp_ctx);
2896 return;
2899 /* read the node flags from the recmaster */
2900 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), recmaster, tmp_ctx, &nodemap);
2901 if (ret != 0) {
2902 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2903 talloc_free(tmp_ctx);
2904 return;
2906 if (c->pnn >= nodemap->num) {
2907 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2908 talloc_free(tmp_ctx);
2909 return;
2912 /* send the flags update to all connected nodes */
2913 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2915 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2916 nodes, 0, CONTROL_TIMEOUT(),
2917 false, data,
2918 NULL, NULL,
2919 NULL) != 0) {
2920 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2922 talloc_free(tmp_ctx);
2923 return;
2926 talloc_free(tmp_ctx);
2930 struct verify_recmode_normal_data {
2931 uint32_t count;
2932 enum monitor_result status;
2935 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2937 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2940 /* one more node has responded with recmode data*/
2941 rmdata->count--;
2943 /* if we failed to get the recmode, then return an error and let
2944 the main loop try again.
2946 if (state->state != CTDB_CONTROL_DONE) {
2947 if (rmdata->status == MONITOR_OK) {
2948 rmdata->status = MONITOR_FAILED;
2950 return;
2953 /* if we got a response, then the recmode will be stored in the
2954 status field
2956 if (state->status != CTDB_RECOVERY_NORMAL) {
2957 DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
2958 rmdata->status = MONITOR_RECOVERY_NEEDED;
2961 return;
2965 /* verify that all nodes are in normal recovery mode */
2966 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2968 struct verify_recmode_normal_data *rmdata;
2969 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2970 struct ctdb_client_control_state *state;
2971 enum monitor_result status;
2972 int j;
2974 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2975 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2976 rmdata->count = 0;
2977 rmdata->status = MONITOR_OK;
2979 /* loop over all active nodes and send an async getrecmode call to
2980 them*/
2981 for (j=0; j<nodemap->num; j++) {
2982 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2983 continue;
2985 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2986 CONTROL_TIMEOUT(),
2987 nodemap->nodes[j].pnn);
2988 if (state == NULL) {
2989 /* we failed to send the control, treat this as
2990 an error and try again next iteration
2992 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2993 talloc_free(mem_ctx);
2994 return MONITOR_FAILED;
2997 /* set up the callback functions */
2998 state->async.fn = verify_recmode_normal_callback;
2999 state->async.private_data = rmdata;
3001 /* one more control to wait for to complete */
3002 rmdata->count++;
3006 /* now wait for up to the maximum number of seconds allowed
3007 or until all nodes we expect a response from has replied
3009 while (rmdata->count > 0) {
3010 event_loop_once(ctdb->ev);
3013 status = rmdata->status;
3014 talloc_free(mem_ctx);
3015 return status;
3019 struct verify_recmaster_data {
3020 struct ctdb_recoverd *rec;
3021 uint32_t count;
3022 uint32_t pnn;
3023 enum monitor_result status;
3026 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
3028 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
3031 /* one more node has responded with recmaster data*/
3032 rmdata->count--;
3034 /* if we failed to get the recmaster, then return an error and let
3035 the main loop try again.
3037 if (state->state != CTDB_CONTROL_DONE) {
3038 if (rmdata->status == MONITOR_OK) {
3039 rmdata->status = MONITOR_FAILED;
3041 return;
3044 /* if we got a response, then the recmaster will be stored in the
3045 status field
3047 if (state->status != rmdata->pnn) {
3048 DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
3049 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
3050 rmdata->status = MONITOR_ELECTION_NEEDED;
3053 return;
3057 /* verify that all nodes agree that we are the recmaster */
3058 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
3060 struct ctdb_context *ctdb = rec->ctdb;
3061 struct verify_recmaster_data *rmdata;
3062 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3063 struct ctdb_client_control_state *state;
3064 enum monitor_result status;
3065 int j;
3067 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
3068 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
3069 rmdata->rec = rec;
3070 rmdata->count = 0;
3071 rmdata->pnn = pnn;
3072 rmdata->status = MONITOR_OK;
3074 /* loop over all active nodes and send an async getrecmaster call to
3075 them*/
3076 for (j=0; j<nodemap->num; j++) {
3077 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3078 continue;
3080 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
3081 CONTROL_TIMEOUT(),
3082 nodemap->nodes[j].pnn);
3083 if (state == NULL) {
3084 /* we failed to send the control, treat this as
3085 an error and try again next iteration
3087 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
3088 talloc_free(mem_ctx);
3089 return MONITOR_FAILED;
3092 /* set up the callback functions */
3093 state->async.fn = verify_recmaster_callback;
3094 state->async.private_data = rmdata;
3096 /* one more control to wait for to complete */
3097 rmdata->count++;
3101 /* now wait for up to the maximum number of seconds allowed
3102 or until all nodes we expect a response from has replied
3104 while (rmdata->count > 0) {
3105 event_loop_once(ctdb->ev);
3108 status = rmdata->status;
3109 talloc_free(mem_ctx);
3110 return status;
3113 static bool interfaces_have_changed(struct ctdb_context *ctdb,
3114 struct ctdb_recoverd *rec)
3116 struct ctdb_control_get_ifaces *ifaces = NULL;
3117 TALLOC_CTX *mem_ctx;
3118 bool ret = false;
3120 mem_ctx = talloc_new(NULL);
3122 /* Read the interfaces from the local node */
3123 if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
3124 CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
3125 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
3126 /* We could return an error. However, this will be
3127 * rare so we'll decide that the interfaces have
3128 * actually changed, just in case.
3130 talloc_free(mem_ctx);
3131 return true;
3134 if (!rec->ifaces) {
3135 /* We haven't been here before so things have changed */
3136 DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
3137 ret = true;
3138 } else if (rec->ifaces->num != ifaces->num) {
3139 /* Number of interfaces has changed */
3140 DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
3141 rec->ifaces->num, ifaces->num));
3142 ret = true;
3143 } else {
3144 /* See if interface names or link states have changed */
3145 int i;
3146 for (i = 0; i < rec->ifaces->num; i++) {
3147 struct ctdb_control_iface_info * iface = &rec->ifaces->ifaces[i];
3148 if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
3149 DEBUG(DEBUG_NOTICE,
3150 ("Interface in slot %d changed: %s => %s\n",
3151 i, iface->name, ifaces->ifaces[i].name));
3152 ret = true;
3153 break;
3155 if (iface->link_state != ifaces->ifaces[i].link_state) {
3156 DEBUG(DEBUG_NOTICE,
3157 ("Interface %s changed state: %d => %d\n",
3158 iface->name, iface->link_state,
3159 ifaces->ifaces[i].link_state));
3160 ret = true;
3161 break;
3166 talloc_free(rec->ifaces);
3167 rec->ifaces = talloc_steal(rec, ifaces);
3169 talloc_free(mem_ctx);
3170 return ret;
3173 /* called to check that the local allocation of public ip addresses is ok.
3175 static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn, struct ctdb_node_map *nodemap)
3177 TALLOC_CTX *mem_ctx = talloc_new(NULL);
3178 struct ctdb_uptime *uptime1 = NULL;
3179 struct ctdb_uptime *uptime2 = NULL;
3180 int ret, j;
3181 bool need_takeover_run = false;
3183 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
3184 CTDB_CURRENT_NODE, &uptime1);
3185 if (ret != 0) {
3186 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
3187 talloc_free(mem_ctx);
3188 return -1;
3191 if (interfaces_have_changed(ctdb, rec)) {
3192 DEBUG(DEBUG_NOTICE, ("The interfaces status has changed on "
3193 "local node %u - force takeover run\n",
3194 pnn));
3195 need_takeover_run = true;
3198 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
3199 CTDB_CURRENT_NODE, &uptime2);
3200 if (ret != 0) {
3201 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
3202 talloc_free(mem_ctx);
3203 return -1;
3206 /* skip the check if the startrecovery time has changed */
3207 if (timeval_compare(&uptime1->last_recovery_started,
3208 &uptime2->last_recovery_started) != 0) {
3209 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3210 talloc_free(mem_ctx);
3211 return 0;
3214 /* skip the check if the endrecovery time has changed */
3215 if (timeval_compare(&uptime1->last_recovery_finished,
3216 &uptime2->last_recovery_finished) != 0) {
3217 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3218 talloc_free(mem_ctx);
3219 return 0;
3222 /* skip the check if we have started but not finished recovery */
3223 if (timeval_compare(&uptime1->last_recovery_finished,
3224 &uptime1->last_recovery_started) != 1) {
3225 DEBUG(DEBUG_INFO, (__location__ " in the middle of recovery or ip reallocation. skipping public ip address check\n"));
3226 talloc_free(mem_ctx);
3228 return 0;
3231 /* verify that we have the ip addresses we should have
3232 and we dont have ones we shouldnt have.
3233 if we find an inconsistency we set recmode to
3234 active on the local node and wait for the recmaster
3235 to do a full blown recovery.
3236 also if the pnn is -1 and we are healthy and can host the ip
3237 we also request a ip reallocation.
3239 if (ctdb->tunable.disable_ip_failover == 0) {
3240 struct ctdb_all_public_ips *ips = NULL;
3242 /* read the *available* IPs from the local node */
3243 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
3244 if (ret != 0) {
3245 DEBUG(DEBUG_ERR, ("Unable to get available public IPs from local node %u\n", pnn));
3246 talloc_free(mem_ctx);
3247 return -1;
3250 for (j=0; j<ips->num; j++) {
3251 if (ips->ips[j].pnn == -1 &&
3252 nodemap->nodes[pnn].flags == 0) {
3253 DEBUG(DEBUG_CRIT,("Public IP '%s' is not assigned and we could serve it\n",
3254 ctdb_addr_to_str(&ips->ips[j].addr)));
3255 need_takeover_run = true;
3259 talloc_free(ips);
3261 /* read the *known* IPs from the local node */
3262 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
3263 if (ret != 0) {
3264 DEBUG(DEBUG_ERR, ("Unable to get known public IPs from local node %u\n", pnn));
3265 talloc_free(mem_ctx);
3266 return -1;
3269 for (j=0; j<ips->num; j++) {
3270 if (ips->ips[j].pnn == pnn) {
3271 if (ctdb->do_checkpublicip && !ctdb_sys_have_ip(&ips->ips[j].addr)) {
3272 DEBUG(DEBUG_CRIT,("Public IP '%s' is assigned to us but not on an interface\n",
3273 ctdb_addr_to_str(&ips->ips[j].addr)));
3274 need_takeover_run = true;
3276 } else {
3277 if (ctdb->do_checkpublicip &&
3278 ctdb_sys_have_ip(&ips->ips[j].addr)) {
3280 DEBUG(DEBUG_CRIT,("We are still serving a public IP '%s' that we should not be serving. Removing it\n",
3281 ctdb_addr_to_str(&ips->ips[j].addr)));
3283 if (ctdb_ctrl_release_ip(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ips->ips[j]) != 0) {
3284 DEBUG(DEBUG_ERR,("Failed to release local IP address\n"));
3291 if (need_takeover_run) {
3292 struct srvid_request rd;
3293 TDB_DATA data;
3295 DEBUG(DEBUG_CRIT,("Trigger takeoverrun\n"));
3297 rd.pnn = ctdb->pnn;
3298 rd.srvid = 0;
3299 data.dptr = (uint8_t *)&rd;
3300 data.dsize = sizeof(rd);
3302 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
3303 if (ret != 0) {
3304 DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
3307 talloc_free(mem_ctx);
3308 return 0;
3312 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
3314 struct ctdb_node_map **remote_nodemaps = callback_data;
3316 if (node_pnn >= ctdb->num_nodes) {
3317 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
3318 return;
3321 remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
3325 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
3326 struct ctdb_node_map *nodemap,
3327 struct ctdb_node_map **remote_nodemaps)
3329 uint32_t *nodes;
3331 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
3332 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
3333 nodes, 0,
3334 CONTROL_TIMEOUT(), false, tdb_null,
3335 async_getnodemap_callback,
3336 NULL,
3337 remote_nodemaps) != 0) {
3338 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
3340 return -1;
3343 return 0;
3346 static int update_recovery_lock_file(struct ctdb_context *ctdb)
3348 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
3349 const char *reclockfile;
3351 if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
3352 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
3353 talloc_free(tmp_ctx);
3354 return -1;
3357 if (reclockfile == NULL) {
3358 if (ctdb->recovery_lock_file != NULL) {
3359 DEBUG(DEBUG_NOTICE,("Recovery lock file disabled\n"));
3360 talloc_free(ctdb->recovery_lock_file);
3361 ctdb->recovery_lock_file = NULL;
3362 ctdb_recovery_unlock(ctdb);
3364 talloc_free(tmp_ctx);
3365 return 0;
3368 if (ctdb->recovery_lock_file == NULL) {
3369 DEBUG(DEBUG_NOTICE,
3370 ("Recovery lock file enabled (%s)\n", reclockfile));
3371 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3372 ctdb_recovery_unlock(ctdb);
3373 talloc_free(tmp_ctx);
3374 return 0;
3378 if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
3379 talloc_free(tmp_ctx);
3380 return 0;
3383 DEBUG(DEBUG_NOTICE,
3384 ("Recovery lock file changed (now %s)\n", reclockfile));
3385 talloc_free(ctdb->recovery_lock_file);
3386 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3387 ctdb_recovery_unlock(ctdb);
3389 talloc_free(tmp_ctx);
3390 return 0;
3393 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
3394 TALLOC_CTX *mem_ctx)
3396 uint32_t pnn;
3397 struct ctdb_node_map *nodemap=NULL;
3398 struct ctdb_node_map *recmaster_nodemap=NULL;
3399 struct ctdb_node_map **remote_nodemaps=NULL;
3400 struct ctdb_vnn_map *vnnmap=NULL;
3401 struct ctdb_vnn_map *remote_vnnmap=NULL;
3402 uint32_t num_lmasters;
3403 int32_t debug_level;
3404 int i, j, ret;
3405 bool self_ban;
3408 /* verify that the main daemon is still running */
3409 if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
3410 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
3411 exit(-1);
3414 /* ping the local daemon to tell it we are alive */
3415 ctdb_ctrl_recd_ping(ctdb);
3417 if (rec->election_timeout) {
3418 /* an election is in progress */
3419 return;
3422 /* read the debug level from the parent and update locally */
3423 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
3424 if (ret !=0) {
3425 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
3426 return;
3428 DEBUGLEVEL = debug_level;
3430 /* get relevant tunables */
3431 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
3432 if (ret != 0) {
3433 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
3434 return;
3437 /* get runstate */
3438 ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(),
3439 CTDB_CURRENT_NODE, &ctdb->runstate);
3440 if (ret != 0) {
3441 DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n"));
3442 return;
3445 /* get the current recovery lock file from the server */
3446 if (update_recovery_lock_file(ctdb) != 0) {
3447 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
3448 return;
3451 /* Make sure that if recovery lock verification becomes disabled when
3452 we close the file
3454 if (ctdb->recovery_lock_file == NULL) {
3455 ctdb_recovery_unlock(ctdb);
3458 pnn = ctdb_get_pnn(ctdb);
3460 /* get the vnnmap */
3461 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
3462 if (ret != 0) {
3463 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
3464 return;
3468 /* get number of nodes */
3469 if (rec->nodemap) {
3470 talloc_free(rec->nodemap);
3471 rec->nodemap = NULL;
3472 nodemap=NULL;
3474 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
3475 if (ret != 0) {
3476 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
3477 return;
3479 nodemap = rec->nodemap;
3481 /* remember our own node flags */
3482 rec->node_flags = nodemap->nodes[pnn].flags;
3484 ban_misbehaving_nodes(rec, &self_ban);
3485 if (self_ban) {
3486 DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
3487 return;
3490 /* if the local daemon is STOPPED or BANNED, we verify that the databases are
3491 also frozen and that the recmode is set to active.
3493 if (rec->node_flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
3494 /* If this node has become inactive then we want to
3495 * reduce the chances of it taking over the recovery
3496 * master role when it becomes active again. This
3497 * helps to stabilise the recovery master role so that
3498 * it stays on the most stable node.
3500 rec->priority_time = timeval_current();
3502 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
3503 if (ret != 0) {
3504 DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
3506 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
3507 DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
3509 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
3510 if (ret != 0) {
3511 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
3513 return;
3515 ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
3516 if (ret != 0) {
3517 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node in STOPPED or BANNED state\n"));
3518 return;
3522 /* If this node is stopped or banned then it is not the recovery
3523 * master, so don't do anything. This prevents stopped or banned
3524 * node from starting election and sending unnecessary controls.
3526 return;
3529 /* check which node is the recovery master */
3530 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
3531 if (ret != 0) {
3532 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
3533 return;
3536 /* If we are not the recmaster then do some housekeeping */
3537 if (rec->recmaster != pnn) {
3538 /* Ignore any IP reallocate requests - only recmaster
3539 * processes them
3541 TALLOC_FREE(rec->reallocate_requests);
3542 /* Clear any nodes that should be force rebalanced in
3543 * the next takeover run. If the recovery master role
3544 * has moved then we don't want to process these some
3545 * time in the future.
3547 TALLOC_FREE(rec->force_rebalance_nodes);
3550 /* This is a special case. When recovery daemon is started, recmaster
3551 * is set to -1. If a node is not started in stopped state, then
3552 * start election to decide recovery master
3554 if (rec->recmaster == (uint32_t)-1) {
3555 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
3556 force_election(rec, pnn, nodemap);
3557 return;
3560 /* update the capabilities for all nodes */
3561 ret = update_capabilities(rec, nodemap);
3562 if (ret != 0) {
3563 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
3564 return;
3568 * If the current recmaster does not have CTDB_CAP_RECMASTER,
3569 * but we have, then force an election and try to become the new
3570 * recmaster.
3572 if (!ctdb_node_has_capabilities(rec->caps,
3573 rec->recmaster,
3574 CTDB_CAP_RECMASTER) &&
3575 (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
3576 !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
3577 DEBUG(DEBUG_ERR, (__location__ " Current recmaster node %u does not have CAP_RECMASTER,"
3578 " but we (node %u) have - force an election\n",
3579 rec->recmaster, pnn));
3580 force_election(rec, pnn, nodemap);
3581 return;
3584 /* verify that the recmaster node is still active */
3585 for (j=0; j<nodemap->num; j++) {
3586 if (nodemap->nodes[j].pnn==rec->recmaster) {
3587 break;
3591 if (j == nodemap->num) {
3592 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
3593 force_election(rec, pnn, nodemap);
3594 return;
3597 /* if recovery master is disconnected we must elect a new recmaster */
3598 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
3599 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
3600 force_election(rec, pnn, nodemap);
3601 return;
3604 /* get nodemap from the recovery master to check if it is inactive */
3605 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3606 mem_ctx, &recmaster_nodemap);
3607 if (ret != 0) {
3608 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
3609 nodemap->nodes[j].pnn));
3610 return;
3614 if ((recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) &&
3615 (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
3616 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
3618 * update our nodemap to carry the recmaster's notion of
3619 * its own flags, so that we don't keep freezing the
3620 * inactive recmaster node...
3622 nodemap->nodes[j].flags = recmaster_nodemap->nodes[j].flags;
3623 force_election(rec, pnn, nodemap);
3624 return;
3627 /* verify that we have all ip addresses we should have and we dont
3628 * have addresses we shouldnt have.
3630 if (ctdb->tunable.disable_ip_failover == 0 &&
3631 !ctdb_op_is_disabled(rec->takeover_run)) {
3632 if (verify_local_ip_allocation(ctdb, rec, pnn, nodemap) != 0) {
3633 DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
3638 /* if we are not the recmaster then we do not need to check
3639 if recovery is needed
3641 if (pnn != rec->recmaster) {
3642 return;
3646 /* ensure our local copies of flags are right */
3647 ret = update_local_flags(rec, nodemap);
3648 if (ret == MONITOR_ELECTION_NEEDED) {
3649 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
3650 force_election(rec, pnn, nodemap);
3651 return;
3653 if (ret != MONITOR_OK) {
3654 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
3655 return;
3658 if (ctdb->num_nodes != nodemap->num) {
3659 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
3660 ctdb_load_nodes_file(ctdb);
3661 return;
3664 /* verify that all active nodes agree that we are the recmaster */
3665 switch (verify_recmaster(rec, nodemap, pnn)) {
3666 case MONITOR_RECOVERY_NEEDED:
3667 /* can not happen */
3668 return;
3669 case MONITOR_ELECTION_NEEDED:
3670 force_election(rec, pnn, nodemap);
3671 return;
3672 case MONITOR_OK:
3673 break;
3674 case MONITOR_FAILED:
3675 return;
3679 if (rec->need_recovery) {
3680 /* a previous recovery didn't finish */
3681 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3682 return;
3685 /* verify that all active nodes are in normal mode
3686 and not in recovery mode
3688 switch (verify_recmode(ctdb, nodemap)) {
3689 case MONITOR_RECOVERY_NEEDED:
3690 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3691 return;
3692 case MONITOR_FAILED:
3693 return;
3694 case MONITOR_ELECTION_NEEDED:
3695 /* can not happen */
3696 case MONITOR_OK:
3697 break;
3701 if (ctdb->recovery_lock_file != NULL) {
3702 /* We must already hold the recovery lock */
3703 if (!ctdb_recovery_have_lock(ctdb)) {
3704 DEBUG(DEBUG_ERR,("Failed recovery lock sanity check. Force a recovery\n"));
3705 ctdb_set_culprit(rec, ctdb->pnn);
3706 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3707 return;
3712 /* if there are takeovers requested, perform it and notify the waiters */
3713 if (!ctdb_op_is_disabled(rec->takeover_run) &&
3714 rec->reallocate_requests) {
3715 process_ipreallocate_requests(ctdb, rec);
3718 /* If recoveries are disabled then there is no use doing any
3719 * nodemap or flags checks. Recoveries might be disabled due
3720 * to "reloadnodes", so doing these checks might cause an
3721 * unnecessary recovery. */
3722 if (ctdb_op_is_disabled(rec->recovery)) {
3723 return;
3726 /* get the nodemap for all active remote nodes
3728 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
3729 if (remote_nodemaps == NULL) {
3730 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3731 return;
3733 for(i=0; i<nodemap->num; i++) {
3734 remote_nodemaps[i] = NULL;
3736 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3737 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3738 return;
3741 /* verify that all other nodes have the same nodemap as we have
3743 for (j=0; j<nodemap->num; j++) {
3744 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3745 continue;
3748 if (remote_nodemaps[j] == NULL) {
3749 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3750 ctdb_set_culprit(rec, j);
3752 return;
3755 /* if the nodes disagree on how many nodes there are
3756 then this is a good reason to try recovery
3758 if (remote_nodemaps[j]->num != nodemap->num) {
3759 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3760 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3761 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3762 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3763 return;
3766 /* if the nodes disagree on which nodes exist and are
3767 active, then that is also a good reason to do recovery
3769 for (i=0;i<nodemap->num;i++) {
3770 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3771 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3772 nodemap->nodes[j].pnn, i,
3773 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3774 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3775 do_recovery(rec, mem_ctx, pnn, nodemap,
3776 vnnmap);
3777 return;
3783 * Update node flags obtained from each active node. This ensure we have
3784 * up-to-date information for all the nodes.
3786 for (j=0; j<nodemap->num; j++) {
3787 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3788 continue;
3790 nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
3793 for (j=0; j<nodemap->num; j++) {
3794 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3795 continue;
3798 /* verify the flags are consistent
3800 for (i=0; i<nodemap->num; i++) {
3801 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3802 continue;
3805 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3806 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3807 nodemap->nodes[j].pnn,
3808 nodemap->nodes[i].pnn,
3809 remote_nodemaps[j]->nodes[i].flags,
3810 nodemap->nodes[i].flags));
3811 if (i == j) {
3812 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3813 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3814 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3815 do_recovery(rec, mem_ctx, pnn, nodemap,
3816 vnnmap);
3817 return;
3818 } else {
3819 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3820 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3821 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3822 do_recovery(rec, mem_ctx, pnn, nodemap,
3823 vnnmap);
3824 return;
3831 /* count how many active nodes there are */
3832 num_lmasters = 0;
3833 for (i=0; i<nodemap->num; i++) {
3834 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
3835 if (ctdb_node_has_capabilities(rec->caps,
3836 ctdb->nodes[i]->pnn,
3837 CTDB_CAP_LMASTER)) {
3838 num_lmasters++;
3844 /* There must be the same number of lmasters in the vnn map as
3845 * there are active nodes with the lmaster capability... or
3846 * do a recovery.
3848 if (vnnmap->size != num_lmasters) {
3849 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
3850 vnnmap->size, num_lmasters));
3851 ctdb_set_culprit(rec, ctdb->pnn);
3852 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3853 return;
3856 /* verify that all active nodes in the nodemap also exist in
3857 the vnnmap.
3859 for (j=0; j<nodemap->num; j++) {
3860 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3861 continue;
3863 if (nodemap->nodes[j].pnn == pnn) {
3864 continue;
3867 for (i=0; i<vnnmap->size; i++) {
3868 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3869 break;
3872 if (i == vnnmap->size) {
3873 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3874 nodemap->nodes[j].pnn));
3875 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3876 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3877 return;
3882 /* verify that all other nodes have the same vnnmap
3883 and are from the same generation
3885 for (j=0; j<nodemap->num; j++) {
3886 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3887 continue;
3889 if (nodemap->nodes[j].pnn == pnn) {
3890 continue;
3893 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3894 mem_ctx, &remote_vnnmap);
3895 if (ret != 0) {
3896 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3897 nodemap->nodes[j].pnn));
3898 return;
3901 /* verify the vnnmap generation is the same */
3902 if (vnnmap->generation != remote_vnnmap->generation) {
3903 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3904 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3905 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3906 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3907 return;
3910 /* verify the vnnmap size is the same */
3911 if (vnnmap->size != remote_vnnmap->size) {
3912 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3913 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3914 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3915 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3916 return;
3919 /* verify the vnnmap is the same */
3920 for (i=0;i<vnnmap->size;i++) {
3921 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3922 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3923 nodemap->nodes[j].pnn));
3924 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3925 do_recovery(rec, mem_ctx, pnn, nodemap,
3926 vnnmap);
3927 return;
3932 /* we might need to change who has what IP assigned */
3933 if (rec->need_takeover_run) {
3934 uint32_t culprit = (uint32_t)-1;
3936 rec->need_takeover_run = false;
3938 /* update the list of public ips that a node can handle for
3939 all connected nodes
3941 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
3942 if (ret != 0) {
3943 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
3944 culprit));
3945 rec->need_takeover_run = true;
3946 return;
3949 /* execute the "startrecovery" event script on all nodes */
3950 ret = run_startrecovery_eventscript(rec, nodemap);
3951 if (ret!=0) {
3952 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
3953 ctdb_set_culprit(rec, ctdb->pnn);
3954 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3955 return;
3958 /* If takeover run fails, then the offending nodes are
3959 * assigned ban culprit counts. And we re-try takeover.
3960 * If takeover run fails repeatedly, the node would get
3961 * banned.
3963 * If rec->need_takeover_run is not set to true at this
3964 * failure, monitoring is disabled cluster-wide (via
3965 * startrecovery eventscript) and will not get enabled.
3967 if (!do_takeover_run(rec, nodemap, true)) {
3968 return;
3971 /* execute the "recovered" event script on all nodes */
3972 ret = run_recovered_eventscript(rec, nodemap, "monitor_cluster");
3973 #if 0
3974 // we cant check whether the event completed successfully
3975 // since this script WILL fail if the node is in recovery mode
3976 // and if that race happens, the code here would just cause a second
3977 // cascading recovery.
3978 if (ret!=0) {
3979 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
3980 ctdb_set_culprit(rec, ctdb->pnn);
3981 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3983 #endif
3988 the main monitoring loop
3990 static void monitor_cluster(struct ctdb_context *ctdb)
3992 struct ctdb_recoverd *rec;
3994 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
3996 rec = talloc_zero(ctdb, struct ctdb_recoverd);
3997 CTDB_NO_MEMORY_FATAL(ctdb, rec);
3999 rec->ctdb = ctdb;
4001 rec->takeover_run = ctdb_op_init(rec, "takeover runs");
4002 CTDB_NO_MEMORY_FATAL(ctdb, rec->takeover_run);
4004 rec->recovery = ctdb_op_init(rec, "recoveries");
4005 CTDB_NO_MEMORY_FATAL(ctdb, rec->recovery);
4007 rec->priority_time = timeval_current();
4009 /* register a message port for sending memory dumps */
4010 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
4012 /* register a message port for recovery elections */
4013 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
4015 /* when nodes are disabled/enabled */
4016 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
4018 /* when we are asked to puch out a flag change */
4019 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
4021 /* register a message port for vacuum fetch */
4022 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
4024 /* register a message port for reloadnodes */
4025 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
4027 /* register a message port for performing a takeover run */
4028 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
4030 /* register a message port for disabling the ip check for a short while */
4031 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
4033 /* register a message port for updating the recovery daemons node assignment for an ip */
4034 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECD_UPDATE_IP, recd_update_ip_handler, rec);
4036 /* register a message port for forcing a rebalance of a node next
4037 reallocation */
4038 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
4040 /* Register a message port for disabling takeover runs */
4041 ctdb_client_set_message_handler(ctdb,
4042 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
4043 disable_takeover_runs_handler, rec);
4045 /* Register a message port for disabling recoveries */
4046 ctdb_client_set_message_handler(ctdb,
4047 CTDB_SRVID_DISABLE_RECOVERIES,
4048 disable_recoveries_handler, rec);
4050 /* register a message port for detaching database */
4051 ctdb_client_set_message_handler(ctdb,
4052 CTDB_SRVID_DETACH_DATABASE,
4053 detach_database_handler, rec);
4055 for (;;) {
4056 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
4057 struct timeval start;
4058 double elapsed;
4060 if (!mem_ctx) {
4061 DEBUG(DEBUG_CRIT,(__location__
4062 " Failed to create temp context\n"));
4063 exit(-1);
4066 start = timeval_current();
4067 main_loop(ctdb, rec, mem_ctx);
4068 talloc_free(mem_ctx);
4070 /* we only check for recovery once every second */
4071 elapsed = timeval_elapsed(&start);
4072 if (elapsed < ctdb->tunable.recover_interval) {
4073 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
4074 - elapsed);
4080 event handler for when the main ctdbd dies
4082 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
4083 uint16_t flags, void *private_data)
4085 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
4086 _exit(1);
4090 called regularly to verify that the recovery daemon is still running
4092 static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
4093 struct timeval yt, void *p)
4095 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
4097 if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
4098 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
4100 event_add_timed(ctdb->ev, ctdb, timeval_zero(),
4101 ctdb_restart_recd, ctdb);
4103 return;
4106 event_add_timed(ctdb->ev, ctdb->recd_ctx,
4107 timeval_current_ofs(30, 0),
4108 ctdb_check_recd, ctdb);
4111 static void recd_sig_child_handler(struct event_context *ev,
4112 struct signal_event *se, int signum, int count,
4113 void *dont_care,
4114 void *private_data)
4116 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4117 int status;
4118 pid_t pid = -1;
4120 while (pid != 0) {
4121 pid = waitpid(-1, &status, WNOHANG);
4122 if (pid == -1) {
4123 if (errno != ECHILD) {
4124 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
4126 return;
4128 if (pid > 0) {
4129 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
4135 startup the recovery daemon as a child of the main ctdb daemon
4137 int ctdb_start_recoverd(struct ctdb_context *ctdb)
4139 int fd[2];
4140 struct signal_event *se;
4141 struct tevent_fd *fde;
4143 if (pipe(fd) != 0) {
4144 return -1;
4147 ctdb->recoverd_pid = ctdb_fork(ctdb);
4148 if (ctdb->recoverd_pid == -1) {
4149 return -1;
4152 if (ctdb->recoverd_pid != 0) {
4153 talloc_free(ctdb->recd_ctx);
4154 ctdb->recd_ctx = talloc_new(ctdb);
4155 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
4157 close(fd[0]);
4158 event_add_timed(ctdb->ev, ctdb->recd_ctx,
4159 timeval_current_ofs(30, 0),
4160 ctdb_check_recd, ctdb);
4161 return 0;
4164 close(fd[1]);
4166 srandom(getpid() ^ time(NULL));
4168 ctdb_set_process_name("ctdb_recovered");
4169 if (switch_from_server_to_client(ctdb, "recoverd") != 0) {
4170 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
4171 exit(1);
4174 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
4176 fde = event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ,
4177 ctdb_recoverd_parent, &fd[0]);
4178 tevent_fd_set_auto_close(fde);
4180 /* set up a handler to pick up sigchld */
4181 se = event_add_signal(ctdb->ev, ctdb,
4182 SIGCHLD, 0,
4183 recd_sig_child_handler,
4184 ctdb);
4185 if (se == NULL) {
4186 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
4187 exit(1);
4190 monitor_cluster(ctdb);
4192 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
4193 return -1;
4197 shutdown the recovery daemon
4199 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
4201 if (ctdb->recoverd_pid == 0) {
4202 return;
4205 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
4206 ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
4208 TALLOC_FREE(ctdb->recd_ctx);
4209 TALLOC_FREE(ctdb->recd_ping_count);
4212 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te,
4213 struct timeval t, void *private_data)
4215 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4217 DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
4218 ctdb_stop_recoverd(ctdb);
4219 ctdb_start_recoverd(ctdb);