ctdb-recoverd: Remove redundant condition when checking recovery lock
[Samba.git] / ctdb / server / ctdb_recoverd.c
blobe76a0d02b9c1b078e44c7fd71704740ab547bfd9
1 /*
2 ctdb recovery daemon
4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
20 #include "includes.h"
21 #include "system/filesys.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/wait.h"
25 #include "popt.h"
26 #include "cmdline.h"
27 #include "../include/ctdb_client.h"
28 #include "../include/ctdb_private.h"
29 #include "lib/tdb_wrap/tdb_wrap.h"
30 #include "lib/util/dlinklist.h"
33 /* List of SRVID requests that need to be processed */
34 struct srvid_list {
35 struct srvid_list *next, *prev;
36 struct srvid_request *request;
39 struct srvid_requests {
40 struct srvid_list *requests;
43 static void srvid_request_reply(struct ctdb_context *ctdb,
44 struct srvid_request *request,
45 TDB_DATA result)
47 /* Someone that sent srvid==0 does not want a reply */
48 if (request->srvid == 0) {
49 talloc_free(request);
50 return;
53 if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
54 result) == 0) {
55 DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
56 (unsigned)request->pnn,
57 (unsigned long long)request->srvid));
58 } else {
59 DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
60 (unsigned)request->pnn,
61 (unsigned long long)request->srvid));
64 talloc_free(request);
67 static void srvid_requests_reply(struct ctdb_context *ctdb,
68 struct srvid_requests **requests,
69 TDB_DATA result)
71 struct srvid_list *r;
73 for (r = (*requests)->requests; r != NULL; r = r->next) {
74 srvid_request_reply(ctdb, r->request, result);
77 /* Free the list structure... */
78 TALLOC_FREE(*requests);
81 static void srvid_request_add(struct ctdb_context *ctdb,
82 struct srvid_requests **requests,
83 struct srvid_request *request)
85 struct srvid_list *t;
86 int32_t ret;
87 TDB_DATA result;
89 if (*requests == NULL) {
90 *requests = talloc_zero(ctdb, struct srvid_requests);
91 if (*requests == NULL) {
92 goto nomem;
96 t = talloc_zero(*requests, struct srvid_list);
97 if (t == NULL) {
98 /* If *requests was just allocated above then free it */
99 if ((*requests)->requests == NULL) {
100 TALLOC_FREE(*requests);
102 goto nomem;
105 t->request = (struct srvid_request *)talloc_steal(t, request);
106 DLIST_ADD((*requests)->requests, t);
108 return;
110 nomem:
111 /* Failed to add the request to the list. Send a fail. */
112 DEBUG(DEBUG_ERR, (__location__
113 " Out of memory, failed to queue SRVID request\n"));
114 ret = -ENOMEM;
115 result.dsize = sizeof(ret);
116 result.dptr = (uint8_t *)&ret;
117 srvid_request_reply(ctdb, request, result);
120 /* An abstraction to allow an operation (takeover runs, recoveries,
121 * ...) to be disabled for a given timeout */
122 struct ctdb_op_state {
123 struct tevent_timer *timer;
124 bool in_progress;
125 const char *name;
128 static struct ctdb_op_state *ctdb_op_init(TALLOC_CTX *mem_ctx, const char *name)
130 struct ctdb_op_state *state = talloc_zero(mem_ctx, struct ctdb_op_state);
132 if (state != NULL) {
133 state->in_progress = false;
134 state->name = name;
137 return state;
140 static bool ctdb_op_is_disabled(struct ctdb_op_state *state)
142 return state->timer != NULL;
145 static bool ctdb_op_begin(struct ctdb_op_state *state)
147 if (ctdb_op_is_disabled(state)) {
148 DEBUG(DEBUG_NOTICE,
149 ("Unable to begin - %s are disabled\n", state->name));
150 return false;
153 state->in_progress = true;
154 return true;
157 static bool ctdb_op_end(struct ctdb_op_state *state)
159 return state->in_progress = false;
162 static bool ctdb_op_is_in_progress(struct ctdb_op_state *state)
164 return state->in_progress;
167 static void ctdb_op_enable(struct ctdb_op_state *state)
169 TALLOC_FREE(state->timer);
172 static void ctdb_op_timeout_handler(struct event_context *ev,
173 struct timed_event *te,
174 struct timeval yt, void *p)
176 struct ctdb_op_state *state =
177 talloc_get_type(p, struct ctdb_op_state);
179 DEBUG(DEBUG_NOTICE,("Reenabling %s after timeout\n", state->name));
180 ctdb_op_enable(state);
183 static int ctdb_op_disable(struct ctdb_op_state *state,
184 struct tevent_context *ev,
185 uint32_t timeout)
187 if (timeout == 0) {
188 DEBUG(DEBUG_NOTICE,("Reenabling %s\n", state->name));
189 ctdb_op_enable(state);
190 return 0;
193 if (state->in_progress) {
194 DEBUG(DEBUG_ERR,
195 ("Unable to disable %s - in progress\n", state->name));
196 return -EAGAIN;
199 DEBUG(DEBUG_NOTICE,("Disabling %s for %u seconds\n",
200 state->name, timeout));
202 /* Clear any old timers */
203 talloc_free(state->timer);
205 /* Arrange for the timeout to occur */
206 state->timer = tevent_add_timer(ev, state,
207 timeval_current_ofs(timeout, 0),
208 ctdb_op_timeout_handler, state);
209 if (state->timer == NULL) {
210 DEBUG(DEBUG_ERR,(__location__ " Unable to setup timer\n"));
211 return -ENOMEM;
214 return 0;
217 struct ctdb_banning_state {
218 uint32_t count;
219 struct timeval last_reported_time;
223 private state of recovery daemon
225 struct ctdb_recoverd {
226 struct ctdb_context *ctdb;
227 uint32_t recmaster;
228 uint32_t last_culprit_node;
229 struct ctdb_node_map *nodemap;
230 struct timeval priority_time;
231 bool need_takeover_run;
232 bool need_recovery;
233 uint32_t node_flags;
234 struct timed_event *send_election_te;
235 struct timed_event *election_timeout;
236 struct vacuum_info *vacuum_info;
237 struct srvid_requests *reallocate_requests;
238 struct ctdb_op_state *takeover_run;
239 struct ctdb_op_state *recovery;
240 struct ctdb_control_get_ifaces *ifaces;
241 uint32_t *force_rebalance_nodes;
242 struct ctdb_node_capabilities *caps;
245 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
246 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
248 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te, struct timeval t, void *private_data);
251 ban a node for a period of time
253 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
255 int ret;
256 struct ctdb_context *ctdb = rec->ctdb;
257 struct ctdb_ban_time bantime;
259 if (!ctdb_validate_pnn(ctdb, pnn)) {
260 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
261 return;
264 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
266 bantime.pnn = pnn;
267 bantime.time = ban_time;
269 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
270 if (ret != 0) {
271 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
272 return;
277 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
281 remember the trouble maker
283 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
285 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
286 struct ctdb_banning_state *ban_state;
288 if (culprit > ctdb->num_nodes) {
289 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
290 return;
293 /* If we are banned or stopped, do not set other nodes as culprits */
294 if (rec->node_flags & NODE_FLAGS_INACTIVE) {
295 DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
296 return;
299 if (ctdb->nodes[culprit]->ban_state == NULL) {
300 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
301 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
305 ban_state = ctdb->nodes[culprit]->ban_state;
306 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
307 /* this was the first time in a long while this node
308 misbehaved so we will forgive any old transgressions.
310 ban_state->count = 0;
313 ban_state->count += count;
314 ban_state->last_reported_time = timeval_current();
315 rec->last_culprit_node = culprit;
319 remember the trouble maker
321 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
323 ctdb_set_culprit_count(rec, culprit, 1);
327 /* this callback is called for every node that failed to execute the
328 recovered event
330 static void recovered_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
332 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
334 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the recovered event. Setting it as recovery fail culprit\n", node_pnn));
336 ctdb_set_culprit(rec, node_pnn);
340 run the "recovered" eventscript on all nodes
342 static int run_recovered_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, const char *caller)
344 TALLOC_CTX *tmp_ctx;
345 uint32_t *nodes;
346 struct ctdb_context *ctdb = rec->ctdb;
348 tmp_ctx = talloc_new(ctdb);
349 CTDB_NO_MEMORY(ctdb, tmp_ctx);
351 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
352 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
353 nodes, 0,
354 CONTROL_TIMEOUT(), false, tdb_null,
355 NULL, recovered_fail_callback,
356 rec) != 0) {
357 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
359 talloc_free(tmp_ctx);
360 return -1;
363 talloc_free(tmp_ctx);
364 return 0;
367 /* this callback is called for every node that failed to execute the
368 start recovery event
370 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
372 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
374 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
376 ctdb_set_culprit(rec, node_pnn);
380 run the "startrecovery" eventscript on all nodes
382 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
384 TALLOC_CTX *tmp_ctx;
385 uint32_t *nodes;
386 struct ctdb_context *ctdb = rec->ctdb;
388 tmp_ctx = talloc_new(ctdb);
389 CTDB_NO_MEMORY(ctdb, tmp_ctx);
391 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
392 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
393 nodes, 0,
394 CONTROL_TIMEOUT(), false, tdb_null,
395 NULL,
396 startrecovery_fail_callback,
397 rec) != 0) {
398 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
399 talloc_free(tmp_ctx);
400 return -1;
403 talloc_free(tmp_ctx);
404 return 0;
408 update the node capabilities for all connected nodes
410 static int update_capabilities(struct ctdb_recoverd *rec,
411 struct ctdb_node_map *nodemap)
413 uint32_t *capp;
414 TALLOC_CTX *tmp_ctx;
415 struct ctdb_node_capabilities *caps;
416 struct ctdb_context *ctdb = rec->ctdb;
418 tmp_ctx = talloc_new(rec);
419 CTDB_NO_MEMORY(ctdb, tmp_ctx);
421 caps = ctdb_get_capabilities(ctdb, tmp_ctx,
422 CONTROL_TIMEOUT(), nodemap);
424 if (caps == NULL) {
425 DEBUG(DEBUG_ERR,
426 (__location__ " Failed to get node capabilities\n"));
427 talloc_free(tmp_ctx);
428 return -1;
431 capp = ctdb_get_node_capabilities(caps, ctdb_get_pnn(ctdb));
432 if (capp == NULL) {
433 DEBUG(DEBUG_ERR,
434 (__location__
435 " Capabilities don't include current node.\n"));
436 talloc_free(tmp_ctx);
437 return -1;
439 ctdb->capabilities = *capp;
441 TALLOC_FREE(rec->caps);
442 rec->caps = talloc_steal(rec, caps);
444 talloc_free(tmp_ctx);
445 return 0;
448 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
450 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
452 DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
453 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
456 static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
458 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
460 DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
461 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
465 change recovery mode on all nodes
467 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t rec_mode)
469 TDB_DATA data;
470 uint32_t *nodes;
471 TALLOC_CTX *tmp_ctx;
473 tmp_ctx = talloc_new(ctdb);
474 CTDB_NO_MEMORY(ctdb, tmp_ctx);
476 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
478 data.dsize = sizeof(uint32_t);
479 data.dptr = (unsigned char *)&rec_mode;
481 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
482 nodes, 0,
483 CONTROL_TIMEOUT(),
484 false, data,
485 NULL, NULL,
486 NULL) != 0) {
487 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
488 talloc_free(tmp_ctx);
489 return -1;
492 /* freeze all nodes */
493 if (rec_mode == CTDB_RECOVERY_ACTIVE) {
494 int i;
496 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
497 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
498 nodes, i,
499 CONTROL_TIMEOUT(),
500 false, tdb_null,
501 NULL,
502 set_recmode_fail_callback,
503 rec) != 0) {
504 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
505 talloc_free(tmp_ctx);
506 return -1;
511 talloc_free(tmp_ctx);
512 return 0;
516 change recovery master on all node
518 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
520 TDB_DATA data;
521 TALLOC_CTX *tmp_ctx;
522 uint32_t *nodes;
524 tmp_ctx = talloc_new(ctdb);
525 CTDB_NO_MEMORY(ctdb, tmp_ctx);
527 data.dsize = sizeof(uint32_t);
528 data.dptr = (unsigned char *)&pnn;
530 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
531 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
532 nodes, 0,
533 CONTROL_TIMEOUT(), false, data,
534 NULL, NULL,
535 NULL) != 0) {
536 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
537 talloc_free(tmp_ctx);
538 return -1;
541 talloc_free(tmp_ctx);
542 return 0;
545 /* update all remote nodes to use the same db priority that we have
546 this can fail if the remove node has not yet been upgraded to
547 support this function, so we always return success and never fail
548 a recovery if this call fails.
550 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
551 struct ctdb_node_map *nodemap,
552 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
554 int db;
556 /* step through all local databases */
557 for (db=0; db<dbmap->num;db++) {
558 struct ctdb_db_priority db_prio;
559 int ret;
561 db_prio.db_id = dbmap->dbs[db].dbid;
562 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].dbid, &db_prio.priority);
563 if (ret != 0) {
564 DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].dbid));
565 continue;
568 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].dbid, db_prio.priority));
570 ret = ctdb_ctrl_set_db_priority(ctdb, CONTROL_TIMEOUT(),
571 CTDB_CURRENT_NODE, &db_prio);
572 if (ret != 0) {
573 DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n",
574 db_prio.db_id));
578 return 0;
582 ensure all other nodes have attached to any databases that we have
584 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
585 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
587 int i, j, db, ret;
588 struct ctdb_dbid_map *remote_dbmap;
590 /* verify that all other nodes have all our databases */
591 for (j=0; j<nodemap->num; j++) {
592 /* we dont need to ourself ourselves */
593 if (nodemap->nodes[j].pnn == pnn) {
594 continue;
596 /* dont check nodes that are unavailable */
597 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
598 continue;
601 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
602 mem_ctx, &remote_dbmap);
603 if (ret != 0) {
604 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
605 return -1;
608 /* step through all local databases */
609 for (db=0; db<dbmap->num;db++) {
610 const char *name;
613 for (i=0;i<remote_dbmap->num;i++) {
614 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
615 break;
618 /* the remote node already have this database */
619 if (i!=remote_dbmap->num) {
620 continue;
622 /* ok so we need to create this database */
623 ret = ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn,
624 dbmap->dbs[db].dbid, mem_ctx,
625 &name);
626 if (ret != 0) {
627 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
628 return -1;
630 ret = ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(),
631 nodemap->nodes[j].pnn,
632 mem_ctx, name,
633 dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
634 if (ret != 0) {
635 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
636 return -1;
641 return 0;
646 ensure we are attached to any databases that anyone else is attached to
648 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
649 uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
651 int i, j, db, ret;
652 struct ctdb_dbid_map *remote_dbmap;
654 /* verify that we have all database any other node has */
655 for (j=0; j<nodemap->num; j++) {
656 /* we dont need to ourself ourselves */
657 if (nodemap->nodes[j].pnn == pnn) {
658 continue;
660 /* dont check nodes that are unavailable */
661 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
662 continue;
665 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
666 mem_ctx, &remote_dbmap);
667 if (ret != 0) {
668 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
669 return -1;
672 /* step through all databases on the remote node */
673 for (db=0; db<remote_dbmap->num;db++) {
674 const char *name;
676 for (i=0;i<(*dbmap)->num;i++) {
677 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
678 break;
681 /* we already have this db locally */
682 if (i!=(*dbmap)->num) {
683 continue;
685 /* ok so we need to create this database and
686 rebuild dbmap
688 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
689 remote_dbmap->dbs[db].dbid, mem_ctx, &name);
690 if (ret != 0) {
691 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
692 nodemap->nodes[j].pnn));
693 return -1;
695 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
696 remote_dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
697 if (ret != 0) {
698 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
699 return -1;
701 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
702 if (ret != 0) {
703 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
704 return -1;
709 return 0;
714 pull the remote database contents from one node into the recdb
716 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
717 struct tdb_wrap *recdb, uint32_t dbid)
719 int ret;
720 TDB_DATA outdata;
721 struct ctdb_marshall_buffer *reply;
722 struct ctdb_rec_data *recdata;
723 int i;
724 TALLOC_CTX *tmp_ctx = talloc_new(recdb);
726 ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
727 CONTROL_TIMEOUT(), &outdata);
728 if (ret != 0) {
729 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
730 talloc_free(tmp_ctx);
731 return -1;
734 reply = (struct ctdb_marshall_buffer *)outdata.dptr;
736 if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
737 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
738 talloc_free(tmp_ctx);
739 return -1;
742 recdata = (struct ctdb_rec_data *)&reply->data[0];
744 for (i=0;
745 i<reply->count;
746 recdata = (struct ctdb_rec_data *)(recdata->length + (uint8_t *)recdata), i++) {
747 TDB_DATA key, data;
748 struct ctdb_ltdb_header *hdr;
749 TDB_DATA existing;
751 key.dptr = &recdata->data[0];
752 key.dsize = recdata->keylen;
753 data.dptr = &recdata->data[key.dsize];
754 data.dsize = recdata->datalen;
756 hdr = (struct ctdb_ltdb_header *)data.dptr;
758 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
759 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
760 talloc_free(tmp_ctx);
761 return -1;
764 /* fetch the existing record, if any */
765 existing = tdb_fetch(recdb->tdb, key);
767 if (existing.dptr != NULL) {
768 struct ctdb_ltdb_header header;
769 if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
770 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
771 (unsigned)existing.dsize, srcnode));
772 free(existing.dptr);
773 talloc_free(tmp_ctx);
774 return -1;
776 header = *(struct ctdb_ltdb_header *)existing.dptr;
777 free(existing.dptr);
778 if (!(header.rsn < hdr->rsn ||
779 (header.dmaster != ctdb_get_pnn(ctdb) &&
780 header.rsn == hdr->rsn))) {
781 continue;
785 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
786 DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
787 talloc_free(tmp_ctx);
788 return -1;
792 talloc_free(tmp_ctx);
794 return 0;
798 struct pull_seqnum_cbdata {
799 int failed;
800 uint32_t pnn;
801 uint64_t seqnum;
804 static void pull_seqnum_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
806 struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
807 uint64_t seqnum;
809 if (cb_data->failed != 0) {
810 DEBUG(DEBUG_ERR, ("Got seqnum from node %d but we have already failed the entire operation\n", node_pnn));
811 return;
814 if (res != 0) {
815 DEBUG(DEBUG_ERR, ("Error when pulling seqnum from node %d\n", node_pnn));
816 cb_data->failed = 1;
817 return;
820 if (outdata.dsize != sizeof(uint64_t)) {
821 DEBUG(DEBUG_ERR, ("Error when reading pull seqnum from node %d, got %d bytes but expected %d\n", node_pnn, (int)outdata.dsize, (int)sizeof(uint64_t)));
822 cb_data->failed = -1;
823 return;
826 seqnum = *((uint64_t *)outdata.dptr);
828 if (seqnum > cb_data->seqnum ||
829 (cb_data->pnn == -1 && seqnum == 0)) {
830 cb_data->seqnum = seqnum;
831 cb_data->pnn = node_pnn;
835 static void pull_seqnum_fail_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
837 struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
839 DEBUG(DEBUG_ERR, ("Failed to pull db seqnum from node %d\n", node_pnn));
840 cb_data->failed = 1;
843 static int pull_highest_seqnum_pdb(struct ctdb_context *ctdb,
844 struct ctdb_recoverd *rec,
845 struct ctdb_node_map *nodemap,
846 struct tdb_wrap *recdb, uint32_t dbid)
848 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
849 uint32_t *nodes;
850 TDB_DATA data;
851 uint32_t outdata[2];
852 struct pull_seqnum_cbdata *cb_data;
854 DEBUG(DEBUG_NOTICE, ("Scan for highest seqnum pdb for db:0x%08x\n", dbid));
856 outdata[0] = dbid;
857 outdata[1] = 0;
859 data.dsize = sizeof(outdata);
860 data.dptr = (uint8_t *)&outdata[0];
862 cb_data = talloc(tmp_ctx, struct pull_seqnum_cbdata);
863 if (cb_data == NULL) {
864 DEBUG(DEBUG_ERR, ("Failed to allocate pull highest seqnum cb_data structure\n"));
865 talloc_free(tmp_ctx);
866 return -1;
869 cb_data->failed = 0;
870 cb_data->pnn = -1;
871 cb_data->seqnum = 0;
873 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
874 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_DB_SEQNUM,
875 nodes, 0,
876 CONTROL_TIMEOUT(), false, data,
877 pull_seqnum_cb,
878 pull_seqnum_fail_cb,
879 cb_data) != 0) {
880 DEBUG(DEBUG_ERR, (__location__ " Failed to run async GET_DB_SEQNUM\n"));
882 talloc_free(tmp_ctx);
883 return -1;
886 if (cb_data->failed != 0) {
887 DEBUG(DEBUG_NOTICE, ("Failed to pull sequence numbers for DB 0x%08x\n", dbid));
888 talloc_free(tmp_ctx);
889 return -1;
892 if (cb_data->pnn == -1) {
893 DEBUG(DEBUG_NOTICE, ("Failed to find a node with highest sequence numbers for DB 0x%08x\n", dbid));
894 talloc_free(tmp_ctx);
895 return -1;
898 DEBUG(DEBUG_NOTICE, ("Pull persistent db:0x%08x from node %d with highest seqnum:%lld\n", dbid, cb_data->pnn, (long long)cb_data->seqnum));
900 if (pull_one_remote_database(ctdb, cb_data->pnn, recdb, dbid) != 0) {
901 DEBUG(DEBUG_ERR, ("Failed to pull higest seqnum database 0x%08x from node %d\n", dbid, cb_data->pnn));
902 talloc_free(tmp_ctx);
903 return -1;
906 talloc_free(tmp_ctx);
907 return 0;
912 pull all the remote database contents into the recdb
914 static int pull_remote_database(struct ctdb_context *ctdb,
915 struct ctdb_recoverd *rec,
916 struct ctdb_node_map *nodemap,
917 struct tdb_wrap *recdb, uint32_t dbid,
918 bool persistent)
920 int j;
922 if (persistent && ctdb->tunable.recover_pdb_by_seqnum != 0) {
923 int ret;
924 ret = pull_highest_seqnum_pdb(ctdb, rec, nodemap, recdb, dbid);
925 if (ret == 0) {
926 return 0;
930 /* pull all records from all other nodes across onto this node
931 (this merges based on rsn)
933 for (j=0; j<nodemap->num; j++) {
934 /* dont merge from nodes that are unavailable */
935 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
936 continue;
938 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
939 DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
940 nodemap->nodes[j].pnn));
941 ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
942 return -1;
946 return 0;
951 update flags on all active nodes
953 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
955 int ret;
957 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
958 if (ret != 0) {
959 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
960 return -1;
963 return 0;
967 ensure all nodes have the same vnnmap we do
969 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
970 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
972 int j, ret;
974 /* push the new vnn map out to all the nodes */
975 for (j=0; j<nodemap->num; j++) {
976 /* dont push to nodes that are unavailable */
977 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
978 continue;
981 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
982 if (ret != 0) {
983 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
984 return -1;
988 return 0;
992 struct vacuum_info {
993 struct vacuum_info *next, *prev;
994 struct ctdb_recoverd *rec;
995 uint32_t srcnode;
996 struct ctdb_db_context *ctdb_db;
997 struct ctdb_marshall_buffer *recs;
998 struct ctdb_rec_data *r;
1001 static void vacuum_fetch_next(struct vacuum_info *v);
1004 called when a vacuum fetch has completed - just free it and do the next one
1006 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
1008 talloc_free(state);
1013 process the next element from the vacuum list
1015 static void vacuum_fetch_next(struct vacuum_info *v)
1017 struct ctdb_call call;
1018 struct ctdb_rec_data *r;
1020 while (v->recs->count) {
1021 struct ctdb_client_call_state *state;
1022 TDB_DATA data;
1023 struct ctdb_ltdb_header *hdr;
1025 ZERO_STRUCT(call);
1026 call.call_id = CTDB_NULL_FUNC;
1027 call.flags = CTDB_IMMEDIATE_MIGRATION;
1028 call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
1030 r = v->r;
1031 v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
1032 v->recs->count--;
1034 call.key.dptr = &r->data[0];
1035 call.key.dsize = r->keylen;
1037 /* ensure we don't block this daemon - just skip a record if we can't get
1038 the chainlock */
1039 if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
1040 continue;
1043 data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
1044 if (data.dptr == NULL) {
1045 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
1046 continue;
1049 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
1050 free(data.dptr);
1051 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
1052 continue;
1055 hdr = (struct ctdb_ltdb_header *)data.dptr;
1056 if (hdr->dmaster == v->rec->ctdb->pnn) {
1057 /* its already local */
1058 free(data.dptr);
1059 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
1060 continue;
1063 free(data.dptr);
1065 state = ctdb_call_send(v->ctdb_db, &call);
1066 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
1067 if (state == NULL) {
1068 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
1069 talloc_free(v);
1070 return;
1072 state->async.fn = vacuum_fetch_callback;
1073 state->async.private_data = NULL;
1076 talloc_free(v);
1081 destroy a vacuum info structure
1083 static int vacuum_info_destructor(struct vacuum_info *v)
1085 DLIST_REMOVE(v->rec->vacuum_info, v);
1086 return 0;
1091 handler for vacuum fetch
1093 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
1094 TDB_DATA data, void *private_data)
1096 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1097 struct ctdb_marshall_buffer *recs;
1098 int ret, i;
1099 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1100 const char *name;
1101 struct ctdb_dbid_map *dbmap=NULL;
1102 bool persistent = false;
1103 struct ctdb_db_context *ctdb_db;
1104 struct ctdb_rec_data *r;
1105 uint32_t srcnode;
1106 struct vacuum_info *v;
1108 recs = (struct ctdb_marshall_buffer *)data.dptr;
1109 r = (struct ctdb_rec_data *)&recs->data[0];
1111 if (recs->count == 0) {
1112 talloc_free(tmp_ctx);
1113 return;
1116 srcnode = r->reqid;
1118 for (v=rec->vacuum_info;v;v=v->next) {
1119 if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
1120 /* we're already working on records from this node */
1121 talloc_free(tmp_ctx);
1122 return;
1126 /* work out if the database is persistent */
1127 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
1128 if (ret != 0) {
1129 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
1130 talloc_free(tmp_ctx);
1131 return;
1134 for (i=0;i<dbmap->num;i++) {
1135 if (dbmap->dbs[i].dbid == recs->db_id) {
1136 persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
1137 break;
1140 if (i == dbmap->num) {
1141 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
1142 talloc_free(tmp_ctx);
1143 return;
1146 /* find the name of this database */
1147 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
1148 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
1149 talloc_free(tmp_ctx);
1150 return;
1153 /* attach to it */
1154 ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, persistent, 0);
1155 if (ctdb_db == NULL) {
1156 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
1157 talloc_free(tmp_ctx);
1158 return;
1161 v = talloc_zero(rec, struct vacuum_info);
1162 if (v == NULL) {
1163 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
1164 talloc_free(tmp_ctx);
1165 return;
1168 v->rec = rec;
1169 v->srcnode = srcnode;
1170 v->ctdb_db = ctdb_db;
1171 v->recs = talloc_memdup(v, recs, data.dsize);
1172 if (v->recs == NULL) {
1173 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
1174 talloc_free(v);
1175 talloc_free(tmp_ctx);
1176 return;
1178 v->r = (struct ctdb_rec_data *)&v->recs->data[0];
1180 DLIST_ADD(rec->vacuum_info, v);
1182 talloc_set_destructor(v, vacuum_info_destructor);
1184 vacuum_fetch_next(v);
1185 talloc_free(tmp_ctx);
1190 * handler for database detach
1192 static void detach_database_handler(struct ctdb_context *ctdb, uint64_t srvid,
1193 TDB_DATA data, void *private_data)
1195 struct ctdb_recoverd *rec = talloc_get_type(private_data,
1196 struct ctdb_recoverd);
1197 uint32_t db_id;
1198 struct vacuum_info *v, *vnext;
1199 struct ctdb_db_context *ctdb_db;
1201 if (data.dsize != sizeof(db_id)) {
1202 return;
1204 db_id = *(uint32_t *)data.dptr;
1206 ctdb_db = find_ctdb_db(ctdb, db_id);
1207 if (ctdb_db == NULL) {
1208 /* database is not attached */
1209 return;
1212 /* Stop any active vacuum fetch */
1213 v = rec->vacuum_info;
1214 while (v != NULL) {
1215 vnext = v->next;
1217 if (v->ctdb_db->db_id == db_id) {
1218 talloc_free(v);
1220 v = vnext;
1223 DLIST_REMOVE(ctdb->db_list, ctdb_db);
1225 DEBUG(DEBUG_NOTICE, ("Detached from database '%s'\n",
1226 ctdb_db->db_name));
1227 talloc_free(ctdb_db);
1231 called when ctdb_wait_timeout should finish
1233 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
1234 struct timeval yt, void *p)
1236 uint32_t *timed_out = (uint32_t *)p;
1237 (*timed_out) = 1;
1241 wait for a given number of seconds
1243 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
1245 uint32_t timed_out = 0;
1246 time_t usecs = (secs - (time_t)secs) * 1000000;
1247 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs), ctdb_wait_handler, &timed_out);
1248 while (!timed_out) {
1249 event_loop_once(ctdb->ev);
1254 called when an election times out (ends)
1256 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
1257 struct timeval t, void *p)
1259 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1260 rec->election_timeout = NULL;
1261 fast_start = false;
1263 DEBUG(DEBUG_WARNING,("Election period ended\n"));
1268 wait for an election to finish. It finished election_timeout seconds after
1269 the last election packet is received
1271 static void ctdb_wait_election(struct ctdb_recoverd *rec)
1273 struct ctdb_context *ctdb = rec->ctdb;
1274 while (rec->election_timeout) {
1275 event_loop_once(ctdb->ev);
1280 Update our local flags from all remote connected nodes.
1281 This is only run when we are or we belive we are the recovery master
1283 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
1285 int j;
1286 struct ctdb_context *ctdb = rec->ctdb;
1287 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1289 /* get the nodemap for all active remote nodes and verify
1290 they are the same as for this node
1292 for (j=0; j<nodemap->num; j++) {
1293 struct ctdb_node_map *remote_nodemap=NULL;
1294 int ret;
1296 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
1297 continue;
1299 if (nodemap->nodes[j].pnn == ctdb->pnn) {
1300 continue;
1303 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
1304 mem_ctx, &remote_nodemap);
1305 if (ret != 0) {
1306 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
1307 nodemap->nodes[j].pnn));
1308 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
1309 talloc_free(mem_ctx);
1310 return MONITOR_FAILED;
1312 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
1313 /* We should tell our daemon about this so it
1314 updates its flags or else we will log the same
1315 message again in the next iteration of recovery.
1316 Since we are the recovery master we can just as
1317 well update the flags on all nodes.
1319 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
1320 if (ret != 0) {
1321 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
1322 return -1;
1325 /* Update our local copy of the flags in the recovery
1326 daemon.
1328 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
1329 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
1330 nodemap->nodes[j].flags));
1331 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
1333 talloc_free(remote_nodemap);
1335 talloc_free(mem_ctx);
1336 return MONITOR_OK;
1340 /* Create a new random generation ip.
1341 The generation id can not be the INVALID_GENERATION id
1343 static uint32_t new_generation(void)
1345 uint32_t generation;
1347 while (1) {
1348 generation = random();
1350 if (generation != INVALID_GENERATION) {
1351 break;
1355 return generation;
1360 create a temporary working database
1362 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1364 char *name;
1365 struct tdb_wrap *recdb;
1366 unsigned tdb_flags;
1368 /* open up the temporary recovery database */
1369 name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
1370 ctdb->db_directory_state,
1371 ctdb->pnn);
1372 if (name == NULL) {
1373 return NULL;
1375 unlink(name);
1377 tdb_flags = TDB_NOLOCK;
1378 if (ctdb->valgrinding) {
1379 tdb_flags |= TDB_NOMMAP;
1381 tdb_flags |= (TDB_INCOMPATIBLE_HASH | TDB_DISALLOW_NESTING);
1383 recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1384 tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1385 if (recdb == NULL) {
1386 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1389 talloc_free(name);
1391 return recdb;
1396 a traverse function for pulling all relevant records from recdb
1398 struct recdb_data {
1399 struct ctdb_context *ctdb;
1400 struct ctdb_marshall_buffer *recdata;
1401 uint32_t len;
1402 uint32_t allocated_len;
1403 bool failed;
1404 bool persistent;
1407 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1409 struct recdb_data *params = (struct recdb_data *)p;
1410 struct ctdb_rec_data *recdata;
1411 struct ctdb_ltdb_header *hdr;
1414 * skip empty records - but NOT for persistent databases:
1416 * The record-by-record mode of recovery deletes empty records.
1417 * For persistent databases, this can lead to data corruption
1418 * by deleting records that should be there:
1420 * - Assume the cluster has been running for a while.
1422 * - A record R in a persistent database has been created and
1423 * deleted a couple of times, the last operation being deletion,
1424 * leaving an empty record with a high RSN, say 10.
1426 * - Now a node N is turned off.
1428 * - This leaves the local database copy of D on N with the empty
1429 * copy of R and RSN 10. On all other nodes, the recovery has deleted
1430 * the copy of record R.
1432 * - Now the record is created again while node N is turned off.
1433 * This creates R with RSN = 1 on all nodes except for N.
1435 * - Now node N is turned on again. The following recovery will chose
1436 * the older empty copy of R due to RSN 10 > RSN 1.
1438 * ==> Hence the record is gone after the recovery.
1440 * On databases like Samba's registry, this can damage the higher-level
1441 * data structures built from the various tdb-level records.
1443 if (!params->persistent && data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1444 return 0;
1447 /* update the dmaster field to point to us */
1448 hdr = (struct ctdb_ltdb_header *)data.dptr;
1449 if (!params->persistent) {
1450 hdr->dmaster = params->ctdb->pnn;
1451 hdr->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
1454 /* add the record to the blob ready to send to the nodes */
1455 recdata = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1456 if (recdata == NULL) {
1457 params->failed = true;
1458 return -1;
1460 if (params->len + recdata->length >= params->allocated_len) {
1461 params->allocated_len = recdata->length + params->len + params->ctdb->tunable.pulldb_preallocation_size;
1462 params->recdata = talloc_realloc_size(NULL, params->recdata, params->allocated_len);
1464 if (params->recdata == NULL) {
1465 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u\n",
1466 recdata->length + params->len));
1467 params->failed = true;
1468 return -1;
1470 params->recdata->count++;
1471 memcpy(params->len+(uint8_t *)params->recdata, recdata, recdata->length);
1472 params->len += recdata->length;
1473 talloc_free(recdata);
1475 return 0;
1479 push the recdb database out to all nodes
1481 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1482 bool persistent,
1483 struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1485 struct recdb_data params;
1486 struct ctdb_marshall_buffer *recdata;
1487 TDB_DATA outdata;
1488 TALLOC_CTX *tmp_ctx;
1489 uint32_t *nodes;
1491 tmp_ctx = talloc_new(ctdb);
1492 CTDB_NO_MEMORY(ctdb, tmp_ctx);
1494 recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1495 CTDB_NO_MEMORY(ctdb, recdata);
1497 recdata->db_id = dbid;
1499 params.ctdb = ctdb;
1500 params.recdata = recdata;
1501 params.len = offsetof(struct ctdb_marshall_buffer, data);
1502 params.allocated_len = params.len;
1503 params.failed = false;
1504 params.persistent = persistent;
1506 if (tdb_traverse_read(recdb->tdb, traverse_recdb, &params) == -1) {
1507 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1508 talloc_free(params.recdata);
1509 talloc_free(tmp_ctx);
1510 return -1;
1513 if (params.failed) {
1514 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1515 talloc_free(params.recdata);
1516 talloc_free(tmp_ctx);
1517 return -1;
1520 recdata = params.recdata;
1522 outdata.dptr = (void *)recdata;
1523 outdata.dsize = params.len;
1525 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1526 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1527 nodes, 0,
1528 CONTROL_TIMEOUT(), false, outdata,
1529 NULL, NULL,
1530 NULL) != 0) {
1531 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1532 talloc_free(recdata);
1533 talloc_free(tmp_ctx);
1534 return -1;
1537 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1538 dbid, recdata->count));
1540 talloc_free(recdata);
1541 talloc_free(tmp_ctx);
1543 return 0;
1548 go through a full recovery on one database
1550 static int recover_database(struct ctdb_recoverd *rec,
1551 TALLOC_CTX *mem_ctx,
1552 uint32_t dbid,
1553 bool persistent,
1554 uint32_t pnn,
1555 struct ctdb_node_map *nodemap,
1556 uint32_t transaction_id)
1558 struct tdb_wrap *recdb;
1559 int ret;
1560 struct ctdb_context *ctdb = rec->ctdb;
1561 TDB_DATA data;
1562 struct ctdb_control_wipe_database w;
1563 uint32_t *nodes;
1565 recdb = create_recdb(ctdb, mem_ctx);
1566 if (recdb == NULL) {
1567 return -1;
1570 /* pull all remote databases onto the recdb */
1571 ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
1572 if (ret != 0) {
1573 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1574 return -1;
1577 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1579 /* wipe all the remote databases. This is safe as we are in a transaction */
1580 w.db_id = dbid;
1581 w.transaction_id = transaction_id;
1583 data.dptr = (void *)&w;
1584 data.dsize = sizeof(w);
1586 nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1587 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1588 nodes, 0,
1589 CONTROL_TIMEOUT(), false, data,
1590 NULL, NULL,
1591 NULL) != 0) {
1592 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1593 talloc_free(recdb);
1594 return -1;
1597 /* push out the correct database. This sets the dmaster and skips
1598 the empty records */
1599 ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
1600 if (ret != 0) {
1601 talloc_free(recdb);
1602 return -1;
1605 /* all done with this database */
1606 talloc_free(recdb);
1608 return 0;
1611 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1612 struct ctdb_recoverd *rec,
1613 struct ctdb_node_map *nodemap,
1614 uint32_t *culprit)
1616 int j;
1617 int ret;
1619 if (ctdb->num_nodes != nodemap->num) {
1620 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
1621 ctdb->num_nodes, nodemap->num));
1622 if (culprit) {
1623 *culprit = ctdb->pnn;
1625 return -1;
1628 for (j=0; j<nodemap->num; j++) {
1629 /* For readability */
1630 struct ctdb_node *node = ctdb->nodes[j];
1632 /* release any existing data */
1633 if (node->known_public_ips) {
1634 talloc_free(node->known_public_ips);
1635 node->known_public_ips = NULL;
1637 if (node->available_public_ips) {
1638 talloc_free(node->available_public_ips);
1639 node->available_public_ips = NULL;
1642 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1643 continue;
1646 /* Retrieve the list of known public IPs from the node */
1647 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1648 CONTROL_TIMEOUT(),
1649 node->pnn,
1650 ctdb->nodes,
1652 &node->known_public_ips);
1653 if (ret != 0) {
1654 DEBUG(DEBUG_ERR,
1655 ("Failed to read known public IPs from node: %u\n",
1656 node->pnn));
1657 if (culprit) {
1658 *culprit = node->pnn;
1660 return -1;
1663 if (ctdb->do_checkpublicip &&
1664 !ctdb_op_is_disabled(rec->takeover_run) &&
1665 verify_remote_ip_allocation(ctdb,
1666 node->known_public_ips,
1667 node->pnn)) {
1668 DEBUG(DEBUG_ERR,("Trigger IP reallocation\n"));
1669 rec->need_takeover_run = true;
1672 /* Retrieve the list of available public IPs from the node */
1673 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1674 CONTROL_TIMEOUT(),
1675 node->pnn,
1676 ctdb->nodes,
1677 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1678 &node->available_public_ips);
1679 if (ret != 0) {
1680 DEBUG(DEBUG_ERR,
1681 ("Failed to read available public IPs from node: %u\n",
1682 node->pnn));
1683 if (culprit) {
1684 *culprit = node->pnn;
1686 return -1;
1690 return 0;
1693 /* when we start a recovery, make sure all nodes use the same reclock file
1694 setting
1696 static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd *rec)
1698 struct ctdb_context *ctdb = rec->ctdb;
1699 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
1700 TDB_DATA data;
1701 uint32_t *nodes;
1703 if (ctdb->recovery_lock_file == NULL) {
1704 data.dptr = NULL;
1705 data.dsize = 0;
1706 } else {
1707 data.dsize = strlen(ctdb->recovery_lock_file) + 1;
1708 data.dptr = (uint8_t *)ctdb->recovery_lock_file;
1711 nodes = list_of_active_nodes(ctdb, rec->nodemap, tmp_ctx, true);
1712 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECLOCK_FILE,
1713 nodes, 0,
1714 CONTROL_TIMEOUT(),
1715 false, data,
1716 NULL, NULL,
1717 rec) != 0) {
1718 DEBUG(DEBUG_ERR, (__location__ " Failed to sync reclock file settings\n"));
1719 talloc_free(tmp_ctx);
1720 return -1;
1723 talloc_free(tmp_ctx);
1724 return 0;
1729 * this callback is called for every node that failed to execute ctdb_takeover_run()
1730 * and set flag to re-run takeover run.
1732 static void takeover_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
1734 DEBUG(DEBUG_ERR, ("Node %u failed the takeover run\n", node_pnn));
1736 if (callback_data != NULL) {
1737 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
1739 DEBUG(DEBUG_ERR, ("Setting node %u as recovery fail culprit\n", node_pnn));
1741 ctdb_set_culprit(rec, node_pnn);
1746 static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
1748 struct ctdb_context *ctdb = rec->ctdb;
1749 int i;
1750 struct ctdb_banning_state *ban_state;
1752 *self_ban = false;
1753 for (i=0; i<ctdb->num_nodes; i++) {
1754 if (ctdb->nodes[i]->ban_state == NULL) {
1755 continue;
1757 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1758 if (ban_state->count < 2*ctdb->num_nodes) {
1759 continue;
1762 DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
1763 ctdb->nodes[i]->pnn, ban_state->count,
1764 ctdb->tunable.recovery_ban_period));
1765 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1766 ban_state->count = 0;
1768 /* Banning ourself? */
1769 if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
1770 *self_ban = true;
1775 static bool do_takeover_run(struct ctdb_recoverd *rec,
1776 struct ctdb_node_map *nodemap,
1777 bool banning_credits_on_fail)
1779 uint32_t *nodes = NULL;
1780 struct srvid_request_data dtr;
1781 TDB_DATA data;
1782 int i;
1783 uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
1784 int ret;
1785 bool ok;
1787 DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
1789 if (ctdb_op_is_in_progress(rec->takeover_run)) {
1790 DEBUG(DEBUG_ERR, (__location__
1791 " takeover run already in progress \n"));
1792 ok = false;
1793 goto done;
1796 if (!ctdb_op_begin(rec->takeover_run)) {
1797 ok = false;
1798 goto done;
1801 /* Disable IP checks (takeover runs, really) on other nodes
1802 * while doing this takeover run. This will stop those other
1803 * nodes from triggering takeover runs when think they should
1804 * be hosting an IP but it isn't yet on an interface. Don't
1805 * wait for replies since a failure here might cause some
1806 * noise in the logs but will not actually cause a problem.
1808 dtr.srvid = 0; /* No reply */
1809 dtr.pnn = -1;
1811 data.dptr = (uint8_t*)&dtr;
1812 data.dsize = sizeof(dtr);
1814 nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
1816 /* Disable for 60 seconds. This can be a tunable later if
1817 * necessary.
1819 dtr.data = 60;
1820 for (i = 0; i < talloc_array_length(nodes); i++) {
1821 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1822 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1823 data) != 0) {
1824 DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
1828 ret = ctdb_takeover_run(rec->ctdb, nodemap,
1829 rec->force_rebalance_nodes,
1830 takeover_fail_callback,
1831 banning_credits_on_fail ? rec : NULL);
1833 /* Reenable takeover runs and IP checks on other nodes */
1834 dtr.data = 0;
1835 for (i = 0; i < talloc_array_length(nodes); i++) {
1836 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1837 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1838 data) != 0) {
1839 DEBUG(DEBUG_INFO,("Failed to reenable takeover runs\n"));
1843 if (ret != 0) {
1844 DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
1845 ok = false;
1846 goto done;
1849 ok = true;
1850 /* Takeover run was successful so clear force rebalance targets */
1851 if (rebalance_nodes == rec->force_rebalance_nodes) {
1852 TALLOC_FREE(rec->force_rebalance_nodes);
1853 } else {
1854 DEBUG(DEBUG_WARNING,
1855 ("Rebalance target nodes changed during takeover run - not clearing\n"));
1857 done:
1858 rec->need_takeover_run = !ok;
1859 talloc_free(nodes);
1860 ctdb_op_end(rec->takeover_run);
1862 DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
1863 return ok;
1868 we are the recmaster, and recovery is needed - start a recovery run
1870 static int do_recovery(struct ctdb_recoverd *rec,
1871 TALLOC_CTX *mem_ctx, uint32_t pnn,
1872 struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap)
1874 struct ctdb_context *ctdb = rec->ctdb;
1875 int i, j, ret;
1876 uint32_t generation;
1877 struct ctdb_dbid_map *dbmap;
1878 TDB_DATA data;
1879 uint32_t *nodes;
1880 struct timeval start_time;
1881 uint32_t culprit = (uint32_t)-1;
1882 bool self_ban;
1884 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1886 /* if recovery fails, force it again */
1887 rec->need_recovery = true;
1889 if (!ctdb_op_begin(rec->recovery)) {
1890 return -1;
1893 if (rec->election_timeout) {
1894 /* an election is in progress */
1895 DEBUG(DEBUG_ERR, ("do_recovery called while election in progress - try again later\n"));
1896 goto fail;
1899 ban_misbehaving_nodes(rec, &self_ban);
1900 if (self_ban) {
1901 DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
1902 goto fail;
1905 if (ctdb->recovery_lock_file != NULL) {
1906 if (ctdb_recovery_have_lock(ctdb)) {
1907 DEBUG(DEBUG_NOTICE, ("Already holding recovery lock\n"));
1908 } else {
1909 start_time = timeval_current();
1910 DEBUG(DEBUG_NOTICE, ("Attempting to take recovery lock (%s)\n",
1911 ctdb->recovery_lock_file));
1912 if (!ctdb_recovery_lock(ctdb)) {
1913 if (ctdb->runstate == CTDB_RUNSTATE_FIRST_RECOVERY) {
1914 /* If ctdb is trying first recovery, it's
1915 * possible that current node does not know
1916 * yet who the recmaster is.
1918 DEBUG(DEBUG_ERR, ("Unable to get recovery lock"
1919 " - retrying recovery\n"));
1920 goto fail;
1923 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
1924 "and ban ourself for %u seconds\n",
1925 ctdb->tunable.recovery_ban_period));
1926 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
1927 goto fail;
1929 ctdb_ctrl_report_recd_lock_latency(ctdb,
1930 CONTROL_TIMEOUT(),
1931 timeval_elapsed(&start_time));
1932 DEBUG(DEBUG_NOTICE,
1933 ("Recovery lock taken successfully by recovery daemon\n"));
1937 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1939 /* get a list of all databases */
1940 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1941 if (ret != 0) {
1942 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1943 goto fail;
1946 /* we do the db creation before we set the recovery mode, so the freeze happens
1947 on all databases we will be dealing with. */
1949 /* verify that we have all the databases any other node has */
1950 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1951 if (ret != 0) {
1952 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1953 goto fail;
1956 /* verify that all other nodes have all our databases */
1957 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1958 if (ret != 0) {
1959 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1960 goto fail;
1962 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1964 /* update the database priority for all remote databases */
1965 ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
1966 if (ret != 0) {
1967 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
1969 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
1972 /* update all other nodes to use the same setting for reclock files
1973 as the local recovery master.
1975 sync_recovery_lock_file_across_cluster(rec);
1977 /* set recovery mode to active on all nodes */
1978 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1979 if (ret != 0) {
1980 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1981 goto fail;
1984 /* execute the "startrecovery" event script on all nodes */
1985 ret = run_startrecovery_eventscript(rec, nodemap);
1986 if (ret!=0) {
1987 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1988 goto fail;
1992 update all nodes to have the same flags that we have
1994 for (i=0;i<nodemap->num;i++) {
1995 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1996 continue;
1999 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
2000 if (ret != 0) {
2001 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2002 DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
2003 } else {
2004 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
2005 goto fail;
2010 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
2012 /* pick a new generation number */
2013 generation = new_generation();
2015 /* change the vnnmap on this node to use the new generation
2016 number but not on any other nodes.
2017 this guarantees that if we abort the recovery prematurely
2018 for some reason (a node stops responding?)
2019 that we can just return immediately and we will reenter
2020 recovery shortly again.
2021 I.e. we deliberately leave the cluster with an inconsistent
2022 generation id to allow us to abort recovery at any stage and
2023 just restart it from scratch.
2025 vnnmap->generation = generation;
2026 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
2027 if (ret != 0) {
2028 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
2029 goto fail;
2032 data.dptr = (void *)&generation;
2033 data.dsize = sizeof(uint32_t);
2035 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
2036 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
2037 nodes, 0,
2038 CONTROL_TIMEOUT(), false, data,
2039 NULL,
2040 transaction_start_fail_callback,
2041 rec) != 0) {
2042 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
2043 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
2044 nodes, 0,
2045 CONTROL_TIMEOUT(), false, tdb_null,
2046 NULL,
2047 NULL,
2048 NULL) != 0) {
2049 DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
2051 goto fail;
2054 DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
2056 for (i=0;i<dbmap->num;i++) {
2057 ret = recover_database(rec, mem_ctx,
2058 dbmap->dbs[i].dbid,
2059 dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT,
2060 pnn, nodemap, generation);
2061 if (ret != 0) {
2062 DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
2063 goto fail;
2067 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
2069 /* commit all the changes */
2070 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
2071 nodes, 0,
2072 CONTROL_TIMEOUT(), false, data,
2073 NULL, NULL,
2074 NULL) != 0) {
2075 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
2076 goto fail;
2079 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
2082 /* update the capabilities for all nodes */
2083 ret = update_capabilities(rec, nodemap);
2084 if (ret!=0) {
2085 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
2086 goto fail;
2089 /* build a new vnn map with all the currently active and
2090 unbanned nodes */
2091 generation = new_generation();
2092 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
2093 CTDB_NO_MEMORY(ctdb, vnnmap);
2094 vnnmap->generation = generation;
2095 vnnmap->size = 0;
2096 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
2097 CTDB_NO_MEMORY(ctdb, vnnmap->map);
2098 for (i=j=0;i<nodemap->num;i++) {
2099 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2100 continue;
2102 if (!ctdb_node_has_capabilities(rec->caps,
2103 ctdb->nodes[i]->pnn,
2104 CTDB_CAP_LMASTER)) {
2105 /* this node can not be an lmaster */
2106 DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
2107 continue;
2110 vnnmap->size++;
2111 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
2112 CTDB_NO_MEMORY(ctdb, vnnmap->map);
2113 vnnmap->map[j++] = nodemap->nodes[i].pnn;
2116 if (vnnmap->size == 0) {
2117 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
2118 vnnmap->size++;
2119 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
2120 CTDB_NO_MEMORY(ctdb, vnnmap->map);
2121 vnnmap->map[0] = pnn;
2124 /* update to the new vnnmap on all nodes */
2125 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
2126 if (ret != 0) {
2127 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
2128 goto fail;
2131 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
2133 /* update recmaster to point to us for all nodes */
2134 ret = set_recovery_master(ctdb, nodemap, pnn);
2135 if (ret!=0) {
2136 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
2137 goto fail;
2140 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
2142 /* disable recovery mode */
2143 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL);
2144 if (ret != 0) {
2145 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
2146 goto fail;
2149 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
2151 /* Fetch known/available public IPs from each active node */
2152 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
2153 if (ret != 0) {
2154 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2155 culprit));
2156 rec->need_takeover_run = true;
2157 goto fail;
2160 do_takeover_run(rec, nodemap, false);
2162 /* execute the "recovered" event script on all nodes */
2163 ret = run_recovered_eventscript(rec, nodemap, "do_recovery");
2164 if (ret!=0) {
2165 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
2166 goto fail;
2169 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
2171 /* send a message to all clients telling them that the cluster
2172 has been reconfigured */
2173 ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
2174 CTDB_SRVID_RECONFIGURE, tdb_null);
2175 if (ret != 0) {
2176 DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n"));
2177 goto fail;
2180 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
2182 rec->need_recovery = false;
2183 ctdb_op_end(rec->recovery);
2185 /* we managed to complete a full recovery, make sure to forgive
2186 any past sins by the nodes that could now participate in the
2187 recovery.
2189 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
2190 for (i=0;i<nodemap->num;i++) {
2191 struct ctdb_banning_state *ban_state;
2193 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2194 continue;
2197 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
2198 if (ban_state == NULL) {
2199 continue;
2202 ban_state->count = 0;
2205 /* We just finished a recovery successfully.
2206 We now wait for rerecovery_timeout before we allow
2207 another recovery to take place.
2209 DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
2210 ctdb_op_disable(rec->recovery, ctdb->ev,
2211 ctdb->tunable.rerecovery_timeout);
2212 return 0;
2214 fail:
2215 ctdb_op_end(rec->recovery);
2216 return -1;
2221 elections are won by first checking the number of connected nodes, then
2222 the priority time, then the pnn
2224 struct election_message {
2225 uint32_t num_connected;
2226 struct timeval priority_time;
2227 uint32_t pnn;
2228 uint32_t node_flags;
2232 form this nodes election data
2234 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
2236 int ret, i;
2237 struct ctdb_node_map *nodemap;
2238 struct ctdb_context *ctdb = rec->ctdb;
2240 ZERO_STRUCTP(em);
2242 em->pnn = rec->ctdb->pnn;
2243 em->priority_time = rec->priority_time;
2245 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
2246 if (ret != 0) {
2247 DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
2248 return;
2251 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
2252 em->node_flags = rec->node_flags;
2254 for (i=0;i<nodemap->num;i++) {
2255 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
2256 em->num_connected++;
2260 /* we shouldnt try to win this election if we cant be a recmaster */
2261 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2262 em->num_connected = 0;
2263 em->priority_time = timeval_current();
2266 talloc_free(nodemap);
2270 see if the given election data wins
2272 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
2274 struct election_message myem;
2275 int cmp = 0;
2277 ctdb_election_data(rec, &myem);
2279 /* we cant win if we dont have the recmaster capability */
2280 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2281 return false;
2284 /* we cant win if we are banned */
2285 if (rec->node_flags & NODE_FLAGS_BANNED) {
2286 return false;
2289 /* we cant win if we are stopped */
2290 if (rec->node_flags & NODE_FLAGS_STOPPED) {
2291 return false;
2294 /* we will automatically win if the other node is banned */
2295 if (em->node_flags & NODE_FLAGS_BANNED) {
2296 return true;
2299 /* we will automatically win if the other node is banned */
2300 if (em->node_flags & NODE_FLAGS_STOPPED) {
2301 return true;
2304 /* try to use the most connected node */
2305 if (cmp == 0) {
2306 cmp = (int)myem.num_connected - (int)em->num_connected;
2309 /* then the longest running node */
2310 if (cmp == 0) {
2311 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
2314 if (cmp == 0) {
2315 cmp = (int)myem.pnn - (int)em->pnn;
2318 return cmp > 0;
2322 send out an election request
2324 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
2326 int ret;
2327 TDB_DATA election_data;
2328 struct election_message emsg;
2329 uint64_t srvid;
2330 struct ctdb_context *ctdb = rec->ctdb;
2332 srvid = CTDB_SRVID_RECOVERY;
2334 ctdb_election_data(rec, &emsg);
2336 election_data.dsize = sizeof(struct election_message);
2337 election_data.dptr = (unsigned char *)&emsg;
2340 /* first we assume we will win the election and set
2341 recoverymaster to be ourself on the current node
2343 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
2344 if (ret != 0) {
2345 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
2346 return -1;
2350 /* send an election message to all active nodes */
2351 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
2352 return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
2356 this function will unban all nodes in the cluster
2358 static void unban_all_nodes(struct ctdb_context *ctdb)
2360 int ret, i;
2361 struct ctdb_node_map *nodemap;
2362 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2364 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2365 if (ret != 0) {
2366 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
2367 return;
2370 for (i=0;i<nodemap->num;i++) {
2371 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
2372 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
2373 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(),
2374 nodemap->nodes[i].pnn, 0,
2375 NODE_FLAGS_BANNED);
2376 if (ret != 0) {
2377 DEBUG(DEBUG_ERR, (__location__ " failed to reset ban state\n"));
2382 talloc_free(tmp_ctx);
2387 we think we are winning the election - send a broadcast election request
2389 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
2391 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2392 int ret;
2394 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
2395 if (ret != 0) {
2396 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
2399 talloc_free(rec->send_election_te);
2400 rec->send_election_te = NULL;
2404 handler for memory dumps
2406 static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
2407 TDB_DATA data, void *private_data)
2409 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2410 TDB_DATA *dump;
2411 int ret;
2412 struct srvid_request *rd;
2414 if (data.dsize != sizeof(struct srvid_request)) {
2415 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2416 talloc_free(tmp_ctx);
2417 return;
2419 rd = (struct srvid_request *)data.dptr;
2421 dump = talloc_zero(tmp_ctx, TDB_DATA);
2422 if (dump == NULL) {
2423 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
2424 talloc_free(tmp_ctx);
2425 return;
2427 ret = ctdb_dump_memory(ctdb, dump);
2428 if (ret != 0) {
2429 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
2430 talloc_free(tmp_ctx);
2431 return;
2434 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
2436 ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
2437 if (ret != 0) {
2438 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
2439 talloc_free(tmp_ctx);
2440 return;
2443 talloc_free(tmp_ctx);
2447 handler for reload_nodes
2449 static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid,
2450 TDB_DATA data, void *private_data)
2452 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2454 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
2456 ctdb_load_nodes_file(rec->ctdb);
2460 static void ctdb_rebalance_timeout(struct event_context *ev,
2461 struct timed_event *te,
2462 struct timeval t, void *p)
2464 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2466 if (rec->force_rebalance_nodes == NULL) {
2467 DEBUG(DEBUG_ERR,
2468 ("Rebalance timeout occurred - no nodes to rebalance\n"));
2469 return;
2472 DEBUG(DEBUG_NOTICE,
2473 ("Rebalance timeout occurred - do takeover run\n"));
2474 do_takeover_run(rec, rec->nodemap, false);
2478 static void recd_node_rebalance_handler(struct ctdb_context *ctdb,
2479 uint64_t srvid,
2480 TDB_DATA data, void *private_data)
2482 uint32_t pnn;
2483 uint32_t *t;
2484 int len;
2485 uint32_t deferred_rebalance;
2486 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2488 if (rec->recmaster != ctdb_get_pnn(ctdb)) {
2489 return;
2492 if (data.dsize != sizeof(uint32_t)) {
2493 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
2494 return;
2497 pnn = *(uint32_t *)&data.dptr[0];
2499 DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
2501 /* Copy any existing list of nodes. There's probably some
2502 * sort of realloc variant that will do this but we need to
2503 * make sure that freeing the old array also cancels the timer
2504 * event for the timeout... not sure if realloc will do that.
2506 len = (rec->force_rebalance_nodes != NULL) ?
2507 talloc_array_length(rec->force_rebalance_nodes) :
2510 /* This allows duplicates to be added but they don't cause
2511 * harm. A call to add a duplicate PNN arguably means that
2512 * the timeout should be reset, so this is the simplest
2513 * solution.
2515 t = talloc_zero_array(rec, uint32_t, len+1);
2516 CTDB_NO_MEMORY_VOID(ctdb, t);
2517 if (len > 0) {
2518 memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
2520 t[len] = pnn;
2522 talloc_free(rec->force_rebalance_nodes);
2524 rec->force_rebalance_nodes = t;
2526 /* If configured, setup a deferred takeover run to make sure
2527 * that certain nodes get IPs rebalanced to them. This will
2528 * be cancelled if a successful takeover run happens before
2529 * the timeout. Assign tunable value to variable for
2530 * readability.
2532 deferred_rebalance = ctdb->tunable.deferred_rebalance_on_node_add;
2533 if (deferred_rebalance != 0) {
2534 event_add_timed(ctdb->ev, rec->force_rebalance_nodes,
2535 timeval_current_ofs(deferred_rebalance, 0),
2536 ctdb_rebalance_timeout, rec);
2542 static void recd_update_ip_handler(struct ctdb_context *ctdb, uint64_t srvid,
2543 TDB_DATA data, void *private_data)
2545 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2546 struct ctdb_public_ip *ip;
2548 if (rec->recmaster != rec->ctdb->pnn) {
2549 DEBUG(DEBUG_INFO,("Not recmaster, ignore update ip message\n"));
2550 return;
2553 if (data.dsize != sizeof(struct ctdb_public_ip)) {
2554 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(struct ctdb_public_ip)));
2555 return;
2558 ip = (struct ctdb_public_ip *)data.dptr;
2560 update_ip_assignment_tree(rec->ctdb, ip);
2563 static void srvid_disable_and_reply(struct ctdb_context *ctdb,
2564 TDB_DATA data,
2565 struct ctdb_op_state *op_state)
2567 struct srvid_request_data *r;
2568 uint32_t timeout;
2569 TDB_DATA result;
2570 int32_t ret = 0;
2572 /* Validate input data */
2573 if (data.dsize != sizeof(struct srvid_request_data)) {
2574 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2575 "expecting %lu\n", (long unsigned)data.dsize,
2576 (long unsigned)sizeof(struct srvid_request)));
2577 return;
2579 if (data.dptr == NULL) {
2580 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2581 return;
2584 r = (struct srvid_request_data *)data.dptr;
2585 timeout = r->data;
2587 ret = ctdb_op_disable(op_state, ctdb->ev, timeout);
2588 if (ret != 0) {
2589 goto done;
2592 /* Returning our PNN tells the caller that we succeeded */
2593 ret = ctdb_get_pnn(ctdb);
2594 done:
2595 result.dsize = sizeof(int32_t);
2596 result.dptr = (uint8_t *)&ret;
2597 srvid_request_reply(ctdb, (struct srvid_request *)r, result);
2600 static void disable_takeover_runs_handler(struct ctdb_context *ctdb,
2601 uint64_t srvid, TDB_DATA data,
2602 void *private_data)
2604 struct ctdb_recoverd *rec = talloc_get_type(private_data,
2605 struct ctdb_recoverd);
2607 srvid_disable_and_reply(ctdb, data, rec->takeover_run);
2610 /* Backward compatibility for this SRVID */
2611 static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid,
2612 TDB_DATA data, void *private_data)
2614 struct ctdb_recoverd *rec = talloc_get_type(private_data,
2615 struct ctdb_recoverd);
2616 uint32_t timeout;
2618 if (data.dsize != sizeof(uint32_t)) {
2619 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2620 "expecting %lu\n", (long unsigned)data.dsize,
2621 (long unsigned)sizeof(uint32_t)));
2622 return;
2624 if (data.dptr == NULL) {
2625 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2626 return;
2629 timeout = *((uint32_t *)data.dptr);
2631 ctdb_op_disable(rec->takeover_run, ctdb->ev, timeout);
2634 static void disable_recoveries_handler(struct ctdb_context *ctdb,
2635 uint64_t srvid, TDB_DATA data,
2636 void *private_data)
2638 struct ctdb_recoverd *rec = talloc_get_type(private_data,
2639 struct ctdb_recoverd);
2641 srvid_disable_and_reply(ctdb, data, rec->recovery);
2645 handler for ip reallocate, just add it to the list of requests and
2646 handle this later in the monitor_cluster loop so we do not recurse
2647 with other requests to takeover_run()
2649 static void ip_reallocate_handler(struct ctdb_context *ctdb, uint64_t srvid,
2650 TDB_DATA data, void *private_data)
2652 struct srvid_request *request;
2653 struct ctdb_recoverd *rec = talloc_get_type(private_data,
2654 struct ctdb_recoverd);
2656 if (data.dsize != sizeof(struct srvid_request)) {
2657 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2658 return;
2661 request = (struct srvid_request *)data.dptr;
2663 srvid_request_add(ctdb, &rec->reallocate_requests, request);
2666 static void process_ipreallocate_requests(struct ctdb_context *ctdb,
2667 struct ctdb_recoverd *rec)
2669 TDB_DATA result;
2670 int32_t ret;
2671 uint32_t culprit;
2672 struct srvid_requests *current;
2674 DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
2676 /* Only process requests that are currently pending. More
2677 * might come in while the takeover run is in progress and
2678 * they will need to be processed later since they might
2679 * be in response flag changes.
2681 current = rec->reallocate_requests;
2682 rec->reallocate_requests = NULL;
2684 /* update the list of public ips that a node can handle for
2685 all connected nodes
2687 ret = ctdb_reload_remote_public_ips(ctdb, rec, rec->nodemap, &culprit);
2688 if (ret != 0) {
2689 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2690 culprit));
2691 rec->need_takeover_run = true;
2693 if (ret == 0) {
2694 if (do_takeover_run(rec, rec->nodemap, false)) {
2695 ret = ctdb_get_pnn(ctdb);
2696 } else {
2697 ret = -1;
2701 result.dsize = sizeof(int32_t);
2702 result.dptr = (uint8_t *)&ret;
2704 srvid_requests_reply(ctdb, &current, result);
2709 handler for recovery master elections
2711 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
2712 TDB_DATA data, void *private_data)
2714 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2715 int ret;
2716 struct election_message *em = (struct election_message *)data.dptr;
2718 /* Ignore election packets from ourself */
2719 if (ctdb->pnn == em->pnn) {
2720 return;
2723 /* we got an election packet - update the timeout for the election */
2724 talloc_free(rec->election_timeout);
2725 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2726 fast_start ?
2727 timeval_current_ofs(0, 500000) :
2728 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2729 ctdb_election_timeout, rec);
2731 /* someone called an election. check their election data
2732 and if we disagree and we would rather be the elected node,
2733 send a new election message to all other nodes
2735 if (ctdb_election_win(rec, em)) {
2736 if (!rec->send_election_te) {
2737 rec->send_election_te = event_add_timed(ctdb->ev, rec,
2738 timeval_current_ofs(0, 500000),
2739 election_send_request, rec);
2741 /*unban_all_nodes(ctdb);*/
2742 return;
2745 /* we didn't win */
2746 TALLOC_FREE(rec->send_election_te);
2748 /* Release the recovery lock file */
2749 if (ctdb_recovery_have_lock(ctdb)) {
2750 ctdb_recovery_unlock(ctdb);
2751 unban_all_nodes(ctdb);
2754 /* ok, let that guy become recmaster then */
2755 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
2756 if (ret != 0) {
2757 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
2758 return;
2761 return;
2766 force the start of the election process
2768 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
2769 struct ctdb_node_map *nodemap)
2771 int ret;
2772 struct ctdb_context *ctdb = rec->ctdb;
2774 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
2776 /* set all nodes to recovery mode to stop all internode traffic */
2777 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
2778 if (ret != 0) {
2779 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
2780 return;
2783 talloc_free(rec->election_timeout);
2784 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2785 fast_start ?
2786 timeval_current_ofs(0, 500000) :
2787 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2788 ctdb_election_timeout, rec);
2790 ret = send_election_request(rec, pnn);
2791 if (ret!=0) {
2792 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
2793 return;
2796 /* wait for a few seconds to collect all responses */
2797 ctdb_wait_election(rec);
2803 handler for when a node changes its flags
2805 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
2806 TDB_DATA data, void *private_data)
2808 int ret;
2809 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2810 struct ctdb_node_map *nodemap=NULL;
2811 TALLOC_CTX *tmp_ctx;
2812 int i;
2813 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2814 int disabled_flag_changed;
2816 if (data.dsize != sizeof(*c)) {
2817 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2818 return;
2821 tmp_ctx = talloc_new(ctdb);
2822 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2824 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2825 if (ret != 0) {
2826 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2827 talloc_free(tmp_ctx);
2828 return;
2832 for (i=0;i<nodemap->num;i++) {
2833 if (nodemap->nodes[i].pnn == c->pnn) break;
2836 if (i == nodemap->num) {
2837 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2838 talloc_free(tmp_ctx);
2839 return;
2842 if (c->old_flags != c->new_flags) {
2843 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2846 disabled_flag_changed = (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
2848 nodemap->nodes[i].flags = c->new_flags;
2850 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2851 CTDB_CURRENT_NODE, &ctdb->recovery_master);
2853 if (ret == 0) {
2854 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2855 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2858 if (ret == 0 &&
2859 ctdb->recovery_master == ctdb->pnn &&
2860 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2861 /* Only do the takeover run if the perm disabled or unhealthy
2862 flags changed since these will cause an ip failover but not
2863 a recovery.
2864 If the node became disconnected or banned this will also
2865 lead to an ip address failover but that is handled
2866 during recovery
2868 if (disabled_flag_changed) {
2869 rec->need_takeover_run = true;
2873 talloc_free(tmp_ctx);
2877 handler for when we need to push out flag changes ot all other nodes
2879 static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
2880 TDB_DATA data, void *private_data)
2882 int ret;
2883 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2884 struct ctdb_node_map *nodemap=NULL;
2885 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2886 uint32_t recmaster;
2887 uint32_t *nodes;
2889 /* find the recovery master */
2890 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
2891 if (ret != 0) {
2892 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from local node\n"));
2893 talloc_free(tmp_ctx);
2894 return;
2897 /* read the node flags from the recmaster */
2898 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), recmaster, tmp_ctx, &nodemap);
2899 if (ret != 0) {
2900 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2901 talloc_free(tmp_ctx);
2902 return;
2904 if (c->pnn >= nodemap->num) {
2905 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2906 talloc_free(tmp_ctx);
2907 return;
2910 /* send the flags update to all connected nodes */
2911 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2913 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2914 nodes, 0, CONTROL_TIMEOUT(),
2915 false, data,
2916 NULL, NULL,
2917 NULL) != 0) {
2918 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2920 talloc_free(tmp_ctx);
2921 return;
2924 talloc_free(tmp_ctx);
2928 struct verify_recmode_normal_data {
2929 uint32_t count;
2930 enum monitor_result status;
2933 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2935 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2938 /* one more node has responded with recmode data*/
2939 rmdata->count--;
2941 /* if we failed to get the recmode, then return an error and let
2942 the main loop try again.
2944 if (state->state != CTDB_CONTROL_DONE) {
2945 if (rmdata->status == MONITOR_OK) {
2946 rmdata->status = MONITOR_FAILED;
2948 return;
2951 /* if we got a response, then the recmode will be stored in the
2952 status field
2954 if (state->status != CTDB_RECOVERY_NORMAL) {
2955 DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
2956 rmdata->status = MONITOR_RECOVERY_NEEDED;
2959 return;
2963 /* verify that all nodes are in normal recovery mode */
2964 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2966 struct verify_recmode_normal_data *rmdata;
2967 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2968 struct ctdb_client_control_state *state;
2969 enum monitor_result status;
2970 int j;
2972 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2973 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2974 rmdata->count = 0;
2975 rmdata->status = MONITOR_OK;
2977 /* loop over all active nodes and send an async getrecmode call to
2978 them*/
2979 for (j=0; j<nodemap->num; j++) {
2980 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2981 continue;
2983 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2984 CONTROL_TIMEOUT(),
2985 nodemap->nodes[j].pnn);
2986 if (state == NULL) {
2987 /* we failed to send the control, treat this as
2988 an error and try again next iteration
2990 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2991 talloc_free(mem_ctx);
2992 return MONITOR_FAILED;
2995 /* set up the callback functions */
2996 state->async.fn = verify_recmode_normal_callback;
2997 state->async.private_data = rmdata;
2999 /* one more control to wait for to complete */
3000 rmdata->count++;
3004 /* now wait for up to the maximum number of seconds allowed
3005 or until all nodes we expect a response from has replied
3007 while (rmdata->count > 0) {
3008 event_loop_once(ctdb->ev);
3011 status = rmdata->status;
3012 talloc_free(mem_ctx);
3013 return status;
3017 struct verify_recmaster_data {
3018 struct ctdb_recoverd *rec;
3019 uint32_t count;
3020 uint32_t pnn;
3021 enum monitor_result status;
3024 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
3026 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
3029 /* one more node has responded with recmaster data*/
3030 rmdata->count--;
3032 /* if we failed to get the recmaster, then return an error and let
3033 the main loop try again.
3035 if (state->state != CTDB_CONTROL_DONE) {
3036 if (rmdata->status == MONITOR_OK) {
3037 rmdata->status = MONITOR_FAILED;
3039 return;
3042 /* if we got a response, then the recmaster will be stored in the
3043 status field
3045 if (state->status != rmdata->pnn) {
3046 DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
3047 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
3048 rmdata->status = MONITOR_ELECTION_NEEDED;
3051 return;
3055 /* verify that all nodes agree that we are the recmaster */
3056 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
3058 struct ctdb_context *ctdb = rec->ctdb;
3059 struct verify_recmaster_data *rmdata;
3060 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3061 struct ctdb_client_control_state *state;
3062 enum monitor_result status;
3063 int j;
3065 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
3066 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
3067 rmdata->rec = rec;
3068 rmdata->count = 0;
3069 rmdata->pnn = pnn;
3070 rmdata->status = MONITOR_OK;
3072 /* loop over all active nodes and send an async getrecmaster call to
3073 them*/
3074 for (j=0; j<nodemap->num; j++) {
3075 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3076 continue;
3078 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
3079 CONTROL_TIMEOUT(),
3080 nodemap->nodes[j].pnn);
3081 if (state == NULL) {
3082 /* we failed to send the control, treat this as
3083 an error and try again next iteration
3085 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
3086 talloc_free(mem_ctx);
3087 return MONITOR_FAILED;
3090 /* set up the callback functions */
3091 state->async.fn = verify_recmaster_callback;
3092 state->async.private_data = rmdata;
3094 /* one more control to wait for to complete */
3095 rmdata->count++;
3099 /* now wait for up to the maximum number of seconds allowed
3100 or until all nodes we expect a response from has replied
3102 while (rmdata->count > 0) {
3103 event_loop_once(ctdb->ev);
3106 status = rmdata->status;
3107 talloc_free(mem_ctx);
3108 return status;
3111 static bool interfaces_have_changed(struct ctdb_context *ctdb,
3112 struct ctdb_recoverd *rec)
3114 struct ctdb_control_get_ifaces *ifaces = NULL;
3115 TALLOC_CTX *mem_ctx;
3116 bool ret = false;
3118 mem_ctx = talloc_new(NULL);
3120 /* Read the interfaces from the local node */
3121 if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
3122 CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
3123 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
3124 /* We could return an error. However, this will be
3125 * rare so we'll decide that the interfaces have
3126 * actually changed, just in case.
3128 talloc_free(mem_ctx);
3129 return true;
3132 if (!rec->ifaces) {
3133 /* We haven't been here before so things have changed */
3134 DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
3135 ret = true;
3136 } else if (rec->ifaces->num != ifaces->num) {
3137 /* Number of interfaces has changed */
3138 DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
3139 rec->ifaces->num, ifaces->num));
3140 ret = true;
3141 } else {
3142 /* See if interface names or link states have changed */
3143 int i;
3144 for (i = 0; i < rec->ifaces->num; i++) {
3145 struct ctdb_control_iface_info * iface = &rec->ifaces->ifaces[i];
3146 if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
3147 DEBUG(DEBUG_NOTICE,
3148 ("Interface in slot %d changed: %s => %s\n",
3149 i, iface->name, ifaces->ifaces[i].name));
3150 ret = true;
3151 break;
3153 if (iface->link_state != ifaces->ifaces[i].link_state) {
3154 DEBUG(DEBUG_NOTICE,
3155 ("Interface %s changed state: %d => %d\n",
3156 iface->name, iface->link_state,
3157 ifaces->ifaces[i].link_state));
3158 ret = true;
3159 break;
3164 talloc_free(rec->ifaces);
3165 rec->ifaces = talloc_steal(rec, ifaces);
3167 talloc_free(mem_ctx);
3168 return ret;
3171 /* called to check that the local allocation of public ip addresses is ok.
3173 static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn, struct ctdb_node_map *nodemap)
3175 TALLOC_CTX *mem_ctx = talloc_new(NULL);
3176 struct ctdb_uptime *uptime1 = NULL;
3177 struct ctdb_uptime *uptime2 = NULL;
3178 int ret, j;
3179 bool need_takeover_run = false;
3181 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
3182 CTDB_CURRENT_NODE, &uptime1);
3183 if (ret != 0) {
3184 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
3185 talloc_free(mem_ctx);
3186 return -1;
3189 if (interfaces_have_changed(ctdb, rec)) {
3190 DEBUG(DEBUG_NOTICE, ("The interfaces status has changed on "
3191 "local node %u - force takeover run\n",
3192 pnn));
3193 need_takeover_run = true;
3196 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
3197 CTDB_CURRENT_NODE, &uptime2);
3198 if (ret != 0) {
3199 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
3200 talloc_free(mem_ctx);
3201 return -1;
3204 /* skip the check if the startrecovery time has changed */
3205 if (timeval_compare(&uptime1->last_recovery_started,
3206 &uptime2->last_recovery_started) != 0) {
3207 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3208 talloc_free(mem_ctx);
3209 return 0;
3212 /* skip the check if the endrecovery time has changed */
3213 if (timeval_compare(&uptime1->last_recovery_finished,
3214 &uptime2->last_recovery_finished) != 0) {
3215 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3216 talloc_free(mem_ctx);
3217 return 0;
3220 /* skip the check if we have started but not finished recovery */
3221 if (timeval_compare(&uptime1->last_recovery_finished,
3222 &uptime1->last_recovery_started) != 1) {
3223 DEBUG(DEBUG_INFO, (__location__ " in the middle of recovery or ip reallocation. skipping public ip address check\n"));
3224 talloc_free(mem_ctx);
3226 return 0;
3229 /* verify that we have the ip addresses we should have
3230 and we dont have ones we shouldnt have.
3231 if we find an inconsistency we set recmode to
3232 active on the local node and wait for the recmaster
3233 to do a full blown recovery.
3234 also if the pnn is -1 and we are healthy and can host the ip
3235 we also request a ip reallocation.
3237 if (ctdb->tunable.disable_ip_failover == 0) {
3238 struct ctdb_all_public_ips *ips = NULL;
3240 /* read the *available* IPs from the local node */
3241 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
3242 if (ret != 0) {
3243 DEBUG(DEBUG_ERR, ("Unable to get available public IPs from local node %u\n", pnn));
3244 talloc_free(mem_ctx);
3245 return -1;
3248 for (j=0; j<ips->num; j++) {
3249 if (ips->ips[j].pnn == -1 &&
3250 nodemap->nodes[pnn].flags == 0) {
3251 DEBUG(DEBUG_CRIT,("Public IP '%s' is not assigned and we could serve it\n",
3252 ctdb_addr_to_str(&ips->ips[j].addr)));
3253 need_takeover_run = true;
3257 talloc_free(ips);
3259 /* read the *known* IPs from the local node */
3260 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
3261 if (ret != 0) {
3262 DEBUG(DEBUG_ERR, ("Unable to get known public IPs from local node %u\n", pnn));
3263 talloc_free(mem_ctx);
3264 return -1;
3267 for (j=0; j<ips->num; j++) {
3268 if (ips->ips[j].pnn == pnn) {
3269 if (ctdb->do_checkpublicip && !ctdb_sys_have_ip(&ips->ips[j].addr)) {
3270 DEBUG(DEBUG_CRIT,("Public IP '%s' is assigned to us but not on an interface\n",
3271 ctdb_addr_to_str(&ips->ips[j].addr)));
3272 need_takeover_run = true;
3274 } else {
3275 if (ctdb->do_checkpublicip &&
3276 ctdb_sys_have_ip(&ips->ips[j].addr)) {
3278 DEBUG(DEBUG_CRIT,("We are still serving a public IP '%s' that we should not be serving. Removing it\n",
3279 ctdb_addr_to_str(&ips->ips[j].addr)));
3281 if (ctdb_ctrl_release_ip(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ips->ips[j]) != 0) {
3282 DEBUG(DEBUG_ERR,("Failed to release local IP address\n"));
3289 if (need_takeover_run) {
3290 struct srvid_request rd;
3291 TDB_DATA data;
3293 DEBUG(DEBUG_CRIT,("Trigger takeoverrun\n"));
3295 rd.pnn = ctdb->pnn;
3296 rd.srvid = 0;
3297 data.dptr = (uint8_t *)&rd;
3298 data.dsize = sizeof(rd);
3300 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
3301 if (ret != 0) {
3302 DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
3305 talloc_free(mem_ctx);
3306 return 0;
3310 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
3312 struct ctdb_node_map **remote_nodemaps = callback_data;
3314 if (node_pnn >= ctdb->num_nodes) {
3315 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
3316 return;
3319 remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
3323 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
3324 struct ctdb_node_map *nodemap,
3325 struct ctdb_node_map **remote_nodemaps)
3327 uint32_t *nodes;
3329 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
3330 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
3331 nodes, 0,
3332 CONTROL_TIMEOUT(), false, tdb_null,
3333 async_getnodemap_callback,
3334 NULL,
3335 remote_nodemaps) != 0) {
3336 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
3338 return -1;
3341 return 0;
3344 static int update_recovery_lock_file(struct ctdb_context *ctdb)
3346 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
3347 const char *reclockfile;
3349 if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
3350 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
3351 talloc_free(tmp_ctx);
3352 return -1;
3355 if (reclockfile == NULL) {
3356 if (ctdb->recovery_lock_file != NULL) {
3357 DEBUG(DEBUG_NOTICE,("Recovery lock file disabled\n"));
3358 talloc_free(ctdb->recovery_lock_file);
3359 ctdb->recovery_lock_file = NULL;
3360 ctdb_recovery_unlock(ctdb);
3362 talloc_free(tmp_ctx);
3363 return 0;
3366 if (ctdb->recovery_lock_file == NULL) {
3367 DEBUG(DEBUG_NOTICE,
3368 ("Recovery lock file enabled (%s)\n", reclockfile));
3369 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3370 ctdb_recovery_unlock(ctdb);
3371 talloc_free(tmp_ctx);
3372 return 0;
3376 if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
3377 talloc_free(tmp_ctx);
3378 return 0;
3381 DEBUG(DEBUG_NOTICE,
3382 ("Recovery lock file changed (now %s)\n", reclockfile));
3383 talloc_free(ctdb->recovery_lock_file);
3384 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3385 ctdb_recovery_unlock(ctdb);
3387 talloc_free(tmp_ctx);
3388 return 0;
3391 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
3392 TALLOC_CTX *mem_ctx)
3394 uint32_t pnn;
3395 struct ctdb_node_map *nodemap=NULL;
3396 struct ctdb_node_map *recmaster_nodemap=NULL;
3397 struct ctdb_node_map **remote_nodemaps=NULL;
3398 struct ctdb_vnn_map *vnnmap=NULL;
3399 struct ctdb_vnn_map *remote_vnnmap=NULL;
3400 uint32_t num_lmasters;
3401 int32_t debug_level;
3402 int i, j, ret;
3403 bool self_ban;
3406 /* verify that the main daemon is still running */
3407 if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
3408 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
3409 exit(-1);
3412 /* ping the local daemon to tell it we are alive */
3413 ctdb_ctrl_recd_ping(ctdb);
3415 if (rec->election_timeout) {
3416 /* an election is in progress */
3417 return;
3420 /* read the debug level from the parent and update locally */
3421 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
3422 if (ret !=0) {
3423 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
3424 return;
3426 DEBUGLEVEL = debug_level;
3428 /* get relevant tunables */
3429 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
3430 if (ret != 0) {
3431 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
3432 return;
3435 /* get runstate */
3436 ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(),
3437 CTDB_CURRENT_NODE, &ctdb->runstate);
3438 if (ret != 0) {
3439 DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n"));
3440 return;
3443 /* get the current recovery lock file from the server */
3444 if (update_recovery_lock_file(ctdb) != 0) {
3445 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
3446 return;
3449 /* Make sure that if recovery lock verification becomes disabled when
3450 we close the file
3452 if (ctdb->recovery_lock_file == NULL) {
3453 ctdb_recovery_unlock(ctdb);
3456 pnn = ctdb_get_pnn(ctdb);
3458 /* get the vnnmap */
3459 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
3460 if (ret != 0) {
3461 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
3462 return;
3466 /* get number of nodes */
3467 if (rec->nodemap) {
3468 talloc_free(rec->nodemap);
3469 rec->nodemap = NULL;
3470 nodemap=NULL;
3472 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
3473 if (ret != 0) {
3474 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
3475 return;
3477 nodemap = rec->nodemap;
3479 /* remember our own node flags */
3480 rec->node_flags = nodemap->nodes[pnn].flags;
3482 ban_misbehaving_nodes(rec, &self_ban);
3483 if (self_ban) {
3484 DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
3485 return;
3488 /* if the local daemon is STOPPED or BANNED, we verify that the databases are
3489 also frozen and that the recmode is set to active.
3491 if (rec->node_flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
3492 /* If this node has become inactive then we want to
3493 * reduce the chances of it taking over the recovery
3494 * master role when it becomes active again. This
3495 * helps to stabilise the recovery master role so that
3496 * it stays on the most stable node.
3498 rec->priority_time = timeval_current();
3500 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
3501 if (ret != 0) {
3502 DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
3504 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
3505 DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
3507 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
3508 if (ret != 0) {
3509 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
3511 return;
3513 ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
3514 if (ret != 0) {
3515 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node in STOPPED or BANNED state\n"));
3516 return;
3520 /* If this node is stopped or banned then it is not the recovery
3521 * master, so don't do anything. This prevents stopped or banned
3522 * node from starting election and sending unnecessary controls.
3524 return;
3527 /* check which node is the recovery master */
3528 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
3529 if (ret != 0) {
3530 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
3531 return;
3534 /* If we are not the recmaster then do some housekeeping */
3535 if (rec->recmaster != pnn) {
3536 /* Ignore any IP reallocate requests - only recmaster
3537 * processes them
3539 TALLOC_FREE(rec->reallocate_requests);
3540 /* Clear any nodes that should be force rebalanced in
3541 * the next takeover run. If the recovery master role
3542 * has moved then we don't want to process these some
3543 * time in the future.
3545 TALLOC_FREE(rec->force_rebalance_nodes);
3548 /* This is a special case. When recovery daemon is started, recmaster
3549 * is set to -1. If a node is not started in stopped state, then
3550 * start election to decide recovery master
3552 if (rec->recmaster == (uint32_t)-1) {
3553 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
3554 force_election(rec, pnn, nodemap);
3555 return;
3558 /* update the capabilities for all nodes */
3559 ret = update_capabilities(rec, nodemap);
3560 if (ret != 0) {
3561 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
3562 return;
3566 * If the current recmaster does not have CTDB_CAP_RECMASTER,
3567 * but we have, then force an election and try to become the new
3568 * recmaster.
3570 if (!ctdb_node_has_capabilities(rec->caps,
3571 rec->recmaster,
3572 CTDB_CAP_RECMASTER) &&
3573 (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
3574 !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
3575 DEBUG(DEBUG_ERR, (__location__ " Current recmaster node %u does not have CAP_RECMASTER,"
3576 " but we (node %u) have - force an election\n",
3577 rec->recmaster, pnn));
3578 force_election(rec, pnn, nodemap);
3579 return;
3582 /* verify that the recmaster node is still active */
3583 for (j=0; j<nodemap->num; j++) {
3584 if (nodemap->nodes[j].pnn==rec->recmaster) {
3585 break;
3589 if (j == nodemap->num) {
3590 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
3591 force_election(rec, pnn, nodemap);
3592 return;
3595 /* if recovery master is disconnected we must elect a new recmaster */
3596 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
3597 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
3598 force_election(rec, pnn, nodemap);
3599 return;
3602 /* get nodemap from the recovery master to check if it is inactive */
3603 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3604 mem_ctx, &recmaster_nodemap);
3605 if (ret != 0) {
3606 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
3607 nodemap->nodes[j].pnn));
3608 return;
3612 if ((recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) &&
3613 (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
3614 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
3616 * update our nodemap to carry the recmaster's notion of
3617 * its own flags, so that we don't keep freezing the
3618 * inactive recmaster node...
3620 nodemap->nodes[j].flags = recmaster_nodemap->nodes[j].flags;
3621 force_election(rec, pnn, nodemap);
3622 return;
3625 /* verify that we have all ip addresses we should have and we dont
3626 * have addresses we shouldnt have.
3628 if (ctdb->tunable.disable_ip_failover == 0 &&
3629 !ctdb_op_is_disabled(rec->takeover_run)) {
3630 if (verify_local_ip_allocation(ctdb, rec, pnn, nodemap) != 0) {
3631 DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
3636 /* if we are not the recmaster then we do not need to check
3637 if recovery is needed
3639 if (pnn != rec->recmaster) {
3640 return;
3644 /* ensure our local copies of flags are right */
3645 ret = update_local_flags(rec, nodemap);
3646 if (ret == MONITOR_ELECTION_NEEDED) {
3647 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
3648 force_election(rec, pnn, nodemap);
3649 return;
3651 if (ret != MONITOR_OK) {
3652 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
3653 return;
3656 if (ctdb->num_nodes != nodemap->num) {
3657 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
3658 ctdb_load_nodes_file(ctdb);
3659 return;
3662 /* verify that all active nodes agree that we are the recmaster */
3663 switch (verify_recmaster(rec, nodemap, pnn)) {
3664 case MONITOR_RECOVERY_NEEDED:
3665 /* can not happen */
3666 return;
3667 case MONITOR_ELECTION_NEEDED:
3668 force_election(rec, pnn, nodemap);
3669 return;
3670 case MONITOR_OK:
3671 break;
3672 case MONITOR_FAILED:
3673 return;
3677 if (rec->need_recovery) {
3678 /* a previous recovery didn't finish */
3679 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3680 return;
3683 /* verify that all active nodes are in normal mode
3684 and not in recovery mode
3686 switch (verify_recmode(ctdb, nodemap)) {
3687 case MONITOR_RECOVERY_NEEDED:
3688 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3689 return;
3690 case MONITOR_FAILED:
3691 return;
3692 case MONITOR_ELECTION_NEEDED:
3693 /* can not happen */
3694 case MONITOR_OK:
3695 break;
3699 if (ctdb->recovery_lock_file != NULL) {
3700 /* We must already hold the recovery lock */
3701 if (!ctdb_recovery_have_lock(ctdb)) {
3702 DEBUG(DEBUG_ERR,("Failed recovery lock sanity check. Force a recovery\n"));
3703 ctdb_set_culprit(rec, ctdb->pnn);
3704 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3705 return;
3710 /* if there are takeovers requested, perform it and notify the waiters */
3711 if (!ctdb_op_is_disabled(rec->takeover_run) &&
3712 rec->reallocate_requests) {
3713 process_ipreallocate_requests(ctdb, rec);
3716 /* If recoveries are disabled then there is no use doing any
3717 * nodemap or flags checks. Recoveries might be disabled due
3718 * to "reloadnodes", so doing these checks might cause an
3719 * unnecessary recovery. */
3720 if (ctdb_op_is_disabled(rec->recovery)) {
3721 return;
3724 /* get the nodemap for all active remote nodes
3726 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
3727 if (remote_nodemaps == NULL) {
3728 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3729 return;
3731 for(i=0; i<nodemap->num; i++) {
3732 remote_nodemaps[i] = NULL;
3734 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3735 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3736 return;
3739 /* verify that all other nodes have the same nodemap as we have
3741 for (j=0; j<nodemap->num; j++) {
3742 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3743 continue;
3746 if (remote_nodemaps[j] == NULL) {
3747 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3748 ctdb_set_culprit(rec, j);
3750 return;
3753 /* if the nodes disagree on how many nodes there are
3754 then this is a good reason to try recovery
3756 if (remote_nodemaps[j]->num != nodemap->num) {
3757 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3758 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3759 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3760 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3761 return;
3764 /* if the nodes disagree on which nodes exist and are
3765 active, then that is also a good reason to do recovery
3767 for (i=0;i<nodemap->num;i++) {
3768 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3769 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3770 nodemap->nodes[j].pnn, i,
3771 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3772 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3773 do_recovery(rec, mem_ctx, pnn, nodemap,
3774 vnnmap);
3775 return;
3781 * Update node flags obtained from each active node. This ensure we have
3782 * up-to-date information for all the nodes.
3784 for (j=0; j<nodemap->num; j++) {
3785 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3786 continue;
3788 nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
3791 for (j=0; j<nodemap->num; j++) {
3792 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3793 continue;
3796 /* verify the flags are consistent
3798 for (i=0; i<nodemap->num; i++) {
3799 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3800 continue;
3803 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3804 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3805 nodemap->nodes[j].pnn,
3806 nodemap->nodes[i].pnn,
3807 remote_nodemaps[j]->nodes[i].flags,
3808 nodemap->nodes[i].flags));
3809 if (i == j) {
3810 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3811 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3812 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3813 do_recovery(rec, mem_ctx, pnn, nodemap,
3814 vnnmap);
3815 return;
3816 } else {
3817 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3818 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3819 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3820 do_recovery(rec, mem_ctx, pnn, nodemap,
3821 vnnmap);
3822 return;
3829 /* count how many active nodes there are */
3830 num_lmasters = 0;
3831 for (i=0; i<nodemap->num; i++) {
3832 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
3833 if (ctdb_node_has_capabilities(rec->caps,
3834 ctdb->nodes[i]->pnn,
3835 CTDB_CAP_LMASTER)) {
3836 num_lmasters++;
3842 /* There must be the same number of lmasters in the vnn map as
3843 * there are active nodes with the lmaster capability... or
3844 * do a recovery.
3846 if (vnnmap->size != num_lmasters) {
3847 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
3848 vnnmap->size, num_lmasters));
3849 ctdb_set_culprit(rec, ctdb->pnn);
3850 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3851 return;
3854 /* verify that all active nodes in the nodemap also exist in
3855 the vnnmap.
3857 for (j=0; j<nodemap->num; j++) {
3858 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3859 continue;
3861 if (nodemap->nodes[j].pnn == pnn) {
3862 continue;
3865 for (i=0; i<vnnmap->size; i++) {
3866 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3867 break;
3870 if (i == vnnmap->size) {
3871 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3872 nodemap->nodes[j].pnn));
3873 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3874 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3875 return;
3880 /* verify that all other nodes have the same vnnmap
3881 and are from the same generation
3883 for (j=0; j<nodemap->num; j++) {
3884 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3885 continue;
3887 if (nodemap->nodes[j].pnn == pnn) {
3888 continue;
3891 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3892 mem_ctx, &remote_vnnmap);
3893 if (ret != 0) {
3894 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3895 nodemap->nodes[j].pnn));
3896 return;
3899 /* verify the vnnmap generation is the same */
3900 if (vnnmap->generation != remote_vnnmap->generation) {
3901 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3902 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3903 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3904 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3905 return;
3908 /* verify the vnnmap size is the same */
3909 if (vnnmap->size != remote_vnnmap->size) {
3910 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3911 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3912 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3913 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3914 return;
3917 /* verify the vnnmap is the same */
3918 for (i=0;i<vnnmap->size;i++) {
3919 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3920 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3921 nodemap->nodes[j].pnn));
3922 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3923 do_recovery(rec, mem_ctx, pnn, nodemap,
3924 vnnmap);
3925 return;
3930 /* we might need to change who has what IP assigned */
3931 if (rec->need_takeover_run) {
3932 uint32_t culprit = (uint32_t)-1;
3934 rec->need_takeover_run = false;
3936 /* update the list of public ips that a node can handle for
3937 all connected nodes
3939 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
3940 if (ret != 0) {
3941 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
3942 culprit));
3943 rec->need_takeover_run = true;
3944 return;
3947 /* execute the "startrecovery" event script on all nodes */
3948 ret = run_startrecovery_eventscript(rec, nodemap);
3949 if (ret!=0) {
3950 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
3951 ctdb_set_culprit(rec, ctdb->pnn);
3952 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3953 return;
3956 /* If takeover run fails, then the offending nodes are
3957 * assigned ban culprit counts. And we re-try takeover.
3958 * If takeover run fails repeatedly, the node would get
3959 * banned.
3961 * If rec->need_takeover_run is not set to true at this
3962 * failure, monitoring is disabled cluster-wide (via
3963 * startrecovery eventscript) and will not get enabled.
3965 if (!do_takeover_run(rec, nodemap, true)) {
3966 return;
3969 /* execute the "recovered" event script on all nodes */
3970 ret = run_recovered_eventscript(rec, nodemap, "monitor_cluster");
3971 #if 0
3972 // we cant check whether the event completed successfully
3973 // since this script WILL fail if the node is in recovery mode
3974 // and if that race happens, the code here would just cause a second
3975 // cascading recovery.
3976 if (ret!=0) {
3977 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
3978 ctdb_set_culprit(rec, ctdb->pnn);
3979 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3981 #endif
3986 the main monitoring loop
3988 static void monitor_cluster(struct ctdb_context *ctdb)
3990 struct ctdb_recoverd *rec;
3992 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
3994 rec = talloc_zero(ctdb, struct ctdb_recoverd);
3995 CTDB_NO_MEMORY_FATAL(ctdb, rec);
3997 rec->ctdb = ctdb;
3999 rec->takeover_run = ctdb_op_init(rec, "takeover runs");
4000 CTDB_NO_MEMORY_FATAL(ctdb, rec->takeover_run);
4002 rec->recovery = ctdb_op_init(rec, "recoveries");
4003 CTDB_NO_MEMORY_FATAL(ctdb, rec->recovery);
4005 rec->priority_time = timeval_current();
4007 /* register a message port for sending memory dumps */
4008 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
4010 /* register a message port for recovery elections */
4011 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
4013 /* when nodes are disabled/enabled */
4014 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
4016 /* when we are asked to puch out a flag change */
4017 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
4019 /* register a message port for vacuum fetch */
4020 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
4022 /* register a message port for reloadnodes */
4023 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
4025 /* register a message port for performing a takeover run */
4026 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
4028 /* register a message port for disabling the ip check for a short while */
4029 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
4031 /* register a message port for updating the recovery daemons node assignment for an ip */
4032 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECD_UPDATE_IP, recd_update_ip_handler, rec);
4034 /* register a message port for forcing a rebalance of a node next
4035 reallocation */
4036 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
4038 /* Register a message port for disabling takeover runs */
4039 ctdb_client_set_message_handler(ctdb,
4040 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
4041 disable_takeover_runs_handler, rec);
4043 /* Register a message port for disabling recoveries */
4044 ctdb_client_set_message_handler(ctdb,
4045 CTDB_SRVID_DISABLE_RECOVERIES,
4046 disable_recoveries_handler, rec);
4048 /* register a message port for detaching database */
4049 ctdb_client_set_message_handler(ctdb,
4050 CTDB_SRVID_DETACH_DATABASE,
4051 detach_database_handler, rec);
4053 for (;;) {
4054 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
4055 struct timeval start;
4056 double elapsed;
4058 if (!mem_ctx) {
4059 DEBUG(DEBUG_CRIT,(__location__
4060 " Failed to create temp context\n"));
4061 exit(-1);
4064 start = timeval_current();
4065 main_loop(ctdb, rec, mem_ctx);
4066 talloc_free(mem_ctx);
4068 /* we only check for recovery once every second */
4069 elapsed = timeval_elapsed(&start);
4070 if (elapsed < ctdb->tunable.recover_interval) {
4071 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
4072 - elapsed);
4078 event handler for when the main ctdbd dies
4080 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
4081 uint16_t flags, void *private_data)
4083 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
4084 _exit(1);
4088 called regularly to verify that the recovery daemon is still running
4090 static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
4091 struct timeval yt, void *p)
4093 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
4095 if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
4096 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
4098 event_add_timed(ctdb->ev, ctdb, timeval_zero(),
4099 ctdb_restart_recd, ctdb);
4101 return;
4104 event_add_timed(ctdb->ev, ctdb->recd_ctx,
4105 timeval_current_ofs(30, 0),
4106 ctdb_check_recd, ctdb);
4109 static void recd_sig_child_handler(struct event_context *ev,
4110 struct signal_event *se, int signum, int count,
4111 void *dont_care,
4112 void *private_data)
4114 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4115 int status;
4116 pid_t pid = -1;
4118 while (pid != 0) {
4119 pid = waitpid(-1, &status, WNOHANG);
4120 if (pid == -1) {
4121 if (errno != ECHILD) {
4122 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
4124 return;
4126 if (pid > 0) {
4127 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
4133 startup the recovery daemon as a child of the main ctdb daemon
4135 int ctdb_start_recoverd(struct ctdb_context *ctdb)
4137 int fd[2];
4138 struct signal_event *se;
4139 struct tevent_fd *fde;
4141 if (pipe(fd) != 0) {
4142 return -1;
4145 ctdb->recoverd_pid = ctdb_fork(ctdb);
4146 if (ctdb->recoverd_pid == -1) {
4147 return -1;
4150 if (ctdb->recoverd_pid != 0) {
4151 talloc_free(ctdb->recd_ctx);
4152 ctdb->recd_ctx = talloc_new(ctdb);
4153 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
4155 close(fd[0]);
4156 event_add_timed(ctdb->ev, ctdb->recd_ctx,
4157 timeval_current_ofs(30, 0),
4158 ctdb_check_recd, ctdb);
4159 return 0;
4162 close(fd[1]);
4164 srandom(getpid() ^ time(NULL));
4166 ctdb_set_process_name("ctdb_recovered");
4167 if (switch_from_server_to_client(ctdb, "recoverd") != 0) {
4168 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
4169 exit(1);
4172 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
4174 fde = event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ,
4175 ctdb_recoverd_parent, &fd[0]);
4176 tevent_fd_set_auto_close(fde);
4178 /* set up a handler to pick up sigchld */
4179 se = event_add_signal(ctdb->ev, ctdb,
4180 SIGCHLD, 0,
4181 recd_sig_child_handler,
4182 ctdb);
4183 if (se == NULL) {
4184 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
4185 exit(1);
4188 monitor_cluster(ctdb);
4190 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
4191 return -1;
4195 shutdown the recovery daemon
4197 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
4199 if (ctdb->recoverd_pid == 0) {
4200 return;
4203 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
4204 ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
4206 TALLOC_FREE(ctdb->recd_ctx);
4207 TALLOC_FREE(ctdb->recd_ping_count);
4210 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te,
4211 struct timeval t, void *private_data)
4213 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4215 DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
4216 ctdb_stop_recoverd(ctdb);
4217 ctdb_start_recoverd(ctdb);