ctdb-recoverd/vacuum: remove unneeded prototype.
[Samba.git] / ctdb / server / ctdb_recoverd.c
blob1b4ac50500a296cb2102dcab2a158e01ae8b9e55
1 /*
2 ctdb recovery daemon
4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
20 #include "includes.h"
21 #include "system/filesys.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/wait.h"
25 #include "popt.h"
26 #include "cmdline.h"
27 #include "../include/ctdb_client.h"
28 #include "../include/ctdb_private.h"
29 #include "lib/tdb_wrap/tdb_wrap.h"
30 #include "lib/util/dlinklist.h"
33 /* List of SRVID requests that need to be processed */
34 struct srvid_list {
35 struct srvid_list *next, *prev;
36 struct srvid_request *request;
39 struct srvid_requests {
40 struct srvid_list *requests;
43 static void srvid_request_reply(struct ctdb_context *ctdb,
44 struct srvid_request *request,
45 TDB_DATA result)
47 /* Someone that sent srvid==0 does not want a reply */
48 if (request->srvid == 0) {
49 talloc_free(request);
50 return;
53 if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
54 result) == 0) {
55 DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
56 (unsigned)request->pnn,
57 (unsigned long long)request->srvid));
58 } else {
59 DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
60 (unsigned)request->pnn,
61 (unsigned long long)request->srvid));
64 talloc_free(request);
67 static void srvid_requests_reply(struct ctdb_context *ctdb,
68 struct srvid_requests **requests,
69 TDB_DATA result)
71 struct srvid_list *r;
73 for (r = (*requests)->requests; r != NULL; r = r->next) {
74 srvid_request_reply(ctdb, r->request, result);
77 /* Free the list structure... */
78 TALLOC_FREE(*requests);
81 static void srvid_request_add(struct ctdb_context *ctdb,
82 struct srvid_requests **requests,
83 struct srvid_request *request)
85 struct srvid_list *t;
86 int32_t ret;
87 TDB_DATA result;
89 if (*requests == NULL) {
90 *requests = talloc_zero(ctdb, struct srvid_requests);
91 if (*requests == NULL) {
92 goto nomem;
96 t = talloc_zero(*requests, struct srvid_list);
97 if (t == NULL) {
98 /* If *requests was just allocated above then free it */
99 if ((*requests)->requests == NULL) {
100 TALLOC_FREE(*requests);
102 goto nomem;
105 t->request = (struct srvid_request *)talloc_steal(t, request);
106 DLIST_ADD((*requests)->requests, t);
108 return;
110 nomem:
111 /* Failed to add the request to the list. Send a fail. */
112 DEBUG(DEBUG_ERR, (__location__
113 " Out of memory, failed to queue SRVID request\n"));
114 ret = -ENOMEM;
115 result.dsize = sizeof(ret);
116 result.dptr = (uint8_t *)&ret;
117 srvid_request_reply(ctdb, request, result);
120 /* An abstraction to allow an operation (takeover runs, recoveries,
121 * ...) to be disabled for a given timeout */
122 struct ctdb_op_state {
123 struct tevent_timer *timer;
124 bool in_progress;
125 const char *name;
128 static struct ctdb_op_state *ctdb_op_init(TALLOC_CTX *mem_ctx, const char *name)
130 struct ctdb_op_state *state = talloc_zero(mem_ctx, struct ctdb_op_state);
132 if (state != NULL) {
133 state->in_progress = false;
134 state->name = name;
137 return state;
140 static bool ctdb_op_is_disabled(struct ctdb_op_state *state)
142 return state->timer != NULL;
145 static bool ctdb_op_begin(struct ctdb_op_state *state)
147 if (ctdb_op_is_disabled(state)) {
148 DEBUG(DEBUG_NOTICE,
149 ("Unable to begin - %s are disabled\n", state->name));
150 return false;
153 state->in_progress = true;
154 return true;
157 static bool ctdb_op_end(struct ctdb_op_state *state)
159 return state->in_progress = false;
162 static bool ctdb_op_is_in_progress(struct ctdb_op_state *state)
164 return state->in_progress;
167 static void ctdb_op_enable(struct ctdb_op_state *state)
169 TALLOC_FREE(state->timer);
172 static void ctdb_op_timeout_handler(struct event_context *ev,
173 struct timed_event *te,
174 struct timeval yt, void *p)
176 struct ctdb_op_state *state =
177 talloc_get_type(p, struct ctdb_op_state);
179 DEBUG(DEBUG_NOTICE,("Reenabling %s after timeout\n", state->name));
180 ctdb_op_enable(state);
183 static int ctdb_op_disable(struct ctdb_op_state *state,
184 struct tevent_context *ev,
185 uint32_t timeout)
187 if (timeout == 0) {
188 DEBUG(DEBUG_NOTICE,("Reenabling %s\n", state->name));
189 ctdb_op_enable(state);
190 return 0;
193 if (state->in_progress) {
194 DEBUG(DEBUG_ERR,
195 ("Unable to disable %s - in progress\n", state->name));
196 return -EAGAIN;
199 DEBUG(DEBUG_NOTICE,("Disabling %s for %u seconds\n",
200 state->name, timeout));
202 /* Clear any old timers */
203 talloc_free(state->timer);
205 /* Arrange for the timeout to occur */
206 state->timer = tevent_add_timer(ev, state,
207 timeval_current_ofs(timeout, 0),
208 ctdb_op_timeout_handler, state);
209 if (state->timer == NULL) {
210 DEBUG(DEBUG_ERR,(__location__ " Unable to setup timer\n"));
211 return -ENOMEM;
214 return 0;
217 struct ctdb_banning_state {
218 uint32_t count;
219 struct timeval last_reported_time;
223 private state of recovery daemon
225 struct ctdb_recoverd {
226 struct ctdb_context *ctdb;
227 uint32_t recmaster;
228 uint32_t last_culprit_node;
229 struct ctdb_node_map *nodemap;
230 struct timeval priority_time;
231 bool need_takeover_run;
232 bool need_recovery;
233 uint32_t node_flags;
234 struct timed_event *send_election_te;
235 struct timed_event *election_timeout;
236 struct vacuum_info *vacuum_info;
237 struct srvid_requests *reallocate_requests;
238 struct ctdb_op_state *takeover_run;
239 struct ctdb_op_state *recovery;
240 struct ctdb_control_get_ifaces *ifaces;
241 uint32_t *force_rebalance_nodes;
242 struct ctdb_node_capabilities *caps;
245 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
246 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
248 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te, struct timeval t, void *private_data);
251 ban a node for a period of time
253 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
255 int ret;
256 struct ctdb_context *ctdb = rec->ctdb;
257 struct ctdb_ban_time bantime;
259 if (!ctdb_validate_pnn(ctdb, pnn)) {
260 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
261 return;
264 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
266 bantime.pnn = pnn;
267 bantime.time = ban_time;
269 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
270 if (ret != 0) {
271 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
272 return;
277 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
281 remember the trouble maker
283 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
285 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
286 struct ctdb_banning_state *ban_state;
288 if (culprit > ctdb->num_nodes) {
289 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
290 return;
293 /* If we are banned or stopped, do not set other nodes as culprits */
294 if (rec->node_flags & NODE_FLAGS_INACTIVE) {
295 DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
296 return;
299 if (ctdb->nodes[culprit]->ban_state == NULL) {
300 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
301 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
305 ban_state = ctdb->nodes[culprit]->ban_state;
306 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
307 /* this was the first time in a long while this node
308 misbehaved so we will forgive any old transgressions.
310 ban_state->count = 0;
313 ban_state->count += count;
314 ban_state->last_reported_time = timeval_current();
315 rec->last_culprit_node = culprit;
319 remember the trouble maker
321 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
323 ctdb_set_culprit_count(rec, culprit, 1);
327 /* this callback is called for every node that failed to execute the
328 recovered event
330 static void recovered_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
332 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
334 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the recovered event. Setting it as recovery fail culprit\n", node_pnn));
336 ctdb_set_culprit(rec, node_pnn);
340 run the "recovered" eventscript on all nodes
342 static int run_recovered_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, const char *caller)
344 TALLOC_CTX *tmp_ctx;
345 uint32_t *nodes;
346 struct ctdb_context *ctdb = rec->ctdb;
348 tmp_ctx = talloc_new(ctdb);
349 CTDB_NO_MEMORY(ctdb, tmp_ctx);
351 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
352 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
353 nodes, 0,
354 CONTROL_TIMEOUT(), false, tdb_null,
355 NULL, recovered_fail_callback,
356 rec) != 0) {
357 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
359 talloc_free(tmp_ctx);
360 return -1;
363 talloc_free(tmp_ctx);
364 return 0;
367 /* this callback is called for every node that failed to execute the
368 start recovery event
370 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
372 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
374 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
376 ctdb_set_culprit(rec, node_pnn);
380 run the "startrecovery" eventscript on all nodes
382 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
384 TALLOC_CTX *tmp_ctx;
385 uint32_t *nodes;
386 struct ctdb_context *ctdb = rec->ctdb;
388 tmp_ctx = talloc_new(ctdb);
389 CTDB_NO_MEMORY(ctdb, tmp_ctx);
391 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
392 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
393 nodes, 0,
394 CONTROL_TIMEOUT(), false, tdb_null,
395 NULL,
396 startrecovery_fail_callback,
397 rec) != 0) {
398 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
399 talloc_free(tmp_ctx);
400 return -1;
403 talloc_free(tmp_ctx);
404 return 0;
408 update the node capabilities for all connected nodes
410 static int update_capabilities(struct ctdb_recoverd *rec,
411 struct ctdb_node_map *nodemap)
413 uint32_t *capp;
414 TALLOC_CTX *tmp_ctx;
415 struct ctdb_node_capabilities *caps;
416 struct ctdb_context *ctdb = rec->ctdb;
418 tmp_ctx = talloc_new(rec);
419 CTDB_NO_MEMORY(ctdb, tmp_ctx);
421 caps = ctdb_get_capabilities(ctdb, tmp_ctx,
422 CONTROL_TIMEOUT(), nodemap);
424 if (caps == NULL) {
425 DEBUG(DEBUG_ERR,
426 (__location__ " Failed to get node capabilities\n"));
427 talloc_free(tmp_ctx);
428 return -1;
431 capp = ctdb_get_node_capabilities(caps, ctdb_get_pnn(ctdb));
432 if (capp == NULL) {
433 DEBUG(DEBUG_ERR,
434 (__location__
435 " Capabilities don't include current node.\n"));
436 talloc_free(tmp_ctx);
437 return -1;
439 ctdb->capabilities = *capp;
441 TALLOC_FREE(rec->caps);
442 rec->caps = talloc_steal(rec, caps);
444 talloc_free(tmp_ctx);
445 return 0;
448 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
450 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
452 DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
453 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
456 static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
458 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
460 DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
461 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
465 change recovery mode on all nodes
467 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t rec_mode)
469 TDB_DATA data;
470 uint32_t *nodes;
471 TALLOC_CTX *tmp_ctx;
473 tmp_ctx = talloc_new(ctdb);
474 CTDB_NO_MEMORY(ctdb, tmp_ctx);
476 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
478 data.dsize = sizeof(uint32_t);
479 data.dptr = (unsigned char *)&rec_mode;
481 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
482 nodes, 0,
483 CONTROL_TIMEOUT(),
484 false, data,
485 NULL, NULL,
486 NULL) != 0) {
487 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
488 talloc_free(tmp_ctx);
489 return -1;
492 /* freeze all nodes */
493 if (rec_mode == CTDB_RECOVERY_ACTIVE) {
494 int i;
496 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
497 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
498 nodes, i,
499 CONTROL_TIMEOUT(),
500 false, tdb_null,
501 NULL,
502 set_recmode_fail_callback,
503 rec) != 0) {
504 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
505 talloc_free(tmp_ctx);
506 return -1;
511 talloc_free(tmp_ctx);
512 return 0;
516 change recovery master on all node
518 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
520 TDB_DATA data;
521 TALLOC_CTX *tmp_ctx;
522 uint32_t *nodes;
524 tmp_ctx = talloc_new(ctdb);
525 CTDB_NO_MEMORY(ctdb, tmp_ctx);
527 data.dsize = sizeof(uint32_t);
528 data.dptr = (unsigned char *)&pnn;
530 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
531 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
532 nodes, 0,
533 CONTROL_TIMEOUT(), false, data,
534 NULL, NULL,
535 NULL) != 0) {
536 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
537 talloc_free(tmp_ctx);
538 return -1;
541 talloc_free(tmp_ctx);
542 return 0;
545 /* update all remote nodes to use the same db priority that we have
546 this can fail if the remove node has not yet been upgraded to
547 support this function, so we always return success and never fail
548 a recovery if this call fails.
550 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
551 struct ctdb_node_map *nodemap,
552 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
554 int db;
556 /* step through all local databases */
557 for (db=0; db<dbmap->num;db++) {
558 struct ctdb_db_priority db_prio;
559 int ret;
561 db_prio.db_id = dbmap->dbs[db].dbid;
562 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].dbid, &db_prio.priority);
563 if (ret != 0) {
564 DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].dbid));
565 continue;
568 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].dbid, db_prio.priority));
570 ret = ctdb_ctrl_set_db_priority(ctdb, CONTROL_TIMEOUT(),
571 CTDB_CURRENT_NODE, &db_prio);
572 if (ret != 0) {
573 DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n",
574 db_prio.db_id));
578 return 0;
582 ensure all other nodes have attached to any databases that we have
584 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
585 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
587 int i, j, db, ret;
588 struct ctdb_dbid_map *remote_dbmap;
590 /* verify that all other nodes have all our databases */
591 for (j=0; j<nodemap->num; j++) {
592 /* we dont need to ourself ourselves */
593 if (nodemap->nodes[j].pnn == pnn) {
594 continue;
596 /* dont check nodes that are unavailable */
597 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
598 continue;
601 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
602 mem_ctx, &remote_dbmap);
603 if (ret != 0) {
604 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
605 return -1;
608 /* step through all local databases */
609 for (db=0; db<dbmap->num;db++) {
610 const char *name;
613 for (i=0;i<remote_dbmap->num;i++) {
614 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
615 break;
618 /* the remote node already have this database */
619 if (i!=remote_dbmap->num) {
620 continue;
622 /* ok so we need to create this database */
623 ret = ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn,
624 dbmap->dbs[db].dbid, mem_ctx,
625 &name);
626 if (ret != 0) {
627 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
628 return -1;
630 ret = ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(),
631 nodemap->nodes[j].pnn,
632 mem_ctx, name,
633 dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
634 if (ret != 0) {
635 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
636 return -1;
641 return 0;
646 ensure we are attached to any databases that anyone else is attached to
648 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
649 uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
651 int i, j, db, ret;
652 struct ctdb_dbid_map *remote_dbmap;
654 /* verify that we have all database any other node has */
655 for (j=0; j<nodemap->num; j++) {
656 /* we dont need to ourself ourselves */
657 if (nodemap->nodes[j].pnn == pnn) {
658 continue;
660 /* dont check nodes that are unavailable */
661 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
662 continue;
665 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
666 mem_ctx, &remote_dbmap);
667 if (ret != 0) {
668 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
669 return -1;
672 /* step through all databases on the remote node */
673 for (db=0; db<remote_dbmap->num;db++) {
674 const char *name;
676 for (i=0;i<(*dbmap)->num;i++) {
677 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
678 break;
681 /* we already have this db locally */
682 if (i!=(*dbmap)->num) {
683 continue;
685 /* ok so we need to create this database and
686 rebuild dbmap
688 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
689 remote_dbmap->dbs[db].dbid, mem_ctx, &name);
690 if (ret != 0) {
691 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
692 nodemap->nodes[j].pnn));
693 return -1;
695 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
696 remote_dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
697 if (ret != 0) {
698 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
699 return -1;
701 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
702 if (ret != 0) {
703 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
704 return -1;
709 return 0;
714 pull the remote database contents from one node into the recdb
716 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
717 struct tdb_wrap *recdb, uint32_t dbid)
719 int ret;
720 TDB_DATA outdata;
721 struct ctdb_marshall_buffer *reply;
722 struct ctdb_rec_data *recdata;
723 int i;
724 TALLOC_CTX *tmp_ctx = talloc_new(recdb);
726 ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
727 CONTROL_TIMEOUT(), &outdata);
728 if (ret != 0) {
729 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
730 talloc_free(tmp_ctx);
731 return -1;
734 reply = (struct ctdb_marshall_buffer *)outdata.dptr;
736 if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
737 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
738 talloc_free(tmp_ctx);
739 return -1;
742 recdata = (struct ctdb_rec_data *)&reply->data[0];
744 for (i=0;
745 i<reply->count;
746 recdata = (struct ctdb_rec_data *)(recdata->length + (uint8_t *)recdata), i++) {
747 TDB_DATA key, data;
748 struct ctdb_ltdb_header *hdr;
749 TDB_DATA existing;
751 key.dptr = &recdata->data[0];
752 key.dsize = recdata->keylen;
753 data.dptr = &recdata->data[key.dsize];
754 data.dsize = recdata->datalen;
756 hdr = (struct ctdb_ltdb_header *)data.dptr;
758 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
759 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
760 talloc_free(tmp_ctx);
761 return -1;
764 /* fetch the existing record, if any */
765 existing = tdb_fetch(recdb->tdb, key);
767 if (existing.dptr != NULL) {
768 struct ctdb_ltdb_header header;
769 if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
770 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
771 (unsigned)existing.dsize, srcnode));
772 free(existing.dptr);
773 talloc_free(tmp_ctx);
774 return -1;
776 header = *(struct ctdb_ltdb_header *)existing.dptr;
777 free(existing.dptr);
778 if (!(header.rsn < hdr->rsn ||
779 (header.dmaster != ctdb_get_pnn(ctdb) &&
780 header.rsn == hdr->rsn))) {
781 continue;
785 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
786 DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
787 talloc_free(tmp_ctx);
788 return -1;
792 talloc_free(tmp_ctx);
794 return 0;
798 struct pull_seqnum_cbdata {
799 int failed;
800 uint32_t pnn;
801 uint64_t seqnum;
804 static void pull_seqnum_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
806 struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
807 uint64_t seqnum;
809 if (cb_data->failed != 0) {
810 DEBUG(DEBUG_ERR, ("Got seqnum from node %d but we have already failed the entire operation\n", node_pnn));
811 return;
814 if (res != 0) {
815 DEBUG(DEBUG_ERR, ("Error when pulling seqnum from node %d\n", node_pnn));
816 cb_data->failed = 1;
817 return;
820 if (outdata.dsize != sizeof(uint64_t)) {
821 DEBUG(DEBUG_ERR, ("Error when reading pull seqnum from node %d, got %d bytes but expected %d\n", node_pnn, (int)outdata.dsize, (int)sizeof(uint64_t)));
822 cb_data->failed = -1;
823 return;
826 seqnum = *((uint64_t *)outdata.dptr);
828 if (seqnum > cb_data->seqnum ||
829 (cb_data->pnn == -1 && seqnum == 0)) {
830 cb_data->seqnum = seqnum;
831 cb_data->pnn = node_pnn;
835 static void pull_seqnum_fail_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
837 struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
839 DEBUG(DEBUG_ERR, ("Failed to pull db seqnum from node %d\n", node_pnn));
840 cb_data->failed = 1;
843 static int pull_highest_seqnum_pdb(struct ctdb_context *ctdb,
844 struct ctdb_recoverd *rec,
845 struct ctdb_node_map *nodemap,
846 struct tdb_wrap *recdb, uint32_t dbid)
848 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
849 uint32_t *nodes;
850 TDB_DATA data;
851 uint32_t outdata[2];
852 struct pull_seqnum_cbdata *cb_data;
854 DEBUG(DEBUG_NOTICE, ("Scan for highest seqnum pdb for db:0x%08x\n", dbid));
856 outdata[0] = dbid;
857 outdata[1] = 0;
859 data.dsize = sizeof(outdata);
860 data.dptr = (uint8_t *)&outdata[0];
862 cb_data = talloc(tmp_ctx, struct pull_seqnum_cbdata);
863 if (cb_data == NULL) {
864 DEBUG(DEBUG_ERR, ("Failed to allocate pull highest seqnum cb_data structure\n"));
865 talloc_free(tmp_ctx);
866 return -1;
869 cb_data->failed = 0;
870 cb_data->pnn = -1;
871 cb_data->seqnum = 0;
873 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
874 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_DB_SEQNUM,
875 nodes, 0,
876 CONTROL_TIMEOUT(), false, data,
877 pull_seqnum_cb,
878 pull_seqnum_fail_cb,
879 cb_data) != 0) {
880 DEBUG(DEBUG_ERR, (__location__ " Failed to run async GET_DB_SEQNUM\n"));
882 talloc_free(tmp_ctx);
883 return -1;
886 if (cb_data->failed != 0) {
887 DEBUG(DEBUG_NOTICE, ("Failed to pull sequence numbers for DB 0x%08x\n", dbid));
888 talloc_free(tmp_ctx);
889 return -1;
892 if (cb_data->pnn == -1) {
893 DEBUG(DEBUG_NOTICE, ("Failed to find a node with highest sequence numbers for DB 0x%08x\n", dbid));
894 talloc_free(tmp_ctx);
895 return -1;
898 DEBUG(DEBUG_NOTICE, ("Pull persistent db:0x%08x from node %d with highest seqnum:%lld\n", dbid, cb_data->pnn, (long long)cb_data->seqnum));
900 if (pull_one_remote_database(ctdb, cb_data->pnn, recdb, dbid) != 0) {
901 DEBUG(DEBUG_ERR, ("Failed to pull higest seqnum database 0x%08x from node %d\n", dbid, cb_data->pnn));
902 talloc_free(tmp_ctx);
903 return -1;
906 talloc_free(tmp_ctx);
907 return 0;
912 pull all the remote database contents into the recdb
914 static int pull_remote_database(struct ctdb_context *ctdb,
915 struct ctdb_recoverd *rec,
916 struct ctdb_node_map *nodemap,
917 struct tdb_wrap *recdb, uint32_t dbid,
918 bool persistent)
920 int j;
922 if (persistent && ctdb->tunable.recover_pdb_by_seqnum != 0) {
923 int ret;
924 ret = pull_highest_seqnum_pdb(ctdb, rec, nodemap, recdb, dbid);
925 if (ret == 0) {
926 return 0;
930 /* pull all records from all other nodes across onto this node
931 (this merges based on rsn)
933 for (j=0; j<nodemap->num; j++) {
934 /* dont merge from nodes that are unavailable */
935 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
936 continue;
938 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
939 DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
940 nodemap->nodes[j].pnn));
941 ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
942 return -1;
946 return 0;
951 update flags on all active nodes
953 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
955 int ret;
957 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
958 if (ret != 0) {
959 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
960 return -1;
963 return 0;
967 ensure all nodes have the same vnnmap we do
969 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
970 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
972 int j, ret;
974 /* push the new vnn map out to all the nodes */
975 for (j=0; j<nodemap->num; j++) {
976 /* dont push to nodes that are unavailable */
977 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
978 continue;
981 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
982 if (ret != 0) {
983 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
984 return -1;
988 return 0;
992 struct vacuum_info {
993 struct vacuum_info *next, *prev;
994 struct ctdb_recoverd *rec;
995 uint32_t srcnode;
996 struct ctdb_db_context *ctdb_db;
997 struct ctdb_marshall_buffer *recs;
998 struct ctdb_rec_data *r;
1003 called when a vacuum fetch has completed - just free it and do the next one
1005 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
1007 talloc_free(state);
1012 process the next element from the vacuum list
1014 static void vacuum_fetch_next(struct vacuum_info *v)
1016 struct ctdb_call call;
1017 struct ctdb_rec_data *r;
1019 while (v->recs->count) {
1020 struct ctdb_client_call_state *state;
1021 TDB_DATA data;
1022 struct ctdb_ltdb_header *hdr;
1024 ZERO_STRUCT(call);
1025 call.call_id = CTDB_NULL_FUNC;
1026 call.flags = CTDB_IMMEDIATE_MIGRATION;
1027 call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
1029 r = v->r;
1030 v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
1031 v->recs->count--;
1033 call.key.dptr = &r->data[0];
1034 call.key.dsize = r->keylen;
1036 /* ensure we don't block this daemon - just skip a record if we can't get
1037 the chainlock */
1038 if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
1039 continue;
1042 data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
1043 if (data.dptr == NULL) {
1044 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
1045 continue;
1048 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
1049 free(data.dptr);
1050 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
1051 continue;
1054 hdr = (struct ctdb_ltdb_header *)data.dptr;
1055 if (hdr->dmaster == v->rec->ctdb->pnn) {
1056 /* its already local */
1057 free(data.dptr);
1058 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
1059 continue;
1062 free(data.dptr);
1064 state = ctdb_call_send(v->ctdb_db, &call);
1065 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
1066 if (state == NULL) {
1067 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
1068 talloc_free(v);
1069 return;
1071 state->async.fn = vacuum_fetch_callback;
1072 state->async.private_data = NULL;
1075 talloc_free(v);
1080 destroy a vacuum info structure
1082 static int vacuum_info_destructor(struct vacuum_info *v)
1084 DLIST_REMOVE(v->rec->vacuum_info, v);
1085 return 0;
1090 handler for vacuum fetch
1092 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
1093 TDB_DATA data, void *private_data)
1095 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1096 struct ctdb_marshall_buffer *recs;
1097 int ret, i;
1098 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1099 const char *name;
1100 struct ctdb_dbid_map *dbmap=NULL;
1101 bool persistent = false;
1102 struct ctdb_db_context *ctdb_db;
1103 struct ctdb_rec_data *r;
1104 uint32_t srcnode;
1105 struct vacuum_info *v;
1107 recs = (struct ctdb_marshall_buffer *)data.dptr;
1108 r = (struct ctdb_rec_data *)&recs->data[0];
1110 if (recs->count == 0) {
1111 talloc_free(tmp_ctx);
1112 return;
1115 srcnode = r->reqid;
1117 for (v=rec->vacuum_info;v;v=v->next) {
1118 if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
1119 /* we're already working on records from this node */
1120 talloc_free(tmp_ctx);
1121 return;
1125 /* work out if the database is persistent */
1126 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
1127 if (ret != 0) {
1128 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
1129 talloc_free(tmp_ctx);
1130 return;
1133 for (i=0;i<dbmap->num;i++) {
1134 if (dbmap->dbs[i].dbid == recs->db_id) {
1135 persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
1136 break;
1139 if (i == dbmap->num) {
1140 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
1141 talloc_free(tmp_ctx);
1142 return;
1145 /* find the name of this database */
1146 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
1147 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
1148 talloc_free(tmp_ctx);
1149 return;
1152 /* attach to it */
1153 ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, persistent, 0);
1154 if (ctdb_db == NULL) {
1155 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
1156 talloc_free(tmp_ctx);
1157 return;
1160 v = talloc_zero(rec, struct vacuum_info);
1161 if (v == NULL) {
1162 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
1163 talloc_free(tmp_ctx);
1164 return;
1167 v->rec = rec;
1168 v->srcnode = srcnode;
1169 v->ctdb_db = ctdb_db;
1170 v->recs = talloc_memdup(v, recs, data.dsize);
1171 if (v->recs == NULL) {
1172 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
1173 talloc_free(v);
1174 talloc_free(tmp_ctx);
1175 return;
1177 v->r = (struct ctdb_rec_data *)&v->recs->data[0];
1179 DLIST_ADD(rec->vacuum_info, v);
1181 talloc_set_destructor(v, vacuum_info_destructor);
1183 vacuum_fetch_next(v);
1184 talloc_free(tmp_ctx);
1189 * handler for database detach
1191 static void detach_database_handler(struct ctdb_context *ctdb, uint64_t srvid,
1192 TDB_DATA data, void *private_data)
1194 struct ctdb_recoverd *rec = talloc_get_type(private_data,
1195 struct ctdb_recoverd);
1196 uint32_t db_id;
1197 struct vacuum_info *v, *vnext;
1198 struct ctdb_db_context *ctdb_db;
1200 if (data.dsize != sizeof(db_id)) {
1201 return;
1203 db_id = *(uint32_t *)data.dptr;
1205 ctdb_db = find_ctdb_db(ctdb, db_id);
1206 if (ctdb_db == NULL) {
1207 /* database is not attached */
1208 return;
1211 /* Stop any active vacuum fetch */
1212 v = rec->vacuum_info;
1213 while (v != NULL) {
1214 vnext = v->next;
1216 if (v->ctdb_db->db_id == db_id) {
1217 talloc_free(v);
1219 v = vnext;
1222 DLIST_REMOVE(ctdb->db_list, ctdb_db);
1224 DEBUG(DEBUG_NOTICE, ("Detached from database '%s'\n",
1225 ctdb_db->db_name));
1226 talloc_free(ctdb_db);
1230 called when ctdb_wait_timeout should finish
1232 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
1233 struct timeval yt, void *p)
1235 uint32_t *timed_out = (uint32_t *)p;
1236 (*timed_out) = 1;
1240 wait for a given number of seconds
1242 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
1244 uint32_t timed_out = 0;
1245 time_t usecs = (secs - (time_t)secs) * 1000000;
1246 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs), ctdb_wait_handler, &timed_out);
1247 while (!timed_out) {
1248 event_loop_once(ctdb->ev);
1253 called when an election times out (ends)
1255 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
1256 struct timeval t, void *p)
1258 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1259 rec->election_timeout = NULL;
1260 fast_start = false;
1262 DEBUG(DEBUG_WARNING,("Election period ended\n"));
1267 wait for an election to finish. It finished election_timeout seconds after
1268 the last election packet is received
1270 static void ctdb_wait_election(struct ctdb_recoverd *rec)
1272 struct ctdb_context *ctdb = rec->ctdb;
1273 while (rec->election_timeout) {
1274 event_loop_once(ctdb->ev);
1279 Update our local flags from all remote connected nodes.
1280 This is only run when we are or we belive we are the recovery master
1282 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
1284 int j;
1285 struct ctdb_context *ctdb = rec->ctdb;
1286 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1288 /* get the nodemap for all active remote nodes and verify
1289 they are the same as for this node
1291 for (j=0; j<nodemap->num; j++) {
1292 struct ctdb_node_map *remote_nodemap=NULL;
1293 int ret;
1295 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
1296 continue;
1298 if (nodemap->nodes[j].pnn == ctdb->pnn) {
1299 continue;
1302 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
1303 mem_ctx, &remote_nodemap);
1304 if (ret != 0) {
1305 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
1306 nodemap->nodes[j].pnn));
1307 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
1308 talloc_free(mem_ctx);
1309 return MONITOR_FAILED;
1311 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
1312 /* We should tell our daemon about this so it
1313 updates its flags or else we will log the same
1314 message again in the next iteration of recovery.
1315 Since we are the recovery master we can just as
1316 well update the flags on all nodes.
1318 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
1319 if (ret != 0) {
1320 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
1321 return -1;
1324 /* Update our local copy of the flags in the recovery
1325 daemon.
1327 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
1328 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
1329 nodemap->nodes[j].flags));
1330 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
1332 talloc_free(remote_nodemap);
1334 talloc_free(mem_ctx);
1335 return MONITOR_OK;
1339 /* Create a new random generation ip.
1340 The generation id can not be the INVALID_GENERATION id
1342 static uint32_t new_generation(void)
1344 uint32_t generation;
1346 while (1) {
1347 generation = random();
1349 if (generation != INVALID_GENERATION) {
1350 break;
1354 return generation;
1359 create a temporary working database
1361 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1363 char *name;
1364 struct tdb_wrap *recdb;
1365 unsigned tdb_flags;
1367 /* open up the temporary recovery database */
1368 name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
1369 ctdb->db_directory_state,
1370 ctdb->pnn);
1371 if (name == NULL) {
1372 return NULL;
1374 unlink(name);
1376 tdb_flags = TDB_NOLOCK;
1377 if (ctdb->valgrinding) {
1378 tdb_flags |= TDB_NOMMAP;
1380 tdb_flags |= (TDB_INCOMPATIBLE_HASH | TDB_DISALLOW_NESTING);
1382 recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1383 tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1384 if (recdb == NULL) {
1385 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1388 talloc_free(name);
1390 return recdb;
1395 a traverse function for pulling all relevant records from recdb
1397 struct recdb_data {
1398 struct ctdb_context *ctdb;
1399 struct ctdb_marshall_buffer *recdata;
1400 uint32_t len;
1401 uint32_t allocated_len;
1402 bool failed;
1403 bool persistent;
1406 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1408 struct recdb_data *params = (struct recdb_data *)p;
1409 struct ctdb_rec_data *recdata;
1410 struct ctdb_ltdb_header *hdr;
1413 * skip empty records - but NOT for persistent databases:
1415 * The record-by-record mode of recovery deletes empty records.
1416 * For persistent databases, this can lead to data corruption
1417 * by deleting records that should be there:
1419 * - Assume the cluster has been running for a while.
1421 * - A record R in a persistent database has been created and
1422 * deleted a couple of times, the last operation being deletion,
1423 * leaving an empty record with a high RSN, say 10.
1425 * - Now a node N is turned off.
1427 * - This leaves the local database copy of D on N with the empty
1428 * copy of R and RSN 10. On all other nodes, the recovery has deleted
1429 * the copy of record R.
1431 * - Now the record is created again while node N is turned off.
1432 * This creates R with RSN = 1 on all nodes except for N.
1434 * - Now node N is turned on again. The following recovery will chose
1435 * the older empty copy of R due to RSN 10 > RSN 1.
1437 * ==> Hence the record is gone after the recovery.
1439 * On databases like Samba's registry, this can damage the higher-level
1440 * data structures built from the various tdb-level records.
1442 if (!params->persistent && data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1443 return 0;
1446 /* update the dmaster field to point to us */
1447 hdr = (struct ctdb_ltdb_header *)data.dptr;
1448 if (!params->persistent) {
1449 hdr->dmaster = params->ctdb->pnn;
1450 hdr->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
1453 /* add the record to the blob ready to send to the nodes */
1454 recdata = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1455 if (recdata == NULL) {
1456 params->failed = true;
1457 return -1;
1459 if (params->len + recdata->length >= params->allocated_len) {
1460 params->allocated_len = recdata->length + params->len + params->ctdb->tunable.pulldb_preallocation_size;
1461 params->recdata = talloc_realloc_size(NULL, params->recdata, params->allocated_len);
1463 if (params->recdata == NULL) {
1464 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u\n",
1465 recdata->length + params->len));
1466 params->failed = true;
1467 return -1;
1469 params->recdata->count++;
1470 memcpy(params->len+(uint8_t *)params->recdata, recdata, recdata->length);
1471 params->len += recdata->length;
1472 talloc_free(recdata);
1474 return 0;
1478 push the recdb database out to all nodes
1480 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1481 bool persistent,
1482 struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1484 struct recdb_data params;
1485 struct ctdb_marshall_buffer *recdata;
1486 TDB_DATA outdata;
1487 TALLOC_CTX *tmp_ctx;
1488 uint32_t *nodes;
1490 tmp_ctx = talloc_new(ctdb);
1491 CTDB_NO_MEMORY(ctdb, tmp_ctx);
1493 recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1494 CTDB_NO_MEMORY(ctdb, recdata);
1496 recdata->db_id = dbid;
1498 params.ctdb = ctdb;
1499 params.recdata = recdata;
1500 params.len = offsetof(struct ctdb_marshall_buffer, data);
1501 params.allocated_len = params.len;
1502 params.failed = false;
1503 params.persistent = persistent;
1505 if (tdb_traverse_read(recdb->tdb, traverse_recdb, &params) == -1) {
1506 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1507 talloc_free(params.recdata);
1508 talloc_free(tmp_ctx);
1509 return -1;
1512 if (params.failed) {
1513 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1514 talloc_free(params.recdata);
1515 talloc_free(tmp_ctx);
1516 return -1;
1519 recdata = params.recdata;
1521 outdata.dptr = (void *)recdata;
1522 outdata.dsize = params.len;
1524 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1525 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1526 nodes, 0,
1527 CONTROL_TIMEOUT(), false, outdata,
1528 NULL, NULL,
1529 NULL) != 0) {
1530 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1531 talloc_free(recdata);
1532 talloc_free(tmp_ctx);
1533 return -1;
1536 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1537 dbid, recdata->count));
1539 talloc_free(recdata);
1540 talloc_free(tmp_ctx);
1542 return 0;
1547 go through a full recovery on one database
1549 static int recover_database(struct ctdb_recoverd *rec,
1550 TALLOC_CTX *mem_ctx,
1551 uint32_t dbid,
1552 bool persistent,
1553 uint32_t pnn,
1554 struct ctdb_node_map *nodemap,
1555 uint32_t transaction_id)
1557 struct tdb_wrap *recdb;
1558 int ret;
1559 struct ctdb_context *ctdb = rec->ctdb;
1560 TDB_DATA data;
1561 struct ctdb_control_wipe_database w;
1562 uint32_t *nodes;
1564 recdb = create_recdb(ctdb, mem_ctx);
1565 if (recdb == NULL) {
1566 return -1;
1569 /* pull all remote databases onto the recdb */
1570 ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
1571 if (ret != 0) {
1572 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1573 return -1;
1576 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1578 /* wipe all the remote databases. This is safe as we are in a transaction */
1579 w.db_id = dbid;
1580 w.transaction_id = transaction_id;
1582 data.dptr = (void *)&w;
1583 data.dsize = sizeof(w);
1585 nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1586 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1587 nodes, 0,
1588 CONTROL_TIMEOUT(), false, data,
1589 NULL, NULL,
1590 NULL) != 0) {
1591 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1592 talloc_free(recdb);
1593 return -1;
1596 /* push out the correct database. This sets the dmaster and skips
1597 the empty records */
1598 ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
1599 if (ret != 0) {
1600 talloc_free(recdb);
1601 return -1;
1604 /* all done with this database */
1605 talloc_free(recdb);
1607 return 0;
1610 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1611 struct ctdb_recoverd *rec,
1612 struct ctdb_node_map *nodemap,
1613 uint32_t *culprit)
1615 int j;
1616 int ret;
1618 if (ctdb->num_nodes != nodemap->num) {
1619 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
1620 ctdb->num_nodes, nodemap->num));
1621 if (culprit) {
1622 *culprit = ctdb->pnn;
1624 return -1;
1627 for (j=0; j<nodemap->num; j++) {
1628 /* For readability */
1629 struct ctdb_node *node = ctdb->nodes[j];
1631 /* release any existing data */
1632 if (node->known_public_ips) {
1633 talloc_free(node->known_public_ips);
1634 node->known_public_ips = NULL;
1636 if (node->available_public_ips) {
1637 talloc_free(node->available_public_ips);
1638 node->available_public_ips = NULL;
1641 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1642 continue;
1645 /* Retrieve the list of known public IPs from the node */
1646 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1647 CONTROL_TIMEOUT(),
1648 node->pnn,
1649 ctdb->nodes,
1651 &node->known_public_ips);
1652 if (ret != 0) {
1653 DEBUG(DEBUG_ERR,
1654 ("Failed to read known public IPs from node: %u\n",
1655 node->pnn));
1656 if (culprit) {
1657 *culprit = node->pnn;
1659 return -1;
1662 if (ctdb->do_checkpublicip &&
1663 !ctdb_op_is_disabled(rec->takeover_run) &&
1664 verify_remote_ip_allocation(ctdb,
1665 node->known_public_ips,
1666 node->pnn)) {
1667 DEBUG(DEBUG_ERR,("Trigger IP reallocation\n"));
1668 rec->need_takeover_run = true;
1671 /* Retrieve the list of available public IPs from the node */
1672 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1673 CONTROL_TIMEOUT(),
1674 node->pnn,
1675 ctdb->nodes,
1676 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1677 &node->available_public_ips);
1678 if (ret != 0) {
1679 DEBUG(DEBUG_ERR,
1680 ("Failed to read available public IPs from node: %u\n",
1681 node->pnn));
1682 if (culprit) {
1683 *culprit = node->pnn;
1685 return -1;
1689 return 0;
1692 /* when we start a recovery, make sure all nodes use the same reclock file
1693 setting
1695 static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd *rec)
1697 struct ctdb_context *ctdb = rec->ctdb;
1698 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
1699 TDB_DATA data;
1700 uint32_t *nodes;
1702 if (ctdb->recovery_lock_file == NULL) {
1703 data.dptr = NULL;
1704 data.dsize = 0;
1705 } else {
1706 data.dsize = strlen(ctdb->recovery_lock_file) + 1;
1707 data.dptr = (uint8_t *)ctdb->recovery_lock_file;
1710 nodes = list_of_active_nodes(ctdb, rec->nodemap, tmp_ctx, true);
1711 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECLOCK_FILE,
1712 nodes, 0,
1713 CONTROL_TIMEOUT(),
1714 false, data,
1715 NULL, NULL,
1716 rec) != 0) {
1717 DEBUG(DEBUG_ERR, (__location__ " Failed to sync reclock file settings\n"));
1718 talloc_free(tmp_ctx);
1719 return -1;
1722 talloc_free(tmp_ctx);
1723 return 0;
1728 * this callback is called for every node that failed to execute ctdb_takeover_run()
1729 * and set flag to re-run takeover run.
1731 static void takeover_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
1733 DEBUG(DEBUG_ERR, ("Node %u failed the takeover run\n", node_pnn));
1735 if (callback_data != NULL) {
1736 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
1738 DEBUG(DEBUG_ERR, ("Setting node %u as recovery fail culprit\n", node_pnn));
1740 ctdb_set_culprit(rec, node_pnn);
1745 static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
1747 struct ctdb_context *ctdb = rec->ctdb;
1748 int i;
1749 struct ctdb_banning_state *ban_state;
1751 *self_ban = false;
1752 for (i=0; i<ctdb->num_nodes; i++) {
1753 if (ctdb->nodes[i]->ban_state == NULL) {
1754 continue;
1756 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1757 if (ban_state->count < 2*ctdb->num_nodes) {
1758 continue;
1761 DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
1762 ctdb->nodes[i]->pnn, ban_state->count,
1763 ctdb->tunable.recovery_ban_period));
1764 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1765 ban_state->count = 0;
1767 /* Banning ourself? */
1768 if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
1769 *self_ban = true;
1774 static bool do_takeover_run(struct ctdb_recoverd *rec,
1775 struct ctdb_node_map *nodemap,
1776 bool banning_credits_on_fail)
1778 uint32_t *nodes = NULL;
1779 struct srvid_request_data dtr;
1780 TDB_DATA data;
1781 int i;
1782 uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
1783 int ret;
1784 bool ok;
1786 DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
1788 if (ctdb_op_is_in_progress(rec->takeover_run)) {
1789 DEBUG(DEBUG_ERR, (__location__
1790 " takeover run already in progress \n"));
1791 ok = false;
1792 goto done;
1795 if (!ctdb_op_begin(rec->takeover_run)) {
1796 ok = false;
1797 goto done;
1800 /* Disable IP checks (takeover runs, really) on other nodes
1801 * while doing this takeover run. This will stop those other
1802 * nodes from triggering takeover runs when think they should
1803 * be hosting an IP but it isn't yet on an interface. Don't
1804 * wait for replies since a failure here might cause some
1805 * noise in the logs but will not actually cause a problem.
1807 dtr.srvid = 0; /* No reply */
1808 dtr.pnn = -1;
1810 data.dptr = (uint8_t*)&dtr;
1811 data.dsize = sizeof(dtr);
1813 nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
1815 /* Disable for 60 seconds. This can be a tunable later if
1816 * necessary.
1818 dtr.data = 60;
1819 for (i = 0; i < talloc_array_length(nodes); i++) {
1820 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1821 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1822 data) != 0) {
1823 DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
1827 ret = ctdb_takeover_run(rec->ctdb, nodemap,
1828 rec->force_rebalance_nodes,
1829 takeover_fail_callback,
1830 banning_credits_on_fail ? rec : NULL);
1832 /* Reenable takeover runs and IP checks on other nodes */
1833 dtr.data = 0;
1834 for (i = 0; i < talloc_array_length(nodes); i++) {
1835 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1836 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1837 data) != 0) {
1838 DEBUG(DEBUG_INFO,("Failed to reenable takeover runs\n"));
1842 if (ret != 0) {
1843 DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
1844 ok = false;
1845 goto done;
1848 ok = true;
1849 /* Takeover run was successful so clear force rebalance targets */
1850 if (rebalance_nodes == rec->force_rebalance_nodes) {
1851 TALLOC_FREE(rec->force_rebalance_nodes);
1852 } else {
1853 DEBUG(DEBUG_WARNING,
1854 ("Rebalance target nodes changed during takeover run - not clearing\n"));
1856 done:
1857 rec->need_takeover_run = !ok;
1858 talloc_free(nodes);
1859 ctdb_op_end(rec->takeover_run);
1861 DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
1862 return ok;
1867 we are the recmaster, and recovery is needed - start a recovery run
1869 static int do_recovery(struct ctdb_recoverd *rec,
1870 TALLOC_CTX *mem_ctx, uint32_t pnn,
1871 struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap)
1873 struct ctdb_context *ctdb = rec->ctdb;
1874 int i, j, ret;
1875 uint32_t generation;
1876 struct ctdb_dbid_map *dbmap;
1877 TDB_DATA data;
1878 uint32_t *nodes;
1879 struct timeval start_time;
1880 uint32_t culprit = (uint32_t)-1;
1881 bool self_ban;
1883 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1885 /* if recovery fails, force it again */
1886 rec->need_recovery = true;
1888 if (!ctdb_op_begin(rec->recovery)) {
1889 return -1;
1892 if (rec->election_timeout) {
1893 /* an election is in progress */
1894 DEBUG(DEBUG_ERR, ("do_recovery called while election in progress - try again later\n"));
1895 goto fail;
1898 ban_misbehaving_nodes(rec, &self_ban);
1899 if (self_ban) {
1900 DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
1901 goto fail;
1904 if (ctdb->recovery_lock_file != NULL) {
1905 if (ctdb_recovery_have_lock(ctdb)) {
1906 DEBUG(DEBUG_NOTICE, ("Already holding recovery lock\n"));
1907 } else {
1908 start_time = timeval_current();
1909 DEBUG(DEBUG_NOTICE, ("Attempting to take recovery lock (%s)\n",
1910 ctdb->recovery_lock_file));
1911 if (!ctdb_recovery_lock(ctdb)) {
1912 if (ctdb->runstate == CTDB_RUNSTATE_FIRST_RECOVERY) {
1913 /* If ctdb is trying first recovery, it's
1914 * possible that current node does not know
1915 * yet who the recmaster is.
1917 DEBUG(DEBUG_ERR, ("Unable to get recovery lock"
1918 " - retrying recovery\n"));
1919 goto fail;
1922 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
1923 "and ban ourself for %u seconds\n",
1924 ctdb->tunable.recovery_ban_period));
1925 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
1926 goto fail;
1928 ctdb_ctrl_report_recd_lock_latency(ctdb,
1929 CONTROL_TIMEOUT(),
1930 timeval_elapsed(&start_time));
1931 DEBUG(DEBUG_NOTICE,
1932 ("Recovery lock taken successfully by recovery daemon\n"));
1936 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1938 /* get a list of all databases */
1939 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1940 if (ret != 0) {
1941 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1942 goto fail;
1945 /* we do the db creation before we set the recovery mode, so the freeze happens
1946 on all databases we will be dealing with. */
1948 /* verify that we have all the databases any other node has */
1949 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1950 if (ret != 0) {
1951 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1952 goto fail;
1955 /* verify that all other nodes have all our databases */
1956 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1957 if (ret != 0) {
1958 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1959 goto fail;
1961 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1963 /* update the database priority for all remote databases */
1964 ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
1965 if (ret != 0) {
1966 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
1968 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
1971 /* update all other nodes to use the same setting for reclock files
1972 as the local recovery master.
1974 sync_recovery_lock_file_across_cluster(rec);
1976 /* set recovery mode to active on all nodes */
1977 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1978 if (ret != 0) {
1979 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1980 goto fail;
1983 /* execute the "startrecovery" event script on all nodes */
1984 ret = run_startrecovery_eventscript(rec, nodemap);
1985 if (ret!=0) {
1986 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1987 goto fail;
1991 update all nodes to have the same flags that we have
1993 for (i=0;i<nodemap->num;i++) {
1994 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1995 continue;
1998 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1999 if (ret != 0) {
2000 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2001 DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
2002 } else {
2003 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
2004 goto fail;
2009 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
2011 /* pick a new generation number */
2012 generation = new_generation();
2014 /* change the vnnmap on this node to use the new generation
2015 number but not on any other nodes.
2016 this guarantees that if we abort the recovery prematurely
2017 for some reason (a node stops responding?)
2018 that we can just return immediately and we will reenter
2019 recovery shortly again.
2020 I.e. we deliberately leave the cluster with an inconsistent
2021 generation id to allow us to abort recovery at any stage and
2022 just restart it from scratch.
2024 vnnmap->generation = generation;
2025 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
2026 if (ret != 0) {
2027 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
2028 goto fail;
2031 data.dptr = (void *)&generation;
2032 data.dsize = sizeof(uint32_t);
2034 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
2035 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
2036 nodes, 0,
2037 CONTROL_TIMEOUT(), false, data,
2038 NULL,
2039 transaction_start_fail_callback,
2040 rec) != 0) {
2041 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
2042 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
2043 nodes, 0,
2044 CONTROL_TIMEOUT(), false, tdb_null,
2045 NULL,
2046 NULL,
2047 NULL) != 0) {
2048 DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
2050 goto fail;
2053 DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
2055 for (i=0;i<dbmap->num;i++) {
2056 ret = recover_database(rec, mem_ctx,
2057 dbmap->dbs[i].dbid,
2058 dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT,
2059 pnn, nodemap, generation);
2060 if (ret != 0) {
2061 DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
2062 goto fail;
2066 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
2068 /* commit all the changes */
2069 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
2070 nodes, 0,
2071 CONTROL_TIMEOUT(), false, data,
2072 NULL, NULL,
2073 NULL) != 0) {
2074 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
2075 goto fail;
2078 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
2081 /* update the capabilities for all nodes */
2082 ret = update_capabilities(rec, nodemap);
2083 if (ret!=0) {
2084 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
2085 goto fail;
2088 /* build a new vnn map with all the currently active and
2089 unbanned nodes */
2090 generation = new_generation();
2091 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
2092 CTDB_NO_MEMORY(ctdb, vnnmap);
2093 vnnmap->generation = generation;
2094 vnnmap->size = 0;
2095 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
2096 CTDB_NO_MEMORY(ctdb, vnnmap->map);
2097 for (i=j=0;i<nodemap->num;i++) {
2098 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2099 continue;
2101 if (!ctdb_node_has_capabilities(rec->caps,
2102 ctdb->nodes[i]->pnn,
2103 CTDB_CAP_LMASTER)) {
2104 /* this node can not be an lmaster */
2105 DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
2106 continue;
2109 vnnmap->size++;
2110 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
2111 CTDB_NO_MEMORY(ctdb, vnnmap->map);
2112 vnnmap->map[j++] = nodemap->nodes[i].pnn;
2115 if (vnnmap->size == 0) {
2116 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
2117 vnnmap->size++;
2118 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
2119 CTDB_NO_MEMORY(ctdb, vnnmap->map);
2120 vnnmap->map[0] = pnn;
2123 /* update to the new vnnmap on all nodes */
2124 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
2125 if (ret != 0) {
2126 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
2127 goto fail;
2130 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
2132 /* update recmaster to point to us for all nodes */
2133 ret = set_recovery_master(ctdb, nodemap, pnn);
2134 if (ret!=0) {
2135 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
2136 goto fail;
2139 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
2141 /* disable recovery mode */
2142 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL);
2143 if (ret != 0) {
2144 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
2145 goto fail;
2148 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
2150 /* Fetch known/available public IPs from each active node */
2151 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
2152 if (ret != 0) {
2153 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2154 culprit));
2155 rec->need_takeover_run = true;
2156 goto fail;
2159 do_takeover_run(rec, nodemap, false);
2161 /* execute the "recovered" event script on all nodes */
2162 ret = run_recovered_eventscript(rec, nodemap, "do_recovery");
2163 if (ret!=0) {
2164 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
2165 goto fail;
2168 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
2170 /* send a message to all clients telling them that the cluster
2171 has been reconfigured */
2172 ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
2173 CTDB_SRVID_RECONFIGURE, tdb_null);
2174 if (ret != 0) {
2175 DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n"));
2176 goto fail;
2179 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
2181 rec->need_recovery = false;
2182 ctdb_op_end(rec->recovery);
2184 /* we managed to complete a full recovery, make sure to forgive
2185 any past sins by the nodes that could now participate in the
2186 recovery.
2188 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
2189 for (i=0;i<nodemap->num;i++) {
2190 struct ctdb_banning_state *ban_state;
2192 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2193 continue;
2196 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
2197 if (ban_state == NULL) {
2198 continue;
2201 ban_state->count = 0;
2204 /* We just finished a recovery successfully.
2205 We now wait for rerecovery_timeout before we allow
2206 another recovery to take place.
2208 DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
2209 ctdb_op_disable(rec->recovery, ctdb->ev,
2210 ctdb->tunable.rerecovery_timeout);
2211 return 0;
2213 fail:
2214 ctdb_op_end(rec->recovery);
2215 return -1;
2220 elections are won by first checking the number of connected nodes, then
2221 the priority time, then the pnn
2223 struct election_message {
2224 uint32_t num_connected;
2225 struct timeval priority_time;
2226 uint32_t pnn;
2227 uint32_t node_flags;
2231 form this nodes election data
2233 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
2235 int ret, i;
2236 struct ctdb_node_map *nodemap;
2237 struct ctdb_context *ctdb = rec->ctdb;
2239 ZERO_STRUCTP(em);
2241 em->pnn = rec->ctdb->pnn;
2242 em->priority_time = rec->priority_time;
2244 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
2245 if (ret != 0) {
2246 DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
2247 return;
2250 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
2251 em->node_flags = rec->node_flags;
2253 for (i=0;i<nodemap->num;i++) {
2254 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
2255 em->num_connected++;
2259 /* we shouldnt try to win this election if we cant be a recmaster */
2260 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2261 em->num_connected = 0;
2262 em->priority_time = timeval_current();
2265 talloc_free(nodemap);
2269 see if the given election data wins
2271 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
2273 struct election_message myem;
2274 int cmp = 0;
2276 ctdb_election_data(rec, &myem);
2278 /* we cant win if we dont have the recmaster capability */
2279 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2280 return false;
2283 /* we cant win if we are banned */
2284 if (rec->node_flags & NODE_FLAGS_BANNED) {
2285 return false;
2288 /* we cant win if we are stopped */
2289 if (rec->node_flags & NODE_FLAGS_STOPPED) {
2290 return false;
2293 /* we will automatically win if the other node is banned */
2294 if (em->node_flags & NODE_FLAGS_BANNED) {
2295 return true;
2298 /* we will automatically win if the other node is banned */
2299 if (em->node_flags & NODE_FLAGS_STOPPED) {
2300 return true;
2303 /* try to use the most connected node */
2304 if (cmp == 0) {
2305 cmp = (int)myem.num_connected - (int)em->num_connected;
2308 /* then the longest running node */
2309 if (cmp == 0) {
2310 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
2313 if (cmp == 0) {
2314 cmp = (int)myem.pnn - (int)em->pnn;
2317 return cmp > 0;
2321 send out an election request
2323 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
2325 int ret;
2326 TDB_DATA election_data;
2327 struct election_message emsg;
2328 uint64_t srvid;
2329 struct ctdb_context *ctdb = rec->ctdb;
2331 srvid = CTDB_SRVID_RECOVERY;
2333 ctdb_election_data(rec, &emsg);
2335 election_data.dsize = sizeof(struct election_message);
2336 election_data.dptr = (unsigned char *)&emsg;
2339 /* first we assume we will win the election and set
2340 recoverymaster to be ourself on the current node
2342 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
2343 if (ret != 0) {
2344 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
2345 return -1;
2349 /* send an election message to all active nodes */
2350 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
2351 return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
2355 this function will unban all nodes in the cluster
2357 static void unban_all_nodes(struct ctdb_context *ctdb)
2359 int ret, i;
2360 struct ctdb_node_map *nodemap;
2361 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2363 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2364 if (ret != 0) {
2365 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
2366 return;
2369 for (i=0;i<nodemap->num;i++) {
2370 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
2371 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
2372 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(),
2373 nodemap->nodes[i].pnn, 0,
2374 NODE_FLAGS_BANNED);
2375 if (ret != 0) {
2376 DEBUG(DEBUG_ERR, (__location__ " failed to reset ban state\n"));
2381 talloc_free(tmp_ctx);
2386 we think we are winning the election - send a broadcast election request
2388 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
2390 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2391 int ret;
2393 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
2394 if (ret != 0) {
2395 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
2398 talloc_free(rec->send_election_te);
2399 rec->send_election_te = NULL;
2403 handler for memory dumps
2405 static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
2406 TDB_DATA data, void *private_data)
2408 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2409 TDB_DATA *dump;
2410 int ret;
2411 struct srvid_request *rd;
2413 if (data.dsize != sizeof(struct srvid_request)) {
2414 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2415 talloc_free(tmp_ctx);
2416 return;
2418 rd = (struct srvid_request *)data.dptr;
2420 dump = talloc_zero(tmp_ctx, TDB_DATA);
2421 if (dump == NULL) {
2422 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
2423 talloc_free(tmp_ctx);
2424 return;
2426 ret = ctdb_dump_memory(ctdb, dump);
2427 if (ret != 0) {
2428 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
2429 talloc_free(tmp_ctx);
2430 return;
2433 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
2435 ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
2436 if (ret != 0) {
2437 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
2438 talloc_free(tmp_ctx);
2439 return;
2442 talloc_free(tmp_ctx);
2446 handler for reload_nodes
2448 static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid,
2449 TDB_DATA data, void *private_data)
2451 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2453 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
2455 ctdb_load_nodes_file(rec->ctdb);
2459 static void ctdb_rebalance_timeout(struct event_context *ev,
2460 struct timed_event *te,
2461 struct timeval t, void *p)
2463 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2465 if (rec->force_rebalance_nodes == NULL) {
2466 DEBUG(DEBUG_ERR,
2467 ("Rebalance timeout occurred - no nodes to rebalance\n"));
2468 return;
2471 DEBUG(DEBUG_NOTICE,
2472 ("Rebalance timeout occurred - do takeover run\n"));
2473 do_takeover_run(rec, rec->nodemap, false);
2477 static void recd_node_rebalance_handler(struct ctdb_context *ctdb,
2478 uint64_t srvid,
2479 TDB_DATA data, void *private_data)
2481 uint32_t pnn;
2482 uint32_t *t;
2483 int len;
2484 uint32_t deferred_rebalance;
2485 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2487 if (rec->recmaster != ctdb_get_pnn(ctdb)) {
2488 return;
2491 if (data.dsize != sizeof(uint32_t)) {
2492 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
2493 return;
2496 pnn = *(uint32_t *)&data.dptr[0];
2498 DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
2500 /* Copy any existing list of nodes. There's probably some
2501 * sort of realloc variant that will do this but we need to
2502 * make sure that freeing the old array also cancels the timer
2503 * event for the timeout... not sure if realloc will do that.
2505 len = (rec->force_rebalance_nodes != NULL) ?
2506 talloc_array_length(rec->force_rebalance_nodes) :
2509 /* This allows duplicates to be added but they don't cause
2510 * harm. A call to add a duplicate PNN arguably means that
2511 * the timeout should be reset, so this is the simplest
2512 * solution.
2514 t = talloc_zero_array(rec, uint32_t, len+1);
2515 CTDB_NO_MEMORY_VOID(ctdb, t);
2516 if (len > 0) {
2517 memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
2519 t[len] = pnn;
2521 talloc_free(rec->force_rebalance_nodes);
2523 rec->force_rebalance_nodes = t;
2525 /* If configured, setup a deferred takeover run to make sure
2526 * that certain nodes get IPs rebalanced to them. This will
2527 * be cancelled if a successful takeover run happens before
2528 * the timeout. Assign tunable value to variable for
2529 * readability.
2531 deferred_rebalance = ctdb->tunable.deferred_rebalance_on_node_add;
2532 if (deferred_rebalance != 0) {
2533 event_add_timed(ctdb->ev, rec->force_rebalance_nodes,
2534 timeval_current_ofs(deferred_rebalance, 0),
2535 ctdb_rebalance_timeout, rec);
2541 static void recd_update_ip_handler(struct ctdb_context *ctdb, uint64_t srvid,
2542 TDB_DATA data, void *private_data)
2544 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2545 struct ctdb_public_ip *ip;
2547 if (rec->recmaster != rec->ctdb->pnn) {
2548 DEBUG(DEBUG_INFO,("Not recmaster, ignore update ip message\n"));
2549 return;
2552 if (data.dsize != sizeof(struct ctdb_public_ip)) {
2553 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(struct ctdb_public_ip)));
2554 return;
2557 ip = (struct ctdb_public_ip *)data.dptr;
2559 update_ip_assignment_tree(rec->ctdb, ip);
2562 static void srvid_disable_and_reply(struct ctdb_context *ctdb,
2563 TDB_DATA data,
2564 struct ctdb_op_state *op_state)
2566 struct srvid_request_data *r;
2567 uint32_t timeout;
2568 TDB_DATA result;
2569 int32_t ret = 0;
2571 /* Validate input data */
2572 if (data.dsize != sizeof(struct srvid_request_data)) {
2573 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2574 "expecting %lu\n", (long unsigned)data.dsize,
2575 (long unsigned)sizeof(struct srvid_request)));
2576 return;
2578 if (data.dptr == NULL) {
2579 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2580 return;
2583 r = (struct srvid_request_data *)data.dptr;
2584 timeout = r->data;
2586 ret = ctdb_op_disable(op_state, ctdb->ev, timeout);
2587 if (ret != 0) {
2588 goto done;
2591 /* Returning our PNN tells the caller that we succeeded */
2592 ret = ctdb_get_pnn(ctdb);
2593 done:
2594 result.dsize = sizeof(int32_t);
2595 result.dptr = (uint8_t *)&ret;
2596 srvid_request_reply(ctdb, (struct srvid_request *)r, result);
2599 static void disable_takeover_runs_handler(struct ctdb_context *ctdb,
2600 uint64_t srvid, TDB_DATA data,
2601 void *private_data)
2603 struct ctdb_recoverd *rec = talloc_get_type(private_data,
2604 struct ctdb_recoverd);
2606 srvid_disable_and_reply(ctdb, data, rec->takeover_run);
2609 /* Backward compatibility for this SRVID */
2610 static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid,
2611 TDB_DATA data, void *private_data)
2613 struct ctdb_recoverd *rec = talloc_get_type(private_data,
2614 struct ctdb_recoverd);
2615 uint32_t timeout;
2617 if (data.dsize != sizeof(uint32_t)) {
2618 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2619 "expecting %lu\n", (long unsigned)data.dsize,
2620 (long unsigned)sizeof(uint32_t)));
2621 return;
2623 if (data.dptr == NULL) {
2624 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2625 return;
2628 timeout = *((uint32_t *)data.dptr);
2630 ctdb_op_disable(rec->takeover_run, ctdb->ev, timeout);
2633 static void disable_recoveries_handler(struct ctdb_context *ctdb,
2634 uint64_t srvid, TDB_DATA data,
2635 void *private_data)
2637 struct ctdb_recoverd *rec = talloc_get_type(private_data,
2638 struct ctdb_recoverd);
2640 srvid_disable_and_reply(ctdb, data, rec->recovery);
2644 handler for ip reallocate, just add it to the list of requests and
2645 handle this later in the monitor_cluster loop so we do not recurse
2646 with other requests to takeover_run()
2648 static void ip_reallocate_handler(struct ctdb_context *ctdb, uint64_t srvid,
2649 TDB_DATA data, void *private_data)
2651 struct srvid_request *request;
2652 struct ctdb_recoverd *rec = talloc_get_type(private_data,
2653 struct ctdb_recoverd);
2655 if (data.dsize != sizeof(struct srvid_request)) {
2656 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2657 return;
2660 request = (struct srvid_request *)data.dptr;
2662 srvid_request_add(ctdb, &rec->reallocate_requests, request);
2665 static void process_ipreallocate_requests(struct ctdb_context *ctdb,
2666 struct ctdb_recoverd *rec)
2668 TDB_DATA result;
2669 int32_t ret;
2670 uint32_t culprit;
2671 struct srvid_requests *current;
2673 DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
2675 /* Only process requests that are currently pending. More
2676 * might come in while the takeover run is in progress and
2677 * they will need to be processed later since they might
2678 * be in response flag changes.
2680 current = rec->reallocate_requests;
2681 rec->reallocate_requests = NULL;
2683 /* update the list of public ips that a node can handle for
2684 all connected nodes
2686 ret = ctdb_reload_remote_public_ips(ctdb, rec, rec->nodemap, &culprit);
2687 if (ret != 0) {
2688 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2689 culprit));
2690 rec->need_takeover_run = true;
2692 if (ret == 0) {
2693 if (do_takeover_run(rec, rec->nodemap, false)) {
2694 ret = ctdb_get_pnn(ctdb);
2695 } else {
2696 ret = -1;
2700 result.dsize = sizeof(int32_t);
2701 result.dptr = (uint8_t *)&ret;
2703 srvid_requests_reply(ctdb, &current, result);
2708 handler for recovery master elections
2710 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
2711 TDB_DATA data, void *private_data)
2713 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2714 int ret;
2715 struct election_message *em = (struct election_message *)data.dptr;
2717 /* Ignore election packets from ourself */
2718 if (ctdb->pnn == em->pnn) {
2719 return;
2722 /* we got an election packet - update the timeout for the election */
2723 talloc_free(rec->election_timeout);
2724 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2725 fast_start ?
2726 timeval_current_ofs(0, 500000) :
2727 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2728 ctdb_election_timeout, rec);
2730 /* someone called an election. check their election data
2731 and if we disagree and we would rather be the elected node,
2732 send a new election message to all other nodes
2734 if (ctdb_election_win(rec, em)) {
2735 if (!rec->send_election_te) {
2736 rec->send_election_te = event_add_timed(ctdb->ev, rec,
2737 timeval_current_ofs(0, 500000),
2738 election_send_request, rec);
2740 /*unban_all_nodes(ctdb);*/
2741 return;
2744 /* we didn't win */
2745 TALLOC_FREE(rec->send_election_te);
2747 /* Release the recovery lock file */
2748 if (ctdb_recovery_have_lock(ctdb)) {
2749 ctdb_recovery_unlock(ctdb);
2750 unban_all_nodes(ctdb);
2753 /* ok, let that guy become recmaster then */
2754 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
2755 if (ret != 0) {
2756 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
2757 return;
2760 return;
2765 force the start of the election process
2767 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
2768 struct ctdb_node_map *nodemap)
2770 int ret;
2771 struct ctdb_context *ctdb = rec->ctdb;
2773 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
2775 /* set all nodes to recovery mode to stop all internode traffic */
2776 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
2777 if (ret != 0) {
2778 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
2779 return;
2782 talloc_free(rec->election_timeout);
2783 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2784 fast_start ?
2785 timeval_current_ofs(0, 500000) :
2786 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2787 ctdb_election_timeout, rec);
2789 ret = send_election_request(rec, pnn);
2790 if (ret!=0) {
2791 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
2792 return;
2795 /* wait for a few seconds to collect all responses */
2796 ctdb_wait_election(rec);
2802 handler for when a node changes its flags
2804 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
2805 TDB_DATA data, void *private_data)
2807 int ret;
2808 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2809 struct ctdb_node_map *nodemap=NULL;
2810 TALLOC_CTX *tmp_ctx;
2811 int i;
2812 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2813 int disabled_flag_changed;
2815 if (data.dsize != sizeof(*c)) {
2816 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2817 return;
2820 tmp_ctx = talloc_new(ctdb);
2821 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2823 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2824 if (ret != 0) {
2825 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2826 talloc_free(tmp_ctx);
2827 return;
2831 for (i=0;i<nodemap->num;i++) {
2832 if (nodemap->nodes[i].pnn == c->pnn) break;
2835 if (i == nodemap->num) {
2836 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2837 talloc_free(tmp_ctx);
2838 return;
2841 if (c->old_flags != c->new_flags) {
2842 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2845 disabled_flag_changed = (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
2847 nodemap->nodes[i].flags = c->new_flags;
2849 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2850 CTDB_CURRENT_NODE, &ctdb->recovery_master);
2852 if (ret == 0) {
2853 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2854 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2857 if (ret == 0 &&
2858 ctdb->recovery_master == ctdb->pnn &&
2859 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2860 /* Only do the takeover run if the perm disabled or unhealthy
2861 flags changed since these will cause an ip failover but not
2862 a recovery.
2863 If the node became disconnected or banned this will also
2864 lead to an ip address failover but that is handled
2865 during recovery
2867 if (disabled_flag_changed) {
2868 rec->need_takeover_run = true;
2872 talloc_free(tmp_ctx);
2876 handler for when we need to push out flag changes ot all other nodes
2878 static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
2879 TDB_DATA data, void *private_data)
2881 int ret;
2882 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2883 struct ctdb_node_map *nodemap=NULL;
2884 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2885 uint32_t recmaster;
2886 uint32_t *nodes;
2888 /* find the recovery master */
2889 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
2890 if (ret != 0) {
2891 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from local node\n"));
2892 talloc_free(tmp_ctx);
2893 return;
2896 /* read the node flags from the recmaster */
2897 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), recmaster, tmp_ctx, &nodemap);
2898 if (ret != 0) {
2899 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2900 talloc_free(tmp_ctx);
2901 return;
2903 if (c->pnn >= nodemap->num) {
2904 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2905 talloc_free(tmp_ctx);
2906 return;
2909 /* send the flags update to all connected nodes */
2910 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2912 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2913 nodes, 0, CONTROL_TIMEOUT(),
2914 false, data,
2915 NULL, NULL,
2916 NULL) != 0) {
2917 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2919 talloc_free(tmp_ctx);
2920 return;
2923 talloc_free(tmp_ctx);
2927 struct verify_recmode_normal_data {
2928 uint32_t count;
2929 enum monitor_result status;
2932 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2934 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2937 /* one more node has responded with recmode data*/
2938 rmdata->count--;
2940 /* if we failed to get the recmode, then return an error and let
2941 the main loop try again.
2943 if (state->state != CTDB_CONTROL_DONE) {
2944 if (rmdata->status == MONITOR_OK) {
2945 rmdata->status = MONITOR_FAILED;
2947 return;
2950 /* if we got a response, then the recmode will be stored in the
2951 status field
2953 if (state->status != CTDB_RECOVERY_NORMAL) {
2954 DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
2955 rmdata->status = MONITOR_RECOVERY_NEEDED;
2958 return;
2962 /* verify that all nodes are in normal recovery mode */
2963 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2965 struct verify_recmode_normal_data *rmdata;
2966 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2967 struct ctdb_client_control_state *state;
2968 enum monitor_result status;
2969 int j;
2971 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2972 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2973 rmdata->count = 0;
2974 rmdata->status = MONITOR_OK;
2976 /* loop over all active nodes and send an async getrecmode call to
2977 them*/
2978 for (j=0; j<nodemap->num; j++) {
2979 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2980 continue;
2982 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2983 CONTROL_TIMEOUT(),
2984 nodemap->nodes[j].pnn);
2985 if (state == NULL) {
2986 /* we failed to send the control, treat this as
2987 an error and try again next iteration
2989 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2990 talloc_free(mem_ctx);
2991 return MONITOR_FAILED;
2994 /* set up the callback functions */
2995 state->async.fn = verify_recmode_normal_callback;
2996 state->async.private_data = rmdata;
2998 /* one more control to wait for to complete */
2999 rmdata->count++;
3003 /* now wait for up to the maximum number of seconds allowed
3004 or until all nodes we expect a response from has replied
3006 while (rmdata->count > 0) {
3007 event_loop_once(ctdb->ev);
3010 status = rmdata->status;
3011 talloc_free(mem_ctx);
3012 return status;
3016 struct verify_recmaster_data {
3017 struct ctdb_recoverd *rec;
3018 uint32_t count;
3019 uint32_t pnn;
3020 enum monitor_result status;
3023 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
3025 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
3028 /* one more node has responded with recmaster data*/
3029 rmdata->count--;
3031 /* if we failed to get the recmaster, then return an error and let
3032 the main loop try again.
3034 if (state->state != CTDB_CONTROL_DONE) {
3035 if (rmdata->status == MONITOR_OK) {
3036 rmdata->status = MONITOR_FAILED;
3038 return;
3041 /* if we got a response, then the recmaster will be stored in the
3042 status field
3044 if (state->status != rmdata->pnn) {
3045 DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
3046 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
3047 rmdata->status = MONITOR_ELECTION_NEEDED;
3050 return;
3054 /* verify that all nodes agree that we are the recmaster */
3055 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
3057 struct ctdb_context *ctdb = rec->ctdb;
3058 struct verify_recmaster_data *rmdata;
3059 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3060 struct ctdb_client_control_state *state;
3061 enum monitor_result status;
3062 int j;
3064 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
3065 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
3066 rmdata->rec = rec;
3067 rmdata->count = 0;
3068 rmdata->pnn = pnn;
3069 rmdata->status = MONITOR_OK;
3071 /* loop over all active nodes and send an async getrecmaster call to
3072 them*/
3073 for (j=0; j<nodemap->num; j++) {
3074 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3075 continue;
3077 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
3078 CONTROL_TIMEOUT(),
3079 nodemap->nodes[j].pnn);
3080 if (state == NULL) {
3081 /* we failed to send the control, treat this as
3082 an error and try again next iteration
3084 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
3085 talloc_free(mem_ctx);
3086 return MONITOR_FAILED;
3089 /* set up the callback functions */
3090 state->async.fn = verify_recmaster_callback;
3091 state->async.private_data = rmdata;
3093 /* one more control to wait for to complete */
3094 rmdata->count++;
3098 /* now wait for up to the maximum number of seconds allowed
3099 or until all nodes we expect a response from has replied
3101 while (rmdata->count > 0) {
3102 event_loop_once(ctdb->ev);
3105 status = rmdata->status;
3106 talloc_free(mem_ctx);
3107 return status;
3110 static bool interfaces_have_changed(struct ctdb_context *ctdb,
3111 struct ctdb_recoverd *rec)
3113 struct ctdb_control_get_ifaces *ifaces = NULL;
3114 TALLOC_CTX *mem_ctx;
3115 bool ret = false;
3117 mem_ctx = talloc_new(NULL);
3119 /* Read the interfaces from the local node */
3120 if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
3121 CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
3122 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
3123 /* We could return an error. However, this will be
3124 * rare so we'll decide that the interfaces have
3125 * actually changed, just in case.
3127 talloc_free(mem_ctx);
3128 return true;
3131 if (!rec->ifaces) {
3132 /* We haven't been here before so things have changed */
3133 DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
3134 ret = true;
3135 } else if (rec->ifaces->num != ifaces->num) {
3136 /* Number of interfaces has changed */
3137 DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
3138 rec->ifaces->num, ifaces->num));
3139 ret = true;
3140 } else {
3141 /* See if interface names or link states have changed */
3142 int i;
3143 for (i = 0; i < rec->ifaces->num; i++) {
3144 struct ctdb_control_iface_info * iface = &rec->ifaces->ifaces[i];
3145 if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
3146 DEBUG(DEBUG_NOTICE,
3147 ("Interface in slot %d changed: %s => %s\n",
3148 i, iface->name, ifaces->ifaces[i].name));
3149 ret = true;
3150 break;
3152 if (iface->link_state != ifaces->ifaces[i].link_state) {
3153 DEBUG(DEBUG_NOTICE,
3154 ("Interface %s changed state: %d => %d\n",
3155 iface->name, iface->link_state,
3156 ifaces->ifaces[i].link_state));
3157 ret = true;
3158 break;
3163 talloc_free(rec->ifaces);
3164 rec->ifaces = talloc_steal(rec, ifaces);
3166 talloc_free(mem_ctx);
3167 return ret;
3170 /* called to check that the local allocation of public ip addresses is ok.
3172 static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn, struct ctdb_node_map *nodemap)
3174 TALLOC_CTX *mem_ctx = talloc_new(NULL);
3175 struct ctdb_uptime *uptime1 = NULL;
3176 struct ctdb_uptime *uptime2 = NULL;
3177 int ret, j;
3178 bool need_takeover_run = false;
3180 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
3181 CTDB_CURRENT_NODE, &uptime1);
3182 if (ret != 0) {
3183 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
3184 talloc_free(mem_ctx);
3185 return -1;
3188 if (interfaces_have_changed(ctdb, rec)) {
3189 DEBUG(DEBUG_NOTICE, ("The interfaces status has changed on "
3190 "local node %u - force takeover run\n",
3191 pnn));
3192 need_takeover_run = true;
3195 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
3196 CTDB_CURRENT_NODE, &uptime2);
3197 if (ret != 0) {
3198 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
3199 talloc_free(mem_ctx);
3200 return -1;
3203 /* skip the check if the startrecovery time has changed */
3204 if (timeval_compare(&uptime1->last_recovery_started,
3205 &uptime2->last_recovery_started) != 0) {
3206 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3207 talloc_free(mem_ctx);
3208 return 0;
3211 /* skip the check if the endrecovery time has changed */
3212 if (timeval_compare(&uptime1->last_recovery_finished,
3213 &uptime2->last_recovery_finished) != 0) {
3214 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3215 talloc_free(mem_ctx);
3216 return 0;
3219 /* skip the check if we have started but not finished recovery */
3220 if (timeval_compare(&uptime1->last_recovery_finished,
3221 &uptime1->last_recovery_started) != 1) {
3222 DEBUG(DEBUG_INFO, (__location__ " in the middle of recovery or ip reallocation. skipping public ip address check\n"));
3223 talloc_free(mem_ctx);
3225 return 0;
3228 /* verify that we have the ip addresses we should have
3229 and we dont have ones we shouldnt have.
3230 if we find an inconsistency we set recmode to
3231 active on the local node and wait for the recmaster
3232 to do a full blown recovery.
3233 also if the pnn is -1 and we are healthy and can host the ip
3234 we also request a ip reallocation.
3236 if (ctdb->tunable.disable_ip_failover == 0) {
3237 struct ctdb_all_public_ips *ips = NULL;
3239 /* read the *available* IPs from the local node */
3240 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
3241 if (ret != 0) {
3242 DEBUG(DEBUG_ERR, ("Unable to get available public IPs from local node %u\n", pnn));
3243 talloc_free(mem_ctx);
3244 return -1;
3247 for (j=0; j<ips->num; j++) {
3248 if (ips->ips[j].pnn == -1 &&
3249 nodemap->nodes[pnn].flags == 0) {
3250 DEBUG(DEBUG_CRIT,("Public IP '%s' is not assigned and we could serve it\n",
3251 ctdb_addr_to_str(&ips->ips[j].addr)));
3252 need_takeover_run = true;
3256 talloc_free(ips);
3258 /* read the *known* IPs from the local node */
3259 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
3260 if (ret != 0) {
3261 DEBUG(DEBUG_ERR, ("Unable to get known public IPs from local node %u\n", pnn));
3262 talloc_free(mem_ctx);
3263 return -1;
3266 for (j=0; j<ips->num; j++) {
3267 if (ips->ips[j].pnn == pnn) {
3268 if (ctdb->do_checkpublicip && !ctdb_sys_have_ip(&ips->ips[j].addr)) {
3269 DEBUG(DEBUG_CRIT,("Public IP '%s' is assigned to us but not on an interface\n",
3270 ctdb_addr_to_str(&ips->ips[j].addr)));
3271 need_takeover_run = true;
3273 } else {
3274 if (ctdb->do_checkpublicip &&
3275 ctdb_sys_have_ip(&ips->ips[j].addr)) {
3277 DEBUG(DEBUG_CRIT,("We are still serving a public IP '%s' that we should not be serving. Removing it\n",
3278 ctdb_addr_to_str(&ips->ips[j].addr)));
3280 if (ctdb_ctrl_release_ip(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ips->ips[j]) != 0) {
3281 DEBUG(DEBUG_ERR,("Failed to release local IP address\n"));
3288 if (need_takeover_run) {
3289 struct srvid_request rd;
3290 TDB_DATA data;
3292 DEBUG(DEBUG_CRIT,("Trigger takeoverrun\n"));
3294 rd.pnn = ctdb->pnn;
3295 rd.srvid = 0;
3296 data.dptr = (uint8_t *)&rd;
3297 data.dsize = sizeof(rd);
3299 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
3300 if (ret != 0) {
3301 DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
3304 talloc_free(mem_ctx);
3305 return 0;
3309 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
3311 struct ctdb_node_map **remote_nodemaps = callback_data;
3313 if (node_pnn >= ctdb->num_nodes) {
3314 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
3315 return;
3318 remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
3322 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
3323 struct ctdb_node_map *nodemap,
3324 struct ctdb_node_map **remote_nodemaps)
3326 uint32_t *nodes;
3328 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
3329 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
3330 nodes, 0,
3331 CONTROL_TIMEOUT(), false, tdb_null,
3332 async_getnodemap_callback,
3333 NULL,
3334 remote_nodemaps) != 0) {
3335 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
3337 return -1;
3340 return 0;
3343 static int update_recovery_lock_file(struct ctdb_context *ctdb)
3345 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
3346 const char *reclockfile;
3348 if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
3349 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
3350 talloc_free(tmp_ctx);
3351 return -1;
3354 if (reclockfile == NULL) {
3355 if (ctdb->recovery_lock_file != NULL) {
3356 DEBUG(DEBUG_NOTICE,("Recovery lock file disabled\n"));
3357 talloc_free(ctdb->recovery_lock_file);
3358 ctdb->recovery_lock_file = NULL;
3359 ctdb_recovery_unlock(ctdb);
3361 talloc_free(tmp_ctx);
3362 return 0;
3365 if (ctdb->recovery_lock_file == NULL) {
3366 DEBUG(DEBUG_NOTICE,
3367 ("Recovery lock file enabled (%s)\n", reclockfile));
3368 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3369 ctdb_recovery_unlock(ctdb);
3370 talloc_free(tmp_ctx);
3371 return 0;
3375 if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
3376 talloc_free(tmp_ctx);
3377 return 0;
3380 DEBUG(DEBUG_NOTICE,
3381 ("Recovery lock file changed (now %s)\n", reclockfile));
3382 talloc_free(ctdb->recovery_lock_file);
3383 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3384 ctdb_recovery_unlock(ctdb);
3386 talloc_free(tmp_ctx);
3387 return 0;
3390 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
3391 TALLOC_CTX *mem_ctx)
3393 uint32_t pnn;
3394 struct ctdb_node_map *nodemap=NULL;
3395 struct ctdb_node_map *recmaster_nodemap=NULL;
3396 struct ctdb_node_map **remote_nodemaps=NULL;
3397 struct ctdb_vnn_map *vnnmap=NULL;
3398 struct ctdb_vnn_map *remote_vnnmap=NULL;
3399 uint32_t num_lmasters;
3400 int32_t debug_level;
3401 int i, j, ret;
3402 bool self_ban;
3405 /* verify that the main daemon is still running */
3406 if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
3407 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
3408 exit(-1);
3411 /* ping the local daemon to tell it we are alive */
3412 ctdb_ctrl_recd_ping(ctdb);
3414 if (rec->election_timeout) {
3415 /* an election is in progress */
3416 return;
3419 /* read the debug level from the parent and update locally */
3420 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
3421 if (ret !=0) {
3422 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
3423 return;
3425 DEBUGLEVEL = debug_level;
3427 /* get relevant tunables */
3428 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
3429 if (ret != 0) {
3430 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
3431 return;
3434 /* get runstate */
3435 ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(),
3436 CTDB_CURRENT_NODE, &ctdb->runstate);
3437 if (ret != 0) {
3438 DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n"));
3439 return;
3442 /* get the current recovery lock file from the server */
3443 if (update_recovery_lock_file(ctdb) != 0) {
3444 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
3445 return;
3448 /* Make sure that if recovery lock verification becomes disabled when
3449 we close the file
3451 if (ctdb->recovery_lock_file == NULL) {
3452 ctdb_recovery_unlock(ctdb);
3455 pnn = ctdb_get_pnn(ctdb);
3457 /* get the vnnmap */
3458 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
3459 if (ret != 0) {
3460 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
3461 return;
3465 /* get number of nodes */
3466 if (rec->nodemap) {
3467 talloc_free(rec->nodemap);
3468 rec->nodemap = NULL;
3469 nodemap=NULL;
3471 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
3472 if (ret != 0) {
3473 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
3474 return;
3476 nodemap = rec->nodemap;
3478 /* remember our own node flags */
3479 rec->node_flags = nodemap->nodes[pnn].flags;
3481 ban_misbehaving_nodes(rec, &self_ban);
3482 if (self_ban) {
3483 DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
3484 return;
3487 /* if the local daemon is STOPPED or BANNED, we verify that the databases are
3488 also frozen and that the recmode is set to active.
3490 if (rec->node_flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
3491 /* If this node has become inactive then we want to
3492 * reduce the chances of it taking over the recovery
3493 * master role when it becomes active again. This
3494 * helps to stabilise the recovery master role so that
3495 * it stays on the most stable node.
3497 rec->priority_time = timeval_current();
3499 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
3500 if (ret != 0) {
3501 DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
3503 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
3504 DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
3506 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
3507 if (ret != 0) {
3508 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
3510 return;
3512 ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
3513 if (ret != 0) {
3514 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node in STOPPED or BANNED state\n"));
3515 return;
3519 /* If this node is stopped or banned then it is not the recovery
3520 * master, so don't do anything. This prevents stopped or banned
3521 * node from starting election and sending unnecessary controls.
3523 return;
3526 /* check which node is the recovery master */
3527 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
3528 if (ret != 0) {
3529 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
3530 return;
3533 /* If we are not the recmaster then do some housekeeping */
3534 if (rec->recmaster != pnn) {
3535 /* Ignore any IP reallocate requests - only recmaster
3536 * processes them
3538 TALLOC_FREE(rec->reallocate_requests);
3539 /* Clear any nodes that should be force rebalanced in
3540 * the next takeover run. If the recovery master role
3541 * has moved then we don't want to process these some
3542 * time in the future.
3544 TALLOC_FREE(rec->force_rebalance_nodes);
3547 /* This is a special case. When recovery daemon is started, recmaster
3548 * is set to -1. If a node is not started in stopped state, then
3549 * start election to decide recovery master
3551 if (rec->recmaster == (uint32_t)-1) {
3552 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
3553 force_election(rec, pnn, nodemap);
3554 return;
3557 /* update the capabilities for all nodes */
3558 ret = update_capabilities(rec, nodemap);
3559 if (ret != 0) {
3560 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
3561 return;
3565 * If the current recmaster does not have CTDB_CAP_RECMASTER,
3566 * but we have, then force an election and try to become the new
3567 * recmaster.
3569 if (!ctdb_node_has_capabilities(rec->caps,
3570 rec->recmaster,
3571 CTDB_CAP_RECMASTER) &&
3572 (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
3573 !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
3574 DEBUG(DEBUG_ERR, (__location__ " Current recmaster node %u does not have CAP_RECMASTER,"
3575 " but we (node %u) have - force an election\n",
3576 rec->recmaster, pnn));
3577 force_election(rec, pnn, nodemap);
3578 return;
3581 /* verify that the recmaster node is still active */
3582 for (j=0; j<nodemap->num; j++) {
3583 if (nodemap->nodes[j].pnn==rec->recmaster) {
3584 break;
3588 if (j == nodemap->num) {
3589 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
3590 force_election(rec, pnn, nodemap);
3591 return;
3594 /* if recovery master is disconnected we must elect a new recmaster */
3595 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
3596 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
3597 force_election(rec, pnn, nodemap);
3598 return;
3601 /* get nodemap from the recovery master to check if it is inactive */
3602 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3603 mem_ctx, &recmaster_nodemap);
3604 if (ret != 0) {
3605 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
3606 nodemap->nodes[j].pnn));
3607 return;
3611 if ((recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) &&
3612 (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
3613 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
3615 * update our nodemap to carry the recmaster's notion of
3616 * its own flags, so that we don't keep freezing the
3617 * inactive recmaster node...
3619 nodemap->nodes[j].flags = recmaster_nodemap->nodes[j].flags;
3620 force_election(rec, pnn, nodemap);
3621 return;
3624 /* verify that we have all ip addresses we should have and we dont
3625 * have addresses we shouldnt have.
3627 if (ctdb->tunable.disable_ip_failover == 0 &&
3628 !ctdb_op_is_disabled(rec->takeover_run)) {
3629 if (verify_local_ip_allocation(ctdb, rec, pnn, nodemap) != 0) {
3630 DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
3635 /* if we are not the recmaster then we do not need to check
3636 if recovery is needed
3638 if (pnn != rec->recmaster) {
3639 return;
3643 /* ensure our local copies of flags are right */
3644 ret = update_local_flags(rec, nodemap);
3645 if (ret == MONITOR_ELECTION_NEEDED) {
3646 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
3647 force_election(rec, pnn, nodemap);
3648 return;
3650 if (ret != MONITOR_OK) {
3651 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
3652 return;
3655 if (ctdb->num_nodes != nodemap->num) {
3656 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
3657 ctdb_load_nodes_file(ctdb);
3658 return;
3661 /* verify that all active nodes agree that we are the recmaster */
3662 switch (verify_recmaster(rec, nodemap, pnn)) {
3663 case MONITOR_RECOVERY_NEEDED:
3664 /* can not happen */
3665 return;
3666 case MONITOR_ELECTION_NEEDED:
3667 force_election(rec, pnn, nodemap);
3668 return;
3669 case MONITOR_OK:
3670 break;
3671 case MONITOR_FAILED:
3672 return;
3676 if (rec->need_recovery) {
3677 /* a previous recovery didn't finish */
3678 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3679 return;
3682 /* verify that all active nodes are in normal mode
3683 and not in recovery mode
3685 switch (verify_recmode(ctdb, nodemap)) {
3686 case MONITOR_RECOVERY_NEEDED:
3687 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3688 return;
3689 case MONITOR_FAILED:
3690 return;
3691 case MONITOR_ELECTION_NEEDED:
3692 /* can not happen */
3693 case MONITOR_OK:
3694 break;
3698 if (ctdb->recovery_lock_file != NULL) {
3699 /* We must already hold the recovery lock */
3700 if (!ctdb_recovery_have_lock(ctdb)) {
3701 DEBUG(DEBUG_ERR,("Failed recovery lock sanity check. Force a recovery\n"));
3702 ctdb_set_culprit(rec, ctdb->pnn);
3703 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3704 return;
3709 /* if there are takeovers requested, perform it and notify the waiters */
3710 if (!ctdb_op_is_disabled(rec->takeover_run) &&
3711 rec->reallocate_requests) {
3712 process_ipreallocate_requests(ctdb, rec);
3715 /* If recoveries are disabled then there is no use doing any
3716 * nodemap or flags checks. Recoveries might be disabled due
3717 * to "reloadnodes", so doing these checks might cause an
3718 * unnecessary recovery. */
3719 if (ctdb_op_is_disabled(rec->recovery)) {
3720 return;
3723 /* get the nodemap for all active remote nodes
3725 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
3726 if (remote_nodemaps == NULL) {
3727 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3728 return;
3730 for(i=0; i<nodemap->num; i++) {
3731 remote_nodemaps[i] = NULL;
3733 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3734 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3735 return;
3738 /* verify that all other nodes have the same nodemap as we have
3740 for (j=0; j<nodemap->num; j++) {
3741 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3742 continue;
3745 if (remote_nodemaps[j] == NULL) {
3746 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3747 ctdb_set_culprit(rec, j);
3749 return;
3752 /* if the nodes disagree on how many nodes there are
3753 then this is a good reason to try recovery
3755 if (remote_nodemaps[j]->num != nodemap->num) {
3756 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3757 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3758 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3759 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3760 return;
3763 /* if the nodes disagree on which nodes exist and are
3764 active, then that is also a good reason to do recovery
3766 for (i=0;i<nodemap->num;i++) {
3767 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3768 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3769 nodemap->nodes[j].pnn, i,
3770 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3771 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3772 do_recovery(rec, mem_ctx, pnn, nodemap,
3773 vnnmap);
3774 return;
3780 * Update node flags obtained from each active node. This ensure we have
3781 * up-to-date information for all the nodes.
3783 for (j=0; j<nodemap->num; j++) {
3784 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3785 continue;
3787 nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
3790 for (j=0; j<nodemap->num; j++) {
3791 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3792 continue;
3795 /* verify the flags are consistent
3797 for (i=0; i<nodemap->num; i++) {
3798 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3799 continue;
3802 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3803 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3804 nodemap->nodes[j].pnn,
3805 nodemap->nodes[i].pnn,
3806 remote_nodemaps[j]->nodes[i].flags,
3807 nodemap->nodes[i].flags));
3808 if (i == j) {
3809 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3810 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3811 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3812 do_recovery(rec, mem_ctx, pnn, nodemap,
3813 vnnmap);
3814 return;
3815 } else {
3816 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3817 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3818 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3819 do_recovery(rec, mem_ctx, pnn, nodemap,
3820 vnnmap);
3821 return;
3828 /* count how many active nodes there are */
3829 num_lmasters = 0;
3830 for (i=0; i<nodemap->num; i++) {
3831 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
3832 if (ctdb_node_has_capabilities(rec->caps,
3833 ctdb->nodes[i]->pnn,
3834 CTDB_CAP_LMASTER)) {
3835 num_lmasters++;
3841 /* There must be the same number of lmasters in the vnn map as
3842 * there are active nodes with the lmaster capability... or
3843 * do a recovery.
3845 if (vnnmap->size != num_lmasters) {
3846 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
3847 vnnmap->size, num_lmasters));
3848 ctdb_set_culprit(rec, ctdb->pnn);
3849 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3850 return;
3853 /* verify that all active nodes in the nodemap also exist in
3854 the vnnmap.
3856 for (j=0; j<nodemap->num; j++) {
3857 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3858 continue;
3860 if (nodemap->nodes[j].pnn == pnn) {
3861 continue;
3864 for (i=0; i<vnnmap->size; i++) {
3865 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3866 break;
3869 if (i == vnnmap->size) {
3870 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3871 nodemap->nodes[j].pnn));
3872 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3873 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3874 return;
3879 /* verify that all other nodes have the same vnnmap
3880 and are from the same generation
3882 for (j=0; j<nodemap->num; j++) {
3883 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3884 continue;
3886 if (nodemap->nodes[j].pnn == pnn) {
3887 continue;
3890 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3891 mem_ctx, &remote_vnnmap);
3892 if (ret != 0) {
3893 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3894 nodemap->nodes[j].pnn));
3895 return;
3898 /* verify the vnnmap generation is the same */
3899 if (vnnmap->generation != remote_vnnmap->generation) {
3900 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3901 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3902 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3903 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3904 return;
3907 /* verify the vnnmap size is the same */
3908 if (vnnmap->size != remote_vnnmap->size) {
3909 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3910 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3911 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3912 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3913 return;
3916 /* verify the vnnmap is the same */
3917 for (i=0;i<vnnmap->size;i++) {
3918 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3919 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3920 nodemap->nodes[j].pnn));
3921 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3922 do_recovery(rec, mem_ctx, pnn, nodemap,
3923 vnnmap);
3924 return;
3929 /* we might need to change who has what IP assigned */
3930 if (rec->need_takeover_run) {
3931 uint32_t culprit = (uint32_t)-1;
3933 rec->need_takeover_run = false;
3935 /* update the list of public ips that a node can handle for
3936 all connected nodes
3938 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
3939 if (ret != 0) {
3940 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
3941 culprit));
3942 rec->need_takeover_run = true;
3943 return;
3946 /* execute the "startrecovery" event script on all nodes */
3947 ret = run_startrecovery_eventscript(rec, nodemap);
3948 if (ret!=0) {
3949 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
3950 ctdb_set_culprit(rec, ctdb->pnn);
3951 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3952 return;
3955 /* If takeover run fails, then the offending nodes are
3956 * assigned ban culprit counts. And we re-try takeover.
3957 * If takeover run fails repeatedly, the node would get
3958 * banned.
3960 * If rec->need_takeover_run is not set to true at this
3961 * failure, monitoring is disabled cluster-wide (via
3962 * startrecovery eventscript) and will not get enabled.
3964 if (!do_takeover_run(rec, nodemap, true)) {
3965 return;
3968 /* execute the "recovered" event script on all nodes */
3969 ret = run_recovered_eventscript(rec, nodemap, "monitor_cluster");
3970 #if 0
3971 // we cant check whether the event completed successfully
3972 // since this script WILL fail if the node is in recovery mode
3973 // and if that race happens, the code here would just cause a second
3974 // cascading recovery.
3975 if (ret!=0) {
3976 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
3977 ctdb_set_culprit(rec, ctdb->pnn);
3978 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3980 #endif
3985 the main monitoring loop
3987 static void monitor_cluster(struct ctdb_context *ctdb)
3989 struct ctdb_recoverd *rec;
3991 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
3993 rec = talloc_zero(ctdb, struct ctdb_recoverd);
3994 CTDB_NO_MEMORY_FATAL(ctdb, rec);
3996 rec->ctdb = ctdb;
3998 rec->takeover_run = ctdb_op_init(rec, "takeover runs");
3999 CTDB_NO_MEMORY_FATAL(ctdb, rec->takeover_run);
4001 rec->recovery = ctdb_op_init(rec, "recoveries");
4002 CTDB_NO_MEMORY_FATAL(ctdb, rec->recovery);
4004 rec->priority_time = timeval_current();
4006 /* register a message port for sending memory dumps */
4007 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
4009 /* register a message port for recovery elections */
4010 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
4012 /* when nodes are disabled/enabled */
4013 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
4015 /* when we are asked to puch out a flag change */
4016 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
4018 /* register a message port for vacuum fetch */
4019 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
4021 /* register a message port for reloadnodes */
4022 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
4024 /* register a message port for performing a takeover run */
4025 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
4027 /* register a message port for disabling the ip check for a short while */
4028 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
4030 /* register a message port for updating the recovery daemons node assignment for an ip */
4031 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECD_UPDATE_IP, recd_update_ip_handler, rec);
4033 /* register a message port for forcing a rebalance of a node next
4034 reallocation */
4035 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
4037 /* Register a message port for disabling takeover runs */
4038 ctdb_client_set_message_handler(ctdb,
4039 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
4040 disable_takeover_runs_handler, rec);
4042 /* Register a message port for disabling recoveries */
4043 ctdb_client_set_message_handler(ctdb,
4044 CTDB_SRVID_DISABLE_RECOVERIES,
4045 disable_recoveries_handler, rec);
4047 /* register a message port for detaching database */
4048 ctdb_client_set_message_handler(ctdb,
4049 CTDB_SRVID_DETACH_DATABASE,
4050 detach_database_handler, rec);
4052 for (;;) {
4053 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
4054 struct timeval start;
4055 double elapsed;
4057 if (!mem_ctx) {
4058 DEBUG(DEBUG_CRIT,(__location__
4059 " Failed to create temp context\n"));
4060 exit(-1);
4063 start = timeval_current();
4064 main_loop(ctdb, rec, mem_ctx);
4065 talloc_free(mem_ctx);
4067 /* we only check for recovery once every second */
4068 elapsed = timeval_elapsed(&start);
4069 if (elapsed < ctdb->tunable.recover_interval) {
4070 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
4071 - elapsed);
4077 event handler for when the main ctdbd dies
4079 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
4080 uint16_t flags, void *private_data)
4082 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
4083 _exit(1);
4087 called regularly to verify that the recovery daemon is still running
4089 static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
4090 struct timeval yt, void *p)
4092 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
4094 if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
4095 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
4097 event_add_timed(ctdb->ev, ctdb, timeval_zero(),
4098 ctdb_restart_recd, ctdb);
4100 return;
4103 event_add_timed(ctdb->ev, ctdb->recd_ctx,
4104 timeval_current_ofs(30, 0),
4105 ctdb_check_recd, ctdb);
4108 static void recd_sig_child_handler(struct event_context *ev,
4109 struct signal_event *se, int signum, int count,
4110 void *dont_care,
4111 void *private_data)
4113 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4114 int status;
4115 pid_t pid = -1;
4117 while (pid != 0) {
4118 pid = waitpid(-1, &status, WNOHANG);
4119 if (pid == -1) {
4120 if (errno != ECHILD) {
4121 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
4123 return;
4125 if (pid > 0) {
4126 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
4132 startup the recovery daemon as a child of the main ctdb daemon
4134 int ctdb_start_recoverd(struct ctdb_context *ctdb)
4136 int fd[2];
4137 struct signal_event *se;
4138 struct tevent_fd *fde;
4140 if (pipe(fd) != 0) {
4141 return -1;
4144 ctdb->recoverd_pid = ctdb_fork(ctdb);
4145 if (ctdb->recoverd_pid == -1) {
4146 return -1;
4149 if (ctdb->recoverd_pid != 0) {
4150 talloc_free(ctdb->recd_ctx);
4151 ctdb->recd_ctx = talloc_new(ctdb);
4152 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
4154 close(fd[0]);
4155 event_add_timed(ctdb->ev, ctdb->recd_ctx,
4156 timeval_current_ofs(30, 0),
4157 ctdb_check_recd, ctdb);
4158 return 0;
4161 close(fd[1]);
4163 srandom(getpid() ^ time(NULL));
4165 ctdb_set_process_name("ctdb_recovered");
4166 if (switch_from_server_to_client(ctdb, "recoverd") != 0) {
4167 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
4168 exit(1);
4171 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
4173 fde = event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ,
4174 ctdb_recoverd_parent, &fd[0]);
4175 tevent_fd_set_auto_close(fde);
4177 /* set up a handler to pick up sigchld */
4178 se = event_add_signal(ctdb->ev, ctdb,
4179 SIGCHLD, 0,
4180 recd_sig_child_handler,
4181 ctdb);
4182 if (se == NULL) {
4183 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
4184 exit(1);
4187 monitor_cluster(ctdb);
4189 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
4190 return -1;
4194 shutdown the recovery daemon
4196 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
4198 if (ctdb->recoverd_pid == 0) {
4199 return;
4202 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
4203 ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
4205 TALLOC_FREE(ctdb->recd_ctx);
4206 TALLOC_FREE(ctdb->recd_ping_count);
4209 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te,
4210 struct timeval t, void *private_data)
4212 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4214 DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
4215 ctdb_stop_recoverd(ctdb);
4216 ctdb_start_recoverd(ctdb);