ctdb-scripts: New consistent system memory and swap monitoring
[Samba.git] / ctdb / server / ctdb_recoverd.c
blobbe53de615f766d58d745109a04f23d44e185159b
1 /*
2 ctdb recovery daemon
4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
20 #include "includes.h"
21 #include "system/filesys.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/wait.h"
25 #include "popt.h"
26 #include "cmdline.h"
27 #include "../include/ctdb_client.h"
28 #include "../include/ctdb_private.h"
29 #include "lib/tdb_wrap/tdb_wrap.h"
30 #include "lib/util/dlinklist.h"
33 /* List of SRVID requests that need to be processed */
34 struct srvid_list {
35 struct srvid_list *next, *prev;
36 struct srvid_request *request;
39 struct srvid_requests {
40 struct srvid_list *requests;
43 static void srvid_request_reply(struct ctdb_context *ctdb,
44 struct srvid_request *request,
45 TDB_DATA result)
47 /* Someone that sent srvid==0 does not want a reply */
48 if (request->srvid == 0) {
49 talloc_free(request);
50 return;
53 if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
54 result) == 0) {
55 DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
56 (unsigned)request->pnn,
57 (unsigned long long)request->srvid));
58 } else {
59 DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
60 (unsigned)request->pnn,
61 (unsigned long long)request->srvid));
64 talloc_free(request);
67 static void srvid_requests_reply(struct ctdb_context *ctdb,
68 struct srvid_requests **requests,
69 TDB_DATA result)
71 struct srvid_list *r;
73 for (r = (*requests)->requests; r != NULL; r = r->next) {
74 srvid_request_reply(ctdb, r->request, result);
77 /* Free the list structure... */
78 TALLOC_FREE(*requests);
81 static void srvid_request_add(struct ctdb_context *ctdb,
82 struct srvid_requests **requests,
83 struct srvid_request *request)
85 struct srvid_list *t;
86 int32_t ret;
87 TDB_DATA result;
89 if (*requests == NULL) {
90 *requests = talloc_zero(ctdb, struct srvid_requests);
91 if (*requests == NULL) {
92 goto nomem;
96 t = talloc_zero(*requests, struct srvid_list);
97 if (t == NULL) {
98 /* If *requests was just allocated above then free it */
99 if ((*requests)->requests == NULL) {
100 TALLOC_FREE(*requests);
102 goto nomem;
105 t->request = (struct srvid_request *)talloc_steal(t, request);
106 DLIST_ADD((*requests)->requests, t);
108 return;
110 nomem:
111 /* Failed to add the request to the list. Send a fail. */
112 DEBUG(DEBUG_ERR, (__location__
113 " Out of memory, failed to queue SRVID request\n"));
114 ret = -ENOMEM;
115 result.dsize = sizeof(ret);
116 result.dptr = (uint8_t *)&ret;
117 srvid_request_reply(ctdb, request, result);
120 /* An abstraction to allow an operation (takeover runs, recoveries,
121 * ...) to be disabled for a given timeout */
122 struct ctdb_op_state {
123 struct tevent_timer *timer;
124 bool in_progress;
125 const char *name;
128 static struct ctdb_op_state *ctdb_op_init(TALLOC_CTX *mem_ctx, const char *name)
130 struct ctdb_op_state *state = talloc_zero(mem_ctx, struct ctdb_op_state);
132 if (state != NULL) {
133 state->in_progress = false;
134 state->name = name;
137 return state;
140 static bool ctdb_op_is_disabled(struct ctdb_op_state *state)
142 return state->timer != NULL;
145 static bool ctdb_op_begin(struct ctdb_op_state *state)
147 if (ctdb_op_is_disabled(state)) {
148 DEBUG(DEBUG_NOTICE,
149 ("Unable to begin - %s are disabled\n", state->name));
150 return false;
153 state->in_progress = true;
154 return true;
157 static bool ctdb_op_end(struct ctdb_op_state *state)
159 return state->in_progress = false;
162 static bool ctdb_op_is_in_progress(struct ctdb_op_state *state)
164 return state->in_progress;
167 static void ctdb_op_enable(struct ctdb_op_state *state)
169 TALLOC_FREE(state->timer);
172 static void ctdb_op_timeout_handler(struct event_context *ev,
173 struct timed_event *te,
174 struct timeval yt, void *p)
176 struct ctdb_op_state *state =
177 talloc_get_type(p, struct ctdb_op_state);
179 DEBUG(DEBUG_NOTICE,("Reenabling %s after timeout\n", state->name));
180 ctdb_op_enable(state);
183 static int ctdb_op_disable(struct ctdb_op_state *state,
184 struct tevent_context *ev,
185 uint32_t timeout)
187 if (timeout == 0) {
188 DEBUG(DEBUG_NOTICE,("Reenabling %s\n", state->name));
189 ctdb_op_enable(state);
190 return 0;
193 if (state->in_progress) {
194 DEBUG(DEBUG_ERR,
195 ("Unable to disable %s - in progress\n", state->name));
196 return -EAGAIN;
199 DEBUG(DEBUG_NOTICE,("Disabling %s for %u seconds\n",
200 state->name, timeout));
202 /* Clear any old timers */
203 talloc_free(state->timer);
205 /* Arrange for the timeout to occur */
206 state->timer = tevent_add_timer(ev, state,
207 timeval_current_ofs(timeout, 0),
208 ctdb_op_timeout_handler, state);
209 if (state->timer == NULL) {
210 DEBUG(DEBUG_ERR,(__location__ " Unable to setup timer\n"));
211 return -ENOMEM;
214 return 0;
217 struct ctdb_banning_state {
218 uint32_t count;
219 struct timeval last_reported_time;
223 private state of recovery daemon
225 struct ctdb_recoverd {
226 struct ctdb_context *ctdb;
227 uint32_t recmaster;
228 uint32_t last_culprit_node;
229 struct ctdb_node_map *nodemap;
230 struct timeval priority_time;
231 bool need_takeover_run;
232 bool need_recovery;
233 uint32_t node_flags;
234 struct timed_event *send_election_te;
235 struct timed_event *election_timeout;
236 struct srvid_requests *reallocate_requests;
237 struct ctdb_op_state *takeover_run;
238 struct ctdb_op_state *recovery;
239 struct ctdb_control_get_ifaces *ifaces;
240 uint32_t *force_rebalance_nodes;
241 struct ctdb_node_capabilities *caps;
244 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
245 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
247 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te, struct timeval t, void *private_data);
250 ban a node for a period of time
252 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
254 int ret;
255 struct ctdb_context *ctdb = rec->ctdb;
256 struct ctdb_ban_time bantime;
258 if (!ctdb_validate_pnn(ctdb, pnn)) {
259 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
260 return;
263 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
265 bantime.pnn = pnn;
266 bantime.time = ban_time;
268 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
269 if (ret != 0) {
270 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
271 return;
276 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
280 remember the trouble maker
282 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
284 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
285 struct ctdb_banning_state *ban_state;
287 if (culprit > ctdb->num_nodes) {
288 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
289 return;
292 /* If we are banned or stopped, do not set other nodes as culprits */
293 if (rec->node_flags & NODE_FLAGS_INACTIVE) {
294 DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
295 return;
298 if (ctdb->nodes[culprit]->ban_state == NULL) {
299 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
300 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
304 ban_state = ctdb->nodes[culprit]->ban_state;
305 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
306 /* this was the first time in a long while this node
307 misbehaved so we will forgive any old transgressions.
309 ban_state->count = 0;
312 ban_state->count += count;
313 ban_state->last_reported_time = timeval_current();
314 rec->last_culprit_node = culprit;
318 remember the trouble maker
320 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
322 ctdb_set_culprit_count(rec, culprit, 1);
326 /* this callback is called for every node that failed to execute the
327 recovered event
329 static void recovered_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
331 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
333 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the recovered event. Setting it as recovery fail culprit\n", node_pnn));
335 ctdb_set_culprit(rec, node_pnn);
339 run the "recovered" eventscript on all nodes
341 static int run_recovered_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, const char *caller)
343 TALLOC_CTX *tmp_ctx;
344 uint32_t *nodes;
345 struct ctdb_context *ctdb = rec->ctdb;
347 tmp_ctx = talloc_new(ctdb);
348 CTDB_NO_MEMORY(ctdb, tmp_ctx);
350 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
351 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
352 nodes, 0,
353 CONTROL_TIMEOUT(), false, tdb_null,
354 NULL, recovered_fail_callback,
355 rec) != 0) {
356 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
358 talloc_free(tmp_ctx);
359 return -1;
362 talloc_free(tmp_ctx);
363 return 0;
366 /* this callback is called for every node that failed to execute the
367 start recovery event
369 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
371 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
373 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
375 ctdb_set_culprit(rec, node_pnn);
379 run the "startrecovery" eventscript on all nodes
381 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
383 TALLOC_CTX *tmp_ctx;
384 uint32_t *nodes;
385 struct ctdb_context *ctdb = rec->ctdb;
387 tmp_ctx = talloc_new(ctdb);
388 CTDB_NO_MEMORY(ctdb, tmp_ctx);
390 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
391 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
392 nodes, 0,
393 CONTROL_TIMEOUT(), false, tdb_null,
394 NULL,
395 startrecovery_fail_callback,
396 rec) != 0) {
397 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
398 talloc_free(tmp_ctx);
399 return -1;
402 talloc_free(tmp_ctx);
403 return 0;
407 update the node capabilities for all connected nodes
409 static int update_capabilities(struct ctdb_recoverd *rec,
410 struct ctdb_node_map *nodemap)
412 uint32_t *capp;
413 TALLOC_CTX *tmp_ctx;
414 struct ctdb_node_capabilities *caps;
415 struct ctdb_context *ctdb = rec->ctdb;
417 tmp_ctx = talloc_new(rec);
418 CTDB_NO_MEMORY(ctdb, tmp_ctx);
420 caps = ctdb_get_capabilities(ctdb, tmp_ctx,
421 CONTROL_TIMEOUT(), nodemap);
423 if (caps == NULL) {
424 DEBUG(DEBUG_ERR,
425 (__location__ " Failed to get node capabilities\n"));
426 talloc_free(tmp_ctx);
427 return -1;
430 capp = ctdb_get_node_capabilities(caps, ctdb_get_pnn(ctdb));
431 if (capp == NULL) {
432 DEBUG(DEBUG_ERR,
433 (__location__
434 " Capabilities don't include current node.\n"));
435 talloc_free(tmp_ctx);
436 return -1;
438 ctdb->capabilities = *capp;
440 TALLOC_FREE(rec->caps);
441 rec->caps = talloc_steal(rec, caps);
443 talloc_free(tmp_ctx);
444 return 0;
447 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
449 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
451 DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
452 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
455 static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
457 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
459 DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
460 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
464 change recovery mode on all nodes
466 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t rec_mode)
468 TDB_DATA data;
469 uint32_t *nodes;
470 TALLOC_CTX *tmp_ctx;
472 tmp_ctx = talloc_new(ctdb);
473 CTDB_NO_MEMORY(ctdb, tmp_ctx);
475 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
477 data.dsize = sizeof(uint32_t);
478 data.dptr = (unsigned char *)&rec_mode;
480 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
481 nodes, 0,
482 CONTROL_TIMEOUT(),
483 false, data,
484 NULL, NULL,
485 NULL) != 0) {
486 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
487 talloc_free(tmp_ctx);
488 return -1;
491 /* freeze all nodes */
492 if (rec_mode == CTDB_RECOVERY_ACTIVE) {
493 int i;
495 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
496 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
497 nodes, i,
498 CONTROL_TIMEOUT(),
499 false, tdb_null,
500 NULL,
501 set_recmode_fail_callback,
502 rec) != 0) {
503 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
504 talloc_free(tmp_ctx);
505 return -1;
510 talloc_free(tmp_ctx);
511 return 0;
515 change recovery master on all node
517 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
519 TDB_DATA data;
520 TALLOC_CTX *tmp_ctx;
521 uint32_t *nodes;
523 tmp_ctx = talloc_new(ctdb);
524 CTDB_NO_MEMORY(ctdb, tmp_ctx);
526 data.dsize = sizeof(uint32_t);
527 data.dptr = (unsigned char *)&pnn;
529 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
530 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
531 nodes, 0,
532 CONTROL_TIMEOUT(), false, data,
533 NULL, NULL,
534 NULL) != 0) {
535 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
536 talloc_free(tmp_ctx);
537 return -1;
540 talloc_free(tmp_ctx);
541 return 0;
544 /* update all remote nodes to use the same db priority that we have
545 this can fail if the remove node has not yet been upgraded to
546 support this function, so we always return success and never fail
547 a recovery if this call fails.
549 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
550 struct ctdb_node_map *nodemap,
551 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
553 int db;
555 /* step through all local databases */
556 for (db=0; db<dbmap->num;db++) {
557 struct ctdb_db_priority db_prio;
558 int ret;
560 db_prio.db_id = dbmap->dbs[db].dbid;
561 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].dbid, &db_prio.priority);
562 if (ret != 0) {
563 DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].dbid));
564 continue;
567 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].dbid, db_prio.priority));
569 ret = ctdb_ctrl_set_db_priority(ctdb, CONTROL_TIMEOUT(),
570 CTDB_CURRENT_NODE, &db_prio);
571 if (ret != 0) {
572 DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n",
573 db_prio.db_id));
577 return 0;
581 ensure all other nodes have attached to any databases that we have
583 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
584 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
586 int i, j, db, ret;
587 struct ctdb_dbid_map *remote_dbmap;
589 /* verify that all other nodes have all our databases */
590 for (j=0; j<nodemap->num; j++) {
591 /* we dont need to ourself ourselves */
592 if (nodemap->nodes[j].pnn == pnn) {
593 continue;
595 /* dont check nodes that are unavailable */
596 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
597 continue;
600 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
601 mem_ctx, &remote_dbmap);
602 if (ret != 0) {
603 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
604 return -1;
607 /* step through all local databases */
608 for (db=0; db<dbmap->num;db++) {
609 const char *name;
612 for (i=0;i<remote_dbmap->num;i++) {
613 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
614 break;
617 /* the remote node already have this database */
618 if (i!=remote_dbmap->num) {
619 continue;
621 /* ok so we need to create this database */
622 ret = ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn,
623 dbmap->dbs[db].dbid, mem_ctx,
624 &name);
625 if (ret != 0) {
626 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
627 return -1;
629 ret = ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(),
630 nodemap->nodes[j].pnn,
631 mem_ctx, name,
632 dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
633 if (ret != 0) {
634 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
635 return -1;
640 return 0;
645 ensure we are attached to any databases that anyone else is attached to
647 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
648 uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
650 int i, j, db, ret;
651 struct ctdb_dbid_map *remote_dbmap;
653 /* verify that we have all database any other node has */
654 for (j=0; j<nodemap->num; j++) {
655 /* we dont need to ourself ourselves */
656 if (nodemap->nodes[j].pnn == pnn) {
657 continue;
659 /* dont check nodes that are unavailable */
660 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
661 continue;
664 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
665 mem_ctx, &remote_dbmap);
666 if (ret != 0) {
667 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
668 return -1;
671 /* step through all databases on the remote node */
672 for (db=0; db<remote_dbmap->num;db++) {
673 const char *name;
675 for (i=0;i<(*dbmap)->num;i++) {
676 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
677 break;
680 /* we already have this db locally */
681 if (i!=(*dbmap)->num) {
682 continue;
684 /* ok so we need to create this database and
685 rebuild dbmap
687 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
688 remote_dbmap->dbs[db].dbid, mem_ctx, &name);
689 if (ret != 0) {
690 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
691 nodemap->nodes[j].pnn));
692 return -1;
694 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
695 remote_dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
696 if (ret != 0) {
697 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
698 return -1;
700 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
701 if (ret != 0) {
702 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
703 return -1;
708 return 0;
713 pull the remote database contents from one node into the recdb
715 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
716 struct tdb_wrap *recdb, uint32_t dbid)
718 int ret;
719 TDB_DATA outdata;
720 struct ctdb_marshall_buffer *reply;
721 struct ctdb_rec_data *recdata;
722 int i;
723 TALLOC_CTX *tmp_ctx = talloc_new(recdb);
725 ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
726 CONTROL_TIMEOUT(), &outdata);
727 if (ret != 0) {
728 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
729 talloc_free(tmp_ctx);
730 return -1;
733 reply = (struct ctdb_marshall_buffer *)outdata.dptr;
735 if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
736 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
737 talloc_free(tmp_ctx);
738 return -1;
741 recdata = (struct ctdb_rec_data *)&reply->data[0];
743 for (i=0;
744 i<reply->count;
745 recdata = (struct ctdb_rec_data *)(recdata->length + (uint8_t *)recdata), i++) {
746 TDB_DATA key, data;
747 struct ctdb_ltdb_header *hdr;
748 TDB_DATA existing;
750 key.dptr = &recdata->data[0];
751 key.dsize = recdata->keylen;
752 data.dptr = &recdata->data[key.dsize];
753 data.dsize = recdata->datalen;
755 hdr = (struct ctdb_ltdb_header *)data.dptr;
757 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
758 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
759 talloc_free(tmp_ctx);
760 return -1;
763 /* fetch the existing record, if any */
764 existing = tdb_fetch(recdb->tdb, key);
766 if (existing.dptr != NULL) {
767 struct ctdb_ltdb_header header;
768 if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
769 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
770 (unsigned)existing.dsize, srcnode));
771 free(existing.dptr);
772 talloc_free(tmp_ctx);
773 return -1;
775 header = *(struct ctdb_ltdb_header *)existing.dptr;
776 free(existing.dptr);
777 if (!(header.rsn < hdr->rsn ||
778 (header.dmaster != ctdb_get_pnn(ctdb) &&
779 header.rsn == hdr->rsn))) {
780 continue;
784 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
785 DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
786 talloc_free(tmp_ctx);
787 return -1;
791 talloc_free(tmp_ctx);
793 return 0;
797 struct pull_seqnum_cbdata {
798 int failed;
799 uint32_t pnn;
800 uint64_t seqnum;
803 static void pull_seqnum_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
805 struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
806 uint64_t seqnum;
808 if (cb_data->failed != 0) {
809 DEBUG(DEBUG_ERR, ("Got seqnum from node %d but we have already failed the entire operation\n", node_pnn));
810 return;
813 if (res != 0) {
814 DEBUG(DEBUG_ERR, ("Error when pulling seqnum from node %d\n", node_pnn));
815 cb_data->failed = 1;
816 return;
819 if (outdata.dsize != sizeof(uint64_t)) {
820 DEBUG(DEBUG_ERR, ("Error when reading pull seqnum from node %d, got %d bytes but expected %d\n", node_pnn, (int)outdata.dsize, (int)sizeof(uint64_t)));
821 cb_data->failed = -1;
822 return;
825 seqnum = *((uint64_t *)outdata.dptr);
827 if (seqnum > cb_data->seqnum ||
828 (cb_data->pnn == -1 && seqnum == 0)) {
829 cb_data->seqnum = seqnum;
830 cb_data->pnn = node_pnn;
834 static void pull_seqnum_fail_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
836 struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
838 DEBUG(DEBUG_ERR, ("Failed to pull db seqnum from node %d\n", node_pnn));
839 cb_data->failed = 1;
842 static int pull_highest_seqnum_pdb(struct ctdb_context *ctdb,
843 struct ctdb_recoverd *rec,
844 struct ctdb_node_map *nodemap,
845 struct tdb_wrap *recdb, uint32_t dbid)
847 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
848 uint32_t *nodes;
849 TDB_DATA data;
850 uint32_t outdata[2];
851 struct pull_seqnum_cbdata *cb_data;
853 DEBUG(DEBUG_NOTICE, ("Scan for highest seqnum pdb for db:0x%08x\n", dbid));
855 outdata[0] = dbid;
856 outdata[1] = 0;
858 data.dsize = sizeof(outdata);
859 data.dptr = (uint8_t *)&outdata[0];
861 cb_data = talloc(tmp_ctx, struct pull_seqnum_cbdata);
862 if (cb_data == NULL) {
863 DEBUG(DEBUG_ERR, ("Failed to allocate pull highest seqnum cb_data structure\n"));
864 talloc_free(tmp_ctx);
865 return -1;
868 cb_data->failed = 0;
869 cb_data->pnn = -1;
870 cb_data->seqnum = 0;
872 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
873 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_DB_SEQNUM,
874 nodes, 0,
875 CONTROL_TIMEOUT(), false, data,
876 pull_seqnum_cb,
877 pull_seqnum_fail_cb,
878 cb_data) != 0) {
879 DEBUG(DEBUG_ERR, (__location__ " Failed to run async GET_DB_SEQNUM\n"));
881 talloc_free(tmp_ctx);
882 return -1;
885 if (cb_data->failed != 0) {
886 DEBUG(DEBUG_NOTICE, ("Failed to pull sequence numbers for DB 0x%08x\n", dbid));
887 talloc_free(tmp_ctx);
888 return -1;
891 if (cb_data->pnn == -1) {
892 DEBUG(DEBUG_NOTICE, ("Failed to find a node with highest sequence numbers for DB 0x%08x\n", dbid));
893 talloc_free(tmp_ctx);
894 return -1;
897 DEBUG(DEBUG_NOTICE, ("Pull persistent db:0x%08x from node %d with highest seqnum:%lld\n", dbid, cb_data->pnn, (long long)cb_data->seqnum));
899 if (pull_one_remote_database(ctdb, cb_data->pnn, recdb, dbid) != 0) {
900 DEBUG(DEBUG_ERR, ("Failed to pull higest seqnum database 0x%08x from node %d\n", dbid, cb_data->pnn));
901 talloc_free(tmp_ctx);
902 return -1;
905 talloc_free(tmp_ctx);
906 return 0;
911 pull all the remote database contents into the recdb
913 static int pull_remote_database(struct ctdb_context *ctdb,
914 struct ctdb_recoverd *rec,
915 struct ctdb_node_map *nodemap,
916 struct tdb_wrap *recdb, uint32_t dbid,
917 bool persistent)
919 int j;
921 if (persistent && ctdb->tunable.recover_pdb_by_seqnum != 0) {
922 int ret;
923 ret = pull_highest_seqnum_pdb(ctdb, rec, nodemap, recdb, dbid);
924 if (ret == 0) {
925 return 0;
929 /* pull all records from all other nodes across onto this node
930 (this merges based on rsn)
932 for (j=0; j<nodemap->num; j++) {
933 /* dont merge from nodes that are unavailable */
934 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
935 continue;
937 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
938 DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
939 nodemap->nodes[j].pnn));
940 ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
941 return -1;
945 return 0;
950 update flags on all active nodes
952 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
954 int ret;
956 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
957 if (ret != 0) {
958 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
959 return -1;
962 return 0;
966 ensure all nodes have the same vnnmap we do
968 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
969 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
971 int j, ret;
973 /* push the new vnn map out to all the nodes */
974 for (j=0; j<nodemap->num; j++) {
975 /* dont push to nodes that are unavailable */
976 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
977 continue;
980 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
981 if (ret != 0) {
982 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
983 return -1;
987 return 0;
992 called when a vacuum fetch has completed - just free it and do the next one
994 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
996 talloc_free(state);
1001 * Process one elements of the vacuum fetch list:
1002 * Migrate it over to us with the special flag
1003 * CTDB_CALL_FLAG_VACUUM_MIGRATION.
1005 static bool vacuum_fetch_process_one(struct ctdb_db_context *ctdb_db,
1006 uint32_t pnn,
1007 struct ctdb_rec_data *r)
1009 struct ctdb_client_call_state *state;
1010 TDB_DATA data;
1011 struct ctdb_ltdb_header *hdr;
1012 struct ctdb_call call;
1014 ZERO_STRUCT(call);
1015 call.call_id = CTDB_NULL_FUNC;
1016 call.flags = CTDB_IMMEDIATE_MIGRATION;
1017 call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
1019 call.key.dptr = &r->data[0];
1020 call.key.dsize = r->keylen;
1022 /* ensure we don't block this daemon - just skip a record if we can't get
1023 the chainlock */
1024 if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, call.key) != 0) {
1025 return true;
1028 data = tdb_fetch(ctdb_db->ltdb->tdb, call.key);
1029 if (data.dptr == NULL) {
1030 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
1031 return true;
1034 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
1035 free(data.dptr);
1036 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
1037 return true;
1040 hdr = (struct ctdb_ltdb_header *)data.dptr;
1041 if (hdr->dmaster == pnn) {
1042 /* its already local */
1043 free(data.dptr);
1044 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
1045 return true;
1048 free(data.dptr);
1050 state = ctdb_call_send(ctdb_db, &call);
1051 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
1052 if (state == NULL) {
1053 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
1054 return false;
1056 state->async.fn = vacuum_fetch_callback;
1057 state->async.private_data = NULL;
1059 return true;
1064 handler for vacuum fetch
1066 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
1067 TDB_DATA data, void *private_data)
1069 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1070 struct ctdb_marshall_buffer *recs;
1071 int ret, i;
1072 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1073 const char *name;
1074 struct ctdb_dbid_map *dbmap=NULL;
1075 bool persistent = false;
1076 struct ctdb_db_context *ctdb_db;
1077 struct ctdb_rec_data *r;
1079 recs = (struct ctdb_marshall_buffer *)data.dptr;
1081 if (recs->count == 0) {
1082 goto done;
1085 /* work out if the database is persistent */
1086 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
1087 if (ret != 0) {
1088 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
1089 goto done;
1092 for (i=0;i<dbmap->num;i++) {
1093 if (dbmap->dbs[i].dbid == recs->db_id) {
1094 persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
1095 break;
1098 if (i == dbmap->num) {
1099 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
1100 goto done;
1103 /* find the name of this database */
1104 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
1105 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
1106 goto done;
1109 /* attach to it */
1110 ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, persistent, 0);
1111 if (ctdb_db == NULL) {
1112 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
1113 goto done;
1116 r = (struct ctdb_rec_data *)&recs->data[0];
1117 while (recs->count) {
1118 bool ok;
1120 ok = vacuum_fetch_process_one(ctdb_db, rec->ctdb->pnn, r);
1121 if (!ok) {
1122 break;
1125 r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
1126 recs->count--;
1129 done:
1130 talloc_free(tmp_ctx);
1135 * handler for database detach
1137 static void detach_database_handler(struct ctdb_context *ctdb, uint64_t srvid,
1138 TDB_DATA data, void *private_data)
1140 uint32_t db_id;
1141 struct ctdb_db_context *ctdb_db;
1143 if (data.dsize != sizeof(db_id)) {
1144 return;
1146 db_id = *(uint32_t *)data.dptr;
1148 ctdb_db = find_ctdb_db(ctdb, db_id);
1149 if (ctdb_db == NULL) {
1150 /* database is not attached */
1151 return;
1154 DLIST_REMOVE(ctdb->db_list, ctdb_db);
1156 DEBUG(DEBUG_NOTICE, ("Detached from database '%s'\n",
1157 ctdb_db->db_name));
1158 talloc_free(ctdb_db);
1162 called when ctdb_wait_timeout should finish
1164 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
1165 struct timeval yt, void *p)
1167 uint32_t *timed_out = (uint32_t *)p;
1168 (*timed_out) = 1;
1172 wait for a given number of seconds
1174 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
1176 uint32_t timed_out = 0;
1177 time_t usecs = (secs - (time_t)secs) * 1000000;
1178 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs), ctdb_wait_handler, &timed_out);
1179 while (!timed_out) {
1180 event_loop_once(ctdb->ev);
1185 called when an election times out (ends)
1187 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
1188 struct timeval t, void *p)
1190 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1191 rec->election_timeout = NULL;
1192 fast_start = false;
1194 DEBUG(DEBUG_WARNING,("Election period ended\n"));
1199 wait for an election to finish. It finished election_timeout seconds after
1200 the last election packet is received
1202 static void ctdb_wait_election(struct ctdb_recoverd *rec)
1204 struct ctdb_context *ctdb = rec->ctdb;
1205 while (rec->election_timeout) {
1206 event_loop_once(ctdb->ev);
1211 Update our local flags from all remote connected nodes.
1212 This is only run when we are or we belive we are the recovery master
1214 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
1216 int j;
1217 struct ctdb_context *ctdb = rec->ctdb;
1218 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1220 /* get the nodemap for all active remote nodes and verify
1221 they are the same as for this node
1223 for (j=0; j<nodemap->num; j++) {
1224 struct ctdb_node_map *remote_nodemap=NULL;
1225 int ret;
1227 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
1228 continue;
1230 if (nodemap->nodes[j].pnn == ctdb->pnn) {
1231 continue;
1234 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
1235 mem_ctx, &remote_nodemap);
1236 if (ret != 0) {
1237 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
1238 nodemap->nodes[j].pnn));
1239 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
1240 talloc_free(mem_ctx);
1241 return MONITOR_FAILED;
1243 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
1244 /* We should tell our daemon about this so it
1245 updates its flags or else we will log the same
1246 message again in the next iteration of recovery.
1247 Since we are the recovery master we can just as
1248 well update the flags on all nodes.
1250 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
1251 if (ret != 0) {
1252 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
1253 return -1;
1256 /* Update our local copy of the flags in the recovery
1257 daemon.
1259 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
1260 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
1261 nodemap->nodes[j].flags));
1262 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
1264 talloc_free(remote_nodemap);
1266 talloc_free(mem_ctx);
1267 return MONITOR_OK;
1271 /* Create a new random generation ip.
1272 The generation id can not be the INVALID_GENERATION id
1274 static uint32_t new_generation(void)
1276 uint32_t generation;
1278 while (1) {
1279 generation = random();
1281 if (generation != INVALID_GENERATION) {
1282 break;
1286 return generation;
1291 create a temporary working database
1293 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1295 char *name;
1296 struct tdb_wrap *recdb;
1297 unsigned tdb_flags;
1299 /* open up the temporary recovery database */
1300 name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
1301 ctdb->db_directory_state,
1302 ctdb->pnn);
1303 if (name == NULL) {
1304 return NULL;
1306 unlink(name);
1308 tdb_flags = TDB_NOLOCK;
1309 if (ctdb->valgrinding) {
1310 tdb_flags |= TDB_NOMMAP;
1312 tdb_flags |= (TDB_INCOMPATIBLE_HASH | TDB_DISALLOW_NESTING);
1314 recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1315 tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1316 if (recdb == NULL) {
1317 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1320 talloc_free(name);
1322 return recdb;
1327 a traverse function for pulling all relevant records from recdb
1329 struct recdb_data {
1330 struct ctdb_context *ctdb;
1331 struct ctdb_marshall_buffer *recdata;
1332 uint32_t len;
1333 uint32_t allocated_len;
1334 bool failed;
1335 bool persistent;
1338 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1340 struct recdb_data *params = (struct recdb_data *)p;
1341 struct ctdb_rec_data *recdata;
1342 struct ctdb_ltdb_header *hdr;
1345 * skip empty records - but NOT for persistent databases:
1347 * The record-by-record mode of recovery deletes empty records.
1348 * For persistent databases, this can lead to data corruption
1349 * by deleting records that should be there:
1351 * - Assume the cluster has been running for a while.
1353 * - A record R in a persistent database has been created and
1354 * deleted a couple of times, the last operation being deletion,
1355 * leaving an empty record with a high RSN, say 10.
1357 * - Now a node N is turned off.
1359 * - This leaves the local database copy of D on N with the empty
1360 * copy of R and RSN 10. On all other nodes, the recovery has deleted
1361 * the copy of record R.
1363 * - Now the record is created again while node N is turned off.
1364 * This creates R with RSN = 1 on all nodes except for N.
1366 * - Now node N is turned on again. The following recovery will chose
1367 * the older empty copy of R due to RSN 10 > RSN 1.
1369 * ==> Hence the record is gone after the recovery.
1371 * On databases like Samba's registry, this can damage the higher-level
1372 * data structures built from the various tdb-level records.
1374 if (!params->persistent && data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1375 return 0;
1378 /* update the dmaster field to point to us */
1379 hdr = (struct ctdb_ltdb_header *)data.dptr;
1380 if (!params->persistent) {
1381 hdr->dmaster = params->ctdb->pnn;
1382 hdr->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
1385 /* add the record to the blob ready to send to the nodes */
1386 recdata = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1387 if (recdata == NULL) {
1388 params->failed = true;
1389 return -1;
1391 if (params->len + recdata->length >= params->allocated_len) {
1392 params->allocated_len = recdata->length + params->len + params->ctdb->tunable.pulldb_preallocation_size;
1393 params->recdata = talloc_realloc_size(NULL, params->recdata, params->allocated_len);
1395 if (params->recdata == NULL) {
1396 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u\n",
1397 recdata->length + params->len));
1398 params->failed = true;
1399 return -1;
1401 params->recdata->count++;
1402 memcpy(params->len+(uint8_t *)params->recdata, recdata, recdata->length);
1403 params->len += recdata->length;
1404 talloc_free(recdata);
1406 return 0;
1410 push the recdb database out to all nodes
1412 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1413 bool persistent,
1414 struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1416 struct recdb_data params;
1417 struct ctdb_marshall_buffer *recdata;
1418 TDB_DATA outdata;
1419 TALLOC_CTX *tmp_ctx;
1420 uint32_t *nodes;
1422 tmp_ctx = talloc_new(ctdb);
1423 CTDB_NO_MEMORY(ctdb, tmp_ctx);
1425 recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1426 CTDB_NO_MEMORY(ctdb, recdata);
1428 recdata->db_id = dbid;
1430 params.ctdb = ctdb;
1431 params.recdata = recdata;
1432 params.len = offsetof(struct ctdb_marshall_buffer, data);
1433 params.allocated_len = params.len;
1434 params.failed = false;
1435 params.persistent = persistent;
1437 if (tdb_traverse_read(recdb->tdb, traverse_recdb, &params) == -1) {
1438 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1439 talloc_free(params.recdata);
1440 talloc_free(tmp_ctx);
1441 return -1;
1444 if (params.failed) {
1445 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1446 talloc_free(params.recdata);
1447 talloc_free(tmp_ctx);
1448 return -1;
1451 recdata = params.recdata;
1453 outdata.dptr = (void *)recdata;
1454 outdata.dsize = params.len;
1456 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1457 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1458 nodes, 0,
1459 CONTROL_TIMEOUT(), false, outdata,
1460 NULL, NULL,
1461 NULL) != 0) {
1462 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1463 talloc_free(recdata);
1464 talloc_free(tmp_ctx);
1465 return -1;
1468 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1469 dbid, recdata->count));
1471 talloc_free(recdata);
1472 talloc_free(tmp_ctx);
1474 return 0;
1479 go through a full recovery on one database
1481 static int recover_database(struct ctdb_recoverd *rec,
1482 TALLOC_CTX *mem_ctx,
1483 uint32_t dbid,
1484 bool persistent,
1485 uint32_t pnn,
1486 struct ctdb_node_map *nodemap,
1487 uint32_t transaction_id)
1489 struct tdb_wrap *recdb;
1490 int ret;
1491 struct ctdb_context *ctdb = rec->ctdb;
1492 TDB_DATA data;
1493 struct ctdb_control_wipe_database w;
1494 uint32_t *nodes;
1496 recdb = create_recdb(ctdb, mem_ctx);
1497 if (recdb == NULL) {
1498 return -1;
1501 /* pull all remote databases onto the recdb */
1502 ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
1503 if (ret != 0) {
1504 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1505 return -1;
1508 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1510 /* wipe all the remote databases. This is safe as we are in a transaction */
1511 w.db_id = dbid;
1512 w.transaction_id = transaction_id;
1514 data.dptr = (void *)&w;
1515 data.dsize = sizeof(w);
1517 nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1518 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1519 nodes, 0,
1520 CONTROL_TIMEOUT(), false, data,
1521 NULL, NULL,
1522 NULL) != 0) {
1523 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1524 talloc_free(recdb);
1525 return -1;
1528 /* push out the correct database. This sets the dmaster and skips
1529 the empty records */
1530 ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
1531 if (ret != 0) {
1532 talloc_free(recdb);
1533 return -1;
1536 /* all done with this database */
1537 talloc_free(recdb);
1539 return 0;
1542 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1543 struct ctdb_recoverd *rec,
1544 struct ctdb_node_map *nodemap,
1545 uint32_t *culprit)
1547 int j;
1548 int ret;
1550 if (ctdb->num_nodes != nodemap->num) {
1551 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
1552 ctdb->num_nodes, nodemap->num));
1553 if (culprit) {
1554 *culprit = ctdb->pnn;
1556 return -1;
1559 for (j=0; j<nodemap->num; j++) {
1560 /* For readability */
1561 struct ctdb_node *node = ctdb->nodes[j];
1563 /* release any existing data */
1564 if (node->known_public_ips) {
1565 talloc_free(node->known_public_ips);
1566 node->known_public_ips = NULL;
1568 if (node->available_public_ips) {
1569 talloc_free(node->available_public_ips);
1570 node->available_public_ips = NULL;
1573 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1574 continue;
1577 /* Retrieve the list of known public IPs from the node */
1578 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1579 CONTROL_TIMEOUT(),
1580 node->pnn,
1581 ctdb->nodes,
1583 &node->known_public_ips);
1584 if (ret != 0) {
1585 DEBUG(DEBUG_ERR,
1586 ("Failed to read known public IPs from node: %u\n",
1587 node->pnn));
1588 if (culprit) {
1589 *culprit = node->pnn;
1591 return -1;
1594 if (ctdb->do_checkpublicip &&
1595 !ctdb_op_is_disabled(rec->takeover_run) &&
1596 verify_remote_ip_allocation(ctdb,
1597 node->known_public_ips,
1598 node->pnn)) {
1599 DEBUG(DEBUG_ERR,("Trigger IP reallocation\n"));
1600 rec->need_takeover_run = true;
1603 /* Retrieve the list of available public IPs from the node */
1604 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1605 CONTROL_TIMEOUT(),
1606 node->pnn,
1607 ctdb->nodes,
1608 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1609 &node->available_public_ips);
1610 if (ret != 0) {
1611 DEBUG(DEBUG_ERR,
1612 ("Failed to read available public IPs from node: %u\n",
1613 node->pnn));
1614 if (culprit) {
1615 *culprit = node->pnn;
1617 return -1;
1621 return 0;
1624 /* when we start a recovery, make sure all nodes use the same reclock file
1625 setting
1627 static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd *rec)
1629 struct ctdb_context *ctdb = rec->ctdb;
1630 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
1631 TDB_DATA data;
1632 uint32_t *nodes;
1634 if (ctdb->recovery_lock_file == NULL) {
1635 data.dptr = NULL;
1636 data.dsize = 0;
1637 } else {
1638 data.dsize = strlen(ctdb->recovery_lock_file) + 1;
1639 data.dptr = (uint8_t *)ctdb->recovery_lock_file;
1642 nodes = list_of_active_nodes(ctdb, rec->nodemap, tmp_ctx, true);
1643 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECLOCK_FILE,
1644 nodes, 0,
1645 CONTROL_TIMEOUT(),
1646 false, data,
1647 NULL, NULL,
1648 rec) != 0) {
1649 DEBUG(DEBUG_ERR, (__location__ " Failed to sync reclock file settings\n"));
1650 talloc_free(tmp_ctx);
1651 return -1;
1654 talloc_free(tmp_ctx);
1655 return 0;
1660 * this callback is called for every node that failed to execute ctdb_takeover_run()
1661 * and set flag to re-run takeover run.
1663 static void takeover_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
1665 DEBUG(DEBUG_ERR, ("Node %u failed the takeover run\n", node_pnn));
1667 if (callback_data != NULL) {
1668 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
1670 DEBUG(DEBUG_ERR, ("Setting node %u as recovery fail culprit\n", node_pnn));
1672 ctdb_set_culprit(rec, node_pnn);
1677 static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
1679 struct ctdb_context *ctdb = rec->ctdb;
1680 int i;
1681 struct ctdb_banning_state *ban_state;
1683 *self_ban = false;
1684 for (i=0; i<ctdb->num_nodes; i++) {
1685 if (ctdb->nodes[i]->ban_state == NULL) {
1686 continue;
1688 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1689 if (ban_state->count < 2*ctdb->num_nodes) {
1690 continue;
1693 DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
1694 ctdb->nodes[i]->pnn, ban_state->count,
1695 ctdb->tunable.recovery_ban_period));
1696 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1697 ban_state->count = 0;
1699 /* Banning ourself? */
1700 if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
1701 *self_ban = true;
1706 static bool do_takeover_run(struct ctdb_recoverd *rec,
1707 struct ctdb_node_map *nodemap,
1708 bool banning_credits_on_fail)
1710 uint32_t *nodes = NULL;
1711 struct srvid_request_data dtr;
1712 TDB_DATA data;
1713 int i;
1714 uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
1715 int ret;
1716 bool ok;
1718 DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
1720 if (ctdb_op_is_in_progress(rec->takeover_run)) {
1721 DEBUG(DEBUG_ERR, (__location__
1722 " takeover run already in progress \n"));
1723 ok = false;
1724 goto done;
1727 if (!ctdb_op_begin(rec->takeover_run)) {
1728 ok = false;
1729 goto done;
1732 /* Disable IP checks (takeover runs, really) on other nodes
1733 * while doing this takeover run. This will stop those other
1734 * nodes from triggering takeover runs when think they should
1735 * be hosting an IP but it isn't yet on an interface. Don't
1736 * wait for replies since a failure here might cause some
1737 * noise in the logs but will not actually cause a problem.
1739 dtr.srvid = 0; /* No reply */
1740 dtr.pnn = -1;
1742 data.dptr = (uint8_t*)&dtr;
1743 data.dsize = sizeof(dtr);
1745 nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
1747 /* Disable for 60 seconds. This can be a tunable later if
1748 * necessary.
1750 dtr.data = 60;
1751 for (i = 0; i < talloc_array_length(nodes); i++) {
1752 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1753 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1754 data) != 0) {
1755 DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
1759 ret = ctdb_takeover_run(rec->ctdb, nodemap,
1760 rec->force_rebalance_nodes,
1761 takeover_fail_callback,
1762 banning_credits_on_fail ? rec : NULL);
1764 /* Reenable takeover runs and IP checks on other nodes */
1765 dtr.data = 0;
1766 for (i = 0; i < talloc_array_length(nodes); i++) {
1767 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1768 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1769 data) != 0) {
1770 DEBUG(DEBUG_INFO,("Failed to reenable takeover runs\n"));
1774 if (ret != 0) {
1775 DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
1776 ok = false;
1777 goto done;
1780 ok = true;
1781 /* Takeover run was successful so clear force rebalance targets */
1782 if (rebalance_nodes == rec->force_rebalance_nodes) {
1783 TALLOC_FREE(rec->force_rebalance_nodes);
1784 } else {
1785 DEBUG(DEBUG_WARNING,
1786 ("Rebalance target nodes changed during takeover run - not clearing\n"));
1788 done:
1789 rec->need_takeover_run = !ok;
1790 talloc_free(nodes);
1791 ctdb_op_end(rec->takeover_run);
1793 DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
1794 return ok;
1799 we are the recmaster, and recovery is needed - start a recovery run
1801 static int do_recovery(struct ctdb_recoverd *rec,
1802 TALLOC_CTX *mem_ctx, uint32_t pnn,
1803 struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap)
1805 struct ctdb_context *ctdb = rec->ctdb;
1806 int i, j, ret;
1807 uint32_t generation;
1808 struct ctdb_dbid_map *dbmap;
1809 TDB_DATA data;
1810 uint32_t *nodes;
1811 struct timeval start_time;
1812 uint32_t culprit = (uint32_t)-1;
1813 bool self_ban;
1815 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1817 /* if recovery fails, force it again */
1818 rec->need_recovery = true;
1820 if (!ctdb_op_begin(rec->recovery)) {
1821 return -1;
1824 if (rec->election_timeout) {
1825 /* an election is in progress */
1826 DEBUG(DEBUG_ERR, ("do_recovery called while election in progress - try again later\n"));
1827 goto fail;
1830 ban_misbehaving_nodes(rec, &self_ban);
1831 if (self_ban) {
1832 DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
1833 goto fail;
1836 if (ctdb->recovery_lock_file != NULL) {
1837 if (ctdb_recovery_have_lock(ctdb)) {
1838 DEBUG(DEBUG_NOTICE, ("Already holding recovery lock\n"));
1839 } else {
1840 start_time = timeval_current();
1841 DEBUG(DEBUG_NOTICE, ("Attempting to take recovery lock (%s)\n",
1842 ctdb->recovery_lock_file));
1843 if (!ctdb_recovery_lock(ctdb)) {
1844 if (ctdb->runstate == CTDB_RUNSTATE_FIRST_RECOVERY) {
1845 /* If ctdb is trying first recovery, it's
1846 * possible that current node does not know
1847 * yet who the recmaster is.
1849 DEBUG(DEBUG_ERR, ("Unable to get recovery lock"
1850 " - retrying recovery\n"));
1851 goto fail;
1854 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
1855 "and ban ourself for %u seconds\n",
1856 ctdb->tunable.recovery_ban_period));
1857 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
1858 goto fail;
1860 ctdb_ctrl_report_recd_lock_latency(ctdb,
1861 CONTROL_TIMEOUT(),
1862 timeval_elapsed(&start_time));
1863 DEBUG(DEBUG_NOTICE,
1864 ("Recovery lock taken successfully by recovery daemon\n"));
1868 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1870 /* get a list of all databases */
1871 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1872 if (ret != 0) {
1873 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1874 goto fail;
1877 /* we do the db creation before we set the recovery mode, so the freeze happens
1878 on all databases we will be dealing with. */
1880 /* verify that we have all the databases any other node has */
1881 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1882 if (ret != 0) {
1883 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1884 goto fail;
1887 /* verify that all other nodes have all our databases */
1888 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1889 if (ret != 0) {
1890 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1891 goto fail;
1893 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1895 /* update the database priority for all remote databases */
1896 ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
1897 if (ret != 0) {
1898 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
1900 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
1903 /* update all other nodes to use the same setting for reclock files
1904 as the local recovery master.
1906 sync_recovery_lock_file_across_cluster(rec);
1908 /* set recovery mode to active on all nodes */
1909 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1910 if (ret != 0) {
1911 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1912 goto fail;
1915 /* execute the "startrecovery" event script on all nodes */
1916 ret = run_startrecovery_eventscript(rec, nodemap);
1917 if (ret!=0) {
1918 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1919 goto fail;
1923 update all nodes to have the same flags that we have
1925 for (i=0;i<nodemap->num;i++) {
1926 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1927 continue;
1930 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1931 if (ret != 0) {
1932 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1933 DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
1934 } else {
1935 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1936 goto fail;
1941 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1943 /* pick a new generation number */
1944 generation = new_generation();
1946 /* change the vnnmap on this node to use the new generation
1947 number but not on any other nodes.
1948 this guarantees that if we abort the recovery prematurely
1949 for some reason (a node stops responding?)
1950 that we can just return immediately and we will reenter
1951 recovery shortly again.
1952 I.e. we deliberately leave the cluster with an inconsistent
1953 generation id to allow us to abort recovery at any stage and
1954 just restart it from scratch.
1956 vnnmap->generation = generation;
1957 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1958 if (ret != 0) {
1959 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1960 goto fail;
1963 data.dptr = (void *)&generation;
1964 data.dsize = sizeof(uint32_t);
1966 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1967 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1968 nodes, 0,
1969 CONTROL_TIMEOUT(), false, data,
1970 NULL,
1971 transaction_start_fail_callback,
1972 rec) != 0) {
1973 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1974 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
1975 nodes, 0,
1976 CONTROL_TIMEOUT(), false, tdb_null,
1977 NULL,
1978 NULL,
1979 NULL) != 0) {
1980 DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
1982 goto fail;
1985 DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1987 for (i=0;i<dbmap->num;i++) {
1988 ret = recover_database(rec, mem_ctx,
1989 dbmap->dbs[i].dbid,
1990 dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT,
1991 pnn, nodemap, generation);
1992 if (ret != 0) {
1993 DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
1994 goto fail;
1998 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
2000 /* commit all the changes */
2001 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
2002 nodes, 0,
2003 CONTROL_TIMEOUT(), false, data,
2004 NULL, NULL,
2005 NULL) != 0) {
2006 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
2007 goto fail;
2010 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
2013 /* update the capabilities for all nodes */
2014 ret = update_capabilities(rec, nodemap);
2015 if (ret!=0) {
2016 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
2017 goto fail;
2020 /* build a new vnn map with all the currently active and
2021 unbanned nodes */
2022 generation = new_generation();
2023 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
2024 CTDB_NO_MEMORY(ctdb, vnnmap);
2025 vnnmap->generation = generation;
2026 vnnmap->size = 0;
2027 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
2028 CTDB_NO_MEMORY(ctdb, vnnmap->map);
2029 for (i=j=0;i<nodemap->num;i++) {
2030 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2031 continue;
2033 if (!ctdb_node_has_capabilities(rec->caps,
2034 ctdb->nodes[i]->pnn,
2035 CTDB_CAP_LMASTER)) {
2036 /* this node can not be an lmaster */
2037 DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
2038 continue;
2041 vnnmap->size++;
2042 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
2043 CTDB_NO_MEMORY(ctdb, vnnmap->map);
2044 vnnmap->map[j++] = nodemap->nodes[i].pnn;
2047 if (vnnmap->size == 0) {
2048 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
2049 vnnmap->size++;
2050 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
2051 CTDB_NO_MEMORY(ctdb, vnnmap->map);
2052 vnnmap->map[0] = pnn;
2055 /* update to the new vnnmap on all nodes */
2056 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
2057 if (ret != 0) {
2058 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
2059 goto fail;
2062 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
2064 /* update recmaster to point to us for all nodes */
2065 ret = set_recovery_master(ctdb, nodemap, pnn);
2066 if (ret!=0) {
2067 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
2068 goto fail;
2071 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
2073 /* disable recovery mode */
2074 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL);
2075 if (ret != 0) {
2076 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
2077 goto fail;
2080 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
2082 /* Fetch known/available public IPs from each active node */
2083 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
2084 if (ret != 0) {
2085 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2086 culprit));
2087 rec->need_takeover_run = true;
2088 goto fail;
2091 do_takeover_run(rec, nodemap, false);
2093 /* execute the "recovered" event script on all nodes */
2094 ret = run_recovered_eventscript(rec, nodemap, "do_recovery");
2095 if (ret!=0) {
2096 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
2097 goto fail;
2100 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
2102 /* send a message to all clients telling them that the cluster
2103 has been reconfigured */
2104 ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
2105 CTDB_SRVID_RECONFIGURE, tdb_null);
2106 if (ret != 0) {
2107 DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n"));
2108 goto fail;
2111 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
2113 rec->need_recovery = false;
2114 ctdb_op_end(rec->recovery);
2116 /* we managed to complete a full recovery, make sure to forgive
2117 any past sins by the nodes that could now participate in the
2118 recovery.
2120 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
2121 for (i=0;i<nodemap->num;i++) {
2122 struct ctdb_banning_state *ban_state;
2124 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2125 continue;
2128 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
2129 if (ban_state == NULL) {
2130 continue;
2133 ban_state->count = 0;
2136 /* We just finished a recovery successfully.
2137 We now wait for rerecovery_timeout before we allow
2138 another recovery to take place.
2140 DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
2141 ctdb_op_disable(rec->recovery, ctdb->ev,
2142 ctdb->tunable.rerecovery_timeout);
2143 return 0;
2145 fail:
2146 ctdb_op_end(rec->recovery);
2147 return -1;
2152 elections are won by first checking the number of connected nodes, then
2153 the priority time, then the pnn
2155 struct election_message {
2156 uint32_t num_connected;
2157 struct timeval priority_time;
2158 uint32_t pnn;
2159 uint32_t node_flags;
2163 form this nodes election data
2165 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
2167 int ret, i;
2168 struct ctdb_node_map *nodemap;
2169 struct ctdb_context *ctdb = rec->ctdb;
2171 ZERO_STRUCTP(em);
2173 em->pnn = rec->ctdb->pnn;
2174 em->priority_time = rec->priority_time;
2176 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
2177 if (ret != 0) {
2178 DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
2179 return;
2182 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
2183 em->node_flags = rec->node_flags;
2185 for (i=0;i<nodemap->num;i++) {
2186 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
2187 em->num_connected++;
2191 /* we shouldnt try to win this election if we cant be a recmaster */
2192 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2193 em->num_connected = 0;
2194 em->priority_time = timeval_current();
2197 talloc_free(nodemap);
2201 see if the given election data wins
2203 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
2205 struct election_message myem;
2206 int cmp = 0;
2208 ctdb_election_data(rec, &myem);
2210 /* we cant win if we dont have the recmaster capability */
2211 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2212 return false;
2215 /* we cant win if we are banned */
2216 if (rec->node_flags & NODE_FLAGS_BANNED) {
2217 return false;
2220 /* we cant win if we are stopped */
2221 if (rec->node_flags & NODE_FLAGS_STOPPED) {
2222 return false;
2225 /* we will automatically win if the other node is banned */
2226 if (em->node_flags & NODE_FLAGS_BANNED) {
2227 return true;
2230 /* we will automatically win if the other node is banned */
2231 if (em->node_flags & NODE_FLAGS_STOPPED) {
2232 return true;
2235 /* try to use the most connected node */
2236 if (cmp == 0) {
2237 cmp = (int)myem.num_connected - (int)em->num_connected;
2240 /* then the longest running node */
2241 if (cmp == 0) {
2242 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
2245 if (cmp == 0) {
2246 cmp = (int)myem.pnn - (int)em->pnn;
2249 return cmp > 0;
2253 send out an election request
2255 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
2257 int ret;
2258 TDB_DATA election_data;
2259 struct election_message emsg;
2260 uint64_t srvid;
2261 struct ctdb_context *ctdb = rec->ctdb;
2263 srvid = CTDB_SRVID_RECOVERY;
2265 ctdb_election_data(rec, &emsg);
2267 election_data.dsize = sizeof(struct election_message);
2268 election_data.dptr = (unsigned char *)&emsg;
2271 /* first we assume we will win the election and set
2272 recoverymaster to be ourself on the current node
2274 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
2275 if (ret != 0) {
2276 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
2277 return -1;
2281 /* send an election message to all active nodes */
2282 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
2283 return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
2287 this function will unban all nodes in the cluster
2289 static void unban_all_nodes(struct ctdb_context *ctdb)
2291 int ret, i;
2292 struct ctdb_node_map *nodemap;
2293 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2295 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2296 if (ret != 0) {
2297 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
2298 return;
2301 for (i=0;i<nodemap->num;i++) {
2302 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
2303 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
2304 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(),
2305 nodemap->nodes[i].pnn, 0,
2306 NODE_FLAGS_BANNED);
2307 if (ret != 0) {
2308 DEBUG(DEBUG_ERR, (__location__ " failed to reset ban state\n"));
2313 talloc_free(tmp_ctx);
2318 we think we are winning the election - send a broadcast election request
2320 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
2322 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2323 int ret;
2325 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
2326 if (ret != 0) {
2327 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
2330 talloc_free(rec->send_election_te);
2331 rec->send_election_te = NULL;
2335 handler for memory dumps
2337 static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
2338 TDB_DATA data, void *private_data)
2340 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2341 TDB_DATA *dump;
2342 int ret;
2343 struct srvid_request *rd;
2345 if (data.dsize != sizeof(struct srvid_request)) {
2346 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2347 talloc_free(tmp_ctx);
2348 return;
2350 rd = (struct srvid_request *)data.dptr;
2352 dump = talloc_zero(tmp_ctx, TDB_DATA);
2353 if (dump == NULL) {
2354 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
2355 talloc_free(tmp_ctx);
2356 return;
2358 ret = ctdb_dump_memory(ctdb, dump);
2359 if (ret != 0) {
2360 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
2361 talloc_free(tmp_ctx);
2362 return;
2365 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
2367 ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
2368 if (ret != 0) {
2369 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
2370 talloc_free(tmp_ctx);
2371 return;
2374 talloc_free(tmp_ctx);
2378 handler for reload_nodes
2380 static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid,
2381 TDB_DATA data, void *private_data)
2383 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2385 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
2387 ctdb_load_nodes_file(rec->ctdb);
2391 static void ctdb_rebalance_timeout(struct event_context *ev,
2392 struct timed_event *te,
2393 struct timeval t, void *p)
2395 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2397 if (rec->force_rebalance_nodes == NULL) {
2398 DEBUG(DEBUG_ERR,
2399 ("Rebalance timeout occurred - no nodes to rebalance\n"));
2400 return;
2403 DEBUG(DEBUG_NOTICE,
2404 ("Rebalance timeout occurred - do takeover run\n"));
2405 do_takeover_run(rec, rec->nodemap, false);
2409 static void recd_node_rebalance_handler(struct ctdb_context *ctdb,
2410 uint64_t srvid,
2411 TDB_DATA data, void *private_data)
2413 uint32_t pnn;
2414 uint32_t *t;
2415 int len;
2416 uint32_t deferred_rebalance;
2417 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2419 if (rec->recmaster != ctdb_get_pnn(ctdb)) {
2420 return;
2423 if (data.dsize != sizeof(uint32_t)) {
2424 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
2425 return;
2428 pnn = *(uint32_t *)&data.dptr[0];
2430 DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
2432 /* Copy any existing list of nodes. There's probably some
2433 * sort of realloc variant that will do this but we need to
2434 * make sure that freeing the old array also cancels the timer
2435 * event for the timeout... not sure if realloc will do that.
2437 len = (rec->force_rebalance_nodes != NULL) ?
2438 talloc_array_length(rec->force_rebalance_nodes) :
2441 /* This allows duplicates to be added but they don't cause
2442 * harm. A call to add a duplicate PNN arguably means that
2443 * the timeout should be reset, so this is the simplest
2444 * solution.
2446 t = talloc_zero_array(rec, uint32_t, len+1);
2447 CTDB_NO_MEMORY_VOID(ctdb, t);
2448 if (len > 0) {
2449 memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
2451 t[len] = pnn;
2453 talloc_free(rec->force_rebalance_nodes);
2455 rec->force_rebalance_nodes = t;
2457 /* If configured, setup a deferred takeover run to make sure
2458 * that certain nodes get IPs rebalanced to them. This will
2459 * be cancelled if a successful takeover run happens before
2460 * the timeout. Assign tunable value to variable for
2461 * readability.
2463 deferred_rebalance = ctdb->tunable.deferred_rebalance_on_node_add;
2464 if (deferred_rebalance != 0) {
2465 event_add_timed(ctdb->ev, rec->force_rebalance_nodes,
2466 timeval_current_ofs(deferred_rebalance, 0),
2467 ctdb_rebalance_timeout, rec);
2473 static void recd_update_ip_handler(struct ctdb_context *ctdb, uint64_t srvid,
2474 TDB_DATA data, void *private_data)
2476 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2477 struct ctdb_public_ip *ip;
2479 if (rec->recmaster != rec->ctdb->pnn) {
2480 DEBUG(DEBUG_INFO,("Not recmaster, ignore update ip message\n"));
2481 return;
2484 if (data.dsize != sizeof(struct ctdb_public_ip)) {
2485 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(struct ctdb_public_ip)));
2486 return;
2489 ip = (struct ctdb_public_ip *)data.dptr;
2491 update_ip_assignment_tree(rec->ctdb, ip);
2494 static void srvid_disable_and_reply(struct ctdb_context *ctdb,
2495 TDB_DATA data,
2496 struct ctdb_op_state *op_state)
2498 struct srvid_request_data *r;
2499 uint32_t timeout;
2500 TDB_DATA result;
2501 int32_t ret = 0;
2503 /* Validate input data */
2504 if (data.dsize != sizeof(struct srvid_request_data)) {
2505 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2506 "expecting %lu\n", (long unsigned)data.dsize,
2507 (long unsigned)sizeof(struct srvid_request)));
2508 return;
2510 if (data.dptr == NULL) {
2511 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2512 return;
2515 r = (struct srvid_request_data *)data.dptr;
2516 timeout = r->data;
2518 ret = ctdb_op_disable(op_state, ctdb->ev, timeout);
2519 if (ret != 0) {
2520 goto done;
2523 /* Returning our PNN tells the caller that we succeeded */
2524 ret = ctdb_get_pnn(ctdb);
2525 done:
2526 result.dsize = sizeof(int32_t);
2527 result.dptr = (uint8_t *)&ret;
2528 srvid_request_reply(ctdb, (struct srvid_request *)r, result);
2531 static void disable_takeover_runs_handler(struct ctdb_context *ctdb,
2532 uint64_t srvid, TDB_DATA data,
2533 void *private_data)
2535 struct ctdb_recoverd *rec = talloc_get_type(private_data,
2536 struct ctdb_recoverd);
2538 srvid_disable_and_reply(ctdb, data, rec->takeover_run);
2541 /* Backward compatibility for this SRVID */
2542 static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid,
2543 TDB_DATA data, void *private_data)
2545 struct ctdb_recoverd *rec = talloc_get_type(private_data,
2546 struct ctdb_recoverd);
2547 uint32_t timeout;
2549 if (data.dsize != sizeof(uint32_t)) {
2550 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2551 "expecting %lu\n", (long unsigned)data.dsize,
2552 (long unsigned)sizeof(uint32_t)));
2553 return;
2555 if (data.dptr == NULL) {
2556 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2557 return;
2560 timeout = *((uint32_t *)data.dptr);
2562 ctdb_op_disable(rec->takeover_run, ctdb->ev, timeout);
2565 static void disable_recoveries_handler(struct ctdb_context *ctdb,
2566 uint64_t srvid, TDB_DATA data,
2567 void *private_data)
2569 struct ctdb_recoverd *rec = talloc_get_type(private_data,
2570 struct ctdb_recoverd);
2572 srvid_disable_and_reply(ctdb, data, rec->recovery);
2576 handler for ip reallocate, just add it to the list of requests and
2577 handle this later in the monitor_cluster loop so we do not recurse
2578 with other requests to takeover_run()
2580 static void ip_reallocate_handler(struct ctdb_context *ctdb, uint64_t srvid,
2581 TDB_DATA data, void *private_data)
2583 struct srvid_request *request;
2584 struct ctdb_recoverd *rec = talloc_get_type(private_data,
2585 struct ctdb_recoverd);
2587 if (data.dsize != sizeof(struct srvid_request)) {
2588 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2589 return;
2592 request = (struct srvid_request *)data.dptr;
2594 srvid_request_add(ctdb, &rec->reallocate_requests, request);
2597 static void process_ipreallocate_requests(struct ctdb_context *ctdb,
2598 struct ctdb_recoverd *rec)
2600 TDB_DATA result;
2601 int32_t ret;
2602 uint32_t culprit;
2603 struct srvid_requests *current;
2605 DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
2607 /* Only process requests that are currently pending. More
2608 * might come in while the takeover run is in progress and
2609 * they will need to be processed later since they might
2610 * be in response flag changes.
2612 current = rec->reallocate_requests;
2613 rec->reallocate_requests = NULL;
2615 /* update the list of public ips that a node can handle for
2616 all connected nodes
2618 ret = ctdb_reload_remote_public_ips(ctdb, rec, rec->nodemap, &culprit);
2619 if (ret != 0) {
2620 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2621 culprit));
2622 rec->need_takeover_run = true;
2624 if (ret == 0) {
2625 if (do_takeover_run(rec, rec->nodemap, false)) {
2626 ret = ctdb_get_pnn(ctdb);
2627 } else {
2628 ret = -1;
2632 result.dsize = sizeof(int32_t);
2633 result.dptr = (uint8_t *)&ret;
2635 srvid_requests_reply(ctdb, &current, result);
2640 handler for recovery master elections
2642 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
2643 TDB_DATA data, void *private_data)
2645 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2646 int ret;
2647 struct election_message *em = (struct election_message *)data.dptr;
2649 /* Ignore election packets from ourself */
2650 if (ctdb->pnn == em->pnn) {
2651 return;
2654 /* we got an election packet - update the timeout for the election */
2655 talloc_free(rec->election_timeout);
2656 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2657 fast_start ?
2658 timeval_current_ofs(0, 500000) :
2659 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2660 ctdb_election_timeout, rec);
2662 /* someone called an election. check their election data
2663 and if we disagree and we would rather be the elected node,
2664 send a new election message to all other nodes
2666 if (ctdb_election_win(rec, em)) {
2667 if (!rec->send_election_te) {
2668 rec->send_election_te = event_add_timed(ctdb->ev, rec,
2669 timeval_current_ofs(0, 500000),
2670 election_send_request, rec);
2672 /*unban_all_nodes(ctdb);*/
2673 return;
2676 /* we didn't win */
2677 TALLOC_FREE(rec->send_election_te);
2679 /* Release the recovery lock file */
2680 if (ctdb_recovery_have_lock(ctdb)) {
2681 ctdb_recovery_unlock(ctdb);
2682 unban_all_nodes(ctdb);
2685 clear_ip_assignment_tree(ctdb);
2687 /* ok, let that guy become recmaster then */
2688 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
2689 if (ret != 0) {
2690 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
2691 return;
2694 return;
2699 force the start of the election process
2701 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
2702 struct ctdb_node_map *nodemap)
2704 int ret;
2705 struct ctdb_context *ctdb = rec->ctdb;
2707 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
2709 /* set all nodes to recovery mode to stop all internode traffic */
2710 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
2711 if (ret != 0) {
2712 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
2713 return;
2716 talloc_free(rec->election_timeout);
2717 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2718 fast_start ?
2719 timeval_current_ofs(0, 500000) :
2720 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2721 ctdb_election_timeout, rec);
2723 ret = send_election_request(rec, pnn);
2724 if (ret!=0) {
2725 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
2726 return;
2729 /* wait for a few seconds to collect all responses */
2730 ctdb_wait_election(rec);
2736 handler for when a node changes its flags
2738 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
2739 TDB_DATA data, void *private_data)
2741 int ret;
2742 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2743 struct ctdb_node_map *nodemap=NULL;
2744 TALLOC_CTX *tmp_ctx;
2745 int i;
2746 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2747 int disabled_flag_changed;
2749 if (data.dsize != sizeof(*c)) {
2750 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2751 return;
2754 tmp_ctx = talloc_new(ctdb);
2755 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2757 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2758 if (ret != 0) {
2759 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2760 talloc_free(tmp_ctx);
2761 return;
2765 for (i=0;i<nodemap->num;i++) {
2766 if (nodemap->nodes[i].pnn == c->pnn) break;
2769 if (i == nodemap->num) {
2770 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2771 talloc_free(tmp_ctx);
2772 return;
2775 if (c->old_flags != c->new_flags) {
2776 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2779 disabled_flag_changed = (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
2781 nodemap->nodes[i].flags = c->new_flags;
2783 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2784 CTDB_CURRENT_NODE, &ctdb->recovery_master);
2786 if (ret == 0) {
2787 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2788 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2791 if (ret == 0 &&
2792 ctdb->recovery_master == ctdb->pnn &&
2793 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2794 /* Only do the takeover run if the perm disabled or unhealthy
2795 flags changed since these will cause an ip failover but not
2796 a recovery.
2797 If the node became disconnected or banned this will also
2798 lead to an ip address failover but that is handled
2799 during recovery
2801 if (disabled_flag_changed) {
2802 rec->need_takeover_run = true;
2806 talloc_free(tmp_ctx);
2810 handler for when we need to push out flag changes ot all other nodes
2812 static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
2813 TDB_DATA data, void *private_data)
2815 int ret;
2816 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2817 struct ctdb_node_map *nodemap=NULL;
2818 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2819 uint32_t recmaster;
2820 uint32_t *nodes;
2822 /* find the recovery master */
2823 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
2824 if (ret != 0) {
2825 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from local node\n"));
2826 talloc_free(tmp_ctx);
2827 return;
2830 /* read the node flags from the recmaster */
2831 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), recmaster, tmp_ctx, &nodemap);
2832 if (ret != 0) {
2833 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2834 talloc_free(tmp_ctx);
2835 return;
2837 if (c->pnn >= nodemap->num) {
2838 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2839 talloc_free(tmp_ctx);
2840 return;
2843 /* send the flags update to all connected nodes */
2844 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2846 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2847 nodes, 0, CONTROL_TIMEOUT(),
2848 false, data,
2849 NULL, NULL,
2850 NULL) != 0) {
2851 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2853 talloc_free(tmp_ctx);
2854 return;
2857 talloc_free(tmp_ctx);
2861 struct verify_recmode_normal_data {
2862 uint32_t count;
2863 enum monitor_result status;
2866 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2868 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2871 /* one more node has responded with recmode data*/
2872 rmdata->count--;
2874 /* if we failed to get the recmode, then return an error and let
2875 the main loop try again.
2877 if (state->state != CTDB_CONTROL_DONE) {
2878 if (rmdata->status == MONITOR_OK) {
2879 rmdata->status = MONITOR_FAILED;
2881 return;
2884 /* if we got a response, then the recmode will be stored in the
2885 status field
2887 if (state->status != CTDB_RECOVERY_NORMAL) {
2888 DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
2889 rmdata->status = MONITOR_RECOVERY_NEEDED;
2892 return;
2896 /* verify that all nodes are in normal recovery mode */
2897 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2899 struct verify_recmode_normal_data *rmdata;
2900 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2901 struct ctdb_client_control_state *state;
2902 enum monitor_result status;
2903 int j;
2905 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2906 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2907 rmdata->count = 0;
2908 rmdata->status = MONITOR_OK;
2910 /* loop over all active nodes and send an async getrecmode call to
2911 them*/
2912 for (j=0; j<nodemap->num; j++) {
2913 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2914 continue;
2916 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2917 CONTROL_TIMEOUT(),
2918 nodemap->nodes[j].pnn);
2919 if (state == NULL) {
2920 /* we failed to send the control, treat this as
2921 an error and try again next iteration
2923 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2924 talloc_free(mem_ctx);
2925 return MONITOR_FAILED;
2928 /* set up the callback functions */
2929 state->async.fn = verify_recmode_normal_callback;
2930 state->async.private_data = rmdata;
2932 /* one more control to wait for to complete */
2933 rmdata->count++;
2937 /* now wait for up to the maximum number of seconds allowed
2938 or until all nodes we expect a response from has replied
2940 while (rmdata->count > 0) {
2941 event_loop_once(ctdb->ev);
2944 status = rmdata->status;
2945 talloc_free(mem_ctx);
2946 return status;
2950 struct verify_recmaster_data {
2951 struct ctdb_recoverd *rec;
2952 uint32_t count;
2953 uint32_t pnn;
2954 enum monitor_result status;
2957 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2959 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2962 /* one more node has responded with recmaster data*/
2963 rmdata->count--;
2965 /* if we failed to get the recmaster, then return an error and let
2966 the main loop try again.
2968 if (state->state != CTDB_CONTROL_DONE) {
2969 if (rmdata->status == MONITOR_OK) {
2970 rmdata->status = MONITOR_FAILED;
2972 return;
2975 /* if we got a response, then the recmaster will be stored in the
2976 status field
2978 if (state->status != rmdata->pnn) {
2979 DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
2980 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2981 rmdata->status = MONITOR_ELECTION_NEEDED;
2984 return;
2988 /* verify that all nodes agree that we are the recmaster */
2989 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
2991 struct ctdb_context *ctdb = rec->ctdb;
2992 struct verify_recmaster_data *rmdata;
2993 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2994 struct ctdb_client_control_state *state;
2995 enum monitor_result status;
2996 int j;
2998 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2999 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
3000 rmdata->rec = rec;
3001 rmdata->count = 0;
3002 rmdata->pnn = pnn;
3003 rmdata->status = MONITOR_OK;
3005 /* loop over all active nodes and send an async getrecmaster call to
3006 them*/
3007 for (j=0; j<nodemap->num; j++) {
3008 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3009 continue;
3011 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
3012 CONTROL_TIMEOUT(),
3013 nodemap->nodes[j].pnn);
3014 if (state == NULL) {
3015 /* we failed to send the control, treat this as
3016 an error and try again next iteration
3018 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
3019 talloc_free(mem_ctx);
3020 return MONITOR_FAILED;
3023 /* set up the callback functions */
3024 state->async.fn = verify_recmaster_callback;
3025 state->async.private_data = rmdata;
3027 /* one more control to wait for to complete */
3028 rmdata->count++;
3032 /* now wait for up to the maximum number of seconds allowed
3033 or until all nodes we expect a response from has replied
3035 while (rmdata->count > 0) {
3036 event_loop_once(ctdb->ev);
3039 status = rmdata->status;
3040 talloc_free(mem_ctx);
3041 return status;
3044 static bool interfaces_have_changed(struct ctdb_context *ctdb,
3045 struct ctdb_recoverd *rec)
3047 struct ctdb_control_get_ifaces *ifaces = NULL;
3048 TALLOC_CTX *mem_ctx;
3049 bool ret = false;
3051 mem_ctx = talloc_new(NULL);
3053 /* Read the interfaces from the local node */
3054 if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
3055 CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
3056 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
3057 /* We could return an error. However, this will be
3058 * rare so we'll decide that the interfaces have
3059 * actually changed, just in case.
3061 talloc_free(mem_ctx);
3062 return true;
3065 if (!rec->ifaces) {
3066 /* We haven't been here before so things have changed */
3067 DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
3068 ret = true;
3069 } else if (rec->ifaces->num != ifaces->num) {
3070 /* Number of interfaces has changed */
3071 DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
3072 rec->ifaces->num, ifaces->num));
3073 ret = true;
3074 } else {
3075 /* See if interface names or link states have changed */
3076 int i;
3077 for (i = 0; i < rec->ifaces->num; i++) {
3078 struct ctdb_control_iface_info * iface = &rec->ifaces->ifaces[i];
3079 if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
3080 DEBUG(DEBUG_NOTICE,
3081 ("Interface in slot %d changed: %s => %s\n",
3082 i, iface->name, ifaces->ifaces[i].name));
3083 ret = true;
3084 break;
3086 if (iface->link_state != ifaces->ifaces[i].link_state) {
3087 DEBUG(DEBUG_NOTICE,
3088 ("Interface %s changed state: %d => %d\n",
3089 iface->name, iface->link_state,
3090 ifaces->ifaces[i].link_state));
3091 ret = true;
3092 break;
3097 talloc_free(rec->ifaces);
3098 rec->ifaces = talloc_steal(rec, ifaces);
3100 talloc_free(mem_ctx);
3101 return ret;
3104 /* called to check that the local allocation of public ip addresses is ok.
3106 static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn, struct ctdb_node_map *nodemap)
3108 TALLOC_CTX *mem_ctx = talloc_new(NULL);
3109 struct ctdb_uptime *uptime1 = NULL;
3110 struct ctdb_uptime *uptime2 = NULL;
3111 int ret, j;
3112 bool need_takeover_run = false;
3114 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
3115 CTDB_CURRENT_NODE, &uptime1);
3116 if (ret != 0) {
3117 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
3118 talloc_free(mem_ctx);
3119 return -1;
3122 if (interfaces_have_changed(ctdb, rec)) {
3123 DEBUG(DEBUG_NOTICE, ("The interfaces status has changed on "
3124 "local node %u - force takeover run\n",
3125 pnn));
3126 need_takeover_run = true;
3129 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
3130 CTDB_CURRENT_NODE, &uptime2);
3131 if (ret != 0) {
3132 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
3133 talloc_free(mem_ctx);
3134 return -1;
3137 /* skip the check if the startrecovery time has changed */
3138 if (timeval_compare(&uptime1->last_recovery_started,
3139 &uptime2->last_recovery_started) != 0) {
3140 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3141 talloc_free(mem_ctx);
3142 return 0;
3145 /* skip the check if the endrecovery time has changed */
3146 if (timeval_compare(&uptime1->last_recovery_finished,
3147 &uptime2->last_recovery_finished) != 0) {
3148 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3149 talloc_free(mem_ctx);
3150 return 0;
3153 /* skip the check if we have started but not finished recovery */
3154 if (timeval_compare(&uptime1->last_recovery_finished,
3155 &uptime1->last_recovery_started) != 1) {
3156 DEBUG(DEBUG_INFO, (__location__ " in the middle of recovery or ip reallocation. skipping public ip address check\n"));
3157 talloc_free(mem_ctx);
3159 return 0;
3162 /* verify that we have the ip addresses we should have
3163 and we dont have ones we shouldnt have.
3164 if we find an inconsistency we set recmode to
3165 active on the local node and wait for the recmaster
3166 to do a full blown recovery.
3167 also if the pnn is -1 and we are healthy and can host the ip
3168 we also request a ip reallocation.
3170 if (ctdb->tunable.disable_ip_failover == 0) {
3171 struct ctdb_all_public_ips *ips = NULL;
3173 /* read the *available* IPs from the local node */
3174 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
3175 if (ret != 0) {
3176 DEBUG(DEBUG_ERR, ("Unable to get available public IPs from local node %u\n", pnn));
3177 talloc_free(mem_ctx);
3178 return -1;
3181 for (j=0; j<ips->num; j++) {
3182 if (ips->ips[j].pnn == -1 &&
3183 nodemap->nodes[pnn].flags == 0) {
3184 DEBUG(DEBUG_CRIT,("Public IP '%s' is not assigned and we could serve it\n",
3185 ctdb_addr_to_str(&ips->ips[j].addr)));
3186 need_takeover_run = true;
3190 talloc_free(ips);
3192 /* read the *known* IPs from the local node */
3193 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
3194 if (ret != 0) {
3195 DEBUG(DEBUG_ERR, ("Unable to get known public IPs from local node %u\n", pnn));
3196 talloc_free(mem_ctx);
3197 return -1;
3200 for (j=0; j<ips->num; j++) {
3201 if (ips->ips[j].pnn == pnn) {
3202 if (ctdb->do_checkpublicip && !ctdb_sys_have_ip(&ips->ips[j].addr)) {
3203 DEBUG(DEBUG_CRIT,("Public IP '%s' is assigned to us but not on an interface\n",
3204 ctdb_addr_to_str(&ips->ips[j].addr)));
3205 need_takeover_run = true;
3207 } else {
3208 if (ctdb->do_checkpublicip &&
3209 ctdb_sys_have_ip(&ips->ips[j].addr)) {
3211 DEBUG(DEBUG_CRIT,("We are still serving a public IP '%s' that we should not be serving. Removing it\n",
3212 ctdb_addr_to_str(&ips->ips[j].addr)));
3214 if (ctdb_ctrl_release_ip(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ips->ips[j]) != 0) {
3215 DEBUG(DEBUG_ERR,("Failed to release local IP address\n"));
3222 if (need_takeover_run) {
3223 struct srvid_request rd;
3224 TDB_DATA data;
3226 DEBUG(DEBUG_CRIT,("Trigger takeoverrun\n"));
3228 rd.pnn = ctdb->pnn;
3229 rd.srvid = 0;
3230 data.dptr = (uint8_t *)&rd;
3231 data.dsize = sizeof(rd);
3233 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
3234 if (ret != 0) {
3235 DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
3238 talloc_free(mem_ctx);
3239 return 0;
3243 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
3245 struct ctdb_node_map **remote_nodemaps = callback_data;
3247 if (node_pnn >= ctdb->num_nodes) {
3248 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
3249 return;
3252 remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
3256 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
3257 struct ctdb_node_map *nodemap,
3258 struct ctdb_node_map **remote_nodemaps)
3260 uint32_t *nodes;
3262 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
3263 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
3264 nodes, 0,
3265 CONTROL_TIMEOUT(), false, tdb_null,
3266 async_getnodemap_callback,
3267 NULL,
3268 remote_nodemaps) != 0) {
3269 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
3271 return -1;
3274 return 0;
3277 static int update_recovery_lock_file(struct ctdb_context *ctdb)
3279 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
3280 const char *reclockfile;
3282 if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
3283 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
3284 talloc_free(tmp_ctx);
3285 return -1;
3288 if (reclockfile == NULL) {
3289 if (ctdb->recovery_lock_file != NULL) {
3290 DEBUG(DEBUG_NOTICE,("Recovery lock file disabled\n"));
3291 talloc_free(ctdb->recovery_lock_file);
3292 ctdb->recovery_lock_file = NULL;
3293 ctdb_recovery_unlock(ctdb);
3295 talloc_free(tmp_ctx);
3296 return 0;
3299 if (ctdb->recovery_lock_file == NULL) {
3300 DEBUG(DEBUG_NOTICE,
3301 ("Recovery lock file enabled (%s)\n", reclockfile));
3302 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3303 ctdb_recovery_unlock(ctdb);
3304 talloc_free(tmp_ctx);
3305 return 0;
3309 if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
3310 talloc_free(tmp_ctx);
3311 return 0;
3314 DEBUG(DEBUG_NOTICE,
3315 ("Recovery lock file changed (now %s)\n", reclockfile));
3316 talloc_free(ctdb->recovery_lock_file);
3317 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3318 ctdb_recovery_unlock(ctdb);
3320 talloc_free(tmp_ctx);
3321 return 0;
3324 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
3325 TALLOC_CTX *mem_ctx)
3327 uint32_t pnn;
3328 struct ctdb_node_map *nodemap=NULL;
3329 struct ctdb_node_map *recmaster_nodemap=NULL;
3330 struct ctdb_node_map **remote_nodemaps=NULL;
3331 struct ctdb_vnn_map *vnnmap=NULL;
3332 struct ctdb_vnn_map *remote_vnnmap=NULL;
3333 uint32_t num_lmasters;
3334 int32_t debug_level;
3335 int i, j, ret;
3336 bool self_ban;
3339 /* verify that the main daemon is still running */
3340 if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
3341 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
3342 exit(-1);
3345 /* ping the local daemon to tell it we are alive */
3346 ctdb_ctrl_recd_ping(ctdb);
3348 if (rec->election_timeout) {
3349 /* an election is in progress */
3350 return;
3353 /* read the debug level from the parent and update locally */
3354 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
3355 if (ret !=0) {
3356 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
3357 return;
3359 DEBUGLEVEL = debug_level;
3361 /* get relevant tunables */
3362 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
3363 if (ret != 0) {
3364 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
3365 return;
3368 /* get runstate */
3369 ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(),
3370 CTDB_CURRENT_NODE, &ctdb->runstate);
3371 if (ret != 0) {
3372 DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n"));
3373 return;
3376 /* get the current recovery lock file from the server */
3377 if (update_recovery_lock_file(ctdb) != 0) {
3378 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
3379 return;
3382 /* Make sure that if recovery lock verification becomes disabled when
3383 we close the file
3385 if (ctdb->recovery_lock_file == NULL) {
3386 ctdb_recovery_unlock(ctdb);
3389 pnn = ctdb_get_pnn(ctdb);
3391 /* get the vnnmap */
3392 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
3393 if (ret != 0) {
3394 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
3395 return;
3399 /* get number of nodes */
3400 if (rec->nodemap) {
3401 talloc_free(rec->nodemap);
3402 rec->nodemap = NULL;
3403 nodemap=NULL;
3405 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
3406 if (ret != 0) {
3407 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
3408 return;
3410 nodemap = rec->nodemap;
3412 /* remember our own node flags */
3413 rec->node_flags = nodemap->nodes[pnn].flags;
3415 ban_misbehaving_nodes(rec, &self_ban);
3416 if (self_ban) {
3417 DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
3418 return;
3421 /* if the local daemon is STOPPED or BANNED, we verify that the databases are
3422 also frozen and that the recmode is set to active.
3424 if (rec->node_flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
3425 /* If this node has become inactive then we want to
3426 * reduce the chances of it taking over the recovery
3427 * master role when it becomes active again. This
3428 * helps to stabilise the recovery master role so that
3429 * it stays on the most stable node.
3431 rec->priority_time = timeval_current();
3433 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
3434 if (ret != 0) {
3435 DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
3437 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
3438 DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
3440 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
3441 if (ret != 0) {
3442 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
3444 return;
3446 ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
3447 if (ret != 0) {
3448 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node in STOPPED or BANNED state\n"));
3449 return;
3453 /* If this node is stopped or banned then it is not the recovery
3454 * master, so don't do anything. This prevents stopped or banned
3455 * node from starting election and sending unnecessary controls.
3457 return;
3460 /* check which node is the recovery master */
3461 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
3462 if (ret != 0) {
3463 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
3464 return;
3467 /* If we are not the recmaster then do some housekeeping */
3468 if (rec->recmaster != pnn) {
3469 /* Ignore any IP reallocate requests - only recmaster
3470 * processes them
3472 TALLOC_FREE(rec->reallocate_requests);
3473 /* Clear any nodes that should be force rebalanced in
3474 * the next takeover run. If the recovery master role
3475 * has moved then we don't want to process these some
3476 * time in the future.
3478 TALLOC_FREE(rec->force_rebalance_nodes);
3481 /* This is a special case. When recovery daemon is started, recmaster
3482 * is set to -1. If a node is not started in stopped state, then
3483 * start election to decide recovery master
3485 if (rec->recmaster == (uint32_t)-1) {
3486 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
3487 force_election(rec, pnn, nodemap);
3488 return;
3491 /* update the capabilities for all nodes */
3492 ret = update_capabilities(rec, nodemap);
3493 if (ret != 0) {
3494 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
3495 return;
3499 * If the current recmaster does not have CTDB_CAP_RECMASTER,
3500 * but we have, then force an election and try to become the new
3501 * recmaster.
3503 if (!ctdb_node_has_capabilities(rec->caps,
3504 rec->recmaster,
3505 CTDB_CAP_RECMASTER) &&
3506 (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
3507 !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
3508 DEBUG(DEBUG_ERR, (__location__ " Current recmaster node %u does not have CAP_RECMASTER,"
3509 " but we (node %u) have - force an election\n",
3510 rec->recmaster, pnn));
3511 force_election(rec, pnn, nodemap);
3512 return;
3515 /* verify that the recmaster node is still active */
3516 for (j=0; j<nodemap->num; j++) {
3517 if (nodemap->nodes[j].pnn==rec->recmaster) {
3518 break;
3522 if (j == nodemap->num) {
3523 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
3524 force_election(rec, pnn, nodemap);
3525 return;
3528 /* if recovery master is disconnected we must elect a new recmaster */
3529 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
3530 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
3531 force_election(rec, pnn, nodemap);
3532 return;
3535 /* get nodemap from the recovery master to check if it is inactive */
3536 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3537 mem_ctx, &recmaster_nodemap);
3538 if (ret != 0) {
3539 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
3540 nodemap->nodes[j].pnn));
3541 return;
3545 if ((recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) &&
3546 (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
3547 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
3549 * update our nodemap to carry the recmaster's notion of
3550 * its own flags, so that we don't keep freezing the
3551 * inactive recmaster node...
3553 nodemap->nodes[j].flags = recmaster_nodemap->nodes[j].flags;
3554 force_election(rec, pnn, nodemap);
3555 return;
3558 /* verify that we have all ip addresses we should have and we dont
3559 * have addresses we shouldnt have.
3561 if (ctdb->tunable.disable_ip_failover == 0 &&
3562 !ctdb_op_is_disabled(rec->takeover_run)) {
3563 if (verify_local_ip_allocation(ctdb, rec, pnn, nodemap) != 0) {
3564 DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
3569 /* if we are not the recmaster then we do not need to check
3570 if recovery is needed
3572 if (pnn != rec->recmaster) {
3573 return;
3577 /* ensure our local copies of flags are right */
3578 ret = update_local_flags(rec, nodemap);
3579 if (ret == MONITOR_ELECTION_NEEDED) {
3580 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
3581 force_election(rec, pnn, nodemap);
3582 return;
3584 if (ret != MONITOR_OK) {
3585 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
3586 return;
3589 if (ctdb->num_nodes != nodemap->num) {
3590 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
3591 ctdb_load_nodes_file(ctdb);
3592 return;
3595 /* verify that all active nodes agree that we are the recmaster */
3596 switch (verify_recmaster(rec, nodemap, pnn)) {
3597 case MONITOR_RECOVERY_NEEDED:
3598 /* can not happen */
3599 return;
3600 case MONITOR_ELECTION_NEEDED:
3601 force_election(rec, pnn, nodemap);
3602 return;
3603 case MONITOR_OK:
3604 break;
3605 case MONITOR_FAILED:
3606 return;
3610 if (rec->need_recovery) {
3611 /* a previous recovery didn't finish */
3612 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3613 return;
3616 /* verify that all active nodes are in normal mode
3617 and not in recovery mode
3619 switch (verify_recmode(ctdb, nodemap)) {
3620 case MONITOR_RECOVERY_NEEDED:
3621 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3622 return;
3623 case MONITOR_FAILED:
3624 return;
3625 case MONITOR_ELECTION_NEEDED:
3626 /* can not happen */
3627 case MONITOR_OK:
3628 break;
3632 if (ctdb->recovery_lock_file != NULL) {
3633 /* We must already hold the recovery lock */
3634 if (!ctdb_recovery_have_lock(ctdb)) {
3635 DEBUG(DEBUG_ERR,("Failed recovery lock sanity check. Force a recovery\n"));
3636 ctdb_set_culprit(rec, ctdb->pnn);
3637 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3638 return;
3643 /* if there are takeovers requested, perform it and notify the waiters */
3644 if (!ctdb_op_is_disabled(rec->takeover_run) &&
3645 rec->reallocate_requests) {
3646 process_ipreallocate_requests(ctdb, rec);
3649 /* If recoveries are disabled then there is no use doing any
3650 * nodemap or flags checks. Recoveries might be disabled due
3651 * to "reloadnodes", so doing these checks might cause an
3652 * unnecessary recovery. */
3653 if (ctdb_op_is_disabled(rec->recovery)) {
3654 return;
3657 /* get the nodemap for all active remote nodes
3659 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
3660 if (remote_nodemaps == NULL) {
3661 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3662 return;
3664 for(i=0; i<nodemap->num; i++) {
3665 remote_nodemaps[i] = NULL;
3667 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3668 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3669 return;
3672 /* verify that all other nodes have the same nodemap as we have
3674 for (j=0; j<nodemap->num; j++) {
3675 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3676 continue;
3679 if (remote_nodemaps[j] == NULL) {
3680 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3681 ctdb_set_culprit(rec, j);
3683 return;
3686 /* if the nodes disagree on how many nodes there are
3687 then this is a good reason to try recovery
3689 if (remote_nodemaps[j]->num != nodemap->num) {
3690 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3691 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3692 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3693 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3694 return;
3697 /* if the nodes disagree on which nodes exist and are
3698 active, then that is also a good reason to do recovery
3700 for (i=0;i<nodemap->num;i++) {
3701 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3702 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3703 nodemap->nodes[j].pnn, i,
3704 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3705 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3706 do_recovery(rec, mem_ctx, pnn, nodemap,
3707 vnnmap);
3708 return;
3714 * Update node flags obtained from each active node. This ensure we have
3715 * up-to-date information for all the nodes.
3717 for (j=0; j<nodemap->num; j++) {
3718 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3719 continue;
3721 nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
3724 for (j=0; j<nodemap->num; j++) {
3725 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3726 continue;
3729 /* verify the flags are consistent
3731 for (i=0; i<nodemap->num; i++) {
3732 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3733 continue;
3736 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3737 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3738 nodemap->nodes[j].pnn,
3739 nodemap->nodes[i].pnn,
3740 remote_nodemaps[j]->nodes[i].flags,
3741 nodemap->nodes[i].flags));
3742 if (i == j) {
3743 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3744 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3745 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3746 do_recovery(rec, mem_ctx, pnn, nodemap,
3747 vnnmap);
3748 return;
3749 } else {
3750 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3751 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3752 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3753 do_recovery(rec, mem_ctx, pnn, nodemap,
3754 vnnmap);
3755 return;
3762 /* count how many active nodes there are */
3763 num_lmasters = 0;
3764 for (i=0; i<nodemap->num; i++) {
3765 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
3766 if (ctdb_node_has_capabilities(rec->caps,
3767 ctdb->nodes[i]->pnn,
3768 CTDB_CAP_LMASTER)) {
3769 num_lmasters++;
3775 /* There must be the same number of lmasters in the vnn map as
3776 * there are active nodes with the lmaster capability... or
3777 * do a recovery.
3779 if (vnnmap->size != num_lmasters) {
3780 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
3781 vnnmap->size, num_lmasters));
3782 ctdb_set_culprit(rec, ctdb->pnn);
3783 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3784 return;
3787 /* verify that all active nodes in the nodemap also exist in
3788 the vnnmap.
3790 for (j=0; j<nodemap->num; j++) {
3791 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3792 continue;
3794 if (nodemap->nodes[j].pnn == pnn) {
3795 continue;
3798 for (i=0; i<vnnmap->size; i++) {
3799 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3800 break;
3803 if (i == vnnmap->size) {
3804 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3805 nodemap->nodes[j].pnn));
3806 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3807 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3808 return;
3813 /* verify that all other nodes have the same vnnmap
3814 and are from the same generation
3816 for (j=0; j<nodemap->num; j++) {
3817 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3818 continue;
3820 if (nodemap->nodes[j].pnn == pnn) {
3821 continue;
3824 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3825 mem_ctx, &remote_vnnmap);
3826 if (ret != 0) {
3827 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3828 nodemap->nodes[j].pnn));
3829 return;
3832 /* verify the vnnmap generation is the same */
3833 if (vnnmap->generation != remote_vnnmap->generation) {
3834 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3835 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3836 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3837 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3838 return;
3841 /* verify the vnnmap size is the same */
3842 if (vnnmap->size != remote_vnnmap->size) {
3843 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3844 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3845 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3846 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3847 return;
3850 /* verify the vnnmap is the same */
3851 for (i=0;i<vnnmap->size;i++) {
3852 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3853 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3854 nodemap->nodes[j].pnn));
3855 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3856 do_recovery(rec, mem_ctx, pnn, nodemap,
3857 vnnmap);
3858 return;
3863 /* we might need to change who has what IP assigned */
3864 if (rec->need_takeover_run) {
3865 uint32_t culprit = (uint32_t)-1;
3867 rec->need_takeover_run = false;
3869 /* update the list of public ips that a node can handle for
3870 all connected nodes
3872 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
3873 if (ret != 0) {
3874 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
3875 culprit));
3876 rec->need_takeover_run = true;
3877 return;
3880 /* execute the "startrecovery" event script on all nodes */
3881 ret = run_startrecovery_eventscript(rec, nodemap);
3882 if (ret!=0) {
3883 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
3884 ctdb_set_culprit(rec, ctdb->pnn);
3885 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3886 return;
3889 /* If takeover run fails, then the offending nodes are
3890 * assigned ban culprit counts. And we re-try takeover.
3891 * If takeover run fails repeatedly, the node would get
3892 * banned.
3894 * If rec->need_takeover_run is not set to true at this
3895 * failure, monitoring is disabled cluster-wide (via
3896 * startrecovery eventscript) and will not get enabled.
3898 if (!do_takeover_run(rec, nodemap, true)) {
3899 return;
3902 /* execute the "recovered" event script on all nodes */
3903 ret = run_recovered_eventscript(rec, nodemap, "monitor_cluster");
3904 #if 0
3905 // we cant check whether the event completed successfully
3906 // since this script WILL fail if the node is in recovery mode
3907 // and if that race happens, the code here would just cause a second
3908 // cascading recovery.
3909 if (ret!=0) {
3910 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
3911 ctdb_set_culprit(rec, ctdb->pnn);
3912 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3914 #endif
3919 the main monitoring loop
3921 static void monitor_cluster(struct ctdb_context *ctdb)
3923 struct ctdb_recoverd *rec;
3925 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
3927 rec = talloc_zero(ctdb, struct ctdb_recoverd);
3928 CTDB_NO_MEMORY_FATAL(ctdb, rec);
3930 rec->ctdb = ctdb;
3932 rec->takeover_run = ctdb_op_init(rec, "takeover runs");
3933 CTDB_NO_MEMORY_FATAL(ctdb, rec->takeover_run);
3935 rec->recovery = ctdb_op_init(rec, "recoveries");
3936 CTDB_NO_MEMORY_FATAL(ctdb, rec->recovery);
3938 rec->priority_time = timeval_current();
3940 /* register a message port for sending memory dumps */
3941 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
3943 /* register a message port for recovery elections */
3944 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
3946 /* when nodes are disabled/enabled */
3947 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
3949 /* when we are asked to puch out a flag change */
3950 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
3952 /* register a message port for vacuum fetch */
3953 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
3955 /* register a message port for reloadnodes */
3956 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
3958 /* register a message port for performing a takeover run */
3959 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
3961 /* register a message port for disabling the ip check for a short while */
3962 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
3964 /* register a message port for updating the recovery daemons node assignment for an ip */
3965 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECD_UPDATE_IP, recd_update_ip_handler, rec);
3967 /* register a message port for forcing a rebalance of a node next
3968 reallocation */
3969 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
3971 /* Register a message port for disabling takeover runs */
3972 ctdb_client_set_message_handler(ctdb,
3973 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
3974 disable_takeover_runs_handler, rec);
3976 /* Register a message port for disabling recoveries */
3977 ctdb_client_set_message_handler(ctdb,
3978 CTDB_SRVID_DISABLE_RECOVERIES,
3979 disable_recoveries_handler, rec);
3981 /* register a message port for detaching database */
3982 ctdb_client_set_message_handler(ctdb,
3983 CTDB_SRVID_DETACH_DATABASE,
3984 detach_database_handler, rec);
3986 for (;;) {
3987 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3988 struct timeval start;
3989 double elapsed;
3991 if (!mem_ctx) {
3992 DEBUG(DEBUG_CRIT,(__location__
3993 " Failed to create temp context\n"));
3994 exit(-1);
3997 start = timeval_current();
3998 main_loop(ctdb, rec, mem_ctx);
3999 talloc_free(mem_ctx);
4001 /* we only check for recovery once every second */
4002 elapsed = timeval_elapsed(&start);
4003 if (elapsed < ctdb->tunable.recover_interval) {
4004 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
4005 - elapsed);
4011 event handler for when the main ctdbd dies
4013 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
4014 uint16_t flags, void *private_data)
4016 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
4017 _exit(1);
4021 called regularly to verify that the recovery daemon is still running
4023 static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
4024 struct timeval yt, void *p)
4026 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
4028 if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
4029 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
4031 event_add_timed(ctdb->ev, ctdb, timeval_zero(),
4032 ctdb_restart_recd, ctdb);
4034 return;
4037 event_add_timed(ctdb->ev, ctdb->recd_ctx,
4038 timeval_current_ofs(30, 0),
4039 ctdb_check_recd, ctdb);
4042 static void recd_sig_child_handler(struct event_context *ev,
4043 struct signal_event *se, int signum, int count,
4044 void *dont_care,
4045 void *private_data)
4047 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4048 int status;
4049 pid_t pid = -1;
4051 while (pid != 0) {
4052 pid = waitpid(-1, &status, WNOHANG);
4053 if (pid == -1) {
4054 if (errno != ECHILD) {
4055 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
4057 return;
4059 if (pid > 0) {
4060 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
4066 startup the recovery daemon as a child of the main ctdb daemon
4068 int ctdb_start_recoverd(struct ctdb_context *ctdb)
4070 int fd[2];
4071 struct signal_event *se;
4072 struct tevent_fd *fde;
4074 if (pipe(fd) != 0) {
4075 return -1;
4078 ctdb->recoverd_pid = ctdb_fork(ctdb);
4079 if (ctdb->recoverd_pid == -1) {
4080 return -1;
4083 if (ctdb->recoverd_pid != 0) {
4084 talloc_free(ctdb->recd_ctx);
4085 ctdb->recd_ctx = talloc_new(ctdb);
4086 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
4088 close(fd[0]);
4089 event_add_timed(ctdb->ev, ctdb->recd_ctx,
4090 timeval_current_ofs(30, 0),
4091 ctdb_check_recd, ctdb);
4092 return 0;
4095 close(fd[1]);
4097 srandom(getpid() ^ time(NULL));
4099 ctdb_set_process_name("ctdb_recovered");
4100 if (switch_from_server_to_client(ctdb, "recoverd") != 0) {
4101 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
4102 exit(1);
4105 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
4107 fde = event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ,
4108 ctdb_recoverd_parent, &fd[0]);
4109 tevent_fd_set_auto_close(fde);
4111 /* set up a handler to pick up sigchld */
4112 se = event_add_signal(ctdb->ev, ctdb,
4113 SIGCHLD, 0,
4114 recd_sig_child_handler,
4115 ctdb);
4116 if (se == NULL) {
4117 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
4118 exit(1);
4121 monitor_cluster(ctdb);
4123 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
4124 return -1;
4128 shutdown the recovery daemon
4130 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
4132 if (ctdb->recoverd_pid == 0) {
4133 return;
4136 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
4137 ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
4139 TALLOC_FREE(ctdb->recd_ctx);
4140 TALLOC_FREE(ctdb->recd_ping_count);
4143 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te,
4144 struct timeval t, void *private_data)
4146 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4148 DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
4149 ctdb_stop_recoverd(ctdb);
4150 ctdb_start_recoverd(ctdb);