python:samba/tests: add simple 'samba-tool user getpassword' test
[Samba.git] / ctdb / server / ctdb_recoverd.c
blobe5b94540fecf59e4bb5c57964ee340317c0daaf0
1 /*
2 ctdb recovery daemon
4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
20 #include "replace.h"
21 #include "system/filesys.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/wait.h"
26 #include <popt.h>
27 #include <talloc.h>
28 #include <tevent.h>
29 #include <tdb.h>
31 #include "lib/tdb_wrap/tdb_wrap.h"
32 #include "lib/util/dlinklist.h"
33 #include "lib/util/debug.h"
34 #include "lib/util/samba_util.h"
35 #include "lib/util/util_process.h"
37 #include "ctdb_private.h"
38 #include "ctdb_client.h"
40 #include "common/system.h"
41 #include "common/cmdline.h"
42 #include "common/common.h"
43 #include "common/logging.h"
45 #include "ctdb_cluster_mutex.h"
47 /* List of SRVID requests that need to be processed */
48 struct srvid_list {
49 struct srvid_list *next, *prev;
50 struct ctdb_srvid_message *request;
53 struct srvid_requests {
54 struct srvid_list *requests;
57 static void srvid_request_reply(struct ctdb_context *ctdb,
58 struct ctdb_srvid_message *request,
59 TDB_DATA result)
61 /* Someone that sent srvid==0 does not want a reply */
62 if (request->srvid == 0) {
63 talloc_free(request);
64 return;
67 if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
68 result) == 0) {
69 DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
70 (unsigned)request->pnn,
71 (unsigned long long)request->srvid));
72 } else {
73 DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
74 (unsigned)request->pnn,
75 (unsigned long long)request->srvid));
78 talloc_free(request);
81 static void srvid_requests_reply(struct ctdb_context *ctdb,
82 struct srvid_requests **requests,
83 TDB_DATA result)
85 struct srvid_list *r;
87 if (*requests == NULL) {
88 return;
91 for (r = (*requests)->requests; r != NULL; r = r->next) {
92 srvid_request_reply(ctdb, r->request, result);
95 /* Free the list structure... */
96 TALLOC_FREE(*requests);
99 static void srvid_request_add(struct ctdb_context *ctdb,
100 struct srvid_requests **requests,
101 struct ctdb_srvid_message *request)
103 struct srvid_list *t;
104 int32_t ret;
105 TDB_DATA result;
107 if (*requests == NULL) {
108 *requests = talloc_zero(ctdb, struct srvid_requests);
109 if (*requests == NULL) {
110 goto nomem;
114 t = talloc_zero(*requests, struct srvid_list);
115 if (t == NULL) {
116 /* If *requests was just allocated above then free it */
117 if ((*requests)->requests == NULL) {
118 TALLOC_FREE(*requests);
120 goto nomem;
123 t->request = (struct ctdb_srvid_message *)talloc_steal(t, request);
124 DLIST_ADD((*requests)->requests, t);
126 return;
128 nomem:
129 /* Failed to add the request to the list. Send a fail. */
130 DEBUG(DEBUG_ERR, (__location__
131 " Out of memory, failed to queue SRVID request\n"));
132 ret = -ENOMEM;
133 result.dsize = sizeof(ret);
134 result.dptr = (uint8_t *)&ret;
135 srvid_request_reply(ctdb, request, result);
138 /* An abstraction to allow an operation (takeover runs, recoveries,
139 * ...) to be disabled for a given timeout */
140 struct ctdb_op_state {
141 struct tevent_timer *timer;
142 bool in_progress;
143 const char *name;
146 static struct ctdb_op_state *ctdb_op_init(TALLOC_CTX *mem_ctx, const char *name)
148 struct ctdb_op_state *state = talloc_zero(mem_ctx, struct ctdb_op_state);
150 if (state != NULL) {
151 state->in_progress = false;
152 state->name = name;
155 return state;
158 static bool ctdb_op_is_disabled(struct ctdb_op_state *state)
160 return state->timer != NULL;
163 static bool ctdb_op_begin(struct ctdb_op_state *state)
165 if (ctdb_op_is_disabled(state)) {
166 DEBUG(DEBUG_NOTICE,
167 ("Unable to begin - %s are disabled\n", state->name));
168 return false;
171 state->in_progress = true;
172 return true;
175 static bool ctdb_op_end(struct ctdb_op_state *state)
177 return state->in_progress = false;
180 static bool ctdb_op_is_in_progress(struct ctdb_op_state *state)
182 return state->in_progress;
185 static void ctdb_op_enable(struct ctdb_op_state *state)
187 TALLOC_FREE(state->timer);
190 static void ctdb_op_timeout_handler(struct tevent_context *ev,
191 struct tevent_timer *te,
192 struct timeval yt, void *p)
194 struct ctdb_op_state *state =
195 talloc_get_type(p, struct ctdb_op_state);
197 DEBUG(DEBUG_NOTICE,("Reenabling %s after timeout\n", state->name));
198 ctdb_op_enable(state);
201 static int ctdb_op_disable(struct ctdb_op_state *state,
202 struct tevent_context *ev,
203 uint32_t timeout)
205 if (timeout == 0) {
206 DEBUG(DEBUG_NOTICE,("Reenabling %s\n", state->name));
207 ctdb_op_enable(state);
208 return 0;
211 if (state->in_progress) {
212 DEBUG(DEBUG_ERR,
213 ("Unable to disable %s - in progress\n", state->name));
214 return -EAGAIN;
217 DEBUG(DEBUG_NOTICE,("Disabling %s for %u seconds\n",
218 state->name, timeout));
220 /* Clear any old timers */
221 talloc_free(state->timer);
223 /* Arrange for the timeout to occur */
224 state->timer = tevent_add_timer(ev, state,
225 timeval_current_ofs(timeout, 0),
226 ctdb_op_timeout_handler, state);
227 if (state->timer == NULL) {
228 DEBUG(DEBUG_ERR,(__location__ " Unable to setup timer\n"));
229 return -ENOMEM;
232 return 0;
235 struct ctdb_banning_state {
236 uint32_t count;
237 struct timeval last_reported_time;
241 private state of recovery daemon
243 struct ctdb_recoverd {
244 struct ctdb_context *ctdb;
245 uint32_t recmaster;
246 uint32_t last_culprit_node;
247 struct ctdb_node_map_old *nodemap;
248 struct timeval priority_time;
249 bool need_takeover_run;
250 bool need_recovery;
251 uint32_t node_flags;
252 struct tevent_timer *send_election_te;
253 struct tevent_timer *election_timeout;
254 struct srvid_requests *reallocate_requests;
255 struct ctdb_op_state *takeover_run;
256 struct ctdb_op_state *recovery;
257 struct ctdb_iface_list_old *ifaces;
258 uint32_t *force_rebalance_nodes;
259 struct ctdb_node_capabilities *caps;
260 bool frozen_on_inactive;
261 struct ctdb_cluster_mutex_handle *recovery_lock_handle;
264 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
265 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
267 static void ctdb_restart_recd(struct tevent_context *ev,
268 struct tevent_timer *te, struct timeval t,
269 void *private_data);
272 ban a node for a period of time
274 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
276 int ret;
277 struct ctdb_context *ctdb = rec->ctdb;
278 struct ctdb_ban_state bantime;
280 if (!ctdb_validate_pnn(ctdb, pnn)) {
281 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
282 return;
285 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
287 bantime.pnn = pnn;
288 bantime.time = ban_time;
290 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
291 if (ret != 0) {
292 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
293 return;
298 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
302 remember the trouble maker
304 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
306 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
307 struct ctdb_banning_state *ban_state;
309 if (culprit > ctdb->num_nodes) {
310 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
311 return;
314 /* If we are banned or stopped, do not set other nodes as culprits */
315 if (rec->node_flags & NODE_FLAGS_INACTIVE) {
316 DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
317 return;
320 if (ctdb->nodes[culprit]->ban_state == NULL) {
321 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
322 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
326 ban_state = ctdb->nodes[culprit]->ban_state;
327 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
328 /* this was the first time in a long while this node
329 misbehaved so we will forgive any old transgressions.
331 ban_state->count = 0;
334 ban_state->count += count;
335 ban_state->last_reported_time = timeval_current();
336 rec->last_culprit_node = culprit;
340 remember the trouble maker
342 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
344 ctdb_set_culprit_count(rec, culprit, 1);
348 /* this callback is called for every node that failed to execute the
349 recovered event
351 static void recovered_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
353 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
355 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the recovered event. Setting it as recovery fail culprit\n", node_pnn));
357 ctdb_set_culprit(rec, node_pnn);
361 run the "recovered" eventscript on all nodes
363 static int run_recovered_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap, const char *caller)
365 TALLOC_CTX *tmp_ctx;
366 uint32_t *nodes;
367 struct ctdb_context *ctdb = rec->ctdb;
369 tmp_ctx = talloc_new(ctdb);
370 CTDB_NO_MEMORY(ctdb, tmp_ctx);
372 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
373 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
374 nodes, 0,
375 CONTROL_TIMEOUT(), false, tdb_null,
376 NULL, recovered_fail_callback,
377 rec) != 0) {
378 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
380 talloc_free(tmp_ctx);
381 return -1;
384 talloc_free(tmp_ctx);
385 return 0;
388 /* this callback is called for every node that failed to execute the
389 start recovery event
391 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
393 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
395 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
397 ctdb_set_culprit(rec, node_pnn);
401 run the "startrecovery" eventscript on all nodes
403 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap)
405 TALLOC_CTX *tmp_ctx;
406 uint32_t *nodes;
407 struct ctdb_context *ctdb = rec->ctdb;
409 tmp_ctx = talloc_new(ctdb);
410 CTDB_NO_MEMORY(ctdb, tmp_ctx);
412 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
413 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
414 nodes, 0,
415 CONTROL_TIMEOUT(), false, tdb_null,
416 NULL,
417 startrecovery_fail_callback,
418 rec) != 0) {
419 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
420 talloc_free(tmp_ctx);
421 return -1;
424 talloc_free(tmp_ctx);
425 return 0;
429 Retrieve capabilities from all connected nodes
431 static int update_capabilities(struct ctdb_recoverd *rec,
432 struct ctdb_node_map_old *nodemap)
434 uint32_t *capp;
435 TALLOC_CTX *tmp_ctx;
436 struct ctdb_node_capabilities *caps;
437 struct ctdb_context *ctdb = rec->ctdb;
439 tmp_ctx = talloc_new(rec);
440 CTDB_NO_MEMORY(ctdb, tmp_ctx);
442 caps = ctdb_get_capabilities(ctdb, tmp_ctx,
443 CONTROL_TIMEOUT(), nodemap);
445 if (caps == NULL) {
446 DEBUG(DEBUG_ERR,
447 (__location__ " Failed to get node capabilities\n"));
448 talloc_free(tmp_ctx);
449 return -1;
452 capp = ctdb_get_node_capabilities(caps, ctdb_get_pnn(ctdb));
453 if (capp == NULL) {
454 DEBUG(DEBUG_ERR,
455 (__location__
456 " Capabilities don't include current node.\n"));
457 talloc_free(tmp_ctx);
458 return -1;
460 ctdb->capabilities = *capp;
462 TALLOC_FREE(rec->caps);
463 rec->caps = talloc_steal(rec, caps);
465 talloc_free(tmp_ctx);
466 return 0;
469 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
471 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
473 DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
474 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
477 static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
479 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
481 DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
482 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
486 change recovery mode on all nodes
488 static int set_recovery_mode(struct ctdb_context *ctdb,
489 struct ctdb_recoverd *rec,
490 struct ctdb_node_map_old *nodemap,
491 uint32_t rec_mode, bool freeze)
493 TDB_DATA data;
494 uint32_t *nodes;
495 TALLOC_CTX *tmp_ctx;
497 tmp_ctx = talloc_new(ctdb);
498 CTDB_NO_MEMORY(ctdb, tmp_ctx);
500 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
502 data.dsize = sizeof(uint32_t);
503 data.dptr = (unsigned char *)&rec_mode;
505 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
506 nodes, 0,
507 CONTROL_TIMEOUT(),
508 false, data,
509 NULL, NULL,
510 NULL) != 0) {
511 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
512 talloc_free(tmp_ctx);
513 return -1;
516 /* freeze all nodes */
517 if (freeze && rec_mode == CTDB_RECOVERY_ACTIVE) {
518 int i;
520 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
521 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
522 nodes, i,
523 CONTROL_TIMEOUT(),
524 false, tdb_null,
525 NULL,
526 set_recmode_fail_callback,
527 rec) != 0) {
528 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
529 talloc_free(tmp_ctx);
530 return -1;
535 talloc_free(tmp_ctx);
536 return 0;
539 /* update all remote nodes to use the same db priority that we have
540 this can fail if the remove node has not yet been upgraded to
541 support this function, so we always return success and never fail
542 a recovery if this call fails.
544 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
545 struct ctdb_node_map_old *nodemap,
546 uint32_t pnn, struct ctdb_dbid_map_old *dbmap, TALLOC_CTX *mem_ctx)
548 int db;
550 /* step through all local databases */
551 for (db=0; db<dbmap->num;db++) {
552 struct ctdb_db_priority db_prio;
553 int ret;
555 db_prio.db_id = dbmap->dbs[db].db_id;
556 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].db_id, &db_prio.priority);
557 if (ret != 0) {
558 DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].db_id));
559 continue;
562 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].db_id, db_prio.priority));
564 ret = ctdb_ctrl_set_db_priority(ctdb, CONTROL_TIMEOUT(),
565 CTDB_CURRENT_NODE, &db_prio);
566 if (ret != 0) {
567 DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n",
568 db_prio.db_id));
572 return 0;
576 ensure all other nodes have attached to any databases that we have
578 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
579 uint32_t pnn, struct ctdb_dbid_map_old *dbmap, TALLOC_CTX *mem_ctx)
581 int i, j, db, ret;
582 struct ctdb_dbid_map_old *remote_dbmap;
584 /* verify that all other nodes have all our databases */
585 for (j=0; j<nodemap->num; j++) {
586 /* we don't need to ourself ourselves */
587 if (nodemap->nodes[j].pnn == pnn) {
588 continue;
590 /* don't check nodes that are unavailable */
591 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
592 continue;
595 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
596 mem_ctx, &remote_dbmap);
597 if (ret != 0) {
598 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
599 return -1;
602 /* step through all local databases */
603 for (db=0; db<dbmap->num;db++) {
604 const char *name;
607 for (i=0;i<remote_dbmap->num;i++) {
608 if (dbmap->dbs[db].db_id == remote_dbmap->dbs[i].db_id) {
609 break;
612 /* the remote node already have this database */
613 if (i!=remote_dbmap->num) {
614 continue;
616 /* ok so we need to create this database */
617 ret = ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn,
618 dbmap->dbs[db].db_id, mem_ctx,
619 &name);
620 if (ret != 0) {
621 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
622 return -1;
624 ret = ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(),
625 nodemap->nodes[j].pnn,
626 mem_ctx, name,
627 dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
628 if (ret != 0) {
629 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
630 return -1;
635 return 0;
640 ensure we are attached to any databases that anyone else is attached to
642 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
643 uint32_t pnn, struct ctdb_dbid_map_old **dbmap, TALLOC_CTX *mem_ctx)
645 int i, j, db, ret;
646 struct ctdb_dbid_map_old *remote_dbmap;
648 /* verify that we have all database any other node has */
649 for (j=0; j<nodemap->num; j++) {
650 /* we don't need to ourself ourselves */
651 if (nodemap->nodes[j].pnn == pnn) {
652 continue;
654 /* don't check nodes that are unavailable */
655 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
656 continue;
659 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
660 mem_ctx, &remote_dbmap);
661 if (ret != 0) {
662 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
663 return -1;
666 /* step through all databases on the remote node */
667 for (db=0; db<remote_dbmap->num;db++) {
668 const char *name;
670 for (i=0;i<(*dbmap)->num;i++) {
671 if (remote_dbmap->dbs[db].db_id == (*dbmap)->dbs[i].db_id) {
672 break;
675 /* we already have this db locally */
676 if (i!=(*dbmap)->num) {
677 continue;
679 /* ok so we need to create this database and
680 rebuild dbmap
682 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
683 remote_dbmap->dbs[db].db_id, mem_ctx, &name);
684 if (ret != 0) {
685 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
686 nodemap->nodes[j].pnn));
687 return -1;
689 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
690 remote_dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
691 if (ret != 0) {
692 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
693 return -1;
695 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
696 if (ret != 0) {
697 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
698 return -1;
703 return 0;
708 pull the remote database contents from one node into the recdb
710 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
711 struct tdb_wrap *recdb, uint32_t dbid)
713 int ret;
714 TDB_DATA outdata;
715 struct ctdb_marshall_buffer *reply;
716 struct ctdb_rec_data_old *recdata;
717 int i;
718 TALLOC_CTX *tmp_ctx = talloc_new(recdb);
720 ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
721 CONTROL_TIMEOUT(), &outdata);
722 if (ret != 0) {
723 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
724 talloc_free(tmp_ctx);
725 return -1;
728 reply = (struct ctdb_marshall_buffer *)outdata.dptr;
730 if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
731 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
732 talloc_free(tmp_ctx);
733 return -1;
736 recdata = (struct ctdb_rec_data_old *)&reply->data[0];
738 for (i=0;
739 i<reply->count;
740 recdata = (struct ctdb_rec_data_old *)(recdata->length + (uint8_t *)recdata), i++) {
741 TDB_DATA key, data;
742 struct ctdb_ltdb_header *hdr;
743 TDB_DATA existing;
745 key.dptr = &recdata->data[0];
746 key.dsize = recdata->keylen;
747 data.dptr = &recdata->data[key.dsize];
748 data.dsize = recdata->datalen;
750 hdr = (struct ctdb_ltdb_header *)data.dptr;
752 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
753 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
754 talloc_free(tmp_ctx);
755 return -1;
758 /* fetch the existing record, if any */
759 existing = tdb_fetch(recdb->tdb, key);
761 if (existing.dptr != NULL) {
762 struct ctdb_ltdb_header header;
763 if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
764 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
765 (unsigned)existing.dsize, srcnode));
766 free(existing.dptr);
767 talloc_free(tmp_ctx);
768 return -1;
770 header = *(struct ctdb_ltdb_header *)existing.dptr;
771 free(existing.dptr);
772 if (!(header.rsn < hdr->rsn ||
773 (header.dmaster != ctdb_get_pnn(ctdb) &&
774 header.rsn == hdr->rsn))) {
775 continue;
779 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
780 DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
781 talloc_free(tmp_ctx);
782 return -1;
786 talloc_free(tmp_ctx);
788 return 0;
792 struct pull_seqnum_cbdata {
793 int failed;
794 uint32_t pnn;
795 uint64_t seqnum;
798 static void pull_seqnum_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
800 struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
801 uint64_t seqnum;
803 if (cb_data->failed != 0) {
804 DEBUG(DEBUG_ERR, ("Got seqnum from node %d but we have already failed the entire operation\n", node_pnn));
805 return;
808 if (res != 0) {
809 DEBUG(DEBUG_ERR, ("Error when pulling seqnum from node %d\n", node_pnn));
810 cb_data->failed = 1;
811 return;
814 if (outdata.dsize != sizeof(uint64_t)) {
815 DEBUG(DEBUG_ERR, ("Error when reading pull seqnum from node %d, got %d bytes but expected %d\n", node_pnn, (int)outdata.dsize, (int)sizeof(uint64_t)));
816 cb_data->failed = -1;
817 return;
820 seqnum = *((uint64_t *)outdata.dptr);
822 if (seqnum > cb_data->seqnum ||
823 (cb_data->pnn == -1 && seqnum == 0)) {
824 cb_data->seqnum = seqnum;
825 cb_data->pnn = node_pnn;
829 static void pull_seqnum_fail_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
831 struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
833 DEBUG(DEBUG_ERR, ("Failed to pull db seqnum from node %d\n", node_pnn));
834 cb_data->failed = 1;
837 static int pull_highest_seqnum_pdb(struct ctdb_context *ctdb,
838 struct ctdb_recoverd *rec,
839 struct ctdb_node_map_old *nodemap,
840 struct tdb_wrap *recdb, uint32_t dbid)
842 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
843 uint32_t *nodes;
844 TDB_DATA data;
845 uint32_t outdata[2];
846 struct pull_seqnum_cbdata *cb_data;
848 DEBUG(DEBUG_NOTICE, ("Scan for highest seqnum pdb for db:0x%08x\n", dbid));
850 outdata[0] = dbid;
851 outdata[1] = 0;
853 data.dsize = sizeof(outdata);
854 data.dptr = (uint8_t *)&outdata[0];
856 cb_data = talloc(tmp_ctx, struct pull_seqnum_cbdata);
857 if (cb_data == NULL) {
858 DEBUG(DEBUG_ERR, ("Failed to allocate pull highest seqnum cb_data structure\n"));
859 talloc_free(tmp_ctx);
860 return -1;
863 cb_data->failed = 0;
864 cb_data->pnn = -1;
865 cb_data->seqnum = 0;
867 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
868 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_DB_SEQNUM,
869 nodes, 0,
870 CONTROL_TIMEOUT(), false, data,
871 pull_seqnum_cb,
872 pull_seqnum_fail_cb,
873 cb_data) != 0) {
874 DEBUG(DEBUG_ERR, (__location__ " Failed to run async GET_DB_SEQNUM\n"));
876 talloc_free(tmp_ctx);
877 return -1;
880 if (cb_data->failed != 0) {
881 DEBUG(DEBUG_NOTICE, ("Failed to pull sequence numbers for DB 0x%08x\n", dbid));
882 talloc_free(tmp_ctx);
883 return -1;
886 if (cb_data->pnn == -1) {
887 DEBUG(DEBUG_NOTICE, ("Failed to find a node with highest sequence numbers for DB 0x%08x\n", dbid));
888 talloc_free(tmp_ctx);
889 return -1;
892 DEBUG(DEBUG_NOTICE, ("Pull persistent db:0x%08x from node %d with highest seqnum:%lld\n", dbid, cb_data->pnn, (long long)cb_data->seqnum));
894 if (pull_one_remote_database(ctdb, cb_data->pnn, recdb, dbid) != 0) {
895 DEBUG(DEBUG_ERR, ("Failed to pull higest seqnum database 0x%08x from node %d\n", dbid, cb_data->pnn));
896 talloc_free(tmp_ctx);
897 return -1;
900 talloc_free(tmp_ctx);
901 return 0;
906 pull all the remote database contents into the recdb
908 static int pull_remote_database(struct ctdb_context *ctdb,
909 struct ctdb_recoverd *rec,
910 struct ctdb_node_map_old *nodemap,
911 struct tdb_wrap *recdb, uint32_t dbid,
912 bool persistent)
914 int j;
916 if (persistent && ctdb->tunable.recover_pdb_by_seqnum != 0) {
917 int ret;
918 ret = pull_highest_seqnum_pdb(ctdb, rec, nodemap, recdb, dbid);
919 if (ret == 0) {
920 return 0;
924 /* pull all records from all other nodes across onto this node
925 (this merges based on rsn)
927 for (j=0; j<nodemap->num; j++) {
928 /* don't merge from nodes that are unavailable */
929 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
930 continue;
932 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
933 DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
934 nodemap->nodes[j].pnn));
935 ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
936 return -1;
940 return 0;
945 update flags on all active nodes
947 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap, uint32_t pnn, uint32_t flags)
949 int ret;
951 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
952 if (ret != 0) {
953 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
954 return -1;
957 return 0;
961 ensure all nodes have the same vnnmap we do
963 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
964 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
966 int j, ret;
968 /* push the new vnn map out to all the nodes */
969 for (j=0; j<nodemap->num; j++) {
970 /* don't push to nodes that are unavailable */
971 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
972 continue;
975 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
976 if (ret != 0) {
977 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
978 return -1;
982 return 0;
987 called when a vacuum fetch has completed - just free it and do the next one
989 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
991 talloc_free(state);
996 * Process one elements of the vacuum fetch list:
997 * Migrate it over to us with the special flag
998 * CTDB_CALL_FLAG_VACUUM_MIGRATION.
1000 static bool vacuum_fetch_process_one(struct ctdb_db_context *ctdb_db,
1001 uint32_t pnn,
1002 struct ctdb_rec_data_old *r)
1004 struct ctdb_client_call_state *state;
1005 TDB_DATA data;
1006 struct ctdb_ltdb_header *hdr;
1007 struct ctdb_call call;
1009 ZERO_STRUCT(call);
1010 call.call_id = CTDB_NULL_FUNC;
1011 call.flags = CTDB_IMMEDIATE_MIGRATION;
1012 call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
1014 call.key.dptr = &r->data[0];
1015 call.key.dsize = r->keylen;
1017 /* ensure we don't block this daemon - just skip a record if we can't get
1018 the chainlock */
1019 if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, call.key) != 0) {
1020 return true;
1023 data = tdb_fetch(ctdb_db->ltdb->tdb, call.key);
1024 if (data.dptr == NULL) {
1025 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
1026 return true;
1029 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
1030 free(data.dptr);
1031 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
1032 return true;
1035 hdr = (struct ctdb_ltdb_header *)data.dptr;
1036 if (hdr->dmaster == pnn) {
1037 /* its already local */
1038 free(data.dptr);
1039 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
1040 return true;
1043 free(data.dptr);
1045 state = ctdb_call_send(ctdb_db, &call);
1046 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
1047 if (state == NULL) {
1048 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
1049 return false;
1051 state->async.fn = vacuum_fetch_callback;
1052 state->async.private_data = NULL;
1054 return true;
1059 handler for vacuum fetch
1061 static void vacuum_fetch_handler(uint64_t srvid, TDB_DATA data,
1062 void *private_data)
1064 struct ctdb_recoverd *rec = talloc_get_type(
1065 private_data, struct ctdb_recoverd);
1066 struct ctdb_context *ctdb = rec->ctdb;
1067 struct ctdb_marshall_buffer *recs;
1068 int ret, i;
1069 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1070 const char *name;
1071 struct ctdb_dbid_map_old *dbmap=NULL;
1072 bool persistent = false;
1073 struct ctdb_db_context *ctdb_db;
1074 struct ctdb_rec_data_old *r;
1076 recs = (struct ctdb_marshall_buffer *)data.dptr;
1078 if (recs->count == 0) {
1079 goto done;
1082 /* work out if the database is persistent */
1083 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
1084 if (ret != 0) {
1085 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
1086 goto done;
1089 for (i=0;i<dbmap->num;i++) {
1090 if (dbmap->dbs[i].db_id == recs->db_id) {
1091 persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
1092 break;
1095 if (i == dbmap->num) {
1096 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
1097 goto done;
1100 /* find the name of this database */
1101 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
1102 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
1103 goto done;
1106 /* attach to it */
1107 ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, persistent, 0);
1108 if (ctdb_db == NULL) {
1109 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
1110 goto done;
1113 r = (struct ctdb_rec_data_old *)&recs->data[0];
1114 while (recs->count) {
1115 bool ok;
1117 ok = vacuum_fetch_process_one(ctdb_db, rec->ctdb->pnn, r);
1118 if (!ok) {
1119 break;
1122 r = (struct ctdb_rec_data_old *)(r->length + (uint8_t *)r);
1123 recs->count--;
1126 done:
1127 talloc_free(tmp_ctx);
1132 * handler for database detach
1134 static void detach_database_handler(uint64_t srvid, TDB_DATA data,
1135 void *private_data)
1137 struct ctdb_recoverd *rec = talloc_get_type(
1138 private_data, struct ctdb_recoverd);
1139 struct ctdb_context *ctdb = rec->ctdb;
1140 uint32_t db_id;
1141 struct ctdb_db_context *ctdb_db;
1143 if (data.dsize != sizeof(db_id)) {
1144 return;
1146 db_id = *(uint32_t *)data.dptr;
1148 ctdb_db = find_ctdb_db(ctdb, db_id);
1149 if (ctdb_db == NULL) {
1150 /* database is not attached */
1151 return;
1154 DLIST_REMOVE(ctdb->db_list, ctdb_db);
1156 DEBUG(DEBUG_NOTICE, ("Detached from database '%s'\n",
1157 ctdb_db->db_name));
1158 talloc_free(ctdb_db);
1162 called when ctdb_wait_timeout should finish
1164 static void ctdb_wait_handler(struct tevent_context *ev,
1165 struct tevent_timer *te,
1166 struct timeval yt, void *p)
1168 uint32_t *timed_out = (uint32_t *)p;
1169 (*timed_out) = 1;
1173 wait for a given number of seconds
1175 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
1177 uint32_t timed_out = 0;
1178 time_t usecs = (secs - (time_t)secs) * 1000000;
1179 tevent_add_timer(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs),
1180 ctdb_wait_handler, &timed_out);
1181 while (!timed_out) {
1182 tevent_loop_once(ctdb->ev);
1187 called when an election times out (ends)
1189 static void ctdb_election_timeout(struct tevent_context *ev,
1190 struct tevent_timer *te,
1191 struct timeval t, void *p)
1193 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1194 rec->election_timeout = NULL;
1195 fast_start = false;
1197 DEBUG(DEBUG_WARNING,("Election period ended\n"));
1202 wait for an election to finish. It finished election_timeout seconds after
1203 the last election packet is received
1205 static void ctdb_wait_election(struct ctdb_recoverd *rec)
1207 struct ctdb_context *ctdb = rec->ctdb;
1208 while (rec->election_timeout) {
1209 tevent_loop_once(ctdb->ev);
1214 Update our local flags from all remote connected nodes.
1215 This is only run when we are or we belive we are the recovery master
1217 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap)
1219 int j;
1220 struct ctdb_context *ctdb = rec->ctdb;
1221 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1223 /* get the nodemap for all active remote nodes and verify
1224 they are the same as for this node
1226 for (j=0; j<nodemap->num; j++) {
1227 struct ctdb_node_map_old *remote_nodemap=NULL;
1228 int ret;
1230 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
1231 continue;
1233 if (nodemap->nodes[j].pnn == ctdb->pnn) {
1234 continue;
1237 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
1238 mem_ctx, &remote_nodemap);
1239 if (ret != 0) {
1240 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
1241 nodemap->nodes[j].pnn));
1242 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
1243 talloc_free(mem_ctx);
1244 return -1;
1246 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
1247 /* We should tell our daemon about this so it
1248 updates its flags or else we will log the same
1249 message again in the next iteration of recovery.
1250 Since we are the recovery master we can just as
1251 well update the flags on all nodes.
1253 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
1254 if (ret != 0) {
1255 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
1256 return -1;
1259 /* Update our local copy of the flags in the recovery
1260 daemon.
1262 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
1263 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
1264 nodemap->nodes[j].flags));
1265 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
1267 talloc_free(remote_nodemap);
1269 talloc_free(mem_ctx);
1270 return 0;
1274 /* Create a new random generation id.
1275 The generation id can not be the INVALID_GENERATION id
1277 static uint32_t new_generation(void)
1279 uint32_t generation;
1281 while (1) {
1282 generation = random();
1284 if (generation != INVALID_GENERATION) {
1285 break;
1289 return generation;
1294 create a temporary working database
1296 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1298 char *name;
1299 struct tdb_wrap *recdb;
1300 unsigned tdb_flags;
1302 /* open up the temporary recovery database */
1303 name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
1304 ctdb->db_directory_state,
1305 ctdb->pnn);
1306 if (name == NULL) {
1307 return NULL;
1309 unlink(name);
1311 tdb_flags = TDB_NOLOCK;
1312 if (ctdb->valgrinding) {
1313 tdb_flags |= TDB_NOMMAP;
1315 tdb_flags |= (TDB_INCOMPATIBLE_HASH | TDB_DISALLOW_NESTING);
1317 recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1318 tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1319 if (recdb == NULL) {
1320 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1323 talloc_free(name);
1325 return recdb;
1330 a traverse function for pulling all relevant records from recdb
1332 struct recdb_data {
1333 struct ctdb_context *ctdb;
1334 struct ctdb_marshall_buffer *recdata;
1335 uint32_t len;
1336 uint32_t allocated_len;
1337 bool failed;
1338 bool persistent;
1341 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1343 struct recdb_data *params = (struct recdb_data *)p;
1344 struct ctdb_rec_data_old *recdata;
1345 struct ctdb_ltdb_header *hdr;
1348 * skip empty records - but NOT for persistent databases:
1350 * The record-by-record mode of recovery deletes empty records.
1351 * For persistent databases, this can lead to data corruption
1352 * by deleting records that should be there:
1354 * - Assume the cluster has been running for a while.
1356 * - A record R in a persistent database has been created and
1357 * deleted a couple of times, the last operation being deletion,
1358 * leaving an empty record with a high RSN, say 10.
1360 * - Now a node N is turned off.
1362 * - This leaves the local database copy of D on N with the empty
1363 * copy of R and RSN 10. On all other nodes, the recovery has deleted
1364 * the copy of record R.
1366 * - Now the record is created again while node N is turned off.
1367 * This creates R with RSN = 1 on all nodes except for N.
1369 * - Now node N is turned on again. The following recovery will chose
1370 * the older empty copy of R due to RSN 10 > RSN 1.
1372 * ==> Hence the record is gone after the recovery.
1374 * On databases like Samba's registry, this can damage the higher-level
1375 * data structures built from the various tdb-level records.
1377 if (!params->persistent && data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1378 return 0;
1381 /* update the dmaster field to point to us */
1382 hdr = (struct ctdb_ltdb_header *)data.dptr;
1383 if (!params->persistent) {
1384 hdr->dmaster = params->ctdb->pnn;
1385 hdr->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
1388 /* add the record to the blob ready to send to the nodes */
1389 recdata = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1390 if (recdata == NULL) {
1391 params->failed = true;
1392 return -1;
1394 if (params->len + recdata->length >= params->allocated_len) {
1395 params->allocated_len = recdata->length + params->len + params->ctdb->tunable.pulldb_preallocation_size;
1396 params->recdata = talloc_realloc_size(NULL, params->recdata, params->allocated_len);
1398 if (params->recdata == NULL) {
1399 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u\n",
1400 recdata->length + params->len));
1401 params->failed = true;
1402 return -1;
1404 params->recdata->count++;
1405 memcpy(params->len+(uint8_t *)params->recdata, recdata, recdata->length);
1406 params->len += recdata->length;
1407 talloc_free(recdata);
1409 return 0;
1413 push the recdb database out to all nodes
1415 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1416 bool persistent,
1417 struct tdb_wrap *recdb, struct ctdb_node_map_old *nodemap)
1419 struct recdb_data params;
1420 struct ctdb_marshall_buffer *recdata;
1421 TDB_DATA outdata;
1422 TALLOC_CTX *tmp_ctx;
1423 uint32_t *nodes;
1425 tmp_ctx = talloc_new(ctdb);
1426 CTDB_NO_MEMORY(ctdb, tmp_ctx);
1428 recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1429 CTDB_NO_MEMORY(ctdb, recdata);
1431 recdata->db_id = dbid;
1433 params.ctdb = ctdb;
1434 params.recdata = recdata;
1435 params.len = offsetof(struct ctdb_marshall_buffer, data);
1436 params.allocated_len = params.len;
1437 params.failed = false;
1438 params.persistent = persistent;
1440 if (tdb_traverse_read(recdb->tdb, traverse_recdb, &params) == -1) {
1441 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1442 talloc_free(params.recdata);
1443 talloc_free(tmp_ctx);
1444 return -1;
1447 if (params.failed) {
1448 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1449 talloc_free(params.recdata);
1450 talloc_free(tmp_ctx);
1451 return -1;
1454 recdata = params.recdata;
1456 outdata.dptr = (void *)recdata;
1457 outdata.dsize = params.len;
1459 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1460 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1461 nodes, 0,
1462 CONTROL_TIMEOUT(), false, outdata,
1463 NULL, NULL,
1464 NULL) != 0) {
1465 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1466 talloc_free(recdata);
1467 talloc_free(tmp_ctx);
1468 return -1;
1471 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1472 dbid, recdata->count));
1474 talloc_free(recdata);
1475 talloc_free(tmp_ctx);
1477 return 0;
1482 go through a full recovery on one database
1484 static int recover_database(struct ctdb_recoverd *rec,
1485 TALLOC_CTX *mem_ctx,
1486 uint32_t dbid,
1487 bool persistent,
1488 uint32_t pnn,
1489 struct ctdb_node_map_old *nodemap,
1490 uint32_t transaction_id)
1492 struct tdb_wrap *recdb;
1493 int ret;
1494 struct ctdb_context *ctdb = rec->ctdb;
1495 TDB_DATA data;
1496 struct ctdb_transdb w;
1497 uint32_t *nodes;
1499 recdb = create_recdb(ctdb, mem_ctx);
1500 if (recdb == NULL) {
1501 return -1;
1504 /* pull all remote databases onto the recdb */
1505 ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
1506 if (ret != 0) {
1507 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1508 return -1;
1511 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1513 /* wipe all the remote databases. This is safe as we are in a transaction */
1514 w.db_id = dbid;
1515 w.tid = transaction_id;
1517 data.dptr = (void *)&w;
1518 data.dsize = sizeof(w);
1520 nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1521 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1522 nodes, 0,
1523 CONTROL_TIMEOUT(), false, data,
1524 NULL, NULL,
1525 NULL) != 0) {
1526 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1527 talloc_free(recdb);
1528 return -1;
1531 /* push out the correct database. This sets the dmaster and skips
1532 the empty records */
1533 ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
1534 if (ret != 0) {
1535 talloc_free(recdb);
1536 return -1;
1539 /* all done with this database */
1540 talloc_free(recdb);
1542 return 0;
1545 static bool ctdb_recovery_have_lock(struct ctdb_recoverd *rec)
1547 return (rec->recovery_lock_handle != NULL);
1550 struct hold_reclock_state {
1551 bool done;
1552 bool locked;
1553 double latency;
1556 static void take_reclock_handler(char status,
1557 double latency,
1558 void *private_data)
1560 struct hold_reclock_state *s =
1561 (struct hold_reclock_state *) private_data;
1563 switch (status) {
1564 case '0':
1565 s->latency = latency;
1566 break;
1568 case '1':
1569 DEBUG(DEBUG_ERR,
1570 ("Unable to take recovery lock - contention\n"));
1571 break;
1573 default:
1574 DEBUG(DEBUG_ERR, ("ERROR: when taking recovery lock\n"));
1577 s->done = true;
1578 s->locked = (status == '0') ;
1581 static bool ctdb_recovery_lock(struct ctdb_recoverd *rec);
1583 static void lost_reclock_handler(void *private_data)
1585 struct ctdb_recoverd *rec = talloc_get_type_abort(
1586 private_data, struct ctdb_recoverd);
1588 DEBUG(DEBUG_ERR,
1589 ("Recovery lock helper terminated unexpectedly - "
1590 "trying to retake recovery lock\n"));
1591 TALLOC_FREE(rec->recovery_lock_handle);
1592 if (! ctdb_recovery_lock(rec)) {
1593 DEBUG(DEBUG_ERR, ("Failed to take recovery lock\n"));
1597 static bool ctdb_recovery_lock(struct ctdb_recoverd *rec)
1599 struct ctdb_context *ctdb = rec->ctdb;
1600 struct ctdb_cluster_mutex_handle *h;
1601 struct hold_reclock_state s = {
1602 .done = false,
1603 .locked = false,
1604 .latency = 0,
1607 h = ctdb_cluster_mutex(rec, ctdb, ctdb->recovery_lock, 0,
1608 take_reclock_handler, &s,
1609 lost_reclock_handler, rec);
1610 if (h == NULL) {
1611 return false;
1614 while (!s.done) {
1615 tevent_loop_once(ctdb->ev);
1618 if (! s.locked) {
1619 talloc_free(h);
1620 return false;
1623 rec->recovery_lock_handle = h;
1624 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(),
1625 s.latency);
1627 return true;
1630 static void ctdb_recovery_unlock(struct ctdb_recoverd *rec)
1632 if (rec->recovery_lock_handle != NULL) {
1633 DEBUG(DEBUG_NOTICE, ("Releasing recovery lock\n"));
1634 TALLOC_FREE(rec->recovery_lock_handle);
1638 static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
1640 struct ctdb_context *ctdb = rec->ctdb;
1641 int i;
1642 struct ctdb_banning_state *ban_state;
1644 *self_ban = false;
1645 for (i=0; i<ctdb->num_nodes; i++) {
1646 if (ctdb->nodes[i]->ban_state == NULL) {
1647 continue;
1649 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1650 if (ban_state->count < 2*ctdb->num_nodes) {
1651 continue;
1654 DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
1655 ctdb->nodes[i]->pnn, ban_state->count,
1656 ctdb->tunable.recovery_ban_period));
1657 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1658 ban_state->count = 0;
1660 /* Banning ourself? */
1661 if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
1662 *self_ban = true;
1667 static bool do_takeover_run(struct ctdb_recoverd *rec,
1668 struct ctdb_node_map_old *nodemap)
1670 uint32_t *nodes = NULL;
1671 struct ctdb_disable_message dtr;
1672 TDB_DATA data;
1673 int i;
1674 uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
1675 int ret;
1676 bool ok;
1678 DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
1680 if (ctdb_op_is_in_progress(rec->takeover_run)) {
1681 DEBUG(DEBUG_ERR, (__location__
1682 " takeover run already in progress \n"));
1683 ok = false;
1684 goto done;
1687 if (!ctdb_op_begin(rec->takeover_run)) {
1688 ok = false;
1689 goto done;
1692 /* Disable IP checks (takeover runs, really) on other nodes
1693 * while doing this takeover run. This will stop those other
1694 * nodes from triggering takeover runs when think they should
1695 * be hosting an IP but it isn't yet on an interface. Don't
1696 * wait for replies since a failure here might cause some
1697 * noise in the logs but will not actually cause a problem.
1699 ZERO_STRUCT(dtr);
1700 dtr.srvid = 0; /* No reply */
1701 dtr.pnn = -1;
1703 data.dptr = (uint8_t*)&dtr;
1704 data.dsize = sizeof(dtr);
1706 nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
1708 /* Disable for 60 seconds. This can be a tunable later if
1709 * necessary.
1711 dtr.timeout = 60;
1712 for (i = 0; i < talloc_array_length(nodes); i++) {
1713 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1714 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1715 data) != 0) {
1716 DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
1720 ret = ctdb_takeover_run(rec->ctdb, nodemap,
1721 rec->force_rebalance_nodes);
1723 /* Reenable takeover runs and IP checks on other nodes */
1724 dtr.timeout = 0;
1725 for (i = 0; i < talloc_array_length(nodes); i++) {
1726 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1727 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1728 data) != 0) {
1729 DEBUG(DEBUG_INFO,("Failed to re-enable takeover runs\n"));
1733 if (ret != 0) {
1734 DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
1735 ok = false;
1736 goto done;
1739 ok = true;
1740 /* Takeover run was successful so clear force rebalance targets */
1741 if (rebalance_nodes == rec->force_rebalance_nodes) {
1742 TALLOC_FREE(rec->force_rebalance_nodes);
1743 } else {
1744 DEBUG(DEBUG_WARNING,
1745 ("Rebalance target nodes changed during takeover run - not clearing\n"));
1747 done:
1748 rec->need_takeover_run = !ok;
1749 talloc_free(nodes);
1750 ctdb_op_end(rec->takeover_run);
1752 DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
1753 return ok;
1756 struct recovery_helper_state {
1757 int fd[2];
1758 pid_t pid;
1759 int result;
1760 bool done;
1763 static void ctdb_recovery_handler(struct tevent_context *ev,
1764 struct tevent_fd *fde,
1765 uint16_t flags, void *private_data)
1767 struct recovery_helper_state *state = talloc_get_type_abort(
1768 private_data, struct recovery_helper_state);
1769 int ret;
1771 ret = sys_read(state->fd[0], &state->result, sizeof(state->result));
1772 if (ret != sizeof(state->result)) {
1773 state->result = EPIPE;
1776 state->done = true;
1780 static int db_recovery_parallel(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx)
1782 static char prog[PATH_MAX+1] = "";
1783 const char **args;
1784 struct recovery_helper_state *state;
1785 struct tevent_fd *fde;
1786 int nargs, ret;
1788 if (!ctdb_set_helper("recovery_helper", prog, sizeof(prog),
1789 "CTDB_RECOVERY_HELPER", CTDB_HELPER_BINDIR,
1790 "ctdb_recovery_helper")) {
1791 ctdb_die(rec->ctdb, "Unable to set recovery helper\n");
1794 state = talloc_zero(mem_ctx, struct recovery_helper_state);
1795 if (state == NULL) {
1796 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1797 return -1;
1800 state->pid = -1;
1802 ret = pipe(state->fd);
1803 if (ret != 0) {
1804 DEBUG(DEBUG_ERR,
1805 ("Failed to create pipe for recovery helper\n"));
1806 goto fail;
1809 set_close_on_exec(state->fd[0]);
1811 nargs = 4;
1812 args = talloc_array(state, const char *, nargs);
1813 if (args == NULL) {
1814 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1815 goto fail;
1818 args[0] = talloc_asprintf(args, "%d", state->fd[1]);
1819 args[1] = rec->ctdb->daemon.name;
1820 args[2] = talloc_asprintf(args, "%u", new_generation());
1821 args[3] = NULL;
1823 if (args[0] == NULL || args[2] == NULL) {
1824 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1825 goto fail;
1828 setenv("CTDB_DBDIR_STATE", rec->ctdb->db_directory_state, 1);
1830 if (!ctdb_vfork_with_logging(state, rec->ctdb, "recovery", prog, nargs,
1831 args, NULL, NULL, &state->pid)) {
1832 DEBUG(DEBUG_ERR,
1833 ("Failed to create child for recovery helper\n"));
1834 goto fail;
1837 close(state->fd[1]);
1838 state->fd[1] = -1;
1840 state->done = false;
1842 fde = tevent_add_fd(rec->ctdb->ev, rec->ctdb, state->fd[0],
1843 TEVENT_FD_READ, ctdb_recovery_handler, state);
1844 if (fde == NULL) {
1845 goto fail;
1847 tevent_fd_set_auto_close(fde);
1849 while (!state->done) {
1850 tevent_loop_once(rec->ctdb->ev);
1853 close(state->fd[0]);
1854 state->fd[0] = -1;
1856 if (state->result != 0) {
1857 goto fail;
1860 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
1861 talloc_free(state);
1862 return 0;
1864 fail:
1865 if (state->fd[0] != -1) {
1866 close(state->fd[0]);
1868 if (state->fd[1] != -1) {
1869 close(state->fd[1]);
1871 if (state->pid != -1) {
1872 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
1874 talloc_free(state);
1875 return -1;
1878 static int db_recovery_serial(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx,
1879 uint32_t pnn, struct ctdb_node_map_old *nodemap,
1880 struct ctdb_vnn_map *vnnmap,
1881 struct ctdb_dbid_map_old *dbmap)
1883 struct ctdb_context *ctdb = rec->ctdb;
1884 uint32_t generation;
1885 TDB_DATA data;
1886 uint32_t *nodes;
1887 int ret, i, j;
1889 /* set recovery mode to active on all nodes */
1890 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE, true);
1891 if (ret != 0) {
1892 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1893 return -1;
1896 /* execute the "startrecovery" event script on all nodes */
1897 ret = run_startrecovery_eventscript(rec, nodemap);
1898 if (ret!=0) {
1899 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1900 return -1;
1903 /* pick a new generation number */
1904 generation = new_generation();
1906 /* change the vnnmap on this node to use the new generation
1907 number but not on any other nodes.
1908 this guarantees that if we abort the recovery prematurely
1909 for some reason (a node stops responding?)
1910 that we can just return immediately and we will reenter
1911 recovery shortly again.
1912 I.e. we deliberately leave the cluster with an inconsistent
1913 generation id to allow us to abort recovery at any stage and
1914 just restart it from scratch.
1916 vnnmap->generation = generation;
1917 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1918 if (ret != 0) {
1919 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1920 return -1;
1923 /* Database generations are updated when the transaction is commited to
1924 * the databases. So make sure to use the final generation as the
1925 * transaction id
1927 generation = new_generation();
1929 data.dptr = (void *)&generation;
1930 data.dsize = sizeof(uint32_t);
1932 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1933 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1934 nodes, 0,
1935 CONTROL_TIMEOUT(), false, data,
1936 NULL,
1937 transaction_start_fail_callback,
1938 rec) != 0) {
1939 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1940 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
1941 nodes, 0,
1942 CONTROL_TIMEOUT(), false, tdb_null,
1943 NULL,
1944 NULL,
1945 NULL) != 0) {
1946 DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
1948 return -1;
1951 DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1953 for (i=0;i<dbmap->num;i++) {
1954 ret = recover_database(rec, mem_ctx,
1955 dbmap->dbs[i].db_id,
1956 dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT,
1957 pnn, nodemap, generation);
1958 if (ret != 0) {
1959 DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].db_id));
1960 return -1;
1964 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
1966 /* commit all the changes */
1967 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
1968 nodes, 0,
1969 CONTROL_TIMEOUT(), false, data,
1970 NULL, NULL,
1971 NULL) != 0) {
1972 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
1973 return -1;
1976 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
1978 /* build a new vnn map with all the currently active and
1979 unbanned nodes */
1980 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
1981 CTDB_NO_MEMORY(ctdb, vnnmap);
1982 vnnmap->generation = generation;
1983 vnnmap->size = 0;
1984 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
1985 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1986 for (i=j=0;i<nodemap->num;i++) {
1987 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1988 continue;
1990 if (!ctdb_node_has_capabilities(rec->caps,
1991 ctdb->nodes[i]->pnn,
1992 CTDB_CAP_LMASTER)) {
1993 /* this node can not be an lmaster */
1994 DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
1995 continue;
1998 vnnmap->size++;
1999 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
2000 CTDB_NO_MEMORY(ctdb, vnnmap->map);
2001 vnnmap->map[j++] = nodemap->nodes[i].pnn;
2004 if (vnnmap->size == 0) {
2005 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
2006 vnnmap->size++;
2007 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
2008 CTDB_NO_MEMORY(ctdb, vnnmap->map);
2009 vnnmap->map[0] = pnn;
2012 /* update to the new vnnmap on all nodes */
2013 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
2014 if (ret != 0) {
2015 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
2016 return -1;
2019 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
2021 /* disable recovery mode */
2022 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL, false);
2023 if (ret != 0) {
2024 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
2025 return -1;
2028 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
2030 /* execute the "recovered" event script on all nodes */
2031 ret = run_recovered_eventscript(rec, nodemap, "do_recovery");
2032 if (ret!=0) {
2033 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
2034 return -1;
2037 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
2039 return 0;
2043 we are the recmaster, and recovery is needed - start a recovery run
2045 static int do_recovery(struct ctdb_recoverd *rec,
2046 TALLOC_CTX *mem_ctx, uint32_t pnn,
2047 struct ctdb_node_map_old *nodemap, struct ctdb_vnn_map *vnnmap)
2049 struct ctdb_context *ctdb = rec->ctdb;
2050 int i, ret;
2051 struct ctdb_dbid_map_old *dbmap;
2052 bool self_ban;
2053 bool par_recovery;
2055 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
2057 /* Check if the current node is still the recmaster. It's possible that
2058 * re-election has changed the recmaster.
2060 if (pnn != rec->recmaster) {
2061 DEBUG(DEBUG_NOTICE,
2062 ("Recovery master changed to %u, aborting recovery\n",
2063 rec->recmaster));
2064 return -1;
2067 /* if recovery fails, force it again */
2068 rec->need_recovery = true;
2070 if (!ctdb_op_begin(rec->recovery)) {
2071 return -1;
2074 if (rec->election_timeout) {
2075 /* an election is in progress */
2076 DEBUG(DEBUG_ERR, ("do_recovery called while election in progress - try again later\n"));
2077 goto fail;
2080 ban_misbehaving_nodes(rec, &self_ban);
2081 if (self_ban) {
2082 DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
2083 goto fail;
2086 if (ctdb->recovery_lock != NULL) {
2087 if (ctdb_recovery_have_lock(rec)) {
2088 DEBUG(DEBUG_NOTICE, ("Already holding recovery lock\n"));
2089 } else {
2090 DEBUG(DEBUG_NOTICE, ("Attempting to take recovery lock (%s)\n",
2091 ctdb->recovery_lock));
2092 if (!ctdb_recovery_lock(rec)) {
2093 if (ctdb->runstate == CTDB_RUNSTATE_FIRST_RECOVERY) {
2094 /* If ctdb is trying first recovery, it's
2095 * possible that current node does not know
2096 * yet who the recmaster is.
2098 DEBUG(DEBUG_ERR, ("Unable to get recovery lock"
2099 " - retrying recovery\n"));
2100 goto fail;
2103 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
2104 "and ban ourself for %u seconds\n",
2105 ctdb->tunable.recovery_ban_period));
2106 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
2107 goto fail;
2109 DEBUG(DEBUG_NOTICE,
2110 ("Recovery lock taken successfully by recovery daemon\n"));
2114 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
2116 /* get a list of all databases */
2117 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
2118 if (ret != 0) {
2119 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
2120 goto fail;
2123 /* we do the db creation before we set the recovery mode, so the freeze happens
2124 on all databases we will be dealing with. */
2126 /* verify that we have all the databases any other node has */
2127 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
2128 if (ret != 0) {
2129 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
2130 goto fail;
2133 /* verify that all other nodes have all our databases */
2134 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
2135 if (ret != 0) {
2136 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
2137 goto fail;
2139 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
2141 /* update the database priority for all remote databases */
2142 ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
2143 if (ret != 0) {
2144 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
2146 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
2149 /* Retrieve capabilities from all connected nodes */
2150 ret = update_capabilities(rec, nodemap);
2151 if (ret!=0) {
2152 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
2153 return -1;
2157 update all nodes to have the same flags that we have
2159 for (i=0;i<nodemap->num;i++) {
2160 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2161 continue;
2164 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
2165 if (ret != 0) {
2166 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2167 DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
2168 } else {
2169 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
2170 return -1;
2175 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
2177 /* Check if all participating nodes have parallel recovery capability */
2178 par_recovery = true;
2179 for (i=0; i<nodemap->num; i++) {
2180 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2181 continue;
2184 if (!(rec->caps[i].capabilities &
2185 CTDB_CAP_PARALLEL_RECOVERY)) {
2186 par_recovery = false;
2187 break;
2191 if (par_recovery) {
2192 ret = db_recovery_parallel(rec, mem_ctx);
2193 } else {
2194 ret = db_recovery_serial(rec, mem_ctx, pnn, nodemap, vnnmap,
2195 dbmap);
2198 if (ret != 0) {
2199 goto fail;
2202 do_takeover_run(rec, nodemap);
2204 /* send a message to all clients telling them that the cluster
2205 has been reconfigured */
2206 ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
2207 CTDB_SRVID_RECONFIGURE, tdb_null);
2208 if (ret != 0) {
2209 DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n"));
2210 goto fail;
2213 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
2215 rec->need_recovery = false;
2216 ctdb_op_end(rec->recovery);
2218 /* we managed to complete a full recovery, make sure to forgive
2219 any past sins by the nodes that could now participate in the
2220 recovery.
2222 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
2223 for (i=0;i<nodemap->num;i++) {
2224 struct ctdb_banning_state *ban_state;
2226 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2227 continue;
2230 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
2231 if (ban_state == NULL) {
2232 continue;
2235 ban_state->count = 0;
2238 /* We just finished a recovery successfully.
2239 We now wait for rerecovery_timeout before we allow
2240 another recovery to take place.
2242 DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
2243 ctdb_op_disable(rec->recovery, ctdb->ev,
2244 ctdb->tunable.rerecovery_timeout);
2245 return 0;
2247 fail:
2248 ctdb_op_end(rec->recovery);
2249 return -1;
2254 elections are won by first checking the number of connected nodes, then
2255 the priority time, then the pnn
2257 struct election_message {
2258 uint32_t num_connected;
2259 struct timeval priority_time;
2260 uint32_t pnn;
2261 uint32_t node_flags;
2265 form this nodes election data
2267 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
2269 int ret, i;
2270 struct ctdb_node_map_old *nodemap;
2271 struct ctdb_context *ctdb = rec->ctdb;
2273 ZERO_STRUCTP(em);
2275 em->pnn = rec->ctdb->pnn;
2276 em->priority_time = rec->priority_time;
2278 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
2279 if (ret != 0) {
2280 DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
2281 return;
2284 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
2285 em->node_flags = rec->node_flags;
2287 for (i=0;i<nodemap->num;i++) {
2288 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
2289 em->num_connected++;
2293 /* we shouldnt try to win this election if we cant be a recmaster */
2294 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2295 em->num_connected = 0;
2296 em->priority_time = timeval_current();
2299 talloc_free(nodemap);
2303 see if the given election data wins
2305 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
2307 struct election_message myem;
2308 int cmp = 0;
2310 ctdb_election_data(rec, &myem);
2312 /* we cant win if we don't have the recmaster capability */
2313 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2314 return false;
2317 /* we cant win if we are banned */
2318 if (rec->node_flags & NODE_FLAGS_BANNED) {
2319 return false;
2322 /* we cant win if we are stopped */
2323 if (rec->node_flags & NODE_FLAGS_STOPPED) {
2324 return false;
2327 /* we will automatically win if the other node is banned */
2328 if (em->node_flags & NODE_FLAGS_BANNED) {
2329 return true;
2332 /* we will automatically win if the other node is banned */
2333 if (em->node_flags & NODE_FLAGS_STOPPED) {
2334 return true;
2337 /* then the longest running node */
2338 if (cmp == 0) {
2339 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
2342 if (cmp == 0) {
2343 cmp = (int)myem.pnn - (int)em->pnn;
2346 return cmp > 0;
2350 send out an election request
2352 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
2354 int ret;
2355 TDB_DATA election_data;
2356 struct election_message emsg;
2357 uint64_t srvid;
2358 struct ctdb_context *ctdb = rec->ctdb;
2360 srvid = CTDB_SRVID_ELECTION;
2362 ctdb_election_data(rec, &emsg);
2364 election_data.dsize = sizeof(struct election_message);
2365 election_data.dptr = (unsigned char *)&emsg;
2368 /* first we assume we will win the election and set
2369 recoverymaster to be ourself on the current node
2371 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
2372 CTDB_CURRENT_NODE, pnn);
2373 if (ret != 0) {
2374 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster\n"));
2375 return -1;
2377 rec->recmaster = pnn;
2379 /* send an election message to all active nodes */
2380 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
2381 return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
2385 we think we are winning the election - send a broadcast election request
2387 static void election_send_request(struct tevent_context *ev,
2388 struct tevent_timer *te,
2389 struct timeval t, void *p)
2391 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2392 int ret;
2394 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
2395 if (ret != 0) {
2396 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
2399 TALLOC_FREE(rec->send_election_te);
2403 handler for memory dumps
2405 static void mem_dump_handler(uint64_t srvid, TDB_DATA data, void *private_data)
2407 struct ctdb_recoverd *rec = talloc_get_type(
2408 private_data, struct ctdb_recoverd);
2409 struct ctdb_context *ctdb = rec->ctdb;
2410 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2411 TDB_DATA *dump;
2412 int ret;
2413 struct ctdb_srvid_message *rd;
2415 if (data.dsize != sizeof(struct ctdb_srvid_message)) {
2416 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2417 talloc_free(tmp_ctx);
2418 return;
2420 rd = (struct ctdb_srvid_message *)data.dptr;
2422 dump = talloc_zero(tmp_ctx, TDB_DATA);
2423 if (dump == NULL) {
2424 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
2425 talloc_free(tmp_ctx);
2426 return;
2428 ret = ctdb_dump_memory(ctdb, dump);
2429 if (ret != 0) {
2430 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
2431 talloc_free(tmp_ctx);
2432 return;
2435 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
2437 ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
2438 if (ret != 0) {
2439 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
2440 talloc_free(tmp_ctx);
2441 return;
2444 talloc_free(tmp_ctx);
2448 handler for reload_nodes
2450 static void reload_nodes_handler(uint64_t srvid, TDB_DATA data,
2451 void *private_data)
2453 struct ctdb_recoverd *rec = talloc_get_type(
2454 private_data, struct ctdb_recoverd);
2456 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
2458 ctdb_load_nodes_file(rec->ctdb);
2462 static void recd_node_rebalance_handler(uint64_t srvid, TDB_DATA data,
2463 void *private_data)
2465 struct ctdb_recoverd *rec = talloc_get_type(
2466 private_data, struct ctdb_recoverd);
2467 struct ctdb_context *ctdb = rec->ctdb;
2468 uint32_t pnn;
2469 uint32_t *t;
2470 int len;
2472 if (rec->recmaster != ctdb_get_pnn(ctdb)) {
2473 return;
2476 if (data.dsize != sizeof(uint32_t)) {
2477 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
2478 return;
2481 pnn = *(uint32_t *)&data.dptr[0];
2483 DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
2485 /* Copy any existing list of nodes. There's probably some
2486 * sort of realloc variant that will do this but we need to
2487 * make sure that freeing the old array also cancels the timer
2488 * event for the timeout... not sure if realloc will do that.
2490 len = (rec->force_rebalance_nodes != NULL) ?
2491 talloc_array_length(rec->force_rebalance_nodes) :
2494 /* This allows duplicates to be added but they don't cause
2495 * harm. A call to add a duplicate PNN arguably means that
2496 * the timeout should be reset, so this is the simplest
2497 * solution.
2499 t = talloc_zero_array(rec, uint32_t, len+1);
2500 CTDB_NO_MEMORY_VOID(ctdb, t);
2501 if (len > 0) {
2502 memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
2504 t[len] = pnn;
2506 talloc_free(rec->force_rebalance_nodes);
2508 rec->force_rebalance_nodes = t;
2513 static void srvid_disable_and_reply(struct ctdb_context *ctdb,
2514 TDB_DATA data,
2515 struct ctdb_op_state *op_state)
2517 struct ctdb_disable_message *r;
2518 uint32_t timeout;
2519 TDB_DATA result;
2520 int32_t ret = 0;
2522 /* Validate input data */
2523 if (data.dsize != sizeof(struct ctdb_disable_message)) {
2524 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2525 "expecting %lu\n", (long unsigned)data.dsize,
2526 (long unsigned)sizeof(struct ctdb_srvid_message)));
2527 return;
2529 if (data.dptr == NULL) {
2530 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2531 return;
2534 r = (struct ctdb_disable_message *)data.dptr;
2535 timeout = r->timeout;
2537 ret = ctdb_op_disable(op_state, ctdb->ev, timeout);
2538 if (ret != 0) {
2539 goto done;
2542 /* Returning our PNN tells the caller that we succeeded */
2543 ret = ctdb_get_pnn(ctdb);
2544 done:
2545 result.dsize = sizeof(int32_t);
2546 result.dptr = (uint8_t *)&ret;
2547 srvid_request_reply(ctdb, (struct ctdb_srvid_message *)r, result);
2550 static void disable_takeover_runs_handler(uint64_t srvid, TDB_DATA data,
2551 void *private_data)
2553 struct ctdb_recoverd *rec = talloc_get_type(
2554 private_data, struct ctdb_recoverd);
2556 srvid_disable_and_reply(rec->ctdb, data, rec->takeover_run);
2559 /* Backward compatibility for this SRVID */
2560 static void disable_ip_check_handler(uint64_t srvid, TDB_DATA data,
2561 void *private_data)
2563 struct ctdb_recoverd *rec = talloc_get_type(
2564 private_data, struct ctdb_recoverd);
2565 uint32_t timeout;
2567 if (data.dsize != sizeof(uint32_t)) {
2568 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2569 "expecting %lu\n", (long unsigned)data.dsize,
2570 (long unsigned)sizeof(uint32_t)));
2571 return;
2573 if (data.dptr == NULL) {
2574 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2575 return;
2578 timeout = *((uint32_t *)data.dptr);
2580 ctdb_op_disable(rec->takeover_run, rec->ctdb->ev, timeout);
2583 static void disable_recoveries_handler(uint64_t srvid, TDB_DATA data,
2584 void *private_data)
2586 struct ctdb_recoverd *rec = talloc_get_type(
2587 private_data, struct ctdb_recoverd);
2589 srvid_disable_and_reply(rec->ctdb, data, rec->recovery);
2593 handler for ip reallocate, just add it to the list of requests and
2594 handle this later in the monitor_cluster loop so we do not recurse
2595 with other requests to takeover_run()
2597 static void ip_reallocate_handler(uint64_t srvid, TDB_DATA data,
2598 void *private_data)
2600 struct ctdb_srvid_message *request;
2601 struct ctdb_recoverd *rec = talloc_get_type(
2602 private_data, struct ctdb_recoverd);
2604 if (data.dsize != sizeof(struct ctdb_srvid_message)) {
2605 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2606 return;
2609 request = (struct ctdb_srvid_message *)data.dptr;
2611 srvid_request_add(rec->ctdb, &rec->reallocate_requests, request);
2614 static void process_ipreallocate_requests(struct ctdb_context *ctdb,
2615 struct ctdb_recoverd *rec)
2617 TDB_DATA result;
2618 int32_t ret;
2619 struct srvid_requests *current;
2621 /* Only process requests that are currently pending. More
2622 * might come in while the takeover run is in progress and
2623 * they will need to be processed later since they might
2624 * be in response flag changes.
2626 current = rec->reallocate_requests;
2627 rec->reallocate_requests = NULL;
2629 if (do_takeover_run(rec, rec->nodemap)) {
2630 ret = ctdb_get_pnn(ctdb);
2631 } else {
2632 ret = -1;
2635 result.dsize = sizeof(int32_t);
2636 result.dptr = (uint8_t *)&ret;
2638 srvid_requests_reply(ctdb, &current, result);
2642 * handler for assigning banning credits
2644 static void banning_handler(uint64_t srvid, TDB_DATA data, void *private_data)
2646 struct ctdb_recoverd *rec = talloc_get_type(
2647 private_data, struct ctdb_recoverd);
2648 uint32_t ban_pnn;
2650 /* Ignore if we are not recmaster */
2651 if (rec->ctdb->pnn != rec->recmaster) {
2652 return;
2655 if (data.dsize != sizeof(uint32_t)) {
2656 DEBUG(DEBUG_ERR, (__location__ "invalid data size %zu\n",
2657 data.dsize));
2658 return;
2661 ban_pnn = *(uint32_t *)data.dptr;
2663 ctdb_set_culprit_count(rec, ban_pnn, rec->nodemap->num);
2667 handler for recovery master elections
2669 static void election_handler(uint64_t srvid, TDB_DATA data, void *private_data)
2671 struct ctdb_recoverd *rec = talloc_get_type(
2672 private_data, struct ctdb_recoverd);
2673 struct ctdb_context *ctdb = rec->ctdb;
2674 int ret;
2675 struct election_message *em = (struct election_message *)data.dptr;
2677 /* Ignore election packets from ourself */
2678 if (ctdb->pnn == em->pnn) {
2679 return;
2682 /* we got an election packet - update the timeout for the election */
2683 talloc_free(rec->election_timeout);
2684 rec->election_timeout = tevent_add_timer(
2685 ctdb->ev, ctdb,
2686 fast_start ?
2687 timeval_current_ofs(0, 500000) :
2688 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2689 ctdb_election_timeout, rec);
2691 /* someone called an election. check their election data
2692 and if we disagree and we would rather be the elected node,
2693 send a new election message to all other nodes
2695 if (ctdb_election_win(rec, em)) {
2696 if (!rec->send_election_te) {
2697 rec->send_election_te = tevent_add_timer(
2698 ctdb->ev, rec,
2699 timeval_current_ofs(0, 500000),
2700 election_send_request, rec);
2702 return;
2705 /* we didn't win */
2706 TALLOC_FREE(rec->send_election_te);
2708 /* Release the recovery lock file */
2709 if (ctdb_recovery_have_lock(rec)) {
2710 ctdb_recovery_unlock(rec);
2713 /* ok, let that guy become recmaster then */
2714 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
2715 CTDB_CURRENT_NODE, em->pnn);
2716 if (ret != 0) {
2717 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster"));
2718 return;
2720 rec->recmaster = em->pnn;
2722 return;
2727 force the start of the election process
2729 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
2730 struct ctdb_node_map_old *nodemap)
2732 int ret;
2733 struct ctdb_context *ctdb = rec->ctdb;
2735 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
2737 /* set all nodes to recovery mode to stop all internode traffic */
2738 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE, false);
2739 if (ret != 0) {
2740 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
2741 return;
2744 talloc_free(rec->election_timeout);
2745 rec->election_timeout = tevent_add_timer(
2746 ctdb->ev, ctdb,
2747 fast_start ?
2748 timeval_current_ofs(0, 500000) :
2749 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2750 ctdb_election_timeout, rec);
2752 ret = send_election_request(rec, pnn);
2753 if (ret!=0) {
2754 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
2755 return;
2758 /* wait for a few seconds to collect all responses */
2759 ctdb_wait_election(rec);
2765 handler for when a node changes its flags
2767 static void monitor_handler(uint64_t srvid, TDB_DATA data, void *private_data)
2769 struct ctdb_recoverd *rec = talloc_get_type(
2770 private_data, struct ctdb_recoverd);
2771 struct ctdb_context *ctdb = rec->ctdb;
2772 int ret;
2773 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2774 struct ctdb_node_map_old *nodemap=NULL;
2775 TALLOC_CTX *tmp_ctx;
2776 int i;
2778 if (data.dsize != sizeof(*c)) {
2779 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2780 return;
2783 tmp_ctx = talloc_new(ctdb);
2784 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2786 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2787 if (ret != 0) {
2788 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2789 talloc_free(tmp_ctx);
2790 return;
2794 for (i=0;i<nodemap->num;i++) {
2795 if (nodemap->nodes[i].pnn == c->pnn) break;
2798 if (i == nodemap->num) {
2799 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2800 talloc_free(tmp_ctx);
2801 return;
2804 if (c->old_flags != c->new_flags) {
2805 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2808 nodemap->nodes[i].flags = c->new_flags;
2810 talloc_free(tmp_ctx);
2814 handler for when we need to push out flag changes ot all other nodes
2816 static void push_flags_handler(uint64_t srvid, TDB_DATA data,
2817 void *private_data)
2819 struct ctdb_recoverd *rec = talloc_get_type(
2820 private_data, struct ctdb_recoverd);
2821 struct ctdb_context *ctdb = rec->ctdb;
2822 int ret;
2823 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2824 struct ctdb_node_map_old *nodemap=NULL;
2825 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2826 uint32_t *nodes;
2828 /* read the node flags from the recmaster */
2829 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
2830 tmp_ctx, &nodemap);
2831 if (ret != 0) {
2832 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2833 talloc_free(tmp_ctx);
2834 return;
2836 if (c->pnn >= nodemap->num) {
2837 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2838 talloc_free(tmp_ctx);
2839 return;
2842 /* send the flags update to all connected nodes */
2843 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2845 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2846 nodes, 0, CONTROL_TIMEOUT(),
2847 false, data,
2848 NULL, NULL,
2849 NULL) != 0) {
2850 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2852 talloc_free(tmp_ctx);
2853 return;
2856 talloc_free(tmp_ctx);
2860 struct verify_recmode_normal_data {
2861 uint32_t count;
2862 enum monitor_result status;
2865 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2867 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2870 /* one more node has responded with recmode data*/
2871 rmdata->count--;
2873 /* if we failed to get the recmode, then return an error and let
2874 the main loop try again.
2876 if (state->state != CTDB_CONTROL_DONE) {
2877 if (rmdata->status == MONITOR_OK) {
2878 rmdata->status = MONITOR_FAILED;
2880 return;
2883 /* if we got a response, then the recmode will be stored in the
2884 status field
2886 if (state->status != CTDB_RECOVERY_NORMAL) {
2887 DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
2888 rmdata->status = MONITOR_RECOVERY_NEEDED;
2891 return;
2895 /* verify that all nodes are in normal recovery mode */
2896 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap)
2898 struct verify_recmode_normal_data *rmdata;
2899 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2900 struct ctdb_client_control_state *state;
2901 enum monitor_result status;
2902 int j;
2904 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2905 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2906 rmdata->count = 0;
2907 rmdata->status = MONITOR_OK;
2909 /* loop over all active nodes and send an async getrecmode call to
2910 them*/
2911 for (j=0; j<nodemap->num; j++) {
2912 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2913 continue;
2915 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2916 CONTROL_TIMEOUT(),
2917 nodemap->nodes[j].pnn);
2918 if (state == NULL) {
2919 /* we failed to send the control, treat this as
2920 an error and try again next iteration
2922 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2923 talloc_free(mem_ctx);
2924 return MONITOR_FAILED;
2927 /* set up the callback functions */
2928 state->async.fn = verify_recmode_normal_callback;
2929 state->async.private_data = rmdata;
2931 /* one more control to wait for to complete */
2932 rmdata->count++;
2936 /* now wait for up to the maximum number of seconds allowed
2937 or until all nodes we expect a response from has replied
2939 while (rmdata->count > 0) {
2940 tevent_loop_once(ctdb->ev);
2943 status = rmdata->status;
2944 talloc_free(mem_ctx);
2945 return status;
2949 struct verify_recmaster_data {
2950 struct ctdb_recoverd *rec;
2951 uint32_t count;
2952 uint32_t pnn;
2953 enum monitor_result status;
2956 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2958 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2961 /* one more node has responded with recmaster data*/
2962 rmdata->count--;
2964 /* if we failed to get the recmaster, then return an error and let
2965 the main loop try again.
2967 if (state->state != CTDB_CONTROL_DONE) {
2968 if (rmdata->status == MONITOR_OK) {
2969 rmdata->status = MONITOR_FAILED;
2971 return;
2974 /* if we got a response, then the recmaster will be stored in the
2975 status field
2977 if (state->status != rmdata->pnn) {
2978 DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
2979 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2980 rmdata->status = MONITOR_ELECTION_NEEDED;
2983 return;
2987 /* verify that all nodes agree that we are the recmaster */
2988 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap, uint32_t pnn)
2990 struct ctdb_context *ctdb = rec->ctdb;
2991 struct verify_recmaster_data *rmdata;
2992 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2993 struct ctdb_client_control_state *state;
2994 enum monitor_result status;
2995 int j;
2997 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2998 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2999 rmdata->rec = rec;
3000 rmdata->count = 0;
3001 rmdata->pnn = pnn;
3002 rmdata->status = MONITOR_OK;
3004 /* loop over all active nodes and send an async getrecmaster call to
3005 them*/
3006 for (j=0; j<nodemap->num; j++) {
3007 if (nodemap->nodes[j].pnn == rec->recmaster) {
3008 continue;
3010 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3011 continue;
3013 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
3014 CONTROL_TIMEOUT(),
3015 nodemap->nodes[j].pnn);
3016 if (state == NULL) {
3017 /* we failed to send the control, treat this as
3018 an error and try again next iteration
3020 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
3021 talloc_free(mem_ctx);
3022 return MONITOR_FAILED;
3025 /* set up the callback functions */
3026 state->async.fn = verify_recmaster_callback;
3027 state->async.private_data = rmdata;
3029 /* one more control to wait for to complete */
3030 rmdata->count++;
3034 /* now wait for up to the maximum number of seconds allowed
3035 or until all nodes we expect a response from has replied
3037 while (rmdata->count > 0) {
3038 tevent_loop_once(ctdb->ev);
3041 status = rmdata->status;
3042 talloc_free(mem_ctx);
3043 return status;
3046 static bool interfaces_have_changed(struct ctdb_context *ctdb,
3047 struct ctdb_recoverd *rec)
3049 struct ctdb_iface_list_old *ifaces = NULL;
3050 TALLOC_CTX *mem_ctx;
3051 bool ret = false;
3053 mem_ctx = talloc_new(NULL);
3055 /* Read the interfaces from the local node */
3056 if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
3057 CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
3058 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
3059 /* We could return an error. However, this will be
3060 * rare so we'll decide that the interfaces have
3061 * actually changed, just in case.
3063 talloc_free(mem_ctx);
3064 return true;
3067 if (!rec->ifaces) {
3068 /* We haven't been here before so things have changed */
3069 DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
3070 ret = true;
3071 } else if (rec->ifaces->num != ifaces->num) {
3072 /* Number of interfaces has changed */
3073 DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
3074 rec->ifaces->num, ifaces->num));
3075 ret = true;
3076 } else {
3077 /* See if interface names or link states have changed */
3078 int i;
3079 for (i = 0; i < rec->ifaces->num; i++) {
3080 struct ctdb_iface * iface = &rec->ifaces->ifaces[i];
3081 if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
3082 DEBUG(DEBUG_NOTICE,
3083 ("Interface in slot %d changed: %s => %s\n",
3084 i, iface->name, ifaces->ifaces[i].name));
3085 ret = true;
3086 break;
3088 if (iface->link_state != ifaces->ifaces[i].link_state) {
3089 DEBUG(DEBUG_NOTICE,
3090 ("Interface %s changed state: %d => %d\n",
3091 iface->name, iface->link_state,
3092 ifaces->ifaces[i].link_state));
3093 ret = true;
3094 break;
3099 talloc_free(rec->ifaces);
3100 rec->ifaces = talloc_steal(rec, ifaces);
3102 talloc_free(mem_ctx);
3103 return ret;
3106 /* Check that the local allocation of public IP addresses is correct
3107 * and do some house-keeping */
3108 static int verify_local_ip_allocation(struct ctdb_context *ctdb,
3109 struct ctdb_recoverd *rec,
3110 uint32_t pnn,
3111 struct ctdb_node_map_old *nodemap)
3113 TALLOC_CTX *mem_ctx = talloc_new(NULL);
3114 int ret, j;
3115 bool need_takeover_run = false;
3116 struct ctdb_public_ip_list_old *ips = NULL;
3118 /* If we are not the recmaster then do some housekeeping */
3119 if (rec->recmaster != pnn) {
3120 /* Ignore any IP reallocate requests - only recmaster
3121 * processes them
3123 TALLOC_FREE(rec->reallocate_requests);
3124 /* Clear any nodes that should be force rebalanced in
3125 * the next takeover run. If the recovery master role
3126 * has moved then we don't want to process these some
3127 * time in the future.
3129 TALLOC_FREE(rec->force_rebalance_nodes);
3132 /* Return early if disabled... */
3133 if (ctdb->tunable.disable_ip_failover != 0 ||
3134 ctdb_op_is_disabled(rec->takeover_run)) {
3135 return 0;
3138 if (interfaces_have_changed(ctdb, rec)) {
3139 need_takeover_run = true;
3142 /* If there are unhosted IPs but this node can host them then
3143 * trigger an IP reallocation */
3145 /* Read *available* IPs from local node */
3146 ret = ctdb_ctrl_get_public_ips_flags(
3147 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx,
3148 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
3149 if (ret != 0) {
3150 DEBUG(DEBUG_ERR, ("Unable to retrieve available public IPs\n"));
3151 talloc_free(mem_ctx);
3152 return -1;
3155 for (j=0; j<ips->num; j++) {
3156 if (ips->ips[j].pnn == -1 &&
3157 nodemap->nodes[pnn].flags == 0) {
3158 DEBUG(DEBUG_WARNING,
3159 ("Unassigned IP %s can be served by this node\n",
3160 ctdb_addr_to_str(&ips->ips[j].addr)));
3161 need_takeover_run = true;
3165 talloc_free(ips);
3167 if (!ctdb->do_checkpublicip) {
3168 goto done;
3171 /* Validate the IP addresses that this node has on network
3172 * interfaces. If there is an inconsistency between reality
3173 * and the state expected by CTDB then try to fix it by
3174 * triggering an IP reallocation or releasing extraneous IP
3175 * addresses. */
3177 /* Read *known* IPs from local node */
3178 ret = ctdb_ctrl_get_public_ips_flags(
3179 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
3180 if (ret != 0) {
3181 DEBUG(DEBUG_ERR, ("Unable to retrieve known public IPs\n"));
3182 talloc_free(mem_ctx);
3183 return -1;
3186 for (j=0; j<ips->num; j++) {
3187 if (ips->ips[j].pnn == pnn) {
3188 if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
3189 DEBUG(DEBUG_ERR,
3190 ("Assigned IP %s not on an interface\n",
3191 ctdb_addr_to_str(&ips->ips[j].addr)));
3192 need_takeover_run = true;
3194 } else {
3195 if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
3196 DEBUG(DEBUG_ERR,
3197 ("IP %s incorrectly on an interface - releasing\n",
3198 ctdb_addr_to_str(&ips->ips[j].addr)));
3199 ret = ctdb_ctrl_release_ip(ctdb,
3200 CONTROL_TIMEOUT(),
3201 CTDB_CURRENT_NODE,
3202 &ips->ips[j]);
3203 if (ret != 0) {
3204 DEBUG(DEBUG_ERR,
3205 ("Failed to release IP address\n"));
3211 done:
3212 if (need_takeover_run) {
3213 struct ctdb_srvid_message rd;
3214 TDB_DATA data;
3216 DEBUG(DEBUG_NOTICE,("Trigger takeoverrun\n"));
3218 ZERO_STRUCT(rd);
3219 rd.pnn = ctdb->pnn;
3220 rd.srvid = 0;
3221 data.dptr = (uint8_t *)&rd;
3222 data.dsize = sizeof(rd);
3224 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
3225 if (ret != 0) {
3226 DEBUG(DEBUG_ERR,
3227 ("Failed to send takeover run request\n"));
3230 talloc_free(mem_ctx);
3231 return 0;
3235 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
3237 struct ctdb_node_map_old **remote_nodemaps = callback_data;
3239 if (node_pnn >= ctdb->num_nodes) {
3240 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
3241 return;
3244 remote_nodemaps[node_pnn] = (struct ctdb_node_map_old *)talloc_steal(remote_nodemaps, outdata.dptr);
3248 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
3249 struct ctdb_node_map_old *nodemap,
3250 struct ctdb_node_map_old **remote_nodemaps)
3252 uint32_t *nodes;
3254 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
3255 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
3256 nodes, 0,
3257 CONTROL_TIMEOUT(), false, tdb_null,
3258 async_getnodemap_callback,
3259 NULL,
3260 remote_nodemaps) != 0) {
3261 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
3263 return -1;
3266 return 0;
3269 static bool validate_recovery_master(struct ctdb_recoverd *rec,
3270 TALLOC_CTX *mem_ctx)
3272 struct ctdb_context *ctdb = rec->ctdb;
3273 uint32_t pnn = ctdb_get_pnn(ctdb);
3274 struct ctdb_node_map_old *nodemap = rec->nodemap;
3275 struct ctdb_node_map_old *recmaster_nodemap = NULL;
3276 int ret;
3278 /* When recovery daemon is started, recmaster is set to
3279 * "unknown" so it knows to start an election.
3281 if (rec->recmaster == CTDB_UNKNOWN_PNN) {
3282 DEBUG(DEBUG_NOTICE,
3283 ("Initial recovery master set - forcing election\n"));
3284 force_election(rec, pnn, nodemap);
3285 return false;
3289 * If the current recmaster does not have CTDB_CAP_RECMASTER,
3290 * but we have, then force an election and try to become the new
3291 * recmaster.
3293 if (!ctdb_node_has_capabilities(rec->caps,
3294 rec->recmaster,
3295 CTDB_CAP_RECMASTER) &&
3296 (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
3297 !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
3298 DEBUG(DEBUG_ERR,
3299 (" Current recmaster node %u does not have CAP_RECMASTER,"
3300 " but we (node %u) have - force an election\n",
3301 rec->recmaster, pnn));
3302 force_election(rec, pnn, nodemap);
3303 return false;
3306 /* Verify that the master node has not been deleted. This
3307 * should not happen because a node should always be shutdown
3308 * before being deleted, causing a new master to be elected
3309 * before now. However, if something strange has happened
3310 * then checking here will ensure we don't index beyond the
3311 * end of the nodemap array. */
3312 if (rec->recmaster >= nodemap->num) {
3313 DEBUG(DEBUG_ERR,
3314 ("Recmaster node %u has been deleted. Force election\n",
3315 rec->recmaster));
3316 force_election(rec, pnn, nodemap);
3317 return false;
3320 /* if recovery master is disconnected/deleted we must elect a new recmaster */
3321 if (nodemap->nodes[rec->recmaster].flags &
3322 (NODE_FLAGS_DISCONNECTED|NODE_FLAGS_DELETED)) {
3323 DEBUG(DEBUG_NOTICE,
3324 ("Recmaster node %u is disconnected/deleted. Force election\n",
3325 rec->recmaster));
3326 force_election(rec, pnn, nodemap);
3327 return false;
3330 /* get nodemap from the recovery master to check if it is inactive */
3331 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
3332 mem_ctx, &recmaster_nodemap);
3333 if (ret != 0) {
3334 DEBUG(DEBUG_ERR,
3335 (__location__
3336 " Unable to get nodemap from recovery master %u\n",
3337 rec->recmaster));
3338 /* No election, just error */
3339 return false;
3343 if ((recmaster_nodemap->nodes[rec->recmaster].flags & NODE_FLAGS_INACTIVE) &&
3344 (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
3345 DEBUG(DEBUG_NOTICE,
3346 ("Recmaster node %u is inactive. Force election\n",
3347 rec->recmaster));
3349 * update our nodemap to carry the recmaster's notion of
3350 * its own flags, so that we don't keep freezing the
3351 * inactive recmaster node...
3353 nodemap->nodes[rec->recmaster].flags =
3354 recmaster_nodemap->nodes[rec->recmaster].flags;
3355 force_election(rec, pnn, nodemap);
3356 return false;
3359 return true;
3362 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
3363 TALLOC_CTX *mem_ctx)
3365 uint32_t pnn;
3366 struct ctdb_node_map_old *nodemap=NULL;
3367 struct ctdb_node_map_old **remote_nodemaps=NULL;
3368 struct ctdb_vnn_map *vnnmap=NULL;
3369 struct ctdb_vnn_map *remote_vnnmap=NULL;
3370 uint32_t num_lmasters;
3371 int32_t debug_level;
3372 int i, j, ret;
3373 bool self_ban;
3376 /* verify that the main daemon is still running */
3377 if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
3378 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
3379 exit(-1);
3382 /* ping the local daemon to tell it we are alive */
3383 ctdb_ctrl_recd_ping(ctdb);
3385 if (rec->election_timeout) {
3386 /* an election is in progress */
3387 return;
3390 /* read the debug level from the parent and update locally */
3391 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
3392 if (ret !=0) {
3393 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
3394 return;
3396 DEBUGLEVEL = debug_level;
3398 /* get relevant tunables */
3399 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
3400 if (ret != 0) {
3401 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
3402 return;
3405 /* get runstate */
3406 ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(),
3407 CTDB_CURRENT_NODE, &ctdb->runstate);
3408 if (ret != 0) {
3409 DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n"));
3410 return;
3413 pnn = ctdb_get_pnn(ctdb);
3415 /* get nodemap */
3416 TALLOC_FREE(rec->nodemap);
3417 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
3418 if (ret != 0) {
3419 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
3420 return;
3422 nodemap = rec->nodemap;
3424 /* remember our own node flags */
3425 rec->node_flags = nodemap->nodes[pnn].flags;
3427 ban_misbehaving_nodes(rec, &self_ban);
3428 if (self_ban) {
3429 DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
3430 return;
3433 /* if the local daemon is STOPPED or BANNED, we verify that the databases are
3434 also frozen and that the recmode is set to active.
3436 if (rec->node_flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
3437 /* If this node has become inactive then we want to
3438 * reduce the chances of it taking over the recovery
3439 * master role when it becomes active again. This
3440 * helps to stabilise the recovery master role so that
3441 * it stays on the most stable node.
3443 rec->priority_time = timeval_current();
3445 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
3446 if (ret != 0) {
3447 DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
3449 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
3450 DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
3452 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
3453 if (ret != 0) {
3454 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
3456 return;
3459 if (! rec->frozen_on_inactive) {
3460 ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(),
3461 CTDB_CURRENT_NODE);
3462 if (ret != 0) {
3463 DEBUG(DEBUG_ERR,
3464 (__location__ " Failed to freeze node "
3465 "in STOPPED or BANNED state\n"));
3466 return;
3469 rec->frozen_on_inactive = true;
3472 /* If this node is stopped or banned then it is not the recovery
3473 * master, so don't do anything. This prevents stopped or banned
3474 * node from starting election and sending unnecessary controls.
3476 return;
3479 rec->frozen_on_inactive = false;
3481 /* Retrieve capabilities from all connected nodes */
3482 ret = update_capabilities(rec, nodemap);
3483 if (ret != 0) {
3484 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
3485 return;
3488 if (! validate_recovery_master(rec, mem_ctx)) {
3489 return;
3492 /* Check if an IP takeover run is needed and trigger one if
3493 * necessary */
3494 verify_local_ip_allocation(ctdb, rec, pnn, nodemap);
3496 /* if we are not the recmaster then we do not need to check
3497 if recovery is needed
3499 if (pnn != rec->recmaster) {
3500 return;
3504 /* ensure our local copies of flags are right */
3505 ret = update_local_flags(rec, nodemap);
3506 if (ret != 0) {
3507 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
3508 return;
3511 if (ctdb->num_nodes != nodemap->num) {
3512 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
3513 ctdb_load_nodes_file(ctdb);
3514 return;
3517 /* verify that all active nodes agree that we are the recmaster */
3518 switch (verify_recmaster(rec, nodemap, pnn)) {
3519 case MONITOR_RECOVERY_NEEDED:
3520 /* can not happen */
3521 return;
3522 case MONITOR_ELECTION_NEEDED:
3523 force_election(rec, pnn, nodemap);
3524 return;
3525 case MONITOR_OK:
3526 break;
3527 case MONITOR_FAILED:
3528 return;
3532 /* get the vnnmap */
3533 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
3534 if (ret != 0) {
3535 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
3536 return;
3539 if (rec->need_recovery) {
3540 /* a previous recovery didn't finish */
3541 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3542 return;
3545 /* verify that all active nodes are in normal mode
3546 and not in recovery mode
3548 switch (verify_recmode(ctdb, nodemap)) {
3549 case MONITOR_RECOVERY_NEEDED:
3550 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3551 return;
3552 case MONITOR_FAILED:
3553 return;
3554 case MONITOR_ELECTION_NEEDED:
3555 /* can not happen */
3556 case MONITOR_OK:
3557 break;
3561 if (ctdb->recovery_lock != NULL) {
3562 /* We must already hold the recovery lock */
3563 if (!ctdb_recovery_have_lock(rec)) {
3564 DEBUG(DEBUG_ERR,("Failed recovery lock sanity check. Force a recovery\n"));
3565 ctdb_set_culprit(rec, ctdb->pnn);
3566 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3567 return;
3572 /* If recoveries are disabled then there is no use doing any
3573 * nodemap or flags checks. Recoveries might be disabled due
3574 * to "reloadnodes", so doing these checks might cause an
3575 * unnecessary recovery. */
3576 if (ctdb_op_is_disabled(rec->recovery)) {
3577 goto takeover_run_checks;
3580 /* get the nodemap for all active remote nodes
3582 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map_old *, nodemap->num);
3583 if (remote_nodemaps == NULL) {
3584 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3585 return;
3587 for(i=0; i<nodemap->num; i++) {
3588 remote_nodemaps[i] = NULL;
3590 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3591 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3592 return;
3595 /* verify that all other nodes have the same nodemap as we have
3597 for (j=0; j<nodemap->num; j++) {
3598 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3599 continue;
3602 if (remote_nodemaps[j] == NULL) {
3603 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3604 ctdb_set_culprit(rec, j);
3606 return;
3609 /* if the nodes disagree on how many nodes there are
3610 then this is a good reason to try recovery
3612 if (remote_nodemaps[j]->num != nodemap->num) {
3613 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3614 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3615 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3616 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3617 return;
3620 /* if the nodes disagree on which nodes exist and are
3621 active, then that is also a good reason to do recovery
3623 for (i=0;i<nodemap->num;i++) {
3624 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3625 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3626 nodemap->nodes[j].pnn, i,
3627 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3628 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3629 do_recovery(rec, mem_ctx, pnn, nodemap,
3630 vnnmap);
3631 return;
3637 * Update node flags obtained from each active node. This ensure we have
3638 * up-to-date information for all the nodes.
3640 for (j=0; j<nodemap->num; j++) {
3641 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3642 continue;
3644 nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
3647 for (j=0; j<nodemap->num; j++) {
3648 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3649 continue;
3652 /* verify the flags are consistent
3654 for (i=0; i<nodemap->num; i++) {
3655 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3656 continue;
3659 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3660 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3661 nodemap->nodes[j].pnn,
3662 nodemap->nodes[i].pnn,
3663 remote_nodemaps[j]->nodes[i].flags,
3664 nodemap->nodes[i].flags));
3665 if (i == j) {
3666 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3667 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3668 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3669 do_recovery(rec, mem_ctx, pnn, nodemap,
3670 vnnmap);
3671 return;
3672 } else {
3673 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3674 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3675 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3676 do_recovery(rec, mem_ctx, pnn, nodemap,
3677 vnnmap);
3678 return;
3685 /* count how many active nodes there are */
3686 num_lmasters = 0;
3687 for (i=0; i<nodemap->num; i++) {
3688 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
3689 if (ctdb_node_has_capabilities(rec->caps,
3690 ctdb->nodes[i]->pnn,
3691 CTDB_CAP_LMASTER)) {
3692 num_lmasters++;
3698 /* There must be the same number of lmasters in the vnn map as
3699 * there are active nodes with the lmaster capability... or
3700 * do a recovery.
3702 if (vnnmap->size != num_lmasters) {
3703 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
3704 vnnmap->size, num_lmasters));
3705 ctdb_set_culprit(rec, ctdb->pnn);
3706 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3707 return;
3710 /* verify that all active nodes in the nodemap also exist in
3711 the vnnmap.
3713 for (j=0; j<nodemap->num; j++) {
3714 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3715 continue;
3717 if (nodemap->nodes[j].pnn == pnn) {
3718 continue;
3721 for (i=0; i<vnnmap->size; i++) {
3722 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3723 break;
3726 if (i == vnnmap->size) {
3727 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3728 nodemap->nodes[j].pnn));
3729 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3730 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3731 return;
3736 /* verify that all other nodes have the same vnnmap
3737 and are from the same generation
3739 for (j=0; j<nodemap->num; j++) {
3740 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3741 continue;
3743 if (nodemap->nodes[j].pnn == pnn) {
3744 continue;
3747 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3748 mem_ctx, &remote_vnnmap);
3749 if (ret != 0) {
3750 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3751 nodemap->nodes[j].pnn));
3752 return;
3755 /* verify the vnnmap generation is the same */
3756 if (vnnmap->generation != remote_vnnmap->generation) {
3757 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3758 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3759 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3760 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3761 return;
3764 /* verify the vnnmap size is the same */
3765 if (vnnmap->size != remote_vnnmap->size) {
3766 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3767 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3768 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3769 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3770 return;
3773 /* verify the vnnmap is the same */
3774 for (i=0;i<vnnmap->size;i++) {
3775 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3776 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3777 nodemap->nodes[j].pnn));
3778 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3779 do_recovery(rec, mem_ctx, pnn, nodemap,
3780 vnnmap);
3781 return;
3786 /* FIXME: Add remote public IP checking to ensure that nodes
3787 * have the IP addresses that are allocated to them. */
3789 takeover_run_checks:
3791 /* If there are IP takeover runs requested or the previous one
3792 * failed then perform one and notify the waiters */
3793 if (!ctdb_op_is_disabled(rec->takeover_run) &&
3794 (rec->reallocate_requests || rec->need_takeover_run)) {
3795 process_ipreallocate_requests(ctdb, rec);
3799 static void recd_sig_term_handler(struct tevent_context *ev,
3800 struct tevent_signal *se, int signum,
3801 int count, void *dont_care,
3802 void *private_data)
3804 struct ctdb_recoverd *rec = talloc_get_type_abort(
3805 private_data, struct ctdb_recoverd);
3807 ctdb_recovery_unlock(rec);
3808 exit(0);
3813 the main monitoring loop
3815 static void monitor_cluster(struct ctdb_context *ctdb)
3817 struct tevent_signal *se;
3818 struct ctdb_recoverd *rec;
3820 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
3822 rec = talloc_zero(ctdb, struct ctdb_recoverd);
3823 CTDB_NO_MEMORY_FATAL(ctdb, rec);
3825 rec->ctdb = ctdb;
3826 rec->recmaster = CTDB_UNKNOWN_PNN;
3827 rec->recovery_lock_handle = NULL;
3829 rec->takeover_run = ctdb_op_init(rec, "takeover runs");
3830 CTDB_NO_MEMORY_FATAL(ctdb, rec->takeover_run);
3832 rec->recovery = ctdb_op_init(rec, "recoveries");
3833 CTDB_NO_MEMORY_FATAL(ctdb, rec->recovery);
3835 rec->priority_time = timeval_current();
3836 rec->frozen_on_inactive = false;
3838 se = tevent_add_signal(ctdb->ev, ctdb, SIGTERM, 0,
3839 recd_sig_term_handler, rec);
3840 if (se == NULL) {
3841 DEBUG(DEBUG_ERR, ("Failed to install SIGTERM handler\n"));
3842 exit(1);
3845 /* register a message port for sending memory dumps */
3846 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
3848 /* when a node is assigned banning credits */
3849 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_BANNING,
3850 banning_handler, rec);
3852 /* register a message port for recovery elections */
3853 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_ELECTION, election_handler, rec);
3855 /* when nodes are disabled/enabled */
3856 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
3858 /* when we are asked to puch out a flag change */
3859 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
3861 /* register a message port for vacuum fetch */
3862 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
3864 /* register a message port for reloadnodes */
3865 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
3867 /* register a message port for performing a takeover run */
3868 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
3870 /* register a message port for disabling the ip check for a short while */
3871 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
3873 /* register a message port for forcing a rebalance of a node next
3874 reallocation */
3875 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
3877 /* Register a message port for disabling takeover runs */
3878 ctdb_client_set_message_handler(ctdb,
3879 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
3880 disable_takeover_runs_handler, rec);
3882 /* Register a message port for disabling recoveries */
3883 ctdb_client_set_message_handler(ctdb,
3884 CTDB_SRVID_DISABLE_RECOVERIES,
3885 disable_recoveries_handler, rec);
3887 /* register a message port for detaching database */
3888 ctdb_client_set_message_handler(ctdb,
3889 CTDB_SRVID_DETACH_DATABASE,
3890 detach_database_handler, rec);
3892 for (;;) {
3893 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3894 struct timeval start;
3895 double elapsed;
3897 if (!mem_ctx) {
3898 DEBUG(DEBUG_CRIT,(__location__
3899 " Failed to create temp context\n"));
3900 exit(-1);
3903 start = timeval_current();
3904 main_loop(ctdb, rec, mem_ctx);
3905 talloc_free(mem_ctx);
3907 /* we only check for recovery once every second */
3908 elapsed = timeval_elapsed(&start);
3909 if (elapsed < ctdb->tunable.recover_interval) {
3910 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
3911 - elapsed);
3917 event handler for when the main ctdbd dies
3919 static void ctdb_recoverd_parent(struct tevent_context *ev,
3920 struct tevent_fd *fde,
3921 uint16_t flags, void *private_data)
3923 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
3924 _exit(1);
3928 called regularly to verify that the recovery daemon is still running
3930 static void ctdb_check_recd(struct tevent_context *ev,
3931 struct tevent_timer *te,
3932 struct timeval yt, void *p)
3934 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3936 if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
3937 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
3939 tevent_add_timer(ctdb->ev, ctdb, timeval_zero(),
3940 ctdb_restart_recd, ctdb);
3942 return;
3945 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3946 timeval_current_ofs(30, 0),
3947 ctdb_check_recd, ctdb);
3950 static void recd_sig_child_handler(struct tevent_context *ev,
3951 struct tevent_signal *se, int signum,
3952 int count, void *dont_care,
3953 void *private_data)
3955 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3956 int status;
3957 pid_t pid = -1;
3959 while (pid != 0) {
3960 pid = waitpid(-1, &status, WNOHANG);
3961 if (pid == -1) {
3962 if (errno != ECHILD) {
3963 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3965 return;
3967 if (pid > 0) {
3968 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3974 startup the recovery daemon as a child of the main ctdb daemon
3976 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3978 int fd[2];
3979 struct tevent_signal *se;
3980 struct tevent_fd *fde;
3982 if (pipe(fd) != 0) {
3983 return -1;
3986 ctdb->recoverd_pid = ctdb_fork(ctdb);
3987 if (ctdb->recoverd_pid == -1) {
3988 return -1;
3991 if (ctdb->recoverd_pid != 0) {
3992 talloc_free(ctdb->recd_ctx);
3993 ctdb->recd_ctx = talloc_new(ctdb);
3994 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
3996 close(fd[0]);
3997 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3998 timeval_current_ofs(30, 0),
3999 ctdb_check_recd, ctdb);
4000 return 0;
4003 close(fd[1]);
4005 srandom(getpid() ^ time(NULL));
4007 prctl_set_comment("ctdb_recovered");
4008 if (switch_from_server_to_client(ctdb, "recoverd") != 0) {
4009 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
4010 exit(1);
4013 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
4015 fde = tevent_add_fd(ctdb->ev, ctdb, fd[0], TEVENT_FD_READ,
4016 ctdb_recoverd_parent, &fd[0]);
4017 tevent_fd_set_auto_close(fde);
4019 /* set up a handler to pick up sigchld */
4020 se = tevent_add_signal(ctdb->ev, ctdb, SIGCHLD, 0,
4021 recd_sig_child_handler, ctdb);
4022 if (se == NULL) {
4023 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
4024 exit(1);
4027 monitor_cluster(ctdb);
4029 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
4030 return -1;
4034 shutdown the recovery daemon
4036 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
4038 if (ctdb->recoverd_pid == 0) {
4039 return;
4042 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
4043 ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
4045 TALLOC_FREE(ctdb->recd_ctx);
4046 TALLOC_FREE(ctdb->recd_ping_count);
4049 static void ctdb_restart_recd(struct tevent_context *ev,
4050 struct tevent_timer *te,
4051 struct timeval t, void *private_data)
4053 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4055 DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
4056 ctdb_stop_recoverd(ctdb);
4057 ctdb_start_recoverd(ctdb);