s3:secrets: let secrets_delete_machine_password_ex() remove SID and GUID too
[Samba.git] / ctdb / server / ctdb_recoverd.c
blobd9cc4a2dca96e76fd7c8637c61b7d77f388f74bf
1 /*
2 ctdb recovery daemon
4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
20 #include "replace.h"
21 #include "system/filesys.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/wait.h"
26 #include <popt.h>
27 #include <talloc.h>
28 #include <tevent.h>
29 #include <tdb.h>
31 #include "lib/tdb_wrap/tdb_wrap.h"
32 #include "lib/util/dlinklist.h"
33 #include "lib/util/debug.h"
34 #include "lib/util/samba_util.h"
35 #include "lib/util/sys_rw.h"
36 #include "lib/util/util_process.h"
38 #include "ctdb_private.h"
39 #include "ctdb_client.h"
41 #include "common/system.h"
42 #include "common/common.h"
43 #include "common/logging.h"
45 #include "ctdb_cluster_mutex.h"
47 /* List of SRVID requests that need to be processed */
48 struct srvid_list {
49 struct srvid_list *next, *prev;
50 struct ctdb_srvid_message *request;
53 struct srvid_requests {
54 struct srvid_list *requests;
57 static void srvid_request_reply(struct ctdb_context *ctdb,
58 struct ctdb_srvid_message *request,
59 TDB_DATA result)
61 /* Someone that sent srvid==0 does not want a reply */
62 if (request->srvid == 0) {
63 talloc_free(request);
64 return;
67 if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
68 result) == 0) {
69 DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
70 (unsigned)request->pnn,
71 (unsigned long long)request->srvid));
72 } else {
73 DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
74 (unsigned)request->pnn,
75 (unsigned long long)request->srvid));
78 talloc_free(request);
81 static void srvid_requests_reply(struct ctdb_context *ctdb,
82 struct srvid_requests **requests,
83 TDB_DATA result)
85 struct srvid_list *r;
87 if (*requests == NULL) {
88 return;
91 for (r = (*requests)->requests; r != NULL; r = r->next) {
92 srvid_request_reply(ctdb, r->request, result);
95 /* Free the list structure... */
96 TALLOC_FREE(*requests);
99 static void srvid_request_add(struct ctdb_context *ctdb,
100 struct srvid_requests **requests,
101 struct ctdb_srvid_message *request)
103 struct srvid_list *t;
104 int32_t ret;
105 TDB_DATA result;
107 if (*requests == NULL) {
108 *requests = talloc_zero(ctdb, struct srvid_requests);
109 if (*requests == NULL) {
110 goto nomem;
114 t = talloc_zero(*requests, struct srvid_list);
115 if (t == NULL) {
116 /* If *requests was just allocated above then free it */
117 if ((*requests)->requests == NULL) {
118 TALLOC_FREE(*requests);
120 goto nomem;
123 t->request = (struct ctdb_srvid_message *)talloc_steal(t, request);
124 DLIST_ADD((*requests)->requests, t);
126 return;
128 nomem:
129 /* Failed to add the request to the list. Send a fail. */
130 DEBUG(DEBUG_ERR, (__location__
131 " Out of memory, failed to queue SRVID request\n"));
132 ret = -ENOMEM;
133 result.dsize = sizeof(ret);
134 result.dptr = (uint8_t *)&ret;
135 srvid_request_reply(ctdb, request, result);
138 /* An abstraction to allow an operation (takeover runs, recoveries,
139 * ...) to be disabled for a given timeout */
140 struct ctdb_op_state {
141 struct tevent_timer *timer;
142 bool in_progress;
143 const char *name;
146 static struct ctdb_op_state *ctdb_op_init(TALLOC_CTX *mem_ctx, const char *name)
148 struct ctdb_op_state *state = talloc_zero(mem_ctx, struct ctdb_op_state);
150 if (state != NULL) {
151 state->in_progress = false;
152 state->name = name;
155 return state;
158 static bool ctdb_op_is_disabled(struct ctdb_op_state *state)
160 return state->timer != NULL;
163 static bool ctdb_op_begin(struct ctdb_op_state *state)
165 if (ctdb_op_is_disabled(state)) {
166 DEBUG(DEBUG_NOTICE,
167 ("Unable to begin - %s are disabled\n", state->name));
168 return false;
171 state->in_progress = true;
172 return true;
175 static bool ctdb_op_end(struct ctdb_op_state *state)
177 return state->in_progress = false;
180 static bool ctdb_op_is_in_progress(struct ctdb_op_state *state)
182 return state->in_progress;
185 static void ctdb_op_enable(struct ctdb_op_state *state)
187 TALLOC_FREE(state->timer);
190 static void ctdb_op_timeout_handler(struct tevent_context *ev,
191 struct tevent_timer *te,
192 struct timeval yt, void *p)
194 struct ctdb_op_state *state =
195 talloc_get_type(p, struct ctdb_op_state);
197 DEBUG(DEBUG_NOTICE,("Reenabling %s after timeout\n", state->name));
198 ctdb_op_enable(state);
201 static int ctdb_op_disable(struct ctdb_op_state *state,
202 struct tevent_context *ev,
203 uint32_t timeout)
205 if (timeout == 0) {
206 DEBUG(DEBUG_NOTICE,("Reenabling %s\n", state->name));
207 ctdb_op_enable(state);
208 return 0;
211 if (state->in_progress) {
212 DEBUG(DEBUG_ERR,
213 ("Unable to disable %s - in progress\n", state->name));
214 return -EAGAIN;
217 DEBUG(DEBUG_NOTICE,("Disabling %s for %u seconds\n",
218 state->name, timeout));
220 /* Clear any old timers */
221 talloc_free(state->timer);
223 /* Arrange for the timeout to occur */
224 state->timer = tevent_add_timer(ev, state,
225 timeval_current_ofs(timeout, 0),
226 ctdb_op_timeout_handler, state);
227 if (state->timer == NULL) {
228 DEBUG(DEBUG_ERR,(__location__ " Unable to setup timer\n"));
229 return -ENOMEM;
232 return 0;
235 struct ctdb_banning_state {
236 uint32_t count;
237 struct timeval last_reported_time;
241 private state of recovery daemon
243 struct ctdb_recoverd {
244 struct ctdb_context *ctdb;
245 uint32_t recmaster;
246 uint32_t last_culprit_node;
247 struct ctdb_node_map_old *nodemap;
248 struct timeval priority_time;
249 bool need_takeover_run;
250 bool need_recovery;
251 uint32_t node_flags;
252 struct tevent_timer *send_election_te;
253 struct tevent_timer *election_timeout;
254 struct srvid_requests *reallocate_requests;
255 struct ctdb_op_state *takeover_run;
256 struct ctdb_op_state *recovery;
257 struct ctdb_iface_list_old *ifaces;
258 uint32_t *force_rebalance_nodes;
259 struct ctdb_node_capabilities *caps;
260 bool frozen_on_inactive;
261 struct ctdb_cluster_mutex_handle *recovery_lock_handle;
264 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
265 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
267 static void ctdb_restart_recd(struct tevent_context *ev,
268 struct tevent_timer *te, struct timeval t,
269 void *private_data);
272 ban a node for a period of time
274 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
276 int ret;
277 struct ctdb_context *ctdb = rec->ctdb;
278 struct ctdb_ban_state bantime;
280 if (!ctdb_validate_pnn(ctdb, pnn)) {
281 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
282 return;
285 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
287 bantime.pnn = pnn;
288 bantime.time = ban_time;
290 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
291 if (ret != 0) {
292 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
293 return;
298 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
302 remember the trouble maker
304 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
306 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
307 struct ctdb_banning_state *ban_state;
309 if (culprit > ctdb->num_nodes) {
310 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
311 return;
314 /* If we are banned or stopped, do not set other nodes as culprits */
315 if (rec->node_flags & NODE_FLAGS_INACTIVE) {
316 DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
317 return;
320 if (ctdb->nodes[culprit]->ban_state == NULL) {
321 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
322 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
326 ban_state = ctdb->nodes[culprit]->ban_state;
327 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
328 /* this was the first time in a long while this node
329 misbehaved so we will forgive any old transgressions.
331 ban_state->count = 0;
334 ban_state->count += count;
335 ban_state->last_reported_time = timeval_current();
336 rec->last_culprit_node = culprit;
340 remember the trouble maker
342 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
344 ctdb_set_culprit_count(rec, culprit, 1);
348 Retrieve capabilities from all connected nodes
350 static int update_capabilities(struct ctdb_recoverd *rec,
351 struct ctdb_node_map_old *nodemap)
353 uint32_t *capp;
354 TALLOC_CTX *tmp_ctx;
355 struct ctdb_node_capabilities *caps;
356 struct ctdb_context *ctdb = rec->ctdb;
358 tmp_ctx = talloc_new(rec);
359 CTDB_NO_MEMORY(ctdb, tmp_ctx);
361 caps = ctdb_get_capabilities(ctdb, tmp_ctx,
362 CONTROL_TIMEOUT(), nodemap);
364 if (caps == NULL) {
365 DEBUG(DEBUG_ERR,
366 (__location__ " Failed to get node capabilities\n"));
367 talloc_free(tmp_ctx);
368 return -1;
371 capp = ctdb_get_node_capabilities(caps, ctdb_get_pnn(ctdb));
372 if (capp == NULL) {
373 DEBUG(DEBUG_ERR,
374 (__location__
375 " Capabilities don't include current node.\n"));
376 talloc_free(tmp_ctx);
377 return -1;
379 ctdb->capabilities = *capp;
381 TALLOC_FREE(rec->caps);
382 rec->caps = talloc_steal(rec, caps);
384 talloc_free(tmp_ctx);
385 return 0;
389 change recovery mode on all nodes
391 static int set_recovery_mode(struct ctdb_context *ctdb,
392 struct ctdb_recoverd *rec,
393 struct ctdb_node_map_old *nodemap,
394 uint32_t rec_mode)
396 TDB_DATA data;
397 uint32_t *nodes;
398 TALLOC_CTX *tmp_ctx;
400 tmp_ctx = talloc_new(ctdb);
401 CTDB_NO_MEMORY(ctdb, tmp_ctx);
403 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
405 data.dsize = sizeof(uint32_t);
406 data.dptr = (unsigned char *)&rec_mode;
408 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
409 nodes, 0,
410 CONTROL_TIMEOUT(),
411 false, data,
412 NULL, NULL,
413 NULL) != 0) {
414 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
415 talloc_free(tmp_ctx);
416 return -1;
419 talloc_free(tmp_ctx);
420 return 0;
424 ensure all other nodes have attached to any databases that we have
426 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
427 uint32_t pnn, struct ctdb_dbid_map_old *dbmap, TALLOC_CTX *mem_ctx)
429 int i, j, db, ret;
430 struct ctdb_dbid_map_old *remote_dbmap;
432 /* verify that all other nodes have all our databases */
433 for (j=0; j<nodemap->num; j++) {
434 /* we don't need to ourself ourselves */
435 if (nodemap->nodes[j].pnn == pnn) {
436 continue;
438 /* don't check nodes that are unavailable */
439 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
440 continue;
443 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
444 mem_ctx, &remote_dbmap);
445 if (ret != 0) {
446 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
447 return -1;
450 /* step through all local databases */
451 for (db=0; db<dbmap->num;db++) {
452 const char *name;
455 for (i=0;i<remote_dbmap->num;i++) {
456 if (dbmap->dbs[db].db_id == remote_dbmap->dbs[i].db_id) {
457 break;
460 /* the remote node already have this database */
461 if (i!=remote_dbmap->num) {
462 continue;
464 /* ok so we need to create this database */
465 ret = ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn,
466 dbmap->dbs[db].db_id, mem_ctx,
467 &name);
468 if (ret != 0) {
469 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
470 return -1;
472 ret = ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(),
473 nodemap->nodes[j].pnn,
474 mem_ctx, name,
475 dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
476 if (ret != 0) {
477 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
478 return -1;
483 return 0;
488 ensure we are attached to any databases that anyone else is attached to
490 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
491 uint32_t pnn, struct ctdb_dbid_map_old **dbmap, TALLOC_CTX *mem_ctx)
493 int i, j, db, ret;
494 struct ctdb_dbid_map_old *remote_dbmap;
496 /* verify that we have all database any other node has */
497 for (j=0; j<nodemap->num; j++) {
498 /* we don't need to ourself ourselves */
499 if (nodemap->nodes[j].pnn == pnn) {
500 continue;
502 /* don't check nodes that are unavailable */
503 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
504 continue;
507 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
508 mem_ctx, &remote_dbmap);
509 if (ret != 0) {
510 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
511 return -1;
514 /* step through all databases on the remote node */
515 for (db=0; db<remote_dbmap->num;db++) {
516 const char *name;
518 for (i=0;i<(*dbmap)->num;i++) {
519 if (remote_dbmap->dbs[db].db_id == (*dbmap)->dbs[i].db_id) {
520 break;
523 /* we already have this db locally */
524 if (i!=(*dbmap)->num) {
525 continue;
527 /* ok so we need to create this database and
528 rebuild dbmap
530 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
531 remote_dbmap->dbs[db].db_id, mem_ctx, &name);
532 if (ret != 0) {
533 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
534 nodemap->nodes[j].pnn));
535 return -1;
537 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
538 remote_dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
539 if (ret != 0) {
540 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
541 return -1;
543 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
544 if (ret != 0) {
545 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
546 return -1;
551 return 0;
555 update flags on all active nodes
557 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap, uint32_t pnn, uint32_t flags)
559 int ret;
561 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
562 if (ret != 0) {
563 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
564 return -1;
567 return 0;
571 called when a vacuum fetch has completed - just free it and do the next one
573 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
575 talloc_free(state);
580 * Process one elements of the vacuum fetch list:
581 * Migrate it over to us with the special flag
582 * CTDB_CALL_FLAG_VACUUM_MIGRATION.
584 static bool vacuum_fetch_process_one(struct ctdb_db_context *ctdb_db,
585 uint32_t pnn,
586 struct ctdb_rec_data_old *r)
588 struct ctdb_client_call_state *state;
589 TDB_DATA data;
590 struct ctdb_ltdb_header *hdr;
591 struct ctdb_call call;
593 ZERO_STRUCT(call);
594 call.call_id = CTDB_NULL_FUNC;
595 call.flags = CTDB_IMMEDIATE_MIGRATION;
596 call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
598 call.key.dptr = &r->data[0];
599 call.key.dsize = r->keylen;
601 /* ensure we don't block this daemon - just skip a record if we can't get
602 the chainlock */
603 if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, call.key) != 0) {
604 return true;
607 data = tdb_fetch(ctdb_db->ltdb->tdb, call.key);
608 if (data.dptr == NULL) {
609 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
610 return true;
613 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
614 free(data.dptr);
615 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
616 return true;
619 hdr = (struct ctdb_ltdb_header *)data.dptr;
620 if (hdr->dmaster == pnn) {
621 /* its already local */
622 free(data.dptr);
623 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
624 return true;
627 free(data.dptr);
629 state = ctdb_call_send(ctdb_db, &call);
630 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
631 if (state == NULL) {
632 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
633 return false;
635 state->async.fn = vacuum_fetch_callback;
636 state->async.private_data = NULL;
638 return true;
643 handler for vacuum fetch
645 static void vacuum_fetch_handler(uint64_t srvid, TDB_DATA data,
646 void *private_data)
648 struct ctdb_recoverd *rec = talloc_get_type(
649 private_data, struct ctdb_recoverd);
650 struct ctdb_context *ctdb = rec->ctdb;
651 struct ctdb_marshall_buffer *recs;
652 int ret, i;
653 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
654 const char *name;
655 struct ctdb_dbid_map_old *dbmap=NULL;
656 bool persistent = false;
657 struct ctdb_db_context *ctdb_db;
658 struct ctdb_rec_data_old *r;
660 recs = (struct ctdb_marshall_buffer *)data.dptr;
662 if (recs->count == 0) {
663 goto done;
666 /* work out if the database is persistent */
667 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
668 if (ret != 0) {
669 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
670 goto done;
673 for (i=0;i<dbmap->num;i++) {
674 if (dbmap->dbs[i].db_id == recs->db_id) {
675 persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
676 break;
679 if (i == dbmap->num) {
680 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
681 goto done;
684 /* find the name of this database */
685 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
686 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
687 goto done;
690 /* attach to it */
691 ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, persistent, 0);
692 if (ctdb_db == NULL) {
693 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
694 goto done;
697 r = (struct ctdb_rec_data_old *)&recs->data[0];
698 while (recs->count) {
699 bool ok;
701 ok = vacuum_fetch_process_one(ctdb_db, rec->ctdb->pnn, r);
702 if (!ok) {
703 break;
706 r = (struct ctdb_rec_data_old *)(r->length + (uint8_t *)r);
707 recs->count--;
710 done:
711 talloc_free(tmp_ctx);
716 * handler for database detach
718 static void detach_database_handler(uint64_t srvid, TDB_DATA data,
719 void *private_data)
721 struct ctdb_recoverd *rec = talloc_get_type(
722 private_data, struct ctdb_recoverd);
723 struct ctdb_context *ctdb = rec->ctdb;
724 uint32_t db_id;
725 struct ctdb_db_context *ctdb_db;
727 if (data.dsize != sizeof(db_id)) {
728 return;
730 db_id = *(uint32_t *)data.dptr;
732 ctdb_db = find_ctdb_db(ctdb, db_id);
733 if (ctdb_db == NULL) {
734 /* database is not attached */
735 return;
738 DLIST_REMOVE(ctdb->db_list, ctdb_db);
740 DEBUG(DEBUG_NOTICE, ("Detached from database '%s'\n",
741 ctdb_db->db_name));
742 talloc_free(ctdb_db);
746 called when ctdb_wait_timeout should finish
748 static void ctdb_wait_handler(struct tevent_context *ev,
749 struct tevent_timer *te,
750 struct timeval yt, void *p)
752 uint32_t *timed_out = (uint32_t *)p;
753 (*timed_out) = 1;
757 wait for a given number of seconds
759 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
761 uint32_t timed_out = 0;
762 time_t usecs = (secs - (time_t)secs) * 1000000;
763 tevent_add_timer(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs),
764 ctdb_wait_handler, &timed_out);
765 while (!timed_out) {
766 tevent_loop_once(ctdb->ev);
771 called when an election times out (ends)
773 static void ctdb_election_timeout(struct tevent_context *ev,
774 struct tevent_timer *te,
775 struct timeval t, void *p)
777 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
778 rec->election_timeout = NULL;
779 fast_start = false;
781 DEBUG(DEBUG_WARNING,("Election period ended\n"));
786 wait for an election to finish. It finished election_timeout seconds after
787 the last election packet is received
789 static void ctdb_wait_election(struct ctdb_recoverd *rec)
791 struct ctdb_context *ctdb = rec->ctdb;
792 while (rec->election_timeout) {
793 tevent_loop_once(ctdb->ev);
798 Update our local flags from all remote connected nodes.
799 This is only run when we are or we belive we are the recovery master
801 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap)
803 int j;
804 struct ctdb_context *ctdb = rec->ctdb;
805 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
807 /* get the nodemap for all active remote nodes and verify
808 they are the same as for this node
810 for (j=0; j<nodemap->num; j++) {
811 struct ctdb_node_map_old *remote_nodemap=NULL;
812 int ret;
814 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
815 continue;
817 if (nodemap->nodes[j].pnn == ctdb->pnn) {
818 continue;
821 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
822 mem_ctx, &remote_nodemap);
823 if (ret != 0) {
824 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
825 nodemap->nodes[j].pnn));
826 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
827 talloc_free(mem_ctx);
828 return -1;
830 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
831 /* We should tell our daemon about this so it
832 updates its flags or else we will log the same
833 message again in the next iteration of recovery.
834 Since we are the recovery master we can just as
835 well update the flags on all nodes.
837 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
838 if (ret != 0) {
839 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
840 return -1;
843 /* Update our local copy of the flags in the recovery
844 daemon.
846 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
847 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
848 nodemap->nodes[j].flags));
849 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
851 talloc_free(remote_nodemap);
853 talloc_free(mem_ctx);
854 return 0;
858 /* Create a new random generation id.
859 The generation id can not be the INVALID_GENERATION id
861 static uint32_t new_generation(void)
863 uint32_t generation;
865 while (1) {
866 generation = random();
868 if (generation != INVALID_GENERATION) {
869 break;
873 return generation;
876 static bool ctdb_recovery_have_lock(struct ctdb_recoverd *rec)
878 return (rec->recovery_lock_handle != NULL);
881 struct hold_reclock_state {
882 bool done;
883 bool locked;
884 double latency;
887 static void take_reclock_handler(char status,
888 double latency,
889 void *private_data)
891 struct hold_reclock_state *s =
892 (struct hold_reclock_state *) private_data;
894 switch (status) {
895 case '0':
896 s->latency = latency;
897 break;
899 case '1':
900 DEBUG(DEBUG_ERR,
901 ("Unable to take recovery lock - contention\n"));
902 break;
904 default:
905 DEBUG(DEBUG_ERR, ("ERROR: when taking recovery lock\n"));
908 s->done = true;
909 s->locked = (status == '0') ;
912 static bool ctdb_recovery_lock(struct ctdb_recoverd *rec);
914 static void lost_reclock_handler(void *private_data)
916 struct ctdb_recoverd *rec = talloc_get_type_abort(
917 private_data, struct ctdb_recoverd);
919 DEBUG(DEBUG_ERR,
920 ("Recovery lock helper terminated unexpectedly - "
921 "trying to retake recovery lock\n"));
922 TALLOC_FREE(rec->recovery_lock_handle);
923 if (! ctdb_recovery_lock(rec)) {
924 DEBUG(DEBUG_ERR, ("Failed to take recovery lock\n"));
928 static bool ctdb_recovery_lock(struct ctdb_recoverd *rec)
930 struct ctdb_context *ctdb = rec->ctdb;
931 struct ctdb_cluster_mutex_handle *h;
932 struct hold_reclock_state s = {
933 .done = false,
934 .locked = false,
935 .latency = 0,
938 h = ctdb_cluster_mutex(rec, ctdb, ctdb->recovery_lock, 0,
939 take_reclock_handler, &s,
940 lost_reclock_handler, rec);
941 if (h == NULL) {
942 return false;
945 while (!s.done) {
946 tevent_loop_once(ctdb->ev);
949 if (! s.locked) {
950 talloc_free(h);
951 return false;
954 rec->recovery_lock_handle = h;
955 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(),
956 s.latency);
958 return true;
961 static void ctdb_recovery_unlock(struct ctdb_recoverd *rec)
963 if (rec->recovery_lock_handle != NULL) {
964 DEBUG(DEBUG_NOTICE, ("Releasing recovery lock\n"));
965 TALLOC_FREE(rec->recovery_lock_handle);
969 static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
971 struct ctdb_context *ctdb = rec->ctdb;
972 int i;
973 struct ctdb_banning_state *ban_state;
975 *self_ban = false;
976 for (i=0; i<ctdb->num_nodes; i++) {
977 if (ctdb->nodes[i]->ban_state == NULL) {
978 continue;
980 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
981 if (ban_state->count < 2*ctdb->num_nodes) {
982 continue;
985 DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
986 ctdb->nodes[i]->pnn, ban_state->count,
987 ctdb->tunable.recovery_ban_period));
988 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
989 ban_state->count = 0;
991 /* Banning ourself? */
992 if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
993 *self_ban = true;
998 struct helper_state {
999 int fd[2];
1000 pid_t pid;
1001 int result;
1002 bool done;
1005 static void helper_handler(struct tevent_context *ev,
1006 struct tevent_fd *fde,
1007 uint16_t flags, void *private_data)
1009 struct helper_state *state = talloc_get_type_abort(
1010 private_data, struct helper_state);
1011 int ret;
1013 ret = sys_read(state->fd[0], &state->result, sizeof(state->result));
1014 if (ret != sizeof(state->result)) {
1015 state->result = EPIPE;
1018 state->done = true;
1021 static int helper_run(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx,
1022 const char *prog, const char *arg, const char *type)
1024 struct helper_state *state;
1025 struct tevent_fd *fde;
1026 const char **args;
1027 int nargs, ret;
1029 state = talloc_zero(mem_ctx, struct helper_state);
1030 if (state == NULL) {
1031 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1032 return -1;
1035 state->pid = -1;
1037 ret = pipe(state->fd);
1038 if (ret != 0) {
1039 DEBUG(DEBUG_ERR,
1040 ("Failed to create pipe for %s helper\n", type));
1041 goto fail;
1044 set_close_on_exec(state->fd[0]);
1046 nargs = 4;
1047 args = talloc_array(state, const char *, nargs);
1048 if (args == NULL) {
1049 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1050 goto fail;
1053 args[0] = talloc_asprintf(args, "%d", state->fd[1]);
1054 if (args[0] == NULL) {
1055 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1056 goto fail;
1058 args[1] = rec->ctdb->daemon.name;
1059 args[2] = arg;
1060 args[3] = NULL;
1062 if (args[2] == NULL) {
1063 nargs = 3;
1066 state->pid = ctdb_vfork_exec(state, rec->ctdb, prog, nargs, args);
1067 if (state->pid == -1) {
1068 DEBUG(DEBUG_ERR,
1069 ("Failed to create child for %s helper\n", type));
1070 goto fail;
1073 close(state->fd[1]);
1074 state->fd[1] = -1;
1076 state->done = false;
1078 fde = tevent_add_fd(rec->ctdb->ev, rec->ctdb, state->fd[0],
1079 TEVENT_FD_READ, helper_handler, state);
1080 if (fde == NULL) {
1081 goto fail;
1083 tevent_fd_set_auto_close(fde);
1085 while (!state->done) {
1086 tevent_loop_once(rec->ctdb->ev);
1089 close(state->fd[0]);
1090 state->fd[0] = -1;
1092 if (state->result != 0) {
1093 goto fail;
1096 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
1097 talloc_free(state);
1098 return 0;
1100 fail:
1101 if (state->fd[0] != -1) {
1102 close(state->fd[0]);
1104 if (state->fd[1] != -1) {
1105 close(state->fd[1]);
1107 if (state->pid != -1) {
1108 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
1110 talloc_free(state);
1111 return -1;
1115 static int ctdb_takeover(struct ctdb_recoverd *rec,
1116 uint32_t *force_rebalance_nodes)
1118 static char prog[PATH_MAX+1] = "";
1119 char *arg;
1120 int i;
1122 if (!ctdb_set_helper("takeover_helper", prog, sizeof(prog),
1123 "CTDB_TAKEOVER_HELPER", CTDB_HELPER_BINDIR,
1124 "ctdb_takeover_helper")) {
1125 ctdb_die(rec->ctdb, "Unable to set takeover helper\n");
1128 arg = NULL;
1129 for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1130 uint32_t pnn = force_rebalance_nodes[i];
1131 if (arg == NULL) {
1132 arg = talloc_asprintf(rec, "%u", pnn);
1133 } else {
1134 arg = talloc_asprintf_append(arg, ",%u", pnn);
1136 if (arg == NULL) {
1137 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1138 return -1;
1142 return helper_run(rec, rec, prog, arg, "takeover");
1145 static bool do_takeover_run(struct ctdb_recoverd *rec,
1146 struct ctdb_node_map_old *nodemap)
1148 uint32_t *nodes = NULL;
1149 struct ctdb_disable_message dtr;
1150 TDB_DATA data;
1151 int i;
1152 uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
1153 int ret;
1154 bool ok;
1156 DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
1158 if (ctdb_op_is_in_progress(rec->takeover_run)) {
1159 DEBUG(DEBUG_ERR, (__location__
1160 " takeover run already in progress \n"));
1161 ok = false;
1162 goto done;
1165 if (!ctdb_op_begin(rec->takeover_run)) {
1166 ok = false;
1167 goto done;
1170 /* Disable IP checks (takeover runs, really) on other nodes
1171 * while doing this takeover run. This will stop those other
1172 * nodes from triggering takeover runs when think they should
1173 * be hosting an IP but it isn't yet on an interface. Don't
1174 * wait for replies since a failure here might cause some
1175 * noise in the logs but will not actually cause a problem.
1177 ZERO_STRUCT(dtr);
1178 dtr.srvid = 0; /* No reply */
1179 dtr.pnn = -1;
1181 data.dptr = (uint8_t*)&dtr;
1182 data.dsize = sizeof(dtr);
1184 nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
1186 /* Disable for 60 seconds. This can be a tunable later if
1187 * necessary.
1189 dtr.timeout = 60;
1190 for (i = 0; i < talloc_array_length(nodes); i++) {
1191 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1192 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1193 data) != 0) {
1194 DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
1198 ret = ctdb_takeover(rec, rec->force_rebalance_nodes);
1200 /* Reenable takeover runs and IP checks on other nodes */
1201 dtr.timeout = 0;
1202 for (i = 0; i < talloc_array_length(nodes); i++) {
1203 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1204 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1205 data) != 0) {
1206 DEBUG(DEBUG_INFO,("Failed to re-enable takeover runs\n"));
1210 if (ret != 0) {
1211 DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
1212 ok = false;
1213 goto done;
1216 ok = true;
1217 /* Takeover run was successful so clear force rebalance targets */
1218 if (rebalance_nodes == rec->force_rebalance_nodes) {
1219 TALLOC_FREE(rec->force_rebalance_nodes);
1220 } else {
1221 DEBUG(DEBUG_WARNING,
1222 ("Rebalance target nodes changed during takeover run - not clearing\n"));
1224 done:
1225 rec->need_takeover_run = !ok;
1226 talloc_free(nodes);
1227 ctdb_op_end(rec->takeover_run);
1229 DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
1230 return ok;
1233 static int db_recovery_parallel(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx)
1235 static char prog[PATH_MAX+1] = "";
1236 const char *arg;
1238 if (!ctdb_set_helper("recovery_helper", prog, sizeof(prog),
1239 "CTDB_RECOVERY_HELPER", CTDB_HELPER_BINDIR,
1240 "ctdb_recovery_helper")) {
1241 ctdb_die(rec->ctdb, "Unable to set recovery helper\n");
1244 arg = talloc_asprintf(mem_ctx, "%u", new_generation());
1245 if (arg == NULL) {
1246 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1247 return -1;
1250 setenv("CTDB_DBDIR_STATE", rec->ctdb->db_directory_state, 1);
1252 return helper_run(rec, mem_ctx, prog, arg, "recovery");
1256 we are the recmaster, and recovery is needed - start a recovery run
1258 static int do_recovery(struct ctdb_recoverd *rec,
1259 TALLOC_CTX *mem_ctx, uint32_t pnn,
1260 struct ctdb_node_map_old *nodemap, struct ctdb_vnn_map *vnnmap)
1262 struct ctdb_context *ctdb = rec->ctdb;
1263 int i, ret;
1264 struct ctdb_dbid_map_old *dbmap;
1265 bool self_ban;
1267 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1269 /* Check if the current node is still the recmaster. It's possible that
1270 * re-election has changed the recmaster.
1272 if (pnn != rec->recmaster) {
1273 DEBUG(DEBUG_NOTICE,
1274 ("Recovery master changed to %u, aborting recovery\n",
1275 rec->recmaster));
1276 return -1;
1279 /* if recovery fails, force it again */
1280 rec->need_recovery = true;
1282 if (!ctdb_op_begin(rec->recovery)) {
1283 return -1;
1286 if (rec->election_timeout) {
1287 /* an election is in progress */
1288 DEBUG(DEBUG_ERR, ("do_recovery called while election in progress - try again later\n"));
1289 goto fail;
1292 ban_misbehaving_nodes(rec, &self_ban);
1293 if (self_ban) {
1294 DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
1295 goto fail;
1298 if (ctdb->recovery_lock != NULL) {
1299 if (ctdb_recovery_have_lock(rec)) {
1300 DEBUG(DEBUG_NOTICE, ("Already holding recovery lock\n"));
1301 } else {
1302 DEBUG(DEBUG_NOTICE, ("Attempting to take recovery lock (%s)\n",
1303 ctdb->recovery_lock));
1304 if (!ctdb_recovery_lock(rec)) {
1305 if (ctdb->runstate == CTDB_RUNSTATE_FIRST_RECOVERY) {
1306 /* If ctdb is trying first recovery, it's
1307 * possible that current node does not know
1308 * yet who the recmaster is.
1310 DEBUG(DEBUG_ERR, ("Unable to get recovery lock"
1311 " - retrying recovery\n"));
1312 goto fail;
1315 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
1316 "and ban ourself for %u seconds\n",
1317 ctdb->tunable.recovery_ban_period));
1318 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
1319 goto fail;
1321 DEBUG(DEBUG_NOTICE,
1322 ("Recovery lock taken successfully by recovery daemon\n"));
1326 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1328 /* get a list of all databases */
1329 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1330 if (ret != 0) {
1331 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1332 goto fail;
1335 /* we do the db creation before we set the recovery mode, so the freeze happens
1336 on all databases we will be dealing with. */
1338 /* verify that we have all the databases any other node has */
1339 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1340 if (ret != 0) {
1341 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1342 goto fail;
1345 /* verify that all other nodes have all our databases */
1346 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1347 if (ret != 0) {
1348 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1349 goto fail;
1351 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1354 /* Retrieve capabilities from all connected nodes */
1355 ret = update_capabilities(rec, nodemap);
1356 if (ret!=0) {
1357 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1358 return -1;
1362 update all nodes to have the same flags that we have
1364 for (i=0;i<nodemap->num;i++) {
1365 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1366 continue;
1369 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1370 if (ret != 0) {
1371 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1372 DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
1373 } else {
1374 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1375 return -1;
1380 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1382 ret = db_recovery_parallel(rec, mem_ctx);
1383 if (ret != 0) {
1384 goto fail;
1387 do_takeover_run(rec, nodemap);
1389 /* send a message to all clients telling them that the cluster
1390 has been reconfigured */
1391 ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
1392 CTDB_SRVID_RECONFIGURE, tdb_null);
1393 if (ret != 0) {
1394 DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n"));
1395 goto fail;
1398 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1400 rec->need_recovery = false;
1401 ctdb_op_end(rec->recovery);
1403 /* we managed to complete a full recovery, make sure to forgive
1404 any past sins by the nodes that could now participate in the
1405 recovery.
1407 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
1408 for (i=0;i<nodemap->num;i++) {
1409 struct ctdb_banning_state *ban_state;
1411 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1412 continue;
1415 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
1416 if (ban_state == NULL) {
1417 continue;
1420 ban_state->count = 0;
1423 /* We just finished a recovery successfully.
1424 We now wait for rerecovery_timeout before we allow
1425 another recovery to take place.
1427 DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
1428 ctdb_op_disable(rec->recovery, ctdb->ev,
1429 ctdb->tunable.rerecovery_timeout);
1430 return 0;
1432 fail:
1433 ctdb_op_end(rec->recovery);
1434 return -1;
1439 elections are won by first checking the number of connected nodes, then
1440 the priority time, then the pnn
1442 struct election_message {
1443 uint32_t num_connected;
1444 struct timeval priority_time;
1445 uint32_t pnn;
1446 uint32_t node_flags;
1450 form this nodes election data
1452 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1454 int ret, i;
1455 struct ctdb_node_map_old *nodemap;
1456 struct ctdb_context *ctdb = rec->ctdb;
1458 ZERO_STRUCTP(em);
1460 em->pnn = rec->ctdb->pnn;
1461 em->priority_time = rec->priority_time;
1463 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1464 if (ret != 0) {
1465 DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
1466 return;
1469 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
1470 em->node_flags = rec->node_flags;
1472 for (i=0;i<nodemap->num;i++) {
1473 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1474 em->num_connected++;
1478 /* we shouldnt try to win this election if we cant be a recmaster */
1479 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1480 em->num_connected = 0;
1481 em->priority_time = timeval_current();
1484 talloc_free(nodemap);
1488 see if the given election data wins
1490 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1492 struct election_message myem;
1493 int cmp = 0;
1495 ctdb_election_data(rec, &myem);
1497 /* we cant win if we don't have the recmaster capability */
1498 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1499 return false;
1502 /* we cant win if we are banned */
1503 if (rec->node_flags & NODE_FLAGS_BANNED) {
1504 return false;
1507 /* we cant win if we are stopped */
1508 if (rec->node_flags & NODE_FLAGS_STOPPED) {
1509 return false;
1512 /* we will automatically win if the other node is banned */
1513 if (em->node_flags & NODE_FLAGS_BANNED) {
1514 return true;
1517 /* we will automatically win if the other node is banned */
1518 if (em->node_flags & NODE_FLAGS_STOPPED) {
1519 return true;
1522 /* then the longest running node */
1523 if (cmp == 0) {
1524 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1527 if (cmp == 0) {
1528 cmp = (int)myem.pnn - (int)em->pnn;
1531 return cmp > 0;
1535 send out an election request
1537 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
1539 int ret;
1540 TDB_DATA election_data;
1541 struct election_message emsg;
1542 uint64_t srvid;
1543 struct ctdb_context *ctdb = rec->ctdb;
1545 srvid = CTDB_SRVID_ELECTION;
1547 ctdb_election_data(rec, &emsg);
1549 election_data.dsize = sizeof(struct election_message);
1550 election_data.dptr = (unsigned char *)&emsg;
1553 /* first we assume we will win the election and set
1554 recoverymaster to be ourself on the current node
1556 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
1557 CTDB_CURRENT_NODE, pnn);
1558 if (ret != 0) {
1559 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster\n"));
1560 return -1;
1562 rec->recmaster = pnn;
1564 /* send an election message to all active nodes */
1565 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
1566 return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1570 we think we are winning the election - send a broadcast election request
1572 static void election_send_request(struct tevent_context *ev,
1573 struct tevent_timer *te,
1574 struct timeval t, void *p)
1576 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1577 int ret;
1579 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
1580 if (ret != 0) {
1581 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
1584 TALLOC_FREE(rec->send_election_te);
1588 handler for memory dumps
1590 static void mem_dump_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1592 struct ctdb_recoverd *rec = talloc_get_type(
1593 private_data, struct ctdb_recoverd);
1594 struct ctdb_context *ctdb = rec->ctdb;
1595 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1596 TDB_DATA *dump;
1597 int ret;
1598 struct ctdb_srvid_message *rd;
1600 if (data.dsize != sizeof(struct ctdb_srvid_message)) {
1601 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1602 talloc_free(tmp_ctx);
1603 return;
1605 rd = (struct ctdb_srvid_message *)data.dptr;
1607 dump = talloc_zero(tmp_ctx, TDB_DATA);
1608 if (dump == NULL) {
1609 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
1610 talloc_free(tmp_ctx);
1611 return;
1613 ret = ctdb_dump_memory(ctdb, dump);
1614 if (ret != 0) {
1615 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
1616 talloc_free(tmp_ctx);
1617 return;
1620 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
1622 ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
1623 if (ret != 0) {
1624 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
1625 talloc_free(tmp_ctx);
1626 return;
1629 talloc_free(tmp_ctx);
1633 handler for reload_nodes
1635 static void reload_nodes_handler(uint64_t srvid, TDB_DATA data,
1636 void *private_data)
1638 struct ctdb_recoverd *rec = talloc_get_type(
1639 private_data, struct ctdb_recoverd);
1641 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
1643 ctdb_load_nodes_file(rec->ctdb);
1647 static void recd_node_rebalance_handler(uint64_t srvid, TDB_DATA data,
1648 void *private_data)
1650 struct ctdb_recoverd *rec = talloc_get_type(
1651 private_data, struct ctdb_recoverd);
1652 struct ctdb_context *ctdb = rec->ctdb;
1653 uint32_t pnn;
1654 uint32_t *t;
1655 int len;
1657 if (rec->recmaster != ctdb_get_pnn(ctdb)) {
1658 return;
1661 if (data.dsize != sizeof(uint32_t)) {
1662 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
1663 return;
1666 pnn = *(uint32_t *)&data.dptr[0];
1668 DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
1670 /* Copy any existing list of nodes. There's probably some
1671 * sort of realloc variant that will do this but we need to
1672 * make sure that freeing the old array also cancels the timer
1673 * event for the timeout... not sure if realloc will do that.
1675 len = (rec->force_rebalance_nodes != NULL) ?
1676 talloc_array_length(rec->force_rebalance_nodes) :
1679 /* This allows duplicates to be added but they don't cause
1680 * harm. A call to add a duplicate PNN arguably means that
1681 * the timeout should be reset, so this is the simplest
1682 * solution.
1684 t = talloc_zero_array(rec, uint32_t, len+1);
1685 CTDB_NO_MEMORY_VOID(ctdb, t);
1686 if (len > 0) {
1687 memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
1689 t[len] = pnn;
1691 talloc_free(rec->force_rebalance_nodes);
1693 rec->force_rebalance_nodes = t;
1698 static void srvid_disable_and_reply(struct ctdb_context *ctdb,
1699 TDB_DATA data,
1700 struct ctdb_op_state *op_state)
1702 struct ctdb_disable_message *r;
1703 uint32_t timeout;
1704 TDB_DATA result;
1705 int32_t ret = 0;
1707 /* Validate input data */
1708 if (data.dsize != sizeof(struct ctdb_disable_message)) {
1709 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1710 "expecting %lu\n", (long unsigned)data.dsize,
1711 (long unsigned)sizeof(struct ctdb_srvid_message)));
1712 return;
1714 if (data.dptr == NULL) {
1715 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
1716 return;
1719 r = (struct ctdb_disable_message *)data.dptr;
1720 timeout = r->timeout;
1722 ret = ctdb_op_disable(op_state, ctdb->ev, timeout);
1723 if (ret != 0) {
1724 goto done;
1727 /* Returning our PNN tells the caller that we succeeded */
1728 ret = ctdb_get_pnn(ctdb);
1729 done:
1730 result.dsize = sizeof(int32_t);
1731 result.dptr = (uint8_t *)&ret;
1732 srvid_request_reply(ctdb, (struct ctdb_srvid_message *)r, result);
1735 static void disable_takeover_runs_handler(uint64_t srvid, TDB_DATA data,
1736 void *private_data)
1738 struct ctdb_recoverd *rec = talloc_get_type(
1739 private_data, struct ctdb_recoverd);
1741 srvid_disable_and_reply(rec->ctdb, data, rec->takeover_run);
1744 /* Backward compatibility for this SRVID */
1745 static void disable_ip_check_handler(uint64_t srvid, TDB_DATA data,
1746 void *private_data)
1748 struct ctdb_recoverd *rec = talloc_get_type(
1749 private_data, struct ctdb_recoverd);
1750 uint32_t timeout;
1752 if (data.dsize != sizeof(uint32_t)) {
1753 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1754 "expecting %lu\n", (long unsigned)data.dsize,
1755 (long unsigned)sizeof(uint32_t)));
1756 return;
1758 if (data.dptr == NULL) {
1759 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
1760 return;
1763 timeout = *((uint32_t *)data.dptr);
1765 ctdb_op_disable(rec->takeover_run, rec->ctdb->ev, timeout);
1768 static void disable_recoveries_handler(uint64_t srvid, TDB_DATA data,
1769 void *private_data)
1771 struct ctdb_recoverd *rec = talloc_get_type(
1772 private_data, struct ctdb_recoverd);
1774 srvid_disable_and_reply(rec->ctdb, data, rec->recovery);
1778 handler for ip reallocate, just add it to the list of requests and
1779 handle this later in the monitor_cluster loop so we do not recurse
1780 with other requests to takeover_run()
1782 static void ip_reallocate_handler(uint64_t srvid, TDB_DATA data,
1783 void *private_data)
1785 struct ctdb_srvid_message *request;
1786 struct ctdb_recoverd *rec = talloc_get_type(
1787 private_data, struct ctdb_recoverd);
1789 if (data.dsize != sizeof(struct ctdb_srvid_message)) {
1790 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1791 return;
1794 request = (struct ctdb_srvid_message *)data.dptr;
1796 srvid_request_add(rec->ctdb, &rec->reallocate_requests, request);
1799 static void process_ipreallocate_requests(struct ctdb_context *ctdb,
1800 struct ctdb_recoverd *rec)
1802 TDB_DATA result;
1803 int32_t ret;
1804 struct srvid_requests *current;
1806 /* Only process requests that are currently pending. More
1807 * might come in while the takeover run is in progress and
1808 * they will need to be processed later since they might
1809 * be in response flag changes.
1811 current = rec->reallocate_requests;
1812 rec->reallocate_requests = NULL;
1814 if (do_takeover_run(rec, rec->nodemap)) {
1815 ret = ctdb_get_pnn(ctdb);
1816 } else {
1817 ret = -1;
1820 result.dsize = sizeof(int32_t);
1821 result.dptr = (uint8_t *)&ret;
1823 srvid_requests_reply(ctdb, &current, result);
1827 * handler for assigning banning credits
1829 static void banning_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1831 struct ctdb_recoverd *rec = talloc_get_type(
1832 private_data, struct ctdb_recoverd);
1833 uint32_t ban_pnn;
1835 /* Ignore if we are not recmaster */
1836 if (rec->ctdb->pnn != rec->recmaster) {
1837 return;
1840 if (data.dsize != sizeof(uint32_t)) {
1841 DEBUG(DEBUG_ERR, (__location__ "invalid data size %zu\n",
1842 data.dsize));
1843 return;
1846 ban_pnn = *(uint32_t *)data.dptr;
1848 ctdb_set_culprit_count(rec, ban_pnn, rec->nodemap->num);
1852 handler for recovery master elections
1854 static void election_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1856 struct ctdb_recoverd *rec = talloc_get_type(
1857 private_data, struct ctdb_recoverd);
1858 struct ctdb_context *ctdb = rec->ctdb;
1859 int ret;
1860 struct election_message *em = (struct election_message *)data.dptr;
1862 /* Ignore election packets from ourself */
1863 if (ctdb->pnn == em->pnn) {
1864 return;
1867 /* we got an election packet - update the timeout for the election */
1868 talloc_free(rec->election_timeout);
1869 rec->election_timeout = tevent_add_timer(
1870 ctdb->ev, ctdb,
1871 fast_start ?
1872 timeval_current_ofs(0, 500000) :
1873 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1874 ctdb_election_timeout, rec);
1876 /* someone called an election. check their election data
1877 and if we disagree and we would rather be the elected node,
1878 send a new election message to all other nodes
1880 if (ctdb_election_win(rec, em)) {
1881 if (!rec->send_election_te) {
1882 rec->send_election_te = tevent_add_timer(
1883 ctdb->ev, rec,
1884 timeval_current_ofs(0, 500000),
1885 election_send_request, rec);
1887 return;
1890 /* we didn't win */
1891 TALLOC_FREE(rec->send_election_te);
1893 /* Release the recovery lock file */
1894 if (ctdb_recovery_have_lock(rec)) {
1895 ctdb_recovery_unlock(rec);
1898 /* ok, let that guy become recmaster then */
1899 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
1900 CTDB_CURRENT_NODE, em->pnn);
1901 if (ret != 0) {
1902 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster"));
1903 return;
1905 rec->recmaster = em->pnn;
1907 return;
1912 force the start of the election process
1914 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
1915 struct ctdb_node_map_old *nodemap)
1917 int ret;
1918 struct ctdb_context *ctdb = rec->ctdb;
1920 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
1922 /* set all nodes to recovery mode to stop all internode traffic */
1923 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1924 if (ret != 0) {
1925 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1926 return;
1929 talloc_free(rec->election_timeout);
1930 rec->election_timeout = tevent_add_timer(
1931 ctdb->ev, ctdb,
1932 fast_start ?
1933 timeval_current_ofs(0, 500000) :
1934 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1935 ctdb_election_timeout, rec);
1937 ret = send_election_request(rec, pnn);
1938 if (ret!=0) {
1939 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
1940 return;
1943 /* wait for a few seconds to collect all responses */
1944 ctdb_wait_election(rec);
1950 handler for when a node changes its flags
1952 static void monitor_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1954 struct ctdb_recoverd *rec = talloc_get_type(
1955 private_data, struct ctdb_recoverd);
1956 struct ctdb_context *ctdb = rec->ctdb;
1957 int ret;
1958 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
1959 struct ctdb_node_map_old *nodemap=NULL;
1960 TALLOC_CTX *tmp_ctx;
1961 int i;
1963 if (data.dsize != sizeof(*c)) {
1964 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
1965 return;
1968 tmp_ctx = talloc_new(ctdb);
1969 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
1971 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1972 if (ret != 0) {
1973 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
1974 talloc_free(tmp_ctx);
1975 return;
1979 for (i=0;i<nodemap->num;i++) {
1980 if (nodemap->nodes[i].pnn == c->pnn) break;
1983 if (i == nodemap->num) {
1984 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
1985 talloc_free(tmp_ctx);
1986 return;
1989 if (c->old_flags != c->new_flags) {
1990 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
1993 nodemap->nodes[i].flags = c->new_flags;
1995 talloc_free(tmp_ctx);
1999 handler for when we need to push out flag changes ot all other nodes
2001 static void push_flags_handler(uint64_t srvid, TDB_DATA data,
2002 void *private_data)
2004 struct ctdb_recoverd *rec = talloc_get_type(
2005 private_data, struct ctdb_recoverd);
2006 struct ctdb_context *ctdb = rec->ctdb;
2007 int ret;
2008 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2009 struct ctdb_node_map_old *nodemap=NULL;
2010 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2011 uint32_t *nodes;
2013 /* read the node flags from the recmaster */
2014 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
2015 tmp_ctx, &nodemap);
2016 if (ret != 0) {
2017 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2018 talloc_free(tmp_ctx);
2019 return;
2021 if (c->pnn >= nodemap->num) {
2022 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2023 talloc_free(tmp_ctx);
2024 return;
2027 /* send the flags update to all connected nodes */
2028 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2030 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2031 nodes, 0, CONTROL_TIMEOUT(),
2032 false, data,
2033 NULL, NULL,
2034 NULL) != 0) {
2035 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2037 talloc_free(tmp_ctx);
2038 return;
2041 talloc_free(tmp_ctx);
2045 struct verify_recmode_normal_data {
2046 uint32_t count;
2047 enum monitor_result status;
2050 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2052 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2055 /* one more node has responded with recmode data*/
2056 rmdata->count--;
2058 /* if we failed to get the recmode, then return an error and let
2059 the main loop try again.
2061 if (state->state != CTDB_CONTROL_DONE) {
2062 if (rmdata->status == MONITOR_OK) {
2063 rmdata->status = MONITOR_FAILED;
2065 return;
2068 /* if we got a response, then the recmode will be stored in the
2069 status field
2071 if (state->status != CTDB_RECOVERY_NORMAL) {
2072 DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
2073 rmdata->status = MONITOR_RECOVERY_NEEDED;
2076 return;
2080 /* verify that all nodes are in normal recovery mode */
2081 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap)
2083 struct verify_recmode_normal_data *rmdata;
2084 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2085 struct ctdb_client_control_state *state;
2086 enum monitor_result status;
2087 int j;
2089 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2090 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2091 rmdata->count = 0;
2092 rmdata->status = MONITOR_OK;
2094 /* loop over all active nodes and send an async getrecmode call to
2095 them*/
2096 for (j=0; j<nodemap->num; j++) {
2097 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2098 continue;
2100 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2101 CONTROL_TIMEOUT(),
2102 nodemap->nodes[j].pnn);
2103 if (state == NULL) {
2104 /* we failed to send the control, treat this as
2105 an error and try again next iteration
2107 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2108 talloc_free(mem_ctx);
2109 return MONITOR_FAILED;
2112 /* set up the callback functions */
2113 state->async.fn = verify_recmode_normal_callback;
2114 state->async.private_data = rmdata;
2116 /* one more control to wait for to complete */
2117 rmdata->count++;
2121 /* now wait for up to the maximum number of seconds allowed
2122 or until all nodes we expect a response from has replied
2124 while (rmdata->count > 0) {
2125 tevent_loop_once(ctdb->ev);
2128 status = rmdata->status;
2129 talloc_free(mem_ctx);
2130 return status;
2134 struct verify_recmaster_data {
2135 struct ctdb_recoverd *rec;
2136 uint32_t count;
2137 uint32_t pnn;
2138 enum monitor_result status;
2141 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2143 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2146 /* one more node has responded with recmaster data*/
2147 rmdata->count--;
2149 /* if we failed to get the recmaster, then return an error and let
2150 the main loop try again.
2152 if (state->state != CTDB_CONTROL_DONE) {
2153 if (rmdata->status == MONITOR_OK) {
2154 rmdata->status = MONITOR_FAILED;
2156 return;
2159 /* if we got a response, then the recmaster will be stored in the
2160 status field
2162 if (state->status != rmdata->pnn) {
2163 DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
2164 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2165 rmdata->status = MONITOR_ELECTION_NEEDED;
2168 return;
2172 /* verify that all nodes agree that we are the recmaster */
2173 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap, uint32_t pnn)
2175 struct ctdb_context *ctdb = rec->ctdb;
2176 struct verify_recmaster_data *rmdata;
2177 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2178 struct ctdb_client_control_state *state;
2179 enum monitor_result status;
2180 int j;
2182 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2183 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2184 rmdata->rec = rec;
2185 rmdata->count = 0;
2186 rmdata->pnn = pnn;
2187 rmdata->status = MONITOR_OK;
2189 /* loop over all active nodes and send an async getrecmaster call to
2190 them*/
2191 for (j=0; j<nodemap->num; j++) {
2192 if (nodemap->nodes[j].pnn == rec->recmaster) {
2193 continue;
2195 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2196 continue;
2198 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
2199 CONTROL_TIMEOUT(),
2200 nodemap->nodes[j].pnn);
2201 if (state == NULL) {
2202 /* we failed to send the control, treat this as
2203 an error and try again next iteration
2205 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
2206 talloc_free(mem_ctx);
2207 return MONITOR_FAILED;
2210 /* set up the callback functions */
2211 state->async.fn = verify_recmaster_callback;
2212 state->async.private_data = rmdata;
2214 /* one more control to wait for to complete */
2215 rmdata->count++;
2219 /* now wait for up to the maximum number of seconds allowed
2220 or until all nodes we expect a response from has replied
2222 while (rmdata->count > 0) {
2223 tevent_loop_once(ctdb->ev);
2226 status = rmdata->status;
2227 talloc_free(mem_ctx);
2228 return status;
2231 static bool interfaces_have_changed(struct ctdb_context *ctdb,
2232 struct ctdb_recoverd *rec)
2234 struct ctdb_iface_list_old *ifaces = NULL;
2235 TALLOC_CTX *mem_ctx;
2236 bool ret = false;
2238 mem_ctx = talloc_new(NULL);
2240 /* Read the interfaces from the local node */
2241 if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
2242 CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
2243 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
2244 /* We could return an error. However, this will be
2245 * rare so we'll decide that the interfaces have
2246 * actually changed, just in case.
2248 talloc_free(mem_ctx);
2249 return true;
2252 if (!rec->ifaces) {
2253 /* We haven't been here before so things have changed */
2254 DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
2255 ret = true;
2256 } else if (rec->ifaces->num != ifaces->num) {
2257 /* Number of interfaces has changed */
2258 DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
2259 rec->ifaces->num, ifaces->num));
2260 ret = true;
2261 } else {
2262 /* See if interface names or link states have changed */
2263 int i;
2264 for (i = 0; i < rec->ifaces->num; i++) {
2265 struct ctdb_iface * iface = &rec->ifaces->ifaces[i];
2266 if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
2267 DEBUG(DEBUG_NOTICE,
2268 ("Interface in slot %d changed: %s => %s\n",
2269 i, iface->name, ifaces->ifaces[i].name));
2270 ret = true;
2271 break;
2273 if (iface->link_state != ifaces->ifaces[i].link_state) {
2274 DEBUG(DEBUG_NOTICE,
2275 ("Interface %s changed state: %d => %d\n",
2276 iface->name, iface->link_state,
2277 ifaces->ifaces[i].link_state));
2278 ret = true;
2279 break;
2284 talloc_free(rec->ifaces);
2285 rec->ifaces = talloc_steal(rec, ifaces);
2287 talloc_free(mem_ctx);
2288 return ret;
2291 /* Check that the local allocation of public IP addresses is correct
2292 * and do some house-keeping */
2293 static int verify_local_ip_allocation(struct ctdb_context *ctdb,
2294 struct ctdb_recoverd *rec,
2295 uint32_t pnn,
2296 struct ctdb_node_map_old *nodemap)
2298 TALLOC_CTX *mem_ctx = talloc_new(NULL);
2299 int ret, j;
2300 bool need_takeover_run = false;
2301 struct ctdb_public_ip_list_old *ips = NULL;
2303 /* If we are not the recmaster then do some housekeeping */
2304 if (rec->recmaster != pnn) {
2305 /* Ignore any IP reallocate requests - only recmaster
2306 * processes them
2308 TALLOC_FREE(rec->reallocate_requests);
2309 /* Clear any nodes that should be force rebalanced in
2310 * the next takeover run. If the recovery master role
2311 * has moved then we don't want to process these some
2312 * time in the future.
2314 TALLOC_FREE(rec->force_rebalance_nodes);
2317 /* Return early if disabled... */
2318 if (ctdb->tunable.disable_ip_failover != 0 ||
2319 ctdb_op_is_disabled(rec->takeover_run)) {
2320 return 0;
2323 if (interfaces_have_changed(ctdb, rec)) {
2324 need_takeover_run = true;
2327 /* If there are unhosted IPs but this node can host them then
2328 * trigger an IP reallocation */
2330 /* Read *available* IPs from local node */
2331 ret = ctdb_ctrl_get_public_ips_flags(
2332 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx,
2333 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
2334 if (ret != 0) {
2335 DEBUG(DEBUG_ERR, ("Unable to retrieve available public IPs\n"));
2336 talloc_free(mem_ctx);
2337 return -1;
2340 for (j=0; j<ips->num; j++) {
2341 if (ips->ips[j].pnn == -1 &&
2342 nodemap->nodes[pnn].flags == 0) {
2343 DEBUG(DEBUG_WARNING,
2344 ("Unassigned IP %s can be served by this node\n",
2345 ctdb_addr_to_str(&ips->ips[j].addr)));
2346 need_takeover_run = true;
2350 talloc_free(ips);
2352 if (!ctdb->do_checkpublicip) {
2353 goto done;
2356 /* Validate the IP addresses that this node has on network
2357 * interfaces. If there is an inconsistency between reality
2358 * and the state expected by CTDB then try to fix it by
2359 * triggering an IP reallocation or releasing extraneous IP
2360 * addresses. */
2362 /* Read *known* IPs from local node */
2363 ret = ctdb_ctrl_get_public_ips_flags(
2364 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
2365 if (ret != 0) {
2366 DEBUG(DEBUG_ERR, ("Unable to retrieve known public IPs\n"));
2367 talloc_free(mem_ctx);
2368 return -1;
2371 for (j=0; j<ips->num; j++) {
2372 if (ips->ips[j].pnn == pnn) {
2373 if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
2374 DEBUG(DEBUG_ERR,
2375 ("Assigned IP %s not on an interface\n",
2376 ctdb_addr_to_str(&ips->ips[j].addr)));
2377 need_takeover_run = true;
2379 } else {
2380 if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
2381 DEBUG(DEBUG_ERR,
2382 ("IP %s incorrectly on an interface\n",
2383 ctdb_addr_to_str(&ips->ips[j].addr)));
2384 need_takeover_run = true;
2389 done:
2390 if (need_takeover_run) {
2391 struct ctdb_srvid_message rd;
2392 TDB_DATA data;
2394 DEBUG(DEBUG_NOTICE,("Trigger takeoverrun\n"));
2396 ZERO_STRUCT(rd);
2397 rd.pnn = ctdb->pnn;
2398 rd.srvid = 0;
2399 data.dptr = (uint8_t *)&rd;
2400 data.dsize = sizeof(rd);
2402 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
2403 if (ret != 0) {
2404 DEBUG(DEBUG_ERR,
2405 ("Failed to send takeover run request\n"));
2408 talloc_free(mem_ctx);
2409 return 0;
2413 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2415 struct ctdb_node_map_old **remote_nodemaps = callback_data;
2417 if (node_pnn >= ctdb->num_nodes) {
2418 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
2419 return;
2422 remote_nodemaps[node_pnn] = (struct ctdb_node_map_old *)talloc_steal(remote_nodemaps, outdata.dptr);
2426 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
2427 struct ctdb_node_map_old *nodemap,
2428 struct ctdb_node_map_old **remote_nodemaps)
2430 uint32_t *nodes;
2432 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
2433 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
2434 nodes, 0,
2435 CONTROL_TIMEOUT(), false, tdb_null,
2436 async_getnodemap_callback,
2437 NULL,
2438 remote_nodemaps) != 0) {
2439 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
2441 return -1;
2444 return 0;
2447 static bool validate_recovery_master(struct ctdb_recoverd *rec,
2448 TALLOC_CTX *mem_ctx)
2450 struct ctdb_context *ctdb = rec->ctdb;
2451 uint32_t pnn = ctdb_get_pnn(ctdb);
2452 struct ctdb_node_map_old *nodemap = rec->nodemap;
2453 struct ctdb_node_map_old *recmaster_nodemap = NULL;
2454 int ret;
2456 /* When recovery daemon is started, recmaster is set to
2457 * "unknown" so it knows to start an election.
2459 if (rec->recmaster == CTDB_UNKNOWN_PNN) {
2460 DEBUG(DEBUG_NOTICE,
2461 ("Initial recovery master set - forcing election\n"));
2462 force_election(rec, pnn, nodemap);
2463 return false;
2467 * If the current recmaster does not have CTDB_CAP_RECMASTER,
2468 * but we have, then force an election and try to become the new
2469 * recmaster.
2471 if (!ctdb_node_has_capabilities(rec->caps,
2472 rec->recmaster,
2473 CTDB_CAP_RECMASTER) &&
2474 (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
2475 !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
2476 DEBUG(DEBUG_ERR,
2477 (" Current recmaster node %u does not have CAP_RECMASTER,"
2478 " but we (node %u) have - force an election\n",
2479 rec->recmaster, pnn));
2480 force_election(rec, pnn, nodemap);
2481 return false;
2484 /* Verify that the master node has not been deleted. This
2485 * should not happen because a node should always be shutdown
2486 * before being deleted, causing a new master to be elected
2487 * before now. However, if something strange has happened
2488 * then checking here will ensure we don't index beyond the
2489 * end of the nodemap array. */
2490 if (rec->recmaster >= nodemap->num) {
2491 DEBUG(DEBUG_ERR,
2492 ("Recmaster node %u has been deleted. Force election\n",
2493 rec->recmaster));
2494 force_election(rec, pnn, nodemap);
2495 return false;
2498 /* if recovery master is disconnected/deleted we must elect a new recmaster */
2499 if (nodemap->nodes[rec->recmaster].flags &
2500 (NODE_FLAGS_DISCONNECTED|NODE_FLAGS_DELETED)) {
2501 DEBUG(DEBUG_NOTICE,
2502 ("Recmaster node %u is disconnected/deleted. Force election\n",
2503 rec->recmaster));
2504 force_election(rec, pnn, nodemap);
2505 return false;
2508 /* get nodemap from the recovery master to check if it is inactive */
2509 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
2510 mem_ctx, &recmaster_nodemap);
2511 if (ret != 0) {
2512 DEBUG(DEBUG_ERR,
2513 (__location__
2514 " Unable to get nodemap from recovery master %u\n",
2515 rec->recmaster));
2516 /* No election, just error */
2517 return false;
2521 if ((recmaster_nodemap->nodes[rec->recmaster].flags & NODE_FLAGS_INACTIVE) &&
2522 (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
2523 DEBUG(DEBUG_NOTICE,
2524 ("Recmaster node %u is inactive. Force election\n",
2525 rec->recmaster));
2527 * update our nodemap to carry the recmaster's notion of
2528 * its own flags, so that we don't keep freezing the
2529 * inactive recmaster node...
2531 nodemap->nodes[rec->recmaster].flags =
2532 recmaster_nodemap->nodes[rec->recmaster].flags;
2533 force_election(rec, pnn, nodemap);
2534 return false;
2537 return true;
2540 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
2541 TALLOC_CTX *mem_ctx)
2543 uint32_t pnn;
2544 struct ctdb_node_map_old *nodemap=NULL;
2545 struct ctdb_node_map_old **remote_nodemaps=NULL;
2546 struct ctdb_vnn_map *vnnmap=NULL;
2547 struct ctdb_vnn_map *remote_vnnmap=NULL;
2548 uint32_t num_lmasters;
2549 int32_t debug_level;
2550 int i, j, ret;
2551 bool self_ban;
2554 /* verify that the main daemon is still running */
2555 if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
2556 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2557 exit(-1);
2560 /* ping the local daemon to tell it we are alive */
2561 ctdb_ctrl_recd_ping(ctdb);
2563 if (rec->election_timeout) {
2564 /* an election is in progress */
2565 return;
2568 /* read the debug level from the parent and update locally */
2569 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
2570 if (ret !=0) {
2571 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
2572 return;
2574 DEBUGLEVEL = debug_level;
2576 /* get relevant tunables */
2577 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
2578 if (ret != 0) {
2579 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
2580 return;
2583 /* get runstate */
2584 ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(),
2585 CTDB_CURRENT_NODE, &ctdb->runstate);
2586 if (ret != 0) {
2587 DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n"));
2588 return;
2591 pnn = ctdb_get_pnn(ctdb);
2593 /* get nodemap */
2594 TALLOC_FREE(rec->nodemap);
2595 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
2596 if (ret != 0) {
2597 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
2598 return;
2600 nodemap = rec->nodemap;
2602 /* remember our own node flags */
2603 rec->node_flags = nodemap->nodes[pnn].flags;
2605 ban_misbehaving_nodes(rec, &self_ban);
2606 if (self_ban) {
2607 DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
2608 return;
2611 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2612 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2613 if (ret != 0) {
2614 D_ERR("Failed to read recmode from local node\n");
2615 return;
2618 /* if the local daemon is STOPPED or BANNED, we verify that the databases are
2619 also frozen and that the recmode is set to active.
2621 if (rec->node_flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
2622 /* If this node has become inactive then we want to
2623 * reduce the chances of it taking over the recovery
2624 * master role when it becomes active again. This
2625 * helps to stabilise the recovery master role so that
2626 * it stays on the most stable node.
2628 rec->priority_time = timeval_current();
2630 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2631 DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
2633 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2634 if (ret != 0) {
2635 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
2637 return;
2640 if (! rec->frozen_on_inactive) {
2641 ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(),
2642 CTDB_CURRENT_NODE);
2643 if (ret != 0) {
2644 DEBUG(DEBUG_ERR,
2645 (__location__ " Failed to freeze node "
2646 "in STOPPED or BANNED state\n"));
2647 return;
2650 rec->frozen_on_inactive = true;
2653 /* If this node is stopped or banned then it is not the recovery
2654 * master, so don't do anything. This prevents stopped or banned
2655 * node from starting election and sending unnecessary controls.
2657 return;
2660 rec->frozen_on_inactive = false;
2662 /* Retrieve capabilities from all connected nodes */
2663 ret = update_capabilities(rec, nodemap);
2664 if (ret != 0) {
2665 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
2666 return;
2669 if (! validate_recovery_master(rec, mem_ctx)) {
2670 return;
2673 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2674 /* Check if an IP takeover run is needed and trigger one if
2675 * necessary */
2676 verify_local_ip_allocation(ctdb, rec, pnn, nodemap);
2679 /* if we are not the recmaster then we do not need to check
2680 if recovery is needed
2682 if (pnn != rec->recmaster) {
2683 return;
2687 /* ensure our local copies of flags are right */
2688 ret = update_local_flags(rec, nodemap);
2689 if (ret != 0) {
2690 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
2691 return;
2694 if (ctdb->num_nodes != nodemap->num) {
2695 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
2696 ctdb_load_nodes_file(ctdb);
2697 return;
2700 /* verify that all active nodes agree that we are the recmaster */
2701 switch (verify_recmaster(rec, nodemap, pnn)) {
2702 case MONITOR_RECOVERY_NEEDED:
2703 /* can not happen */
2704 return;
2705 case MONITOR_ELECTION_NEEDED:
2706 force_election(rec, pnn, nodemap);
2707 return;
2708 case MONITOR_OK:
2709 break;
2710 case MONITOR_FAILED:
2711 return;
2715 /* get the vnnmap */
2716 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
2717 if (ret != 0) {
2718 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
2719 return;
2722 if (rec->need_recovery) {
2723 /* a previous recovery didn't finish */
2724 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2725 return;
2728 /* verify that all active nodes are in normal mode
2729 and not in recovery mode
2731 switch (verify_recmode(ctdb, nodemap)) {
2732 case MONITOR_RECOVERY_NEEDED:
2733 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2734 return;
2735 case MONITOR_FAILED:
2736 return;
2737 case MONITOR_ELECTION_NEEDED:
2738 /* can not happen */
2739 case MONITOR_OK:
2740 break;
2744 if (ctdb->recovery_lock != NULL) {
2745 /* We must already hold the recovery lock */
2746 if (!ctdb_recovery_have_lock(rec)) {
2747 DEBUG(DEBUG_ERR,("Failed recovery lock sanity check. Force a recovery\n"));
2748 ctdb_set_culprit(rec, ctdb->pnn);
2749 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2750 return;
2755 /* If recoveries are disabled then there is no use doing any
2756 * nodemap or flags checks. Recoveries might be disabled due
2757 * to "reloadnodes", so doing these checks might cause an
2758 * unnecessary recovery. */
2759 if (ctdb_op_is_disabled(rec->recovery)) {
2760 goto takeover_run_checks;
2763 /* get the nodemap for all active remote nodes
2765 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map_old *, nodemap->num);
2766 if (remote_nodemaps == NULL) {
2767 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
2768 return;
2770 for(i=0; i<nodemap->num; i++) {
2771 remote_nodemaps[i] = NULL;
2773 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
2774 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
2775 return;
2778 /* verify that all other nodes have the same nodemap as we have
2780 for (j=0; j<nodemap->num; j++) {
2781 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2782 continue;
2785 if (remote_nodemaps[j] == NULL) {
2786 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
2787 ctdb_set_culprit(rec, j);
2789 return;
2792 /* if the nodes disagree on how many nodes there are
2793 then this is a good reason to try recovery
2795 if (remote_nodemaps[j]->num != nodemap->num) {
2796 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
2797 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
2798 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2799 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2800 return;
2803 /* if the nodes disagree on which nodes exist and are
2804 active, then that is also a good reason to do recovery
2806 for (i=0;i<nodemap->num;i++) {
2807 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
2808 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
2809 nodemap->nodes[j].pnn, i,
2810 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
2811 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2812 do_recovery(rec, mem_ctx, pnn, nodemap,
2813 vnnmap);
2814 return;
2820 * Update node flags obtained from each active node. This ensure we have
2821 * up-to-date information for all the nodes.
2823 for (j=0; j<nodemap->num; j++) {
2824 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2825 continue;
2827 nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
2830 for (j=0; j<nodemap->num; j++) {
2831 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2832 continue;
2835 /* verify the flags are consistent
2837 for (i=0; i<nodemap->num; i++) {
2838 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2839 continue;
2842 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
2843 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
2844 nodemap->nodes[j].pnn,
2845 nodemap->nodes[i].pnn,
2846 remote_nodemaps[j]->nodes[i].flags,
2847 nodemap->nodes[i].flags));
2848 if (i == j) {
2849 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
2850 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
2851 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2852 do_recovery(rec, mem_ctx, pnn, nodemap,
2853 vnnmap);
2854 return;
2855 } else {
2856 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
2857 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
2858 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2859 do_recovery(rec, mem_ctx, pnn, nodemap,
2860 vnnmap);
2861 return;
2868 /* count how many active nodes there are */
2869 num_lmasters = 0;
2870 for (i=0; i<nodemap->num; i++) {
2871 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
2872 if (ctdb_node_has_capabilities(rec->caps,
2873 ctdb->nodes[i]->pnn,
2874 CTDB_CAP_LMASTER)) {
2875 num_lmasters++;
2881 /* There must be the same number of lmasters in the vnn map as
2882 * there are active nodes with the lmaster capability... or
2883 * do a recovery.
2885 if (vnnmap->size != num_lmasters) {
2886 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
2887 vnnmap->size, num_lmasters));
2888 ctdb_set_culprit(rec, ctdb->pnn);
2889 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2890 return;
2893 /* verify that all active nodes in the nodemap also exist in
2894 the vnnmap.
2896 for (j=0; j<nodemap->num; j++) {
2897 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2898 continue;
2900 if (nodemap->nodes[j].pnn == pnn) {
2901 continue;
2904 for (i=0; i<vnnmap->size; i++) {
2905 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
2906 break;
2909 if (i == vnnmap->size) {
2910 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
2911 nodemap->nodes[j].pnn));
2912 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2913 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2914 return;
2919 /* verify that all other nodes have the same vnnmap
2920 and are from the same generation
2922 for (j=0; j<nodemap->num; j++) {
2923 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2924 continue;
2926 if (nodemap->nodes[j].pnn == pnn) {
2927 continue;
2930 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
2931 mem_ctx, &remote_vnnmap);
2932 if (ret != 0) {
2933 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
2934 nodemap->nodes[j].pnn));
2935 return;
2938 /* verify the vnnmap generation is the same */
2939 if (vnnmap->generation != remote_vnnmap->generation) {
2940 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
2941 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
2942 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2943 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2944 return;
2947 /* verify the vnnmap size is the same */
2948 if (vnnmap->size != remote_vnnmap->size) {
2949 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
2950 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
2951 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2952 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2953 return;
2956 /* verify the vnnmap is the same */
2957 for (i=0;i<vnnmap->size;i++) {
2958 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
2959 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
2960 nodemap->nodes[j].pnn));
2961 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2962 do_recovery(rec, mem_ctx, pnn, nodemap,
2963 vnnmap);
2964 return;
2969 /* FIXME: Add remote public IP checking to ensure that nodes
2970 * have the IP addresses that are allocated to them. */
2972 takeover_run_checks:
2974 /* If there are IP takeover runs requested or the previous one
2975 * failed then perform one and notify the waiters */
2976 if (!ctdb_op_is_disabled(rec->takeover_run) &&
2977 (rec->reallocate_requests || rec->need_takeover_run)) {
2978 process_ipreallocate_requests(ctdb, rec);
2982 static void recd_sig_term_handler(struct tevent_context *ev,
2983 struct tevent_signal *se, int signum,
2984 int count, void *dont_care,
2985 void *private_data)
2987 struct ctdb_recoverd *rec = talloc_get_type_abort(
2988 private_data, struct ctdb_recoverd);
2990 DEBUG(DEBUG_ERR, ("Received SIGTERM, exiting\n"));
2991 ctdb_recovery_unlock(rec);
2992 exit(0);
2997 the main monitoring loop
2999 static void monitor_cluster(struct ctdb_context *ctdb)
3001 struct tevent_signal *se;
3002 struct ctdb_recoverd *rec;
3004 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
3006 rec = talloc_zero(ctdb, struct ctdb_recoverd);
3007 CTDB_NO_MEMORY_FATAL(ctdb, rec);
3009 rec->ctdb = ctdb;
3010 rec->recmaster = CTDB_UNKNOWN_PNN;
3011 rec->recovery_lock_handle = NULL;
3013 rec->takeover_run = ctdb_op_init(rec, "takeover runs");
3014 CTDB_NO_MEMORY_FATAL(ctdb, rec->takeover_run);
3016 rec->recovery = ctdb_op_init(rec, "recoveries");
3017 CTDB_NO_MEMORY_FATAL(ctdb, rec->recovery);
3019 rec->priority_time = timeval_current();
3020 rec->frozen_on_inactive = false;
3022 se = tevent_add_signal(ctdb->ev, ctdb, SIGTERM, 0,
3023 recd_sig_term_handler, rec);
3024 if (se == NULL) {
3025 DEBUG(DEBUG_ERR, ("Failed to install SIGTERM handler\n"));
3026 exit(1);
3029 /* register a message port for sending memory dumps */
3030 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
3032 /* when a node is assigned banning credits */
3033 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_BANNING,
3034 banning_handler, rec);
3036 /* register a message port for recovery elections */
3037 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_ELECTION, election_handler, rec);
3039 /* when nodes are disabled/enabled */
3040 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
3042 /* when we are asked to puch out a flag change */
3043 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
3045 /* register a message port for vacuum fetch */
3046 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
3048 /* register a message port for reloadnodes */
3049 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
3051 /* register a message port for performing a takeover run */
3052 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
3054 /* register a message port for disabling the ip check for a short while */
3055 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
3057 /* register a message port for forcing a rebalance of a node next
3058 reallocation */
3059 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
3061 /* Register a message port for disabling takeover runs */
3062 ctdb_client_set_message_handler(ctdb,
3063 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
3064 disable_takeover_runs_handler, rec);
3066 /* Register a message port for disabling recoveries */
3067 ctdb_client_set_message_handler(ctdb,
3068 CTDB_SRVID_DISABLE_RECOVERIES,
3069 disable_recoveries_handler, rec);
3071 /* register a message port for detaching database */
3072 ctdb_client_set_message_handler(ctdb,
3073 CTDB_SRVID_DETACH_DATABASE,
3074 detach_database_handler, rec);
3076 for (;;) {
3077 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3078 struct timeval start;
3079 double elapsed;
3081 if (!mem_ctx) {
3082 DEBUG(DEBUG_CRIT,(__location__
3083 " Failed to create temp context\n"));
3084 exit(-1);
3087 start = timeval_current();
3088 main_loop(ctdb, rec, mem_ctx);
3089 talloc_free(mem_ctx);
3091 /* we only check for recovery once every second */
3092 elapsed = timeval_elapsed(&start);
3093 if (elapsed < ctdb->tunable.recover_interval) {
3094 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
3095 - elapsed);
3101 event handler for when the main ctdbd dies
3103 static void ctdb_recoverd_parent(struct tevent_context *ev,
3104 struct tevent_fd *fde,
3105 uint16_t flags, void *private_data)
3107 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
3108 _exit(1);
3112 called regularly to verify that the recovery daemon is still running
3114 static void ctdb_check_recd(struct tevent_context *ev,
3115 struct tevent_timer *te,
3116 struct timeval yt, void *p)
3118 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3120 if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
3121 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
3123 tevent_add_timer(ctdb->ev, ctdb, timeval_zero(),
3124 ctdb_restart_recd, ctdb);
3126 return;
3129 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3130 timeval_current_ofs(30, 0),
3131 ctdb_check_recd, ctdb);
3134 static void recd_sig_child_handler(struct tevent_context *ev,
3135 struct tevent_signal *se, int signum,
3136 int count, void *dont_care,
3137 void *private_data)
3139 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3140 int status;
3141 pid_t pid = -1;
3143 while (pid != 0) {
3144 pid = waitpid(-1, &status, WNOHANG);
3145 if (pid == -1) {
3146 if (errno != ECHILD) {
3147 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3149 return;
3151 if (pid > 0) {
3152 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3158 startup the recovery daemon as a child of the main ctdb daemon
3160 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3162 int fd[2];
3163 struct tevent_signal *se;
3164 struct tevent_fd *fde;
3165 int ret;
3167 if (pipe(fd) != 0) {
3168 return -1;
3171 ctdb->recoverd_pid = ctdb_fork(ctdb);
3172 if (ctdb->recoverd_pid == -1) {
3173 return -1;
3176 if (ctdb->recoverd_pid != 0) {
3177 talloc_free(ctdb->recd_ctx);
3178 ctdb->recd_ctx = talloc_new(ctdb);
3179 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
3181 close(fd[0]);
3182 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3183 timeval_current_ofs(30, 0),
3184 ctdb_check_recd, ctdb);
3185 return 0;
3188 close(fd[1]);
3190 srandom(getpid() ^ time(NULL));
3192 ret = logging_init(ctdb, NULL, NULL, "ctdb-recoverd");
3193 if (ret != 0) {
3194 return -1;
3197 prctl_set_comment("ctdb_recovered");
3198 if (switch_from_server_to_client(ctdb) != 0) {
3199 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3200 exit(1);
3203 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
3205 fde = tevent_add_fd(ctdb->ev, ctdb, fd[0], TEVENT_FD_READ,
3206 ctdb_recoverd_parent, &fd[0]);
3207 tevent_fd_set_auto_close(fde);
3209 /* set up a handler to pick up sigchld */
3210 se = tevent_add_signal(ctdb->ev, ctdb, SIGCHLD, 0,
3211 recd_sig_child_handler, ctdb);
3212 if (se == NULL) {
3213 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3214 exit(1);
3217 monitor_cluster(ctdb);
3219 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
3220 return -1;
3224 shutdown the recovery daemon
3226 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
3228 if (ctdb->recoverd_pid == 0) {
3229 return;
3232 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
3233 ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
3235 TALLOC_FREE(ctdb->recd_ctx);
3236 TALLOC_FREE(ctdb->recd_ping_count);
3239 static void ctdb_restart_recd(struct tevent_context *ev,
3240 struct tevent_timer *te,
3241 struct timeval t, void *private_data)
3243 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3245 DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
3246 ctdb_stop_recoverd(ctdb);
3247 ctdb_start_recoverd(ctdb);