ctdb-recoverd: Process all the records for vacuum fetch in a loop
[Samba.git] / ctdb / server / ctdb_recoverd.c
blobf86f57e570d8a29e1592b7ef38b64f79475d3880
1 /*
2 ctdb recovery daemon
4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
20 #include "includes.h"
21 #include "system/filesys.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/wait.h"
25 #include "popt.h"
26 #include "cmdline.h"
27 #include "../include/ctdb_client.h"
28 #include "../include/ctdb_private.h"
29 #include "lib/tdb_wrap/tdb_wrap.h"
30 #include "lib/util/dlinklist.h"
33 /* List of SRVID requests that need to be processed */
34 struct srvid_list {
35 struct srvid_list *next, *prev;
36 struct srvid_request *request;
39 struct srvid_requests {
40 struct srvid_list *requests;
43 static void srvid_request_reply(struct ctdb_context *ctdb,
44 struct srvid_request *request,
45 TDB_DATA result)
47 /* Someone that sent srvid==0 does not want a reply */
48 if (request->srvid == 0) {
49 talloc_free(request);
50 return;
53 if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
54 result) == 0) {
55 DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
56 (unsigned)request->pnn,
57 (unsigned long long)request->srvid));
58 } else {
59 DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
60 (unsigned)request->pnn,
61 (unsigned long long)request->srvid));
64 talloc_free(request);
67 static void srvid_requests_reply(struct ctdb_context *ctdb,
68 struct srvid_requests **requests,
69 TDB_DATA result)
71 struct srvid_list *r;
73 for (r = (*requests)->requests; r != NULL; r = r->next) {
74 srvid_request_reply(ctdb, r->request, result);
77 /* Free the list structure... */
78 TALLOC_FREE(*requests);
81 static void srvid_request_add(struct ctdb_context *ctdb,
82 struct srvid_requests **requests,
83 struct srvid_request *request)
85 struct srvid_list *t;
86 int32_t ret;
87 TDB_DATA result;
89 if (*requests == NULL) {
90 *requests = talloc_zero(ctdb, struct srvid_requests);
91 if (*requests == NULL) {
92 goto nomem;
96 t = talloc_zero(*requests, struct srvid_list);
97 if (t == NULL) {
98 /* If *requests was just allocated above then free it */
99 if ((*requests)->requests == NULL) {
100 TALLOC_FREE(*requests);
102 goto nomem;
105 t->request = (struct srvid_request *)talloc_steal(t, request);
106 DLIST_ADD((*requests)->requests, t);
108 return;
110 nomem:
111 /* Failed to add the request to the list. Send a fail. */
112 DEBUG(DEBUG_ERR, (__location__
113 " Out of memory, failed to queue SRVID request\n"));
114 ret = -ENOMEM;
115 result.dsize = sizeof(ret);
116 result.dptr = (uint8_t *)&ret;
117 srvid_request_reply(ctdb, request, result);
120 struct ctdb_banning_state {
121 uint32_t count;
122 struct timeval last_reported_time;
126 private state of recovery daemon
128 struct ctdb_recoverd {
129 struct ctdb_context *ctdb;
130 uint32_t recmaster;
131 uint32_t num_active;
132 uint32_t num_lmasters;
133 uint32_t num_connected;
134 uint32_t last_culprit_node;
135 struct ctdb_node_map *nodemap;
136 struct timeval priority_time;
137 bool need_takeover_run;
138 bool need_recovery;
139 uint32_t node_flags;
140 struct timed_event *send_election_te;
141 struct timed_event *election_timeout;
142 struct vacuum_info *vacuum_info;
143 struct srvid_requests *reallocate_requests;
144 bool takeover_run_in_progress;
145 TALLOC_CTX *takeover_runs_disable_ctx;
146 struct ctdb_control_get_ifaces *ifaces;
147 uint32_t *force_rebalance_nodes;
150 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
151 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
153 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te, struct timeval t, void *private_data);
156 ban a node for a period of time
158 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
160 int ret;
161 struct ctdb_context *ctdb = rec->ctdb;
162 struct ctdb_ban_time bantime;
164 if (!ctdb_validate_pnn(ctdb, pnn)) {
165 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
166 return;
169 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
171 bantime.pnn = pnn;
172 bantime.time = ban_time;
174 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
175 if (ret != 0) {
176 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
177 return;
182 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
186 remember the trouble maker
188 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
190 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
191 struct ctdb_banning_state *ban_state;
193 if (culprit > ctdb->num_nodes) {
194 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
195 return;
198 /* If we are banned or stopped, do not set other nodes as culprits */
199 if (rec->node_flags & NODE_FLAGS_INACTIVE) {
200 DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
201 return;
204 if (ctdb->nodes[culprit]->ban_state == NULL) {
205 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
206 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
210 ban_state = ctdb->nodes[culprit]->ban_state;
211 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
212 /* this was the first time in a long while this node
213 misbehaved so we will forgive any old transgressions.
215 ban_state->count = 0;
218 ban_state->count += count;
219 ban_state->last_reported_time = timeval_current();
220 rec->last_culprit_node = culprit;
224 remember the trouble maker
226 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
228 ctdb_set_culprit_count(rec, culprit, 1);
232 /* this callback is called for every node that failed to execute the
233 recovered event
235 static void recovered_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
237 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
239 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the recovered event. Setting it as recovery fail culprit\n", node_pnn));
241 ctdb_set_culprit(rec, node_pnn);
245 run the "recovered" eventscript on all nodes
247 static int run_recovered_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, const char *caller)
249 TALLOC_CTX *tmp_ctx;
250 uint32_t *nodes;
251 struct ctdb_context *ctdb = rec->ctdb;
253 tmp_ctx = talloc_new(ctdb);
254 CTDB_NO_MEMORY(ctdb, tmp_ctx);
256 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
257 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
258 nodes, 0,
259 CONTROL_TIMEOUT(), false, tdb_null,
260 NULL, recovered_fail_callback,
261 rec) != 0) {
262 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
264 talloc_free(tmp_ctx);
265 return -1;
268 talloc_free(tmp_ctx);
269 return 0;
272 /* this callback is called for every node that failed to execute the
273 start recovery event
275 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
277 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
279 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
281 ctdb_set_culprit(rec, node_pnn);
285 run the "startrecovery" eventscript on all nodes
287 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
289 TALLOC_CTX *tmp_ctx;
290 uint32_t *nodes;
291 struct ctdb_context *ctdb = rec->ctdb;
293 tmp_ctx = talloc_new(ctdb);
294 CTDB_NO_MEMORY(ctdb, tmp_ctx);
296 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
297 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
298 nodes, 0,
299 CONTROL_TIMEOUT(), false, tdb_null,
300 NULL,
301 startrecovery_fail_callback,
302 rec) != 0) {
303 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
304 talloc_free(tmp_ctx);
305 return -1;
308 talloc_free(tmp_ctx);
309 return 0;
312 static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
314 if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
315 DEBUG(DEBUG_ERR, (__location__ " Invalid length/pointer for getcap callback : %u %p\n", (unsigned)outdata.dsize, outdata.dptr));
316 return;
318 if (node_pnn < ctdb->num_nodes) {
319 ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
322 if (node_pnn == ctdb->pnn) {
323 ctdb->capabilities = ctdb->nodes[node_pnn]->capabilities;
328 update the node capabilities for all connected nodes
330 static int update_capabilities(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
332 uint32_t *nodes;
333 TALLOC_CTX *tmp_ctx;
335 tmp_ctx = talloc_new(ctdb);
336 CTDB_NO_MEMORY(ctdb, tmp_ctx);
338 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
339 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES,
340 nodes, 0,
341 CONTROL_TIMEOUT(),
342 false, tdb_null,
343 async_getcap_callback, NULL,
344 NULL) != 0) {
345 DEBUG(DEBUG_ERR, (__location__ " Failed to read node capabilities.\n"));
346 talloc_free(tmp_ctx);
347 return -1;
350 talloc_free(tmp_ctx);
351 return 0;
354 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
356 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
358 DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
359 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
362 static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
364 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
366 DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
367 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
371 change recovery mode on all nodes
373 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t rec_mode)
375 TDB_DATA data;
376 uint32_t *nodes;
377 TALLOC_CTX *tmp_ctx;
379 tmp_ctx = talloc_new(ctdb);
380 CTDB_NO_MEMORY(ctdb, tmp_ctx);
382 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
384 data.dsize = sizeof(uint32_t);
385 data.dptr = (unsigned char *)&rec_mode;
387 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
388 nodes, 0,
389 CONTROL_TIMEOUT(),
390 false, data,
391 NULL, NULL,
392 NULL) != 0) {
393 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
394 talloc_free(tmp_ctx);
395 return -1;
398 /* freeze all nodes */
399 if (rec_mode == CTDB_RECOVERY_ACTIVE) {
400 int i;
402 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
403 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
404 nodes, i,
405 CONTROL_TIMEOUT(),
406 false, tdb_null,
407 NULL,
408 set_recmode_fail_callback,
409 rec) != 0) {
410 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
411 talloc_free(tmp_ctx);
412 return -1;
417 talloc_free(tmp_ctx);
418 return 0;
422 change recovery master on all node
424 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
426 TDB_DATA data;
427 TALLOC_CTX *tmp_ctx;
428 uint32_t *nodes;
430 tmp_ctx = talloc_new(ctdb);
431 CTDB_NO_MEMORY(ctdb, tmp_ctx);
433 data.dsize = sizeof(uint32_t);
434 data.dptr = (unsigned char *)&pnn;
436 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
437 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
438 nodes, 0,
439 CONTROL_TIMEOUT(), false, data,
440 NULL, NULL,
441 NULL) != 0) {
442 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
443 talloc_free(tmp_ctx);
444 return -1;
447 talloc_free(tmp_ctx);
448 return 0;
451 /* update all remote nodes to use the same db priority that we have
452 this can fail if the remove node has not yet been upgraded to
453 support this function, so we always return success and never fail
454 a recovery if this call fails.
456 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
457 struct ctdb_node_map *nodemap,
458 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
460 int db;
462 /* step through all local databases */
463 for (db=0; db<dbmap->num;db++) {
464 struct ctdb_db_priority db_prio;
465 int ret;
467 db_prio.db_id = dbmap->dbs[db].dbid;
468 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].dbid, &db_prio.priority);
469 if (ret != 0) {
470 DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].dbid));
471 continue;
474 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].dbid, db_prio.priority));
476 ret = ctdb_ctrl_set_db_priority(ctdb, CONTROL_TIMEOUT(),
477 CTDB_CURRENT_NODE, &db_prio);
478 if (ret != 0) {
479 DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n",
480 db_prio.db_id));
484 return 0;
488 ensure all other nodes have attached to any databases that we have
490 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
491 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
493 int i, j, db, ret;
494 struct ctdb_dbid_map *remote_dbmap;
496 /* verify that all other nodes have all our databases */
497 for (j=0; j<nodemap->num; j++) {
498 /* we dont need to ourself ourselves */
499 if (nodemap->nodes[j].pnn == pnn) {
500 continue;
502 /* dont check nodes that are unavailable */
503 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
504 continue;
507 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
508 mem_ctx, &remote_dbmap);
509 if (ret != 0) {
510 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
511 return -1;
514 /* step through all local databases */
515 for (db=0; db<dbmap->num;db++) {
516 const char *name;
519 for (i=0;i<remote_dbmap->num;i++) {
520 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
521 break;
524 /* the remote node already have this database */
525 if (i!=remote_dbmap->num) {
526 continue;
528 /* ok so we need to create this database */
529 ret = ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn,
530 dbmap->dbs[db].dbid, mem_ctx,
531 &name);
532 if (ret != 0) {
533 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
534 return -1;
536 ret = ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(),
537 nodemap->nodes[j].pnn,
538 mem_ctx, name,
539 dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
540 if (ret != 0) {
541 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
542 return -1;
547 return 0;
552 ensure we are attached to any databases that anyone else is attached to
554 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
555 uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
557 int i, j, db, ret;
558 struct ctdb_dbid_map *remote_dbmap;
560 /* verify that we have all database any other node has */
561 for (j=0; j<nodemap->num; j++) {
562 /* we dont need to ourself ourselves */
563 if (nodemap->nodes[j].pnn == pnn) {
564 continue;
566 /* dont check nodes that are unavailable */
567 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
568 continue;
571 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
572 mem_ctx, &remote_dbmap);
573 if (ret != 0) {
574 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
575 return -1;
578 /* step through all databases on the remote node */
579 for (db=0; db<remote_dbmap->num;db++) {
580 const char *name;
582 for (i=0;i<(*dbmap)->num;i++) {
583 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
584 break;
587 /* we already have this db locally */
588 if (i!=(*dbmap)->num) {
589 continue;
591 /* ok so we need to create this database and
592 rebuild dbmap
594 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
595 remote_dbmap->dbs[db].dbid, mem_ctx, &name);
596 if (ret != 0) {
597 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
598 nodemap->nodes[j].pnn));
599 return -1;
601 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
602 remote_dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
603 if (ret != 0) {
604 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
605 return -1;
607 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
608 if (ret != 0) {
609 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
610 return -1;
615 return 0;
620 pull the remote database contents from one node into the recdb
622 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
623 struct tdb_wrap *recdb, uint32_t dbid)
625 int ret;
626 TDB_DATA outdata;
627 struct ctdb_marshall_buffer *reply;
628 struct ctdb_rec_data *rec;
629 int i;
630 TALLOC_CTX *tmp_ctx = talloc_new(recdb);
632 ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
633 CONTROL_TIMEOUT(), &outdata);
634 if (ret != 0) {
635 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
636 talloc_free(tmp_ctx);
637 return -1;
640 reply = (struct ctdb_marshall_buffer *)outdata.dptr;
642 if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
643 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
644 talloc_free(tmp_ctx);
645 return -1;
648 rec = (struct ctdb_rec_data *)&reply->data[0];
650 for (i=0;
651 i<reply->count;
652 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec), i++) {
653 TDB_DATA key, data;
654 struct ctdb_ltdb_header *hdr;
655 TDB_DATA existing;
657 key.dptr = &rec->data[0];
658 key.dsize = rec->keylen;
659 data.dptr = &rec->data[key.dsize];
660 data.dsize = rec->datalen;
662 hdr = (struct ctdb_ltdb_header *)data.dptr;
664 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
665 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
666 talloc_free(tmp_ctx);
667 return -1;
670 /* fetch the existing record, if any */
671 existing = tdb_fetch(recdb->tdb, key);
673 if (existing.dptr != NULL) {
674 struct ctdb_ltdb_header header;
675 if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
676 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
677 (unsigned)existing.dsize, srcnode));
678 free(existing.dptr);
679 talloc_free(tmp_ctx);
680 return -1;
682 header = *(struct ctdb_ltdb_header *)existing.dptr;
683 free(existing.dptr);
684 if (!(header.rsn < hdr->rsn ||
685 (header.dmaster != ctdb->recovery_master && header.rsn == hdr->rsn))) {
686 continue;
690 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
691 DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
692 talloc_free(tmp_ctx);
693 return -1;
697 talloc_free(tmp_ctx);
699 return 0;
703 struct pull_seqnum_cbdata {
704 int failed;
705 uint32_t pnn;
706 uint64_t seqnum;
709 static void pull_seqnum_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
711 struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
712 uint64_t seqnum;
714 if (cb_data->failed != 0) {
715 DEBUG(DEBUG_ERR, ("Got seqnum from node %d but we have already failed the entire operation\n", node_pnn));
716 return;
719 if (res != 0) {
720 DEBUG(DEBUG_ERR, ("Error when pulling seqnum from node %d\n", node_pnn));
721 cb_data->failed = 1;
722 return;
725 if (outdata.dsize != sizeof(uint64_t)) {
726 DEBUG(DEBUG_ERR, ("Error when reading pull seqnum from node %d, got %d bytes but expected %d\n", node_pnn, (int)outdata.dsize, (int)sizeof(uint64_t)));
727 cb_data->failed = -1;
728 return;
731 seqnum = *((uint64_t *)outdata.dptr);
733 if (seqnum > cb_data->seqnum ||
734 (cb_data->pnn == -1 && seqnum == 0)) {
735 cb_data->seqnum = seqnum;
736 cb_data->pnn = node_pnn;
740 static void pull_seqnum_fail_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
742 struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
744 DEBUG(DEBUG_ERR, ("Failed to pull db seqnum from node %d\n", node_pnn));
745 cb_data->failed = 1;
748 static int pull_highest_seqnum_pdb(struct ctdb_context *ctdb,
749 struct ctdb_recoverd *rec,
750 struct ctdb_node_map *nodemap,
751 struct tdb_wrap *recdb, uint32_t dbid)
753 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
754 uint32_t *nodes;
755 TDB_DATA data;
756 uint32_t outdata[2];
757 struct pull_seqnum_cbdata *cb_data;
759 DEBUG(DEBUG_NOTICE, ("Scan for highest seqnum pdb for db:0x%08x\n", dbid));
761 outdata[0] = dbid;
762 outdata[1] = 0;
764 data.dsize = sizeof(outdata);
765 data.dptr = (uint8_t *)&outdata[0];
767 cb_data = talloc(tmp_ctx, struct pull_seqnum_cbdata);
768 if (cb_data == NULL) {
769 DEBUG(DEBUG_ERR, ("Failed to allocate pull highest seqnum cb_data structure\n"));
770 talloc_free(tmp_ctx);
771 return -1;
774 cb_data->failed = 0;
775 cb_data->pnn = -1;
776 cb_data->seqnum = 0;
778 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
779 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_DB_SEQNUM,
780 nodes, 0,
781 CONTROL_TIMEOUT(), false, data,
782 pull_seqnum_cb,
783 pull_seqnum_fail_cb,
784 cb_data) != 0) {
785 DEBUG(DEBUG_ERR, (__location__ " Failed to run async GET_DB_SEQNUM\n"));
787 talloc_free(tmp_ctx);
788 return -1;
791 if (cb_data->failed != 0) {
792 DEBUG(DEBUG_NOTICE, ("Failed to pull sequence numbers for DB 0x%08x\n", dbid));
793 talloc_free(tmp_ctx);
794 return -1;
797 if (cb_data->pnn == -1) {
798 DEBUG(DEBUG_NOTICE, ("Failed to find a node with highest sequence numbers for DB 0x%08x\n", dbid));
799 talloc_free(tmp_ctx);
800 return -1;
803 DEBUG(DEBUG_NOTICE, ("Pull persistent db:0x%08x from node %d with highest seqnum:%lld\n", dbid, cb_data->pnn, (long long)cb_data->seqnum));
805 if (pull_one_remote_database(ctdb, cb_data->pnn, recdb, dbid) != 0) {
806 DEBUG(DEBUG_ERR, ("Failed to pull higest seqnum database 0x%08x from node %d\n", dbid, cb_data->pnn));
807 talloc_free(tmp_ctx);
808 return -1;
811 talloc_free(tmp_ctx);
812 return 0;
817 pull all the remote database contents into the recdb
819 static int pull_remote_database(struct ctdb_context *ctdb,
820 struct ctdb_recoverd *rec,
821 struct ctdb_node_map *nodemap,
822 struct tdb_wrap *recdb, uint32_t dbid,
823 bool persistent)
825 int j;
827 if (persistent && ctdb->tunable.recover_pdb_by_seqnum != 0) {
828 int ret;
829 ret = pull_highest_seqnum_pdb(ctdb, rec, nodemap, recdb, dbid);
830 if (ret == 0) {
831 return 0;
835 /* pull all records from all other nodes across onto this node
836 (this merges based on rsn)
838 for (j=0; j<nodemap->num; j++) {
839 /* dont merge from nodes that are unavailable */
840 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
841 continue;
843 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
844 DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
845 nodemap->nodes[j].pnn));
846 ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
847 return -1;
851 return 0;
856 update flags on all active nodes
858 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
860 int ret;
862 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
863 if (ret != 0) {
864 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
865 return -1;
868 return 0;
872 ensure all nodes have the same vnnmap we do
874 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
875 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
877 int j, ret;
879 /* push the new vnn map out to all the nodes */
880 for (j=0; j<nodemap->num; j++) {
881 /* dont push to nodes that are unavailable */
882 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
883 continue;
886 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
887 if (ret != 0) {
888 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
889 return -1;
893 return 0;
897 struct vacuum_info {
898 struct vacuum_info *next, *prev;
899 struct ctdb_recoverd *rec;
900 uint32_t srcnode;
901 struct ctdb_db_context *ctdb_db;
902 struct ctdb_marshall_buffer *recs;
903 struct ctdb_rec_data *r;
906 static void vacuum_fetch_next(struct vacuum_info *v);
909 called when a vacuum fetch has completed - just free it and do the next one
911 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
913 talloc_free(state);
918 process the next element from the vacuum list
920 static void vacuum_fetch_next(struct vacuum_info *v)
922 struct ctdb_call call;
923 struct ctdb_rec_data *r;
925 while (v->recs->count) {
926 struct ctdb_client_call_state *state;
927 TDB_DATA data;
928 struct ctdb_ltdb_header *hdr;
930 ZERO_STRUCT(call);
931 call.call_id = CTDB_NULL_FUNC;
932 call.flags = CTDB_IMMEDIATE_MIGRATION;
933 call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
935 r = v->r;
936 v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
937 v->recs->count--;
939 call.key.dptr = &r->data[0];
940 call.key.dsize = r->keylen;
942 /* ensure we don't block this daemon - just skip a record if we can't get
943 the chainlock */
944 if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
945 continue;
948 data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
949 if (data.dptr == NULL) {
950 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
951 continue;
954 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
955 free(data.dptr);
956 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
957 continue;
960 hdr = (struct ctdb_ltdb_header *)data.dptr;
961 if (hdr->dmaster == v->rec->ctdb->pnn) {
962 /* its already local */
963 free(data.dptr);
964 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
965 continue;
968 free(data.dptr);
970 state = ctdb_call_send(v->ctdb_db, &call);
971 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
972 if (state == NULL) {
973 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
974 talloc_free(v);
975 return;
977 state->async.fn = vacuum_fetch_callback;
978 state->async.private_data = NULL;
981 talloc_free(v);
986 destroy a vacuum info structure
988 static int vacuum_info_destructor(struct vacuum_info *v)
990 DLIST_REMOVE(v->rec->vacuum_info, v);
991 return 0;
996 handler for vacuum fetch
998 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
999 TDB_DATA data, void *private_data)
1001 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1002 struct ctdb_marshall_buffer *recs;
1003 int ret, i;
1004 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1005 const char *name;
1006 struct ctdb_dbid_map *dbmap=NULL;
1007 bool persistent = false;
1008 struct ctdb_db_context *ctdb_db;
1009 struct ctdb_rec_data *r;
1010 uint32_t srcnode;
1011 struct vacuum_info *v;
1013 recs = (struct ctdb_marshall_buffer *)data.dptr;
1014 r = (struct ctdb_rec_data *)&recs->data[0];
1016 if (recs->count == 0) {
1017 talloc_free(tmp_ctx);
1018 return;
1021 srcnode = r->reqid;
1023 for (v=rec->vacuum_info;v;v=v->next) {
1024 if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
1025 /* we're already working on records from this node */
1026 talloc_free(tmp_ctx);
1027 return;
1031 /* work out if the database is persistent */
1032 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
1033 if (ret != 0) {
1034 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
1035 talloc_free(tmp_ctx);
1036 return;
1039 for (i=0;i<dbmap->num;i++) {
1040 if (dbmap->dbs[i].dbid == recs->db_id) {
1041 persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
1042 break;
1045 if (i == dbmap->num) {
1046 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
1047 talloc_free(tmp_ctx);
1048 return;
1051 /* find the name of this database */
1052 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
1053 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
1054 talloc_free(tmp_ctx);
1055 return;
1058 /* attach to it */
1059 ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, persistent, 0);
1060 if (ctdb_db == NULL) {
1061 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
1062 talloc_free(tmp_ctx);
1063 return;
1066 v = talloc_zero(rec, struct vacuum_info);
1067 if (v == NULL) {
1068 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
1069 talloc_free(tmp_ctx);
1070 return;
1073 v->rec = rec;
1074 v->srcnode = srcnode;
1075 v->ctdb_db = ctdb_db;
1076 v->recs = talloc_memdup(v, recs, data.dsize);
1077 if (v->recs == NULL) {
1078 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
1079 talloc_free(v);
1080 talloc_free(tmp_ctx);
1081 return;
1083 v->r = (struct ctdb_rec_data *)&v->recs->data[0];
1085 DLIST_ADD(rec->vacuum_info, v);
1087 talloc_set_destructor(v, vacuum_info_destructor);
1089 vacuum_fetch_next(v);
1090 talloc_free(tmp_ctx);
1095 * handler for database detach
1097 static void detach_database_handler(struct ctdb_context *ctdb, uint64_t srvid,
1098 TDB_DATA data, void *private_data)
1100 struct ctdb_recoverd *rec = talloc_get_type(private_data,
1101 struct ctdb_recoverd);
1102 uint32_t db_id;
1103 struct vacuum_info *v, *vnext;
1104 struct ctdb_db_context *ctdb_db;
1106 if (data.dsize != sizeof(db_id)) {
1107 return;
1109 db_id = *(uint32_t *)data.dptr;
1111 ctdb_db = find_ctdb_db(ctdb, db_id);
1112 if (ctdb_db == NULL) {
1113 /* database is not attached */
1114 return;
1117 /* Stop any active vacuum fetch */
1118 v = rec->vacuum_info;
1119 while (v != NULL) {
1120 vnext = v->next;
1122 if (v->ctdb_db->db_id == db_id) {
1123 talloc_free(v);
1125 v = vnext;
1128 DLIST_REMOVE(ctdb->db_list, ctdb_db);
1130 DEBUG(DEBUG_NOTICE, ("Detached from database '%s'\n",
1131 ctdb_db->db_name));
1132 talloc_free(ctdb_db);
1136 called when ctdb_wait_timeout should finish
1138 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
1139 struct timeval yt, void *p)
1141 uint32_t *timed_out = (uint32_t *)p;
1142 (*timed_out) = 1;
1146 wait for a given number of seconds
1148 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
1150 uint32_t timed_out = 0;
1151 time_t usecs = (secs - (time_t)secs) * 1000000;
1152 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs), ctdb_wait_handler, &timed_out);
1153 while (!timed_out) {
1154 event_loop_once(ctdb->ev);
1159 called when an election times out (ends)
1161 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
1162 struct timeval t, void *p)
1164 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1165 rec->election_timeout = NULL;
1166 fast_start = false;
1168 DEBUG(DEBUG_WARNING,("Election period ended\n"));
1173 wait for an election to finish. It finished election_timeout seconds after
1174 the last election packet is received
1176 static void ctdb_wait_election(struct ctdb_recoverd *rec)
1178 struct ctdb_context *ctdb = rec->ctdb;
1179 while (rec->election_timeout) {
1180 event_loop_once(ctdb->ev);
1185 Update our local flags from all remote connected nodes.
1186 This is only run when we are or we belive we are the recovery master
1188 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
1190 int j;
1191 struct ctdb_context *ctdb = rec->ctdb;
1192 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1194 /* get the nodemap for all active remote nodes and verify
1195 they are the same as for this node
1197 for (j=0; j<nodemap->num; j++) {
1198 struct ctdb_node_map *remote_nodemap=NULL;
1199 int ret;
1201 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
1202 continue;
1204 if (nodemap->nodes[j].pnn == ctdb->pnn) {
1205 continue;
1208 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
1209 mem_ctx, &remote_nodemap);
1210 if (ret != 0) {
1211 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
1212 nodemap->nodes[j].pnn));
1213 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
1214 talloc_free(mem_ctx);
1215 return MONITOR_FAILED;
1217 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
1218 /* We should tell our daemon about this so it
1219 updates its flags or else we will log the same
1220 message again in the next iteration of recovery.
1221 Since we are the recovery master we can just as
1222 well update the flags on all nodes.
1224 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
1225 if (ret != 0) {
1226 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
1227 return -1;
1230 /* Update our local copy of the flags in the recovery
1231 daemon.
1233 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
1234 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
1235 nodemap->nodes[j].flags));
1236 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
1238 talloc_free(remote_nodemap);
1240 talloc_free(mem_ctx);
1241 return MONITOR_OK;
1245 /* Create a new random generation ip.
1246 The generation id can not be the INVALID_GENERATION id
1248 static uint32_t new_generation(void)
1250 uint32_t generation;
1252 while (1) {
1253 generation = random();
1255 if (generation != INVALID_GENERATION) {
1256 break;
1260 return generation;
1265 create a temporary working database
1267 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1269 char *name;
1270 struct tdb_wrap *recdb;
1271 unsigned tdb_flags;
1273 /* open up the temporary recovery database */
1274 name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
1275 ctdb->db_directory_state,
1276 ctdb->pnn);
1277 if (name == NULL) {
1278 return NULL;
1280 unlink(name);
1282 tdb_flags = TDB_NOLOCK;
1283 if (ctdb->valgrinding) {
1284 tdb_flags |= TDB_NOMMAP;
1286 tdb_flags |= (TDB_INCOMPATIBLE_HASH | TDB_DISALLOW_NESTING);
1288 recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1289 tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1290 if (recdb == NULL) {
1291 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1294 talloc_free(name);
1296 return recdb;
1301 a traverse function for pulling all relevant records from recdb
1303 struct recdb_data {
1304 struct ctdb_context *ctdb;
1305 struct ctdb_marshall_buffer *recdata;
1306 uint32_t len;
1307 uint32_t allocated_len;
1308 bool failed;
1309 bool persistent;
1312 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1314 struct recdb_data *params = (struct recdb_data *)p;
1315 struct ctdb_rec_data *rec;
1316 struct ctdb_ltdb_header *hdr;
1319 * skip empty records - but NOT for persistent databases:
1321 * The record-by-record mode of recovery deletes empty records.
1322 * For persistent databases, this can lead to data corruption
1323 * by deleting records that should be there:
1325 * - Assume the cluster has been running for a while.
1327 * - A record R in a persistent database has been created and
1328 * deleted a couple of times, the last operation being deletion,
1329 * leaving an empty record with a high RSN, say 10.
1331 * - Now a node N is turned off.
1333 * - This leaves the local database copy of D on N with the empty
1334 * copy of R and RSN 10. On all other nodes, the recovery has deleted
1335 * the copy of record R.
1337 * - Now the record is created again while node N is turned off.
1338 * This creates R with RSN = 1 on all nodes except for N.
1340 * - Now node N is turned on again. The following recovery will chose
1341 * the older empty copy of R due to RSN 10 > RSN 1.
1343 * ==> Hence the record is gone after the recovery.
1345 * On databases like Samba's registry, this can damage the higher-level
1346 * data structures built from the various tdb-level records.
1348 if (!params->persistent && data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1349 return 0;
1352 /* update the dmaster field to point to us */
1353 hdr = (struct ctdb_ltdb_header *)data.dptr;
1354 if (!params->persistent) {
1355 hdr->dmaster = params->ctdb->pnn;
1356 hdr->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
1359 /* add the record to the blob ready to send to the nodes */
1360 rec = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1361 if (rec == NULL) {
1362 params->failed = true;
1363 return -1;
1365 if (params->len + rec->length >= params->allocated_len) {
1366 params->allocated_len = rec->length + params->len + params->ctdb->tunable.pulldb_preallocation_size;
1367 params->recdata = talloc_realloc_size(NULL, params->recdata, params->allocated_len);
1369 if (params->recdata == NULL) {
1370 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u\n",
1371 rec->length + params->len));
1372 params->failed = true;
1373 return -1;
1375 params->recdata->count++;
1376 memcpy(params->len+(uint8_t *)params->recdata, rec, rec->length);
1377 params->len += rec->length;
1378 talloc_free(rec);
1380 return 0;
1384 push the recdb database out to all nodes
1386 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1387 bool persistent,
1388 struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1390 struct recdb_data params;
1391 struct ctdb_marshall_buffer *recdata;
1392 TDB_DATA outdata;
1393 TALLOC_CTX *tmp_ctx;
1394 uint32_t *nodes;
1396 tmp_ctx = talloc_new(ctdb);
1397 CTDB_NO_MEMORY(ctdb, tmp_ctx);
1399 recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1400 CTDB_NO_MEMORY(ctdb, recdata);
1402 recdata->db_id = dbid;
1404 params.ctdb = ctdb;
1405 params.recdata = recdata;
1406 params.len = offsetof(struct ctdb_marshall_buffer, data);
1407 params.allocated_len = params.len;
1408 params.failed = false;
1409 params.persistent = persistent;
1411 if (tdb_traverse_read(recdb->tdb, traverse_recdb, &params) == -1) {
1412 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1413 talloc_free(params.recdata);
1414 talloc_free(tmp_ctx);
1415 return -1;
1418 if (params.failed) {
1419 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1420 talloc_free(params.recdata);
1421 talloc_free(tmp_ctx);
1422 return -1;
1425 recdata = params.recdata;
1427 outdata.dptr = (void *)recdata;
1428 outdata.dsize = params.len;
1430 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1431 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1432 nodes, 0,
1433 CONTROL_TIMEOUT(), false, outdata,
1434 NULL, NULL,
1435 NULL) != 0) {
1436 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1437 talloc_free(recdata);
1438 talloc_free(tmp_ctx);
1439 return -1;
1442 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1443 dbid, recdata->count));
1445 talloc_free(recdata);
1446 talloc_free(tmp_ctx);
1448 return 0;
1453 go through a full recovery on one database
1455 static int recover_database(struct ctdb_recoverd *rec,
1456 TALLOC_CTX *mem_ctx,
1457 uint32_t dbid,
1458 bool persistent,
1459 uint32_t pnn,
1460 struct ctdb_node_map *nodemap,
1461 uint32_t transaction_id)
1463 struct tdb_wrap *recdb;
1464 int ret;
1465 struct ctdb_context *ctdb = rec->ctdb;
1466 TDB_DATA data;
1467 struct ctdb_control_wipe_database w;
1468 uint32_t *nodes;
1470 recdb = create_recdb(ctdb, mem_ctx);
1471 if (recdb == NULL) {
1472 return -1;
1475 /* pull all remote databases onto the recdb */
1476 ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
1477 if (ret != 0) {
1478 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1479 return -1;
1482 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1484 /* wipe all the remote databases. This is safe as we are in a transaction */
1485 w.db_id = dbid;
1486 w.transaction_id = transaction_id;
1488 data.dptr = (void *)&w;
1489 data.dsize = sizeof(w);
1491 nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1492 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1493 nodes, 0,
1494 CONTROL_TIMEOUT(), false, data,
1495 NULL, NULL,
1496 NULL) != 0) {
1497 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1498 talloc_free(recdb);
1499 return -1;
1502 /* push out the correct database. This sets the dmaster and skips
1503 the empty records */
1504 ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
1505 if (ret != 0) {
1506 talloc_free(recdb);
1507 return -1;
1510 /* all done with this database */
1511 talloc_free(recdb);
1513 return 0;
1516 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1517 struct ctdb_recoverd *rec,
1518 struct ctdb_node_map *nodemap,
1519 uint32_t *culprit)
1521 int j;
1522 int ret;
1524 if (ctdb->num_nodes != nodemap->num) {
1525 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
1526 ctdb->num_nodes, nodemap->num));
1527 if (culprit) {
1528 *culprit = ctdb->pnn;
1530 return -1;
1533 for (j=0; j<nodemap->num; j++) {
1534 /* For readability */
1535 struct ctdb_node *node = ctdb->nodes[j];
1537 /* release any existing data */
1538 if (node->known_public_ips) {
1539 talloc_free(node->known_public_ips);
1540 node->known_public_ips = NULL;
1542 if (node->available_public_ips) {
1543 talloc_free(node->available_public_ips);
1544 node->available_public_ips = NULL;
1547 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1548 continue;
1551 /* Retrieve the list of known public IPs from the node */
1552 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1553 CONTROL_TIMEOUT(),
1554 node->pnn,
1555 ctdb->nodes,
1557 &node->known_public_ips);
1558 if (ret != 0) {
1559 DEBUG(DEBUG_ERR,
1560 ("Failed to read known public IPs from node: %u\n",
1561 node->pnn));
1562 if (culprit) {
1563 *culprit = node->pnn;
1565 return -1;
1568 if (ctdb->do_checkpublicip &&
1569 rec->takeover_runs_disable_ctx == NULL &&
1570 verify_remote_ip_allocation(ctdb,
1571 node->known_public_ips,
1572 node->pnn)) {
1573 DEBUG(DEBUG_ERR,("Trigger IP reallocation\n"));
1574 rec->need_takeover_run = true;
1577 /* Retrieve the list of available public IPs from the node */
1578 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1579 CONTROL_TIMEOUT(),
1580 node->pnn,
1581 ctdb->nodes,
1582 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1583 &node->available_public_ips);
1584 if (ret != 0) {
1585 DEBUG(DEBUG_ERR,
1586 ("Failed to read available public IPs from node: %u\n",
1587 node->pnn));
1588 if (culprit) {
1589 *culprit = node->pnn;
1591 return -1;
1595 return 0;
1598 /* when we start a recovery, make sure all nodes use the same reclock file
1599 setting
1601 static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd *rec)
1603 struct ctdb_context *ctdb = rec->ctdb;
1604 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
1605 TDB_DATA data;
1606 uint32_t *nodes;
1608 if (ctdb->recovery_lock_file == NULL) {
1609 data.dptr = NULL;
1610 data.dsize = 0;
1611 } else {
1612 data.dsize = strlen(ctdb->recovery_lock_file) + 1;
1613 data.dptr = (uint8_t *)ctdb->recovery_lock_file;
1616 nodes = list_of_active_nodes(ctdb, rec->nodemap, tmp_ctx, true);
1617 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECLOCK_FILE,
1618 nodes, 0,
1619 CONTROL_TIMEOUT(),
1620 false, data,
1621 NULL, NULL,
1622 rec) != 0) {
1623 DEBUG(DEBUG_ERR, (__location__ " Failed to sync reclock file settings\n"));
1624 talloc_free(tmp_ctx);
1625 return -1;
1628 talloc_free(tmp_ctx);
1629 return 0;
1634 * this callback is called for every node that failed to execute ctdb_takeover_run()
1635 * and set flag to re-run takeover run.
1637 static void takeover_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
1639 DEBUG(DEBUG_ERR, ("Node %u failed the takeover run\n", node_pnn));
1641 if (callback_data != NULL) {
1642 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
1644 DEBUG(DEBUG_ERR, ("Setting node %u as recovery fail culprit\n", node_pnn));
1646 ctdb_set_culprit(rec, node_pnn);
1651 static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
1653 struct ctdb_context *ctdb = rec->ctdb;
1654 int i;
1655 struct ctdb_banning_state *ban_state;
1657 *self_ban = false;
1658 for (i=0; i<ctdb->num_nodes; i++) {
1659 if (ctdb->nodes[i]->ban_state == NULL) {
1660 continue;
1662 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1663 if (ban_state->count < 2*ctdb->num_nodes) {
1664 continue;
1667 DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
1668 ctdb->nodes[i]->pnn, ban_state->count,
1669 ctdb->tunable.recovery_ban_period));
1670 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1671 ban_state->count = 0;
1673 /* Banning ourself? */
1674 if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
1675 *self_ban = true;
1680 static bool do_takeover_run(struct ctdb_recoverd *rec,
1681 struct ctdb_node_map *nodemap,
1682 bool banning_credits_on_fail)
1684 uint32_t *nodes = NULL;
1685 struct srvid_request_data dtr;
1686 TDB_DATA data;
1687 int i;
1688 uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
1689 int ret;
1690 bool ok;
1692 DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
1694 if (rec->takeover_run_in_progress) {
1695 DEBUG(DEBUG_ERR, (__location__
1696 " takeover run already in progress \n"));
1697 ok = false;
1698 goto done;
1701 rec->takeover_run_in_progress = true;
1703 /* If takeover runs are in disabled then fail... */
1704 if (rec->takeover_runs_disable_ctx != NULL) {
1705 DEBUG(DEBUG_ERR,
1706 ("Takeover runs are disabled so refusing to run one\n"));
1707 ok = false;
1708 goto done;
1711 /* Disable IP checks (takeover runs, really) on other nodes
1712 * while doing this takeover run. This will stop those other
1713 * nodes from triggering takeover runs when think they should
1714 * be hosting an IP but it isn't yet on an interface. Don't
1715 * wait for replies since a failure here might cause some
1716 * noise in the logs but will not actually cause a problem.
1718 dtr.srvid = 0; /* No reply */
1719 dtr.pnn = -1;
1721 data.dptr = (uint8_t*)&dtr;
1722 data.dsize = sizeof(dtr);
1724 nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
1726 /* Disable for 60 seconds. This can be a tunable later if
1727 * necessary.
1729 dtr.data = 60;
1730 for (i = 0; i < talloc_array_length(nodes); i++) {
1731 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1732 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1733 data) != 0) {
1734 DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
1738 ret = ctdb_takeover_run(rec->ctdb, nodemap,
1739 rec->force_rebalance_nodes,
1740 takeover_fail_callback,
1741 banning_credits_on_fail ? rec : NULL);
1743 /* Reenable takeover runs and IP checks on other nodes */
1744 dtr.data = 0;
1745 for (i = 0; i < talloc_array_length(nodes); i++) {
1746 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1747 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1748 data) != 0) {
1749 DEBUG(DEBUG_INFO,("Failed to reenable takeover runs\n"));
1753 if (ret != 0) {
1754 DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
1755 ok = false;
1756 goto done;
1759 ok = true;
1760 /* Takeover run was successful so clear force rebalance targets */
1761 if (rebalance_nodes == rec->force_rebalance_nodes) {
1762 TALLOC_FREE(rec->force_rebalance_nodes);
1763 } else {
1764 DEBUG(DEBUG_WARNING,
1765 ("Rebalance target nodes changed during takeover run - not clearing\n"));
1767 done:
1768 rec->need_takeover_run = !ok;
1769 talloc_free(nodes);
1770 rec->takeover_run_in_progress = false;
1772 DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
1773 return ok;
1778 we are the recmaster, and recovery is needed - start a recovery run
1780 static int do_recovery(struct ctdb_recoverd *rec,
1781 TALLOC_CTX *mem_ctx, uint32_t pnn,
1782 struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap)
1784 struct ctdb_context *ctdb = rec->ctdb;
1785 int i, j, ret;
1786 uint32_t generation;
1787 struct ctdb_dbid_map *dbmap;
1788 TDB_DATA data;
1789 uint32_t *nodes;
1790 struct timeval start_time;
1791 uint32_t culprit = (uint32_t)-1;
1792 bool self_ban;
1794 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1796 /* if recovery fails, force it again */
1797 rec->need_recovery = true;
1799 if (rec->election_timeout) {
1800 /* an election is in progress */
1801 DEBUG(DEBUG_ERR, ("do_recovery called while election in progress - try again later\n"));
1802 return -1;
1805 ban_misbehaving_nodes(rec, &self_ban);
1806 if (self_ban) {
1807 DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
1808 return -1;
1811 if (ctdb->tunable.verify_recovery_lock != 0) {
1812 DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n"));
1813 start_time = timeval_current();
1814 if (!ctdb_recovery_lock(ctdb, true)) {
1815 if (ctdb->runstate == CTDB_RUNSTATE_FIRST_RECOVERY) {
1816 /* If ctdb is trying first recovery, it's
1817 * possible that current node does not know yet
1818 * who the recmaster is.
1820 DEBUG(DEBUG_ERR, ("Unable to get recovery lock"
1821 " - retrying recovery\n"));
1822 return -1;
1825 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
1826 "and ban ourself for %u seconds\n",
1827 ctdb->tunable.recovery_ban_period));
1828 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
1829 return -1;
1831 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
1832 DEBUG(DEBUG_NOTICE,("Recovery lock taken successfully by recovery daemon\n"));
1835 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1837 /* get a list of all databases */
1838 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1839 if (ret != 0) {
1840 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1841 return -1;
1844 /* we do the db creation before we set the recovery mode, so the freeze happens
1845 on all databases we will be dealing with. */
1847 /* verify that we have all the databases any other node has */
1848 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1849 if (ret != 0) {
1850 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1851 return -1;
1854 /* verify that all other nodes have all our databases */
1855 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1856 if (ret != 0) {
1857 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1858 return -1;
1860 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1862 /* update the database priority for all remote databases */
1863 ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
1864 if (ret != 0) {
1865 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
1867 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
1870 /* update all other nodes to use the same setting for reclock files
1871 as the local recovery master.
1873 sync_recovery_lock_file_across_cluster(rec);
1875 /* set recovery mode to active on all nodes */
1876 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1877 if (ret != 0) {
1878 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1879 return -1;
1882 /* execute the "startrecovery" event script on all nodes */
1883 ret = run_startrecovery_eventscript(rec, nodemap);
1884 if (ret!=0) {
1885 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1886 return -1;
1890 update all nodes to have the same flags that we have
1892 for (i=0;i<nodemap->num;i++) {
1893 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1894 continue;
1897 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1898 if (ret != 0) {
1899 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1900 DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
1901 } else {
1902 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1903 return -1;
1908 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1910 /* pick a new generation number */
1911 generation = new_generation();
1913 /* change the vnnmap on this node to use the new generation
1914 number but not on any other nodes.
1915 this guarantees that if we abort the recovery prematurely
1916 for some reason (a node stops responding?)
1917 that we can just return immediately and we will reenter
1918 recovery shortly again.
1919 I.e. we deliberately leave the cluster with an inconsistent
1920 generation id to allow us to abort recovery at any stage and
1921 just restart it from scratch.
1923 vnnmap->generation = generation;
1924 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1925 if (ret != 0) {
1926 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1927 return -1;
1930 data.dptr = (void *)&generation;
1931 data.dsize = sizeof(uint32_t);
1933 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1934 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1935 nodes, 0,
1936 CONTROL_TIMEOUT(), false, data,
1937 NULL,
1938 transaction_start_fail_callback,
1939 rec) != 0) {
1940 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1941 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
1942 nodes, 0,
1943 CONTROL_TIMEOUT(), false, tdb_null,
1944 NULL,
1945 NULL,
1946 NULL) != 0) {
1947 DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
1949 return -1;
1952 DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1954 for (i=0;i<dbmap->num;i++) {
1955 ret = recover_database(rec, mem_ctx,
1956 dbmap->dbs[i].dbid,
1957 dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT,
1958 pnn, nodemap, generation);
1959 if (ret != 0) {
1960 DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
1961 return -1;
1965 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
1967 /* commit all the changes */
1968 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
1969 nodes, 0,
1970 CONTROL_TIMEOUT(), false, data,
1971 NULL, NULL,
1972 NULL) != 0) {
1973 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
1974 return -1;
1977 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
1980 /* update the capabilities for all nodes */
1981 ret = update_capabilities(ctdb, nodemap);
1982 if (ret!=0) {
1983 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1984 return -1;
1987 /* build a new vnn map with all the currently active and
1988 unbanned nodes */
1989 generation = new_generation();
1990 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
1991 CTDB_NO_MEMORY(ctdb, vnnmap);
1992 vnnmap->generation = generation;
1993 vnnmap->size = 0;
1994 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
1995 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1996 for (i=j=0;i<nodemap->num;i++) {
1997 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1998 continue;
2000 if (!(ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER)) {
2001 /* this node can not be an lmaster */
2002 DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
2003 continue;
2006 vnnmap->size++;
2007 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
2008 CTDB_NO_MEMORY(ctdb, vnnmap->map);
2009 vnnmap->map[j++] = nodemap->nodes[i].pnn;
2012 if (vnnmap->size == 0) {
2013 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
2014 vnnmap->size++;
2015 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
2016 CTDB_NO_MEMORY(ctdb, vnnmap->map);
2017 vnnmap->map[0] = pnn;
2020 /* update to the new vnnmap on all nodes */
2021 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
2022 if (ret != 0) {
2023 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
2024 return -1;
2027 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
2029 /* update recmaster to point to us for all nodes */
2030 ret = set_recovery_master(ctdb, nodemap, pnn);
2031 if (ret!=0) {
2032 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
2033 return -1;
2036 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
2038 /* disable recovery mode */
2039 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL);
2040 if (ret != 0) {
2041 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
2042 return -1;
2045 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
2047 /* Fetch known/available public IPs from each active node */
2048 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
2049 if (ret != 0) {
2050 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2051 culprit));
2052 rec->need_takeover_run = true;
2053 return -1;
2056 do_takeover_run(rec, nodemap, false);
2058 /* execute the "recovered" event script on all nodes */
2059 ret = run_recovered_eventscript(rec, nodemap, "do_recovery");
2060 if (ret!=0) {
2061 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
2062 return -1;
2065 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
2067 /* send a message to all clients telling them that the cluster
2068 has been reconfigured */
2069 ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
2070 CTDB_SRVID_RECONFIGURE, tdb_null);
2071 if (ret != 0) {
2072 DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n"));
2073 return -1;
2076 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
2078 rec->need_recovery = false;
2080 /* we managed to complete a full recovery, make sure to forgive
2081 any past sins by the nodes that could now participate in the
2082 recovery.
2084 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
2085 for (i=0;i<nodemap->num;i++) {
2086 struct ctdb_banning_state *ban_state;
2088 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2089 continue;
2092 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
2093 if (ban_state == NULL) {
2094 continue;
2097 ban_state->count = 0;
2101 /* We just finished a recovery successfully.
2102 We now wait for rerecovery_timeout before we allow
2103 another recovery to take place.
2105 DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
2106 ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
2107 DEBUG(DEBUG_NOTICE, ("The rerecovery timeout has elapsed. We now allow recoveries to trigger again.\n"));
2109 return 0;
2114 elections are won by first checking the number of connected nodes, then
2115 the priority time, then the pnn
2117 struct election_message {
2118 uint32_t num_connected;
2119 struct timeval priority_time;
2120 uint32_t pnn;
2121 uint32_t node_flags;
2125 form this nodes election data
2127 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
2129 int ret, i;
2130 struct ctdb_node_map *nodemap;
2131 struct ctdb_context *ctdb = rec->ctdb;
2133 ZERO_STRUCTP(em);
2135 em->pnn = rec->ctdb->pnn;
2136 em->priority_time = rec->priority_time;
2138 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
2139 if (ret != 0) {
2140 DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
2141 return;
2144 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
2145 em->node_flags = rec->node_flags;
2147 for (i=0;i<nodemap->num;i++) {
2148 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
2149 em->num_connected++;
2153 /* we shouldnt try to win this election if we cant be a recmaster */
2154 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2155 em->num_connected = 0;
2156 em->priority_time = timeval_current();
2159 talloc_free(nodemap);
2163 see if the given election data wins
2165 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
2167 struct election_message myem;
2168 int cmp = 0;
2170 ctdb_election_data(rec, &myem);
2172 /* we cant win if we dont have the recmaster capability */
2173 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2174 return false;
2177 /* we cant win if we are banned */
2178 if (rec->node_flags & NODE_FLAGS_BANNED) {
2179 return false;
2182 /* we cant win if we are stopped */
2183 if (rec->node_flags & NODE_FLAGS_STOPPED) {
2184 return false;
2187 /* we will automatically win if the other node is banned */
2188 if (em->node_flags & NODE_FLAGS_BANNED) {
2189 return true;
2192 /* we will automatically win if the other node is banned */
2193 if (em->node_flags & NODE_FLAGS_STOPPED) {
2194 return true;
2197 /* try to use the most connected node */
2198 if (cmp == 0) {
2199 cmp = (int)myem.num_connected - (int)em->num_connected;
2202 /* then the longest running node */
2203 if (cmp == 0) {
2204 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
2207 if (cmp == 0) {
2208 cmp = (int)myem.pnn - (int)em->pnn;
2211 return cmp > 0;
2215 send out an election request
2217 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
2219 int ret;
2220 TDB_DATA election_data;
2221 struct election_message emsg;
2222 uint64_t srvid;
2223 struct ctdb_context *ctdb = rec->ctdb;
2225 srvid = CTDB_SRVID_RECOVERY;
2227 ctdb_election_data(rec, &emsg);
2229 election_data.dsize = sizeof(struct election_message);
2230 election_data.dptr = (unsigned char *)&emsg;
2233 /* first we assume we will win the election and set
2234 recoverymaster to be ourself on the current node
2236 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
2237 if (ret != 0) {
2238 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
2239 return -1;
2243 /* send an election message to all active nodes */
2244 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
2245 return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
2249 this function will unban all nodes in the cluster
2251 static void unban_all_nodes(struct ctdb_context *ctdb)
2253 int ret, i;
2254 struct ctdb_node_map *nodemap;
2255 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2257 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2258 if (ret != 0) {
2259 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
2260 return;
2263 for (i=0;i<nodemap->num;i++) {
2264 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
2265 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
2266 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(),
2267 nodemap->nodes[i].pnn, 0,
2268 NODE_FLAGS_BANNED);
2269 if (ret != 0) {
2270 DEBUG(DEBUG_ERR, (__location__ " failed to reset ban state\n"));
2275 talloc_free(tmp_ctx);
2280 we think we are winning the election - send a broadcast election request
2282 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
2284 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2285 int ret;
2287 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
2288 if (ret != 0) {
2289 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
2292 talloc_free(rec->send_election_te);
2293 rec->send_election_te = NULL;
2297 handler for memory dumps
2299 static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
2300 TDB_DATA data, void *private_data)
2302 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2303 TDB_DATA *dump;
2304 int ret;
2305 struct srvid_request *rd;
2307 if (data.dsize != sizeof(struct srvid_request)) {
2308 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2309 talloc_free(tmp_ctx);
2310 return;
2312 rd = (struct srvid_request *)data.dptr;
2314 dump = talloc_zero(tmp_ctx, TDB_DATA);
2315 if (dump == NULL) {
2316 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
2317 talloc_free(tmp_ctx);
2318 return;
2320 ret = ctdb_dump_memory(ctdb, dump);
2321 if (ret != 0) {
2322 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
2323 talloc_free(tmp_ctx);
2324 return;
2327 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
2329 ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
2330 if (ret != 0) {
2331 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
2332 talloc_free(tmp_ctx);
2333 return;
2336 talloc_free(tmp_ctx);
2340 handler for reload_nodes
2342 static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid,
2343 TDB_DATA data, void *private_data)
2345 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2347 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
2349 ctdb_load_nodes_file(rec->ctdb);
2353 static void ctdb_rebalance_timeout(struct event_context *ev,
2354 struct timed_event *te,
2355 struct timeval t, void *p)
2357 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2359 if (rec->force_rebalance_nodes == NULL) {
2360 DEBUG(DEBUG_ERR,
2361 ("Rebalance timeout occurred - no nodes to rebalance\n"));
2362 return;
2365 DEBUG(DEBUG_NOTICE,
2366 ("Rebalance timeout occurred - do takeover run\n"));
2367 do_takeover_run(rec, rec->nodemap, false);
2371 static void recd_node_rebalance_handler(struct ctdb_context *ctdb,
2372 uint64_t srvid,
2373 TDB_DATA data, void *private_data)
2375 uint32_t pnn;
2376 uint32_t *t;
2377 int len;
2378 uint32_t deferred_rebalance;
2379 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2381 if (rec->recmaster != ctdb_get_pnn(ctdb)) {
2382 return;
2385 if (data.dsize != sizeof(uint32_t)) {
2386 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
2387 return;
2390 pnn = *(uint32_t *)&data.dptr[0];
2392 DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
2394 /* Copy any existing list of nodes. There's probably some
2395 * sort of realloc variant that will do this but we need to
2396 * make sure that freeing the old array also cancels the timer
2397 * event for the timeout... not sure if realloc will do that.
2399 len = (rec->force_rebalance_nodes != NULL) ?
2400 talloc_array_length(rec->force_rebalance_nodes) :
2403 /* This allows duplicates to be added but they don't cause
2404 * harm. A call to add a duplicate PNN arguably means that
2405 * the timeout should be reset, so this is the simplest
2406 * solution.
2408 t = talloc_zero_array(rec, uint32_t, len+1);
2409 CTDB_NO_MEMORY_VOID(ctdb, t);
2410 if (len > 0) {
2411 memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
2413 t[len] = pnn;
2415 talloc_free(rec->force_rebalance_nodes);
2417 rec->force_rebalance_nodes = t;
2419 /* If configured, setup a deferred takeover run to make sure
2420 * that certain nodes get IPs rebalanced to them. This will
2421 * be cancelled if a successful takeover run happens before
2422 * the timeout. Assign tunable value to variable for
2423 * readability.
2425 deferred_rebalance = ctdb->tunable.deferred_rebalance_on_node_add;
2426 if (deferred_rebalance != 0) {
2427 event_add_timed(ctdb->ev, rec->force_rebalance_nodes,
2428 timeval_current_ofs(deferred_rebalance, 0),
2429 ctdb_rebalance_timeout, rec);
2435 static void recd_update_ip_handler(struct ctdb_context *ctdb, uint64_t srvid,
2436 TDB_DATA data, void *private_data)
2438 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2439 struct ctdb_public_ip *ip;
2441 if (rec->recmaster != rec->ctdb->pnn) {
2442 DEBUG(DEBUG_INFO,("Not recmaster, ignore update ip message\n"));
2443 return;
2446 if (data.dsize != sizeof(struct ctdb_public_ip)) {
2447 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(struct ctdb_public_ip)));
2448 return;
2451 ip = (struct ctdb_public_ip *)data.dptr;
2453 update_ip_assignment_tree(rec->ctdb, ip);
2457 static void clear_takeover_runs_disable(struct ctdb_recoverd *rec)
2459 TALLOC_FREE(rec->takeover_runs_disable_ctx);
2462 static void reenable_takeover_runs(struct event_context *ev,
2463 struct timed_event *te,
2464 struct timeval yt, void *p)
2466 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2468 DEBUG(DEBUG_NOTICE,("Reenabling takeover runs after timeout\n"));
2469 clear_takeover_runs_disable(rec);
2472 static void disable_takeover_runs_handler(struct ctdb_context *ctdb,
2473 uint64_t srvid, TDB_DATA data,
2474 void *private_data)
2476 struct ctdb_recoverd *rec = talloc_get_type(private_data,
2477 struct ctdb_recoverd);
2478 struct srvid_request_data *r;
2479 uint32_t timeout;
2480 TDB_DATA result;
2481 int32_t ret = 0;
2483 /* Validate input data */
2484 if (data.dsize != sizeof(struct srvid_request_data)) {
2485 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2486 "expecting %lu\n", (long unsigned)data.dsize,
2487 (long unsigned)sizeof(struct srvid_request)));
2488 return;
2490 if (data.dptr == NULL) {
2491 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2492 return;
2495 r = (struct srvid_request_data *)data.dptr;
2496 timeout = r->data;
2498 if (timeout == 0) {
2499 DEBUG(DEBUG_NOTICE,("Reenabling takeover runs\n"));
2500 clear_takeover_runs_disable(rec);
2501 ret = ctdb_get_pnn(ctdb);
2502 goto done;
2505 if (rec->takeover_run_in_progress) {
2506 DEBUG(DEBUG_ERR,
2507 ("Unable to disable takeover runs - in progress\n"));
2508 ret = -EAGAIN;
2509 goto done;
2512 DEBUG(DEBUG_NOTICE,("Disabling takeover runs for %u seconds\n", timeout));
2514 /* Clear any old timers */
2515 clear_takeover_runs_disable(rec);
2517 /* When this is non-NULL it indicates that takeover runs are
2518 * disabled. This context also holds the timeout timer.
2520 rec->takeover_runs_disable_ctx = talloc_new(rec);
2521 if (rec->takeover_runs_disable_ctx == NULL) {
2522 DEBUG(DEBUG_ERR,(__location__ " Unable to allocate memory\n"));
2523 ret = -ENOMEM;
2524 goto done;
2527 /* Arrange for the timeout to occur */
2528 event_add_timed(ctdb->ev, rec->takeover_runs_disable_ctx,
2529 timeval_current_ofs(timeout, 0),
2530 reenable_takeover_runs,
2531 rec);
2533 /* Returning our PNN tells the caller that we succeeded */
2534 ret = ctdb_get_pnn(ctdb);
2535 done:
2536 result.dsize = sizeof(int32_t);
2537 result.dptr = (uint8_t *)&ret;
2538 srvid_request_reply(ctdb, (struct srvid_request *)r, result);
2541 /* Backward compatibility for this SRVID - call
2542 * disable_takeover_runs_handler() instead
2544 static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid,
2545 TDB_DATA data, void *private_data)
2547 struct ctdb_recoverd *rec = talloc_get_type(private_data,
2548 struct ctdb_recoverd);
2549 TDB_DATA data2;
2550 struct srvid_request_data *req;
2552 if (data.dsize != sizeof(uint32_t)) {
2553 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2554 "expecting %lu\n", (long unsigned)data.dsize,
2555 (long unsigned)sizeof(uint32_t)));
2556 return;
2558 if (data.dptr == NULL) {
2559 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2560 return;
2563 req = talloc(ctdb, struct srvid_request_data);
2564 CTDB_NO_MEMORY_VOID(ctdb, req);
2566 req->srvid = 0; /* No reply */
2567 req->pnn = -1;
2568 req->data = *((uint32_t *)data.dptr); /* Timeout */
2570 data2.dsize = sizeof(*req);
2571 data2.dptr = (uint8_t *)req;
2573 disable_takeover_runs_handler(rec->ctdb,
2574 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
2575 data2, rec);
2579 handler for ip reallocate, just add it to the list of requests and
2580 handle this later in the monitor_cluster loop so we do not recurse
2581 with other requests to takeover_run()
2583 static void ip_reallocate_handler(struct ctdb_context *ctdb, uint64_t srvid,
2584 TDB_DATA data, void *private_data)
2586 struct srvid_request *request;
2587 struct ctdb_recoverd *rec = talloc_get_type(private_data,
2588 struct ctdb_recoverd);
2590 if (data.dsize != sizeof(struct srvid_request)) {
2591 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2592 return;
2595 request = (struct srvid_request *)data.dptr;
2597 srvid_request_add(ctdb, &rec->reallocate_requests, request);
2600 static void process_ipreallocate_requests(struct ctdb_context *ctdb,
2601 struct ctdb_recoverd *rec)
2603 TDB_DATA result;
2604 int32_t ret;
2605 uint32_t culprit;
2606 struct srvid_requests *current;
2608 DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
2610 /* Only process requests that are currently pending. More
2611 * might come in while the takeover run is in progress and
2612 * they will need to be processed later since they might
2613 * be in response flag changes.
2615 current = rec->reallocate_requests;
2616 rec->reallocate_requests = NULL;
2618 /* update the list of public ips that a node can handle for
2619 all connected nodes
2621 ret = ctdb_reload_remote_public_ips(ctdb, rec, rec->nodemap, &culprit);
2622 if (ret != 0) {
2623 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2624 culprit));
2625 rec->need_takeover_run = true;
2627 if (ret == 0) {
2628 if (do_takeover_run(rec, rec->nodemap, false)) {
2629 ret = ctdb_get_pnn(ctdb);
2630 } else {
2631 ret = -1;
2635 result.dsize = sizeof(int32_t);
2636 result.dptr = (uint8_t *)&ret;
2638 srvid_requests_reply(ctdb, &current, result);
2643 handler for recovery master elections
2645 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
2646 TDB_DATA data, void *private_data)
2648 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2649 int ret;
2650 struct election_message *em = (struct election_message *)data.dptr;
2651 TALLOC_CTX *mem_ctx;
2653 /* Ignore election packets from ourself */
2654 if (ctdb->pnn == em->pnn) {
2655 return;
2658 /* we got an election packet - update the timeout for the election */
2659 talloc_free(rec->election_timeout);
2660 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2661 fast_start ?
2662 timeval_current_ofs(0, 500000) :
2663 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2664 ctdb_election_timeout, rec);
2666 mem_ctx = talloc_new(ctdb);
2668 /* someone called an election. check their election data
2669 and if we disagree and we would rather be the elected node,
2670 send a new election message to all other nodes
2672 if (ctdb_election_win(rec, em)) {
2673 if (!rec->send_election_te) {
2674 rec->send_election_te = event_add_timed(ctdb->ev, rec,
2675 timeval_current_ofs(0, 500000),
2676 election_send_request, rec);
2678 talloc_free(mem_ctx);
2679 /*unban_all_nodes(ctdb);*/
2680 return;
2683 /* we didn't win */
2684 talloc_free(rec->send_election_te);
2685 rec->send_election_te = NULL;
2687 if (ctdb->tunable.verify_recovery_lock != 0) {
2688 /* release the recmaster lock */
2689 if (em->pnn != ctdb->pnn &&
2690 ctdb->recovery_lock_fd != -1) {
2691 DEBUG(DEBUG_NOTICE, ("Release the recovery lock\n"));
2692 close(ctdb->recovery_lock_fd);
2693 ctdb->recovery_lock_fd = -1;
2694 unban_all_nodes(ctdb);
2698 /* ok, let that guy become recmaster then */
2699 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
2700 if (ret != 0) {
2701 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
2702 talloc_free(mem_ctx);
2703 return;
2706 talloc_free(mem_ctx);
2707 return;
2712 force the start of the election process
2714 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
2715 struct ctdb_node_map *nodemap)
2717 int ret;
2718 struct ctdb_context *ctdb = rec->ctdb;
2720 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
2722 /* set all nodes to recovery mode to stop all internode traffic */
2723 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
2724 if (ret != 0) {
2725 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
2726 return;
2729 talloc_free(rec->election_timeout);
2730 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2731 fast_start ?
2732 timeval_current_ofs(0, 500000) :
2733 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2734 ctdb_election_timeout, rec);
2736 ret = send_election_request(rec, pnn);
2737 if (ret!=0) {
2738 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
2739 return;
2742 /* wait for a few seconds to collect all responses */
2743 ctdb_wait_election(rec);
2749 handler for when a node changes its flags
2751 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
2752 TDB_DATA data, void *private_data)
2754 int ret;
2755 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2756 struct ctdb_node_map *nodemap=NULL;
2757 TALLOC_CTX *tmp_ctx;
2758 int i;
2759 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2760 int disabled_flag_changed;
2762 if (data.dsize != sizeof(*c)) {
2763 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2764 return;
2767 tmp_ctx = talloc_new(ctdb);
2768 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2770 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2771 if (ret != 0) {
2772 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2773 talloc_free(tmp_ctx);
2774 return;
2778 for (i=0;i<nodemap->num;i++) {
2779 if (nodemap->nodes[i].pnn == c->pnn) break;
2782 if (i == nodemap->num) {
2783 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2784 talloc_free(tmp_ctx);
2785 return;
2788 if (c->old_flags != c->new_flags) {
2789 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2792 disabled_flag_changed = (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
2794 nodemap->nodes[i].flags = c->new_flags;
2796 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2797 CTDB_CURRENT_NODE, &ctdb->recovery_master);
2799 if (ret == 0) {
2800 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2801 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2804 if (ret == 0 &&
2805 ctdb->recovery_master == ctdb->pnn &&
2806 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2807 /* Only do the takeover run if the perm disabled or unhealthy
2808 flags changed since these will cause an ip failover but not
2809 a recovery.
2810 If the node became disconnected or banned this will also
2811 lead to an ip address failover but that is handled
2812 during recovery
2814 if (disabled_flag_changed) {
2815 rec->need_takeover_run = true;
2819 talloc_free(tmp_ctx);
2823 handler for when we need to push out flag changes ot all other nodes
2825 static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
2826 TDB_DATA data, void *private_data)
2828 int ret;
2829 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2830 struct ctdb_node_map *nodemap=NULL;
2831 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2832 uint32_t recmaster;
2833 uint32_t *nodes;
2835 /* find the recovery master */
2836 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
2837 if (ret != 0) {
2838 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from local node\n"));
2839 talloc_free(tmp_ctx);
2840 return;
2843 /* read the node flags from the recmaster */
2844 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), recmaster, tmp_ctx, &nodemap);
2845 if (ret != 0) {
2846 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2847 talloc_free(tmp_ctx);
2848 return;
2850 if (c->pnn >= nodemap->num) {
2851 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2852 talloc_free(tmp_ctx);
2853 return;
2856 /* send the flags update to all connected nodes */
2857 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2859 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2860 nodes, 0, CONTROL_TIMEOUT(),
2861 false, data,
2862 NULL, NULL,
2863 NULL) != 0) {
2864 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2866 talloc_free(tmp_ctx);
2867 return;
2870 talloc_free(tmp_ctx);
2874 struct verify_recmode_normal_data {
2875 uint32_t count;
2876 enum monitor_result status;
2879 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2881 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2884 /* one more node has responded with recmode data*/
2885 rmdata->count--;
2887 /* if we failed to get the recmode, then return an error and let
2888 the main loop try again.
2890 if (state->state != CTDB_CONTROL_DONE) {
2891 if (rmdata->status == MONITOR_OK) {
2892 rmdata->status = MONITOR_FAILED;
2894 return;
2897 /* if we got a response, then the recmode will be stored in the
2898 status field
2900 if (state->status != CTDB_RECOVERY_NORMAL) {
2901 DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
2902 rmdata->status = MONITOR_RECOVERY_NEEDED;
2905 return;
2909 /* verify that all nodes are in normal recovery mode */
2910 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2912 struct verify_recmode_normal_data *rmdata;
2913 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2914 struct ctdb_client_control_state *state;
2915 enum monitor_result status;
2916 int j;
2918 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2919 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2920 rmdata->count = 0;
2921 rmdata->status = MONITOR_OK;
2923 /* loop over all active nodes and send an async getrecmode call to
2924 them*/
2925 for (j=0; j<nodemap->num; j++) {
2926 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2927 continue;
2929 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2930 CONTROL_TIMEOUT(),
2931 nodemap->nodes[j].pnn);
2932 if (state == NULL) {
2933 /* we failed to send the control, treat this as
2934 an error and try again next iteration
2936 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2937 talloc_free(mem_ctx);
2938 return MONITOR_FAILED;
2941 /* set up the callback functions */
2942 state->async.fn = verify_recmode_normal_callback;
2943 state->async.private_data = rmdata;
2945 /* one more control to wait for to complete */
2946 rmdata->count++;
2950 /* now wait for up to the maximum number of seconds allowed
2951 or until all nodes we expect a response from has replied
2953 while (rmdata->count > 0) {
2954 event_loop_once(ctdb->ev);
2957 status = rmdata->status;
2958 talloc_free(mem_ctx);
2959 return status;
2963 struct verify_recmaster_data {
2964 struct ctdb_recoverd *rec;
2965 uint32_t count;
2966 uint32_t pnn;
2967 enum monitor_result status;
2970 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2972 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2975 /* one more node has responded with recmaster data*/
2976 rmdata->count--;
2978 /* if we failed to get the recmaster, then return an error and let
2979 the main loop try again.
2981 if (state->state != CTDB_CONTROL_DONE) {
2982 if (rmdata->status == MONITOR_OK) {
2983 rmdata->status = MONITOR_FAILED;
2985 return;
2988 /* if we got a response, then the recmaster will be stored in the
2989 status field
2991 if (state->status != rmdata->pnn) {
2992 DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
2993 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2994 rmdata->status = MONITOR_ELECTION_NEEDED;
2997 return;
3001 /* verify that all nodes agree that we are the recmaster */
3002 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
3004 struct ctdb_context *ctdb = rec->ctdb;
3005 struct verify_recmaster_data *rmdata;
3006 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3007 struct ctdb_client_control_state *state;
3008 enum monitor_result status;
3009 int j;
3011 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
3012 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
3013 rmdata->rec = rec;
3014 rmdata->count = 0;
3015 rmdata->pnn = pnn;
3016 rmdata->status = MONITOR_OK;
3018 /* loop over all active nodes and send an async getrecmaster call to
3019 them*/
3020 for (j=0; j<nodemap->num; j++) {
3021 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3022 continue;
3024 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
3025 CONTROL_TIMEOUT(),
3026 nodemap->nodes[j].pnn);
3027 if (state == NULL) {
3028 /* we failed to send the control, treat this as
3029 an error and try again next iteration
3031 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
3032 talloc_free(mem_ctx);
3033 return MONITOR_FAILED;
3036 /* set up the callback functions */
3037 state->async.fn = verify_recmaster_callback;
3038 state->async.private_data = rmdata;
3040 /* one more control to wait for to complete */
3041 rmdata->count++;
3045 /* now wait for up to the maximum number of seconds allowed
3046 or until all nodes we expect a response from has replied
3048 while (rmdata->count > 0) {
3049 event_loop_once(ctdb->ev);
3052 status = rmdata->status;
3053 talloc_free(mem_ctx);
3054 return status;
3057 static bool interfaces_have_changed(struct ctdb_context *ctdb,
3058 struct ctdb_recoverd *rec)
3060 struct ctdb_control_get_ifaces *ifaces = NULL;
3061 TALLOC_CTX *mem_ctx;
3062 bool ret = false;
3064 mem_ctx = talloc_new(NULL);
3066 /* Read the interfaces from the local node */
3067 if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
3068 CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
3069 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
3070 /* We could return an error. However, this will be
3071 * rare so we'll decide that the interfaces have
3072 * actually changed, just in case.
3074 talloc_free(mem_ctx);
3075 return true;
3078 if (!rec->ifaces) {
3079 /* We haven't been here before so things have changed */
3080 DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
3081 ret = true;
3082 } else if (rec->ifaces->num != ifaces->num) {
3083 /* Number of interfaces has changed */
3084 DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
3085 rec->ifaces->num, ifaces->num));
3086 ret = true;
3087 } else {
3088 /* See if interface names or link states have changed */
3089 int i;
3090 for (i = 0; i < rec->ifaces->num; i++) {
3091 struct ctdb_control_iface_info * iface = &rec->ifaces->ifaces[i];
3092 if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
3093 DEBUG(DEBUG_NOTICE,
3094 ("Interface in slot %d changed: %s => %s\n",
3095 i, iface->name, ifaces->ifaces[i].name));
3096 ret = true;
3097 break;
3099 if (iface->link_state != ifaces->ifaces[i].link_state) {
3100 DEBUG(DEBUG_NOTICE,
3101 ("Interface %s changed state: %d => %d\n",
3102 iface->name, iface->link_state,
3103 ifaces->ifaces[i].link_state));
3104 ret = true;
3105 break;
3110 talloc_free(rec->ifaces);
3111 rec->ifaces = talloc_steal(rec, ifaces);
3113 talloc_free(mem_ctx);
3114 return ret;
3117 /* called to check that the local allocation of public ip addresses is ok.
3119 static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn, struct ctdb_node_map *nodemap)
3121 TALLOC_CTX *mem_ctx = talloc_new(NULL);
3122 struct ctdb_uptime *uptime1 = NULL;
3123 struct ctdb_uptime *uptime2 = NULL;
3124 int ret, j;
3125 bool need_takeover_run = false;
3127 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
3128 CTDB_CURRENT_NODE, &uptime1);
3129 if (ret != 0) {
3130 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
3131 talloc_free(mem_ctx);
3132 return -1;
3135 if (interfaces_have_changed(ctdb, rec)) {
3136 DEBUG(DEBUG_NOTICE, ("The interfaces status has changed on "
3137 "local node %u - force takeover run\n",
3138 pnn));
3139 need_takeover_run = true;
3142 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
3143 CTDB_CURRENT_NODE, &uptime2);
3144 if (ret != 0) {
3145 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
3146 talloc_free(mem_ctx);
3147 return -1;
3150 /* skip the check if the startrecovery time has changed */
3151 if (timeval_compare(&uptime1->last_recovery_started,
3152 &uptime2->last_recovery_started) != 0) {
3153 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3154 talloc_free(mem_ctx);
3155 return 0;
3158 /* skip the check if the endrecovery time has changed */
3159 if (timeval_compare(&uptime1->last_recovery_finished,
3160 &uptime2->last_recovery_finished) != 0) {
3161 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3162 talloc_free(mem_ctx);
3163 return 0;
3166 /* skip the check if we have started but not finished recovery */
3167 if (timeval_compare(&uptime1->last_recovery_finished,
3168 &uptime1->last_recovery_started) != 1) {
3169 DEBUG(DEBUG_INFO, (__location__ " in the middle of recovery or ip reallocation. skipping public ip address check\n"));
3170 talloc_free(mem_ctx);
3172 return 0;
3175 /* verify that we have the ip addresses we should have
3176 and we dont have ones we shouldnt have.
3177 if we find an inconsistency we set recmode to
3178 active on the local node and wait for the recmaster
3179 to do a full blown recovery.
3180 also if the pnn is -1 and we are healthy and can host the ip
3181 we also request a ip reallocation.
3183 if (ctdb->tunable.disable_ip_failover == 0) {
3184 struct ctdb_all_public_ips *ips = NULL;
3186 /* read the *available* IPs from the local node */
3187 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
3188 if (ret != 0) {
3189 DEBUG(DEBUG_ERR, ("Unable to get available public IPs from local node %u\n", pnn));
3190 talloc_free(mem_ctx);
3191 return -1;
3194 for (j=0; j<ips->num; j++) {
3195 if (ips->ips[j].pnn == -1 &&
3196 nodemap->nodes[pnn].flags == 0) {
3197 DEBUG(DEBUG_CRIT,("Public IP '%s' is not assigned and we could serve it\n",
3198 ctdb_addr_to_str(&ips->ips[j].addr)));
3199 need_takeover_run = true;
3203 talloc_free(ips);
3205 /* read the *known* IPs from the local node */
3206 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
3207 if (ret != 0) {
3208 DEBUG(DEBUG_ERR, ("Unable to get known public IPs from local node %u\n", pnn));
3209 talloc_free(mem_ctx);
3210 return -1;
3213 for (j=0; j<ips->num; j++) {
3214 if (ips->ips[j].pnn == pnn) {
3215 if (ctdb->do_checkpublicip && !ctdb_sys_have_ip(&ips->ips[j].addr)) {
3216 DEBUG(DEBUG_CRIT,("Public IP '%s' is assigned to us but not on an interface\n",
3217 ctdb_addr_to_str(&ips->ips[j].addr)));
3218 need_takeover_run = true;
3220 } else {
3221 if (ctdb->do_checkpublicip &&
3222 ctdb_sys_have_ip(&ips->ips[j].addr)) {
3224 DEBUG(DEBUG_CRIT,("We are still serving a public IP '%s' that we should not be serving. Removing it\n",
3225 ctdb_addr_to_str(&ips->ips[j].addr)));
3227 if (ctdb_ctrl_release_ip(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ips->ips[j]) != 0) {
3228 DEBUG(DEBUG_ERR,("Failed to release local IP address\n"));
3235 if (need_takeover_run) {
3236 struct srvid_request rd;
3237 TDB_DATA data;
3239 DEBUG(DEBUG_CRIT,("Trigger takeoverrun\n"));
3241 rd.pnn = ctdb->pnn;
3242 rd.srvid = 0;
3243 data.dptr = (uint8_t *)&rd;
3244 data.dsize = sizeof(rd);
3246 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
3247 if (ret != 0) {
3248 DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
3251 talloc_free(mem_ctx);
3252 return 0;
3256 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
3258 struct ctdb_node_map **remote_nodemaps = callback_data;
3260 if (node_pnn >= ctdb->num_nodes) {
3261 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
3262 return;
3265 remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
3269 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
3270 struct ctdb_node_map *nodemap,
3271 struct ctdb_node_map **remote_nodemaps)
3273 uint32_t *nodes;
3275 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
3276 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
3277 nodes, 0,
3278 CONTROL_TIMEOUT(), false, tdb_null,
3279 async_getnodemap_callback,
3280 NULL,
3281 remote_nodemaps) != 0) {
3282 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
3284 return -1;
3287 return 0;
3290 enum reclock_child_status { RECLOCK_CHECKING, RECLOCK_OK, RECLOCK_FAILED, RECLOCK_TIMEOUT};
3291 struct ctdb_check_reclock_state {
3292 struct ctdb_context *ctdb;
3293 struct timeval start_time;
3294 int fd[2];
3295 pid_t child;
3296 struct timed_event *te;
3297 struct fd_event *fde;
3298 enum reclock_child_status status;
3301 /* when we free the reclock state we must kill any child process.
3303 static int check_reclock_destructor(struct ctdb_check_reclock_state *state)
3305 struct ctdb_context *ctdb = state->ctdb;
3307 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&state->start_time));
3309 if (state->fd[0] != -1) {
3310 close(state->fd[0]);
3311 state->fd[0] = -1;
3313 if (state->fd[1] != -1) {
3314 close(state->fd[1]);
3315 state->fd[1] = -1;
3317 ctdb_kill(ctdb, state->child, SIGKILL);
3318 return 0;
3322 called if our check_reclock child times out. this would happen if
3323 i/o to the reclock file blocks.
3325 static void ctdb_check_reclock_timeout(struct event_context *ev, struct timed_event *te,
3326 struct timeval t, void *private_data)
3328 struct ctdb_check_reclock_state *state = talloc_get_type(private_data,
3329 struct ctdb_check_reclock_state);
3331 DEBUG(DEBUG_ERR,(__location__ " check_reclock child process hung/timedout CFS slow to grant locks?\n"));
3332 state->status = RECLOCK_TIMEOUT;
3335 /* this is called when the child process has completed checking the reclock
3336 file and has written data back to us through the pipe.
3338 static void reclock_child_handler(struct event_context *ev, struct fd_event *fde,
3339 uint16_t flags, void *private_data)
3341 struct ctdb_check_reclock_state *state= talloc_get_type(private_data,
3342 struct ctdb_check_reclock_state);
3343 char c = 0;
3344 int ret;
3346 /* we got a response from our child process so we can abort the
3347 timeout.
3349 talloc_free(state->te);
3350 state->te = NULL;
3352 ret = sys_read(state->fd[0], &c, 1);
3353 if (ret != 1 || c != RECLOCK_OK) {
3354 DEBUG(DEBUG_ERR,(__location__ " reclock child process returned error %d\n", c));
3355 state->status = RECLOCK_FAILED;
3357 return;
3360 state->status = RECLOCK_OK;
3361 return;
3364 static int check_recovery_lock(struct ctdb_context *ctdb)
3366 int ret;
3367 struct ctdb_check_reclock_state *state;
3368 pid_t parent = getpid();
3370 if (ctdb->recovery_lock_fd == -1) {
3371 DEBUG(DEBUG_CRIT,("recovery master doesn't have the recovery lock\n"));
3372 return -1;
3375 state = talloc(ctdb, struct ctdb_check_reclock_state);
3376 CTDB_NO_MEMORY(ctdb, state);
3378 state->ctdb = ctdb;
3379 state->start_time = timeval_current();
3380 state->status = RECLOCK_CHECKING;
3381 state->fd[0] = -1;
3382 state->fd[1] = -1;
3384 ret = pipe(state->fd);
3385 if (ret != 0) {
3386 talloc_free(state);
3387 DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for check_reclock child\n"));
3388 return -1;
3391 state->child = ctdb_fork(ctdb);
3392 if (state->child == (pid_t)-1) {
3393 DEBUG(DEBUG_CRIT,(__location__ " fork() failed in check_reclock child\n"));
3394 close(state->fd[0]);
3395 state->fd[0] = -1;
3396 close(state->fd[1]);
3397 state->fd[1] = -1;
3398 talloc_free(state);
3399 return -1;
3402 if (state->child == 0) {
3403 char cc = RECLOCK_OK;
3404 close(state->fd[0]);
3405 state->fd[0] = -1;
3407 ctdb_set_process_name("ctdb_rec_reclock");
3408 debug_extra = talloc_asprintf(NULL, "recovery-lock:");
3409 if (pread(ctdb->recovery_lock_fd, &cc, 1, 0) == -1) {
3410 DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
3411 cc = RECLOCK_FAILED;
3414 sys_write(state->fd[1], &cc, 1);
3415 /* make sure we die when our parent dies */
3416 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
3417 sleep(5);
3419 _exit(0);
3421 close(state->fd[1]);
3422 state->fd[1] = -1;
3423 set_close_on_exec(state->fd[0]);
3425 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for check_recovery_lock\n", state->fd[0]));
3427 talloc_set_destructor(state, check_reclock_destructor);
3429 state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(15, 0),
3430 ctdb_check_reclock_timeout, state);
3431 if (state->te == NULL) {
3432 DEBUG(DEBUG_CRIT,(__location__ " Failed to create a timed event for reclock child\n"));
3433 talloc_free(state);
3434 return -1;
3437 state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
3438 EVENT_FD_READ,
3439 reclock_child_handler,
3440 (void *)state);
3442 if (state->fde == NULL) {
3443 DEBUG(DEBUG_CRIT,(__location__ " Failed to create an fd event for reclock child\n"));
3444 talloc_free(state);
3445 return -1;
3447 tevent_fd_set_auto_close(state->fde);
3449 while (state->status == RECLOCK_CHECKING) {
3450 event_loop_once(ctdb->ev);
3453 if (state->status == RECLOCK_FAILED) {
3454 DEBUG(DEBUG_ERR,(__location__ " reclock child failed when checking file\n"));
3455 close(ctdb->recovery_lock_fd);
3456 ctdb->recovery_lock_fd = -1;
3457 talloc_free(state);
3458 return -1;
3461 talloc_free(state);
3462 return 0;
3465 static int update_recovery_lock_file(struct ctdb_context *ctdb)
3467 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
3468 const char *reclockfile;
3470 if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
3471 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
3472 talloc_free(tmp_ctx);
3473 return -1;
3476 if (reclockfile == NULL) {
3477 if (ctdb->recovery_lock_file != NULL) {
3478 DEBUG(DEBUG_ERR,("Reclock file disabled\n"));
3479 talloc_free(ctdb->recovery_lock_file);
3480 ctdb->recovery_lock_file = NULL;
3481 if (ctdb->recovery_lock_fd != -1) {
3482 close(ctdb->recovery_lock_fd);
3483 ctdb->recovery_lock_fd = -1;
3486 ctdb->tunable.verify_recovery_lock = 0;
3487 talloc_free(tmp_ctx);
3488 return 0;
3491 if (ctdb->recovery_lock_file == NULL) {
3492 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3493 if (ctdb->recovery_lock_fd != -1) {
3494 close(ctdb->recovery_lock_fd);
3495 ctdb->recovery_lock_fd = -1;
3497 talloc_free(tmp_ctx);
3498 return 0;
3502 if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
3503 talloc_free(tmp_ctx);
3504 return 0;
3507 talloc_free(ctdb->recovery_lock_file);
3508 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3509 ctdb->tunable.verify_recovery_lock = 0;
3510 if (ctdb->recovery_lock_fd != -1) {
3511 close(ctdb->recovery_lock_fd);
3512 ctdb->recovery_lock_fd = -1;
3515 talloc_free(tmp_ctx);
3516 return 0;
3519 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
3520 TALLOC_CTX *mem_ctx)
3522 uint32_t pnn;
3523 struct ctdb_node_map *nodemap=NULL;
3524 struct ctdb_node_map *recmaster_nodemap=NULL;
3525 struct ctdb_node_map **remote_nodemaps=NULL;
3526 struct ctdb_vnn_map *vnnmap=NULL;
3527 struct ctdb_vnn_map *remote_vnnmap=NULL;
3528 int32_t debug_level;
3529 int i, j, ret;
3530 bool self_ban;
3533 /* verify that the main daemon is still running */
3534 if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
3535 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
3536 exit(-1);
3539 /* ping the local daemon to tell it we are alive */
3540 ctdb_ctrl_recd_ping(ctdb);
3542 if (rec->election_timeout) {
3543 /* an election is in progress */
3544 return;
3547 /* read the debug level from the parent and update locally */
3548 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
3549 if (ret !=0) {
3550 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
3551 return;
3553 DEBUGLEVEL = debug_level;
3555 /* get relevant tunables */
3556 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
3557 if (ret != 0) {
3558 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
3559 return;
3562 /* get runstate */
3563 ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(),
3564 CTDB_CURRENT_NODE, &ctdb->runstate);
3565 if (ret != 0) {
3566 DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n"));
3567 return;
3570 /* get the current recovery lock file from the server */
3571 if (update_recovery_lock_file(ctdb) != 0) {
3572 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
3573 return;
3576 /* Make sure that if recovery lock verification becomes disabled when
3577 we close the file
3579 if (ctdb->tunable.verify_recovery_lock == 0) {
3580 if (ctdb->recovery_lock_fd != -1) {
3581 close(ctdb->recovery_lock_fd);
3582 ctdb->recovery_lock_fd = -1;
3586 pnn = ctdb_get_pnn(ctdb);
3588 /* get the vnnmap */
3589 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
3590 if (ret != 0) {
3591 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
3592 return;
3596 /* get number of nodes */
3597 if (rec->nodemap) {
3598 talloc_free(rec->nodemap);
3599 rec->nodemap = NULL;
3600 nodemap=NULL;
3602 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
3603 if (ret != 0) {
3604 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
3605 return;
3607 nodemap = rec->nodemap;
3609 /* remember our own node flags */
3610 rec->node_flags = nodemap->nodes[pnn].flags;
3612 ban_misbehaving_nodes(rec, &self_ban);
3613 if (self_ban) {
3614 DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
3615 return;
3618 /* if the local daemon is STOPPED or BANNED, we verify that the databases are
3619 also frozen and that the recmode is set to active.
3621 if (rec->node_flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
3622 /* If this node has become inactive then we want to
3623 * reduce the chances of it taking over the recovery
3624 * master role when it becomes active again. This
3625 * helps to stabilise the recovery master role so that
3626 * it stays on the most stable node.
3628 rec->priority_time = timeval_current();
3630 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
3631 if (ret != 0) {
3632 DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
3634 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
3635 DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
3637 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
3638 if (ret != 0) {
3639 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
3641 return;
3643 ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
3644 if (ret != 0) {
3645 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node in STOPPED or BANNED state\n"));
3646 return;
3650 /* If this node is stopped or banned then it is not the recovery
3651 * master, so don't do anything. This prevents stopped or banned
3652 * node from starting election and sending unnecessary controls.
3654 return;
3657 /* check which node is the recovery master */
3658 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
3659 if (ret != 0) {
3660 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
3661 return;
3664 /* If we are not the recmaster then do some housekeeping */
3665 if (rec->recmaster != pnn) {
3666 /* Ignore any IP reallocate requests - only recmaster
3667 * processes them
3669 TALLOC_FREE(rec->reallocate_requests);
3670 /* Clear any nodes that should be force rebalanced in
3671 * the next takeover run. If the recovery master role
3672 * has moved then we don't want to process these some
3673 * time in the future.
3675 TALLOC_FREE(rec->force_rebalance_nodes);
3678 /* This is a special case. When recovery daemon is started, recmaster
3679 * is set to -1. If a node is not started in stopped state, then
3680 * start election to decide recovery master
3682 if (rec->recmaster == (uint32_t)-1) {
3683 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
3684 force_election(rec, pnn, nodemap);
3685 return;
3688 /* update the capabilities for all nodes */
3689 ret = update_capabilities(ctdb, nodemap);
3690 if (ret != 0) {
3691 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
3692 return;
3696 * If the current recmaster does not have CTDB_CAP_RECMASTER,
3697 * but we have, then force an election and try to become the new
3698 * recmaster.
3700 if ((rec->ctdb->nodes[rec->recmaster]->capabilities & CTDB_CAP_RECMASTER) == 0 &&
3701 (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
3702 !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
3703 DEBUG(DEBUG_ERR, (__location__ " Current recmaster node %u does not have CAP_RECMASTER,"
3704 " but we (node %u) have - force an election\n",
3705 rec->recmaster, pnn));
3706 force_election(rec, pnn, nodemap);
3707 return;
3710 /* count how many active nodes there are */
3711 rec->num_active = 0;
3712 rec->num_lmasters = 0;
3713 rec->num_connected = 0;
3714 for (i=0; i<nodemap->num; i++) {
3715 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
3716 rec->num_active++;
3717 if (rec->ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER) {
3718 rec->num_lmasters++;
3721 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
3722 rec->num_connected++;
3727 /* verify that the recmaster node is still active */
3728 for (j=0; j<nodemap->num; j++) {
3729 if (nodemap->nodes[j].pnn==rec->recmaster) {
3730 break;
3734 if (j == nodemap->num) {
3735 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
3736 force_election(rec, pnn, nodemap);
3737 return;
3740 /* if recovery master is disconnected we must elect a new recmaster */
3741 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
3742 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
3743 force_election(rec, pnn, nodemap);
3744 return;
3747 /* get nodemap from the recovery master to check if it is inactive */
3748 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3749 mem_ctx, &recmaster_nodemap);
3750 if (ret != 0) {
3751 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
3752 nodemap->nodes[j].pnn));
3753 return;
3757 if ((recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) &&
3758 (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
3759 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
3761 * update our nodemap to carry the recmaster's notion of
3762 * its own flags, so that we don't keep freezing the
3763 * inactive recmaster node...
3765 nodemap->nodes[j].flags = recmaster_nodemap->nodes[j].flags;
3766 force_election(rec, pnn, nodemap);
3767 return;
3770 /* verify that we have all ip addresses we should have and we dont
3771 * have addresses we shouldnt have.
3773 if (ctdb->tunable.disable_ip_failover == 0 &&
3774 rec->takeover_runs_disable_ctx == NULL) {
3775 if (verify_local_ip_allocation(ctdb, rec, pnn, nodemap) != 0) {
3776 DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
3781 /* if we are not the recmaster then we do not need to check
3782 if recovery is needed
3784 if (pnn != rec->recmaster) {
3785 return;
3789 /* ensure our local copies of flags are right */
3790 ret = update_local_flags(rec, nodemap);
3791 if (ret == MONITOR_ELECTION_NEEDED) {
3792 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
3793 force_election(rec, pnn, nodemap);
3794 return;
3796 if (ret != MONITOR_OK) {
3797 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
3798 return;
3801 if (ctdb->num_nodes != nodemap->num) {
3802 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
3803 ctdb_load_nodes_file(ctdb);
3804 return;
3807 /* verify that all active nodes agree that we are the recmaster */
3808 switch (verify_recmaster(rec, nodemap, pnn)) {
3809 case MONITOR_RECOVERY_NEEDED:
3810 /* can not happen */
3811 return;
3812 case MONITOR_ELECTION_NEEDED:
3813 force_election(rec, pnn, nodemap);
3814 return;
3815 case MONITOR_OK:
3816 break;
3817 case MONITOR_FAILED:
3818 return;
3822 if (rec->need_recovery) {
3823 /* a previous recovery didn't finish */
3824 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3825 return;
3828 /* verify that all active nodes are in normal mode
3829 and not in recovery mode
3831 switch (verify_recmode(ctdb, nodemap)) {
3832 case MONITOR_RECOVERY_NEEDED:
3833 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3834 return;
3835 case MONITOR_FAILED:
3836 return;
3837 case MONITOR_ELECTION_NEEDED:
3838 /* can not happen */
3839 case MONITOR_OK:
3840 break;
3844 if (ctdb->tunable.verify_recovery_lock != 0) {
3845 /* we should have the reclock - check its not stale */
3846 ret = check_recovery_lock(ctdb);
3847 if (ret != 0) {
3848 DEBUG(DEBUG_ERR,("Failed check_recovery_lock. Force a recovery\n"));
3849 ctdb_set_culprit(rec, ctdb->pnn);
3850 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3851 return;
3856 /* if there are takeovers requested, perform it and notify the waiters */
3857 if (rec->takeover_runs_disable_ctx == NULL &&
3858 rec->reallocate_requests) {
3859 process_ipreallocate_requests(ctdb, rec);
3862 /* get the nodemap for all active remote nodes
3864 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
3865 if (remote_nodemaps == NULL) {
3866 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3867 return;
3869 for(i=0; i<nodemap->num; i++) {
3870 remote_nodemaps[i] = NULL;
3872 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3873 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3874 return;
3877 /* verify that all other nodes have the same nodemap as we have
3879 for (j=0; j<nodemap->num; j++) {
3880 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3881 continue;
3884 if (remote_nodemaps[j] == NULL) {
3885 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3886 ctdb_set_culprit(rec, j);
3888 return;
3891 /* if the nodes disagree on how many nodes there are
3892 then this is a good reason to try recovery
3894 if (remote_nodemaps[j]->num != nodemap->num) {
3895 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3896 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3897 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3898 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3899 return;
3902 /* if the nodes disagree on which nodes exist and are
3903 active, then that is also a good reason to do recovery
3905 for (i=0;i<nodemap->num;i++) {
3906 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3907 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3908 nodemap->nodes[j].pnn, i,
3909 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3910 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3911 do_recovery(rec, mem_ctx, pnn, nodemap,
3912 vnnmap);
3913 return;
3919 * Update node flags obtained from each active node. This ensure we have
3920 * up-to-date information for all the nodes.
3922 for (j=0; j<nodemap->num; j++) {
3923 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3924 continue;
3926 nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
3929 for (j=0; j<nodemap->num; j++) {
3930 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3931 continue;
3934 /* verify the flags are consistent
3936 for (i=0; i<nodemap->num; i++) {
3937 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3938 continue;
3941 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3942 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3943 nodemap->nodes[j].pnn,
3944 nodemap->nodes[i].pnn,
3945 remote_nodemaps[j]->nodes[i].flags,
3946 nodemap->nodes[i].flags));
3947 if (i == j) {
3948 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3949 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3950 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3951 do_recovery(rec, mem_ctx, pnn, nodemap,
3952 vnnmap);
3953 return;
3954 } else {
3955 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3956 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3957 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3958 do_recovery(rec, mem_ctx, pnn, nodemap,
3959 vnnmap);
3960 return;
3967 /* There must be the same number of lmasters in the vnn map as
3968 * there are active nodes with the lmaster capability... or
3969 * do a recovery.
3971 if (vnnmap->size != rec->num_lmasters) {
3972 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
3973 vnnmap->size, rec->num_lmasters));
3974 ctdb_set_culprit(rec, ctdb->pnn);
3975 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3976 return;
3979 /* verify that all active nodes in the nodemap also exist in
3980 the vnnmap.
3982 for (j=0; j<nodemap->num; j++) {
3983 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3984 continue;
3986 if (nodemap->nodes[j].pnn == pnn) {
3987 continue;
3990 for (i=0; i<vnnmap->size; i++) {
3991 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3992 break;
3995 if (i == vnnmap->size) {
3996 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3997 nodemap->nodes[j].pnn));
3998 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3999 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
4000 return;
4005 /* verify that all other nodes have the same vnnmap
4006 and are from the same generation
4008 for (j=0; j<nodemap->num; j++) {
4009 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
4010 continue;
4012 if (nodemap->nodes[j].pnn == pnn) {
4013 continue;
4016 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
4017 mem_ctx, &remote_vnnmap);
4018 if (ret != 0) {
4019 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
4020 nodemap->nodes[j].pnn));
4021 return;
4024 /* verify the vnnmap generation is the same */
4025 if (vnnmap->generation != remote_vnnmap->generation) {
4026 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
4027 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
4028 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
4029 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
4030 return;
4033 /* verify the vnnmap size is the same */
4034 if (vnnmap->size != remote_vnnmap->size) {
4035 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
4036 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
4037 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
4038 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
4039 return;
4042 /* verify the vnnmap is the same */
4043 for (i=0;i<vnnmap->size;i++) {
4044 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
4045 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
4046 nodemap->nodes[j].pnn));
4047 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
4048 do_recovery(rec, mem_ctx, pnn, nodemap,
4049 vnnmap);
4050 return;
4055 /* we might need to change who has what IP assigned */
4056 if (rec->need_takeover_run) {
4057 uint32_t culprit = (uint32_t)-1;
4059 rec->need_takeover_run = false;
4061 /* update the list of public ips that a node can handle for
4062 all connected nodes
4064 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
4065 if (ret != 0) {
4066 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
4067 culprit));
4068 rec->need_takeover_run = true;
4069 return;
4072 /* execute the "startrecovery" event script on all nodes */
4073 ret = run_startrecovery_eventscript(rec, nodemap);
4074 if (ret!=0) {
4075 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
4076 ctdb_set_culprit(rec, ctdb->pnn);
4077 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
4078 return;
4081 /* If takeover run fails, then the offending nodes are
4082 * assigned ban culprit counts. And we re-try takeover.
4083 * If takeover run fails repeatedly, the node would get
4084 * banned.
4086 * If rec->need_takeover_run is not set to true at this
4087 * failure, monitoring is disabled cluster-wide (via
4088 * startrecovery eventscript) and will not get enabled.
4090 if (!do_takeover_run(rec, nodemap, true)) {
4091 return;
4094 /* execute the "recovered" event script on all nodes */
4095 ret = run_recovered_eventscript(rec, nodemap, "monitor_cluster");
4096 #if 0
4097 // we cant check whether the event completed successfully
4098 // since this script WILL fail if the node is in recovery mode
4099 // and if that race happens, the code here would just cause a second
4100 // cascading recovery.
4101 if (ret!=0) {
4102 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
4103 ctdb_set_culprit(rec, ctdb->pnn);
4104 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
4106 #endif
4111 the main monitoring loop
4113 static void monitor_cluster(struct ctdb_context *ctdb)
4115 struct ctdb_recoverd *rec;
4117 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
4119 rec = talloc_zero(ctdb, struct ctdb_recoverd);
4120 CTDB_NO_MEMORY_FATAL(ctdb, rec);
4122 rec->ctdb = ctdb;
4124 rec->takeover_run_in_progress = false;
4126 rec->priority_time = timeval_current();
4128 /* register a message port for sending memory dumps */
4129 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
4131 /* register a message port for recovery elections */
4132 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
4134 /* when nodes are disabled/enabled */
4135 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
4137 /* when we are asked to puch out a flag change */
4138 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
4140 /* register a message port for vacuum fetch */
4141 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
4143 /* register a message port for reloadnodes */
4144 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
4146 /* register a message port for performing a takeover run */
4147 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
4149 /* register a message port for disabling the ip check for a short while */
4150 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
4152 /* register a message port for updating the recovery daemons node assignment for an ip */
4153 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECD_UPDATE_IP, recd_update_ip_handler, rec);
4155 /* register a message port for forcing a rebalance of a node next
4156 reallocation */
4157 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
4159 /* Register a message port for disabling takeover runs */
4160 ctdb_client_set_message_handler(ctdb,
4161 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
4162 disable_takeover_runs_handler, rec);
4164 /* register a message port for detaching database */
4165 ctdb_client_set_message_handler(ctdb,
4166 CTDB_SRVID_DETACH_DATABASE,
4167 detach_database_handler, rec);
4169 for (;;) {
4170 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
4171 struct timeval start;
4172 double elapsed;
4174 if (!mem_ctx) {
4175 DEBUG(DEBUG_CRIT,(__location__
4176 " Failed to create temp context\n"));
4177 exit(-1);
4180 start = timeval_current();
4181 main_loop(ctdb, rec, mem_ctx);
4182 talloc_free(mem_ctx);
4184 /* we only check for recovery once every second */
4185 elapsed = timeval_elapsed(&start);
4186 if (elapsed < ctdb->tunable.recover_interval) {
4187 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
4188 - elapsed);
4194 event handler for when the main ctdbd dies
4196 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
4197 uint16_t flags, void *private_data)
4199 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
4200 _exit(1);
4204 called regularly to verify that the recovery daemon is still running
4206 static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
4207 struct timeval yt, void *p)
4209 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
4211 if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
4212 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
4214 event_add_timed(ctdb->ev, ctdb, timeval_zero(),
4215 ctdb_restart_recd, ctdb);
4217 return;
4220 event_add_timed(ctdb->ev, ctdb->recd_ctx,
4221 timeval_current_ofs(30, 0),
4222 ctdb_check_recd, ctdb);
4225 static void recd_sig_child_handler(struct event_context *ev,
4226 struct signal_event *se, int signum, int count,
4227 void *dont_care,
4228 void *private_data)
4230 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4231 int status;
4232 pid_t pid = -1;
4234 while (pid != 0) {
4235 pid = waitpid(-1, &status, WNOHANG);
4236 if (pid == -1) {
4237 if (errno != ECHILD) {
4238 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
4240 return;
4242 if (pid > 0) {
4243 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
4249 startup the recovery daemon as a child of the main ctdb daemon
4251 int ctdb_start_recoverd(struct ctdb_context *ctdb)
4253 int fd[2];
4254 struct signal_event *se;
4255 struct tevent_fd *fde;
4257 if (pipe(fd) != 0) {
4258 return -1;
4261 ctdb->recoverd_pid = ctdb_fork(ctdb);
4262 if (ctdb->recoverd_pid == -1) {
4263 return -1;
4266 if (ctdb->recoverd_pid != 0) {
4267 talloc_free(ctdb->recd_ctx);
4268 ctdb->recd_ctx = talloc_new(ctdb);
4269 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
4271 close(fd[0]);
4272 event_add_timed(ctdb->ev, ctdb->recd_ctx,
4273 timeval_current_ofs(30, 0),
4274 ctdb_check_recd, ctdb);
4275 return 0;
4278 close(fd[1]);
4280 srandom(getpid() ^ time(NULL));
4282 ctdb_set_process_name("ctdb_recovered");
4283 if (switch_from_server_to_client(ctdb, "recoverd") != 0) {
4284 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
4285 exit(1);
4288 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
4290 fde = event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ,
4291 ctdb_recoverd_parent, &fd[0]);
4292 tevent_fd_set_auto_close(fde);
4294 /* set up a handler to pick up sigchld */
4295 se = event_add_signal(ctdb->ev, ctdb,
4296 SIGCHLD, 0,
4297 recd_sig_child_handler,
4298 ctdb);
4299 if (se == NULL) {
4300 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
4301 exit(1);
4304 monitor_cluster(ctdb);
4306 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
4307 return -1;
4311 shutdown the recovery daemon
4313 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
4315 if (ctdb->recoverd_pid == 0) {
4316 return;
4319 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
4320 ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
4322 TALLOC_FREE(ctdb->recd_ctx);
4323 TALLOC_FREE(ctdb->recd_ping_count);
4326 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te,
4327 struct timeval t, void *private_data)
4329 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4331 DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
4332 ctdb_stop_recoverd(ctdb);
4333 ctdb_start_recoverd(ctdb);