ctdb-daemon: Do not allow database detach if AllowClientDBAttach=1
[Samba.git] / ctdb / server / ctdb_recoverd.c
blobac692ec5419f1309625ab505295bd5b898df550f
1 /*
2 ctdb recovery daemon
4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
20 #include "includes.h"
21 #include "system/filesys.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/wait.h"
25 #include "popt.h"
26 #include "cmdline.h"
27 #include "../include/ctdb_client.h"
28 #include "../include/ctdb_private.h"
29 #include "db_wrap.h"
30 #include "dlinklist.h"
33 /* List of SRVID requests that need to be processed */
34 struct srvid_list {
35 struct srvid_list *next, *prev;
36 struct srvid_request *request;
39 struct srvid_requests {
40 struct srvid_list *requests;
43 static void srvid_request_reply(struct ctdb_context *ctdb,
44 struct srvid_request *request,
45 TDB_DATA result)
47 /* Someone that sent srvid==0 does not want a reply */
48 if (request->srvid == 0) {
49 talloc_free(request);
50 return;
53 if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
54 result) == 0) {
55 DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
56 (unsigned)request->pnn,
57 (unsigned long long)request->srvid));
58 } else {
59 DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
60 (unsigned)request->pnn,
61 (unsigned long long)request->srvid));
64 talloc_free(request);
67 static void srvid_requests_reply(struct ctdb_context *ctdb,
68 struct srvid_requests **requests,
69 TDB_DATA result)
71 struct srvid_list *r;
73 for (r = (*requests)->requests; r != NULL; r = r->next) {
74 srvid_request_reply(ctdb, r->request, result);
77 /* Free the list structure... */
78 TALLOC_FREE(*requests);
81 static void srvid_request_add(struct ctdb_context *ctdb,
82 struct srvid_requests **requests,
83 struct srvid_request *request)
85 struct srvid_list *t;
86 int32_t ret;
87 TDB_DATA result;
89 if (*requests == NULL) {
90 *requests = talloc_zero(ctdb, struct srvid_requests);
91 if (*requests == NULL) {
92 goto nomem;
96 t = talloc_zero(*requests, struct srvid_list);
97 if (t == NULL) {
98 /* If *requests was just allocated above then free it */
99 if ((*requests)->requests == NULL) {
100 TALLOC_FREE(*requests);
102 goto nomem;
105 t->request = (struct srvid_request *)talloc_steal(t, request);
106 DLIST_ADD((*requests)->requests, t);
108 return;
110 nomem:
111 /* Failed to add the request to the list. Send a fail. */
112 DEBUG(DEBUG_ERR, (__location__
113 " Out of memory, failed to queue SRVID request\n"));
114 ret = -ENOMEM;
115 result.dsize = sizeof(ret);
116 result.dptr = (uint8_t *)&ret;
117 srvid_request_reply(ctdb, request, result);
120 struct ctdb_banning_state {
121 uint32_t count;
122 struct timeval last_reported_time;
126 private state of recovery daemon
128 struct ctdb_recoverd {
129 struct ctdb_context *ctdb;
130 uint32_t recmaster;
131 uint32_t num_active;
132 uint32_t num_lmasters;
133 uint32_t num_connected;
134 uint32_t last_culprit_node;
135 struct ctdb_node_map *nodemap;
136 struct timeval priority_time;
137 bool need_takeover_run;
138 bool need_recovery;
139 uint32_t node_flags;
140 struct timed_event *send_election_te;
141 struct timed_event *election_timeout;
142 struct vacuum_info *vacuum_info;
143 struct srvid_requests *reallocate_requests;
144 bool takeover_run_in_progress;
145 TALLOC_CTX *takeover_runs_disable_ctx;
146 struct ctdb_control_get_ifaces *ifaces;
147 uint32_t *force_rebalance_nodes;
150 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
151 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
153 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te, struct timeval t, void *private_data);
156 ban a node for a period of time
158 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
160 int ret;
161 struct ctdb_context *ctdb = rec->ctdb;
162 struct ctdb_ban_time bantime;
164 if (!ctdb_validate_pnn(ctdb, pnn)) {
165 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
166 return;
169 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
171 bantime.pnn = pnn;
172 bantime.time = ban_time;
174 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
175 if (ret != 0) {
176 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
177 return;
182 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
186 remember the trouble maker
188 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
190 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
191 struct ctdb_banning_state *ban_state;
193 if (culprit > ctdb->num_nodes) {
194 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
195 return;
198 /* If we are banned or stopped, do not set other nodes as culprits */
199 if (rec->node_flags & NODE_FLAGS_INACTIVE) {
200 DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
201 return;
204 if (ctdb->nodes[culprit]->ban_state == NULL) {
205 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
206 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
210 ban_state = ctdb->nodes[culprit]->ban_state;
211 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
212 /* this was the first time in a long while this node
213 misbehaved so we will forgive any old transgressions.
215 ban_state->count = 0;
218 ban_state->count += count;
219 ban_state->last_reported_time = timeval_current();
220 rec->last_culprit_node = culprit;
224 remember the trouble maker
226 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
228 ctdb_set_culprit_count(rec, culprit, 1);
232 /* this callback is called for every node that failed to execute the
233 recovered event
235 static void recovered_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
237 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
239 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the recovered event. Setting it as recovery fail culprit\n", node_pnn));
241 ctdb_set_culprit(rec, node_pnn);
245 run the "recovered" eventscript on all nodes
247 static int run_recovered_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, const char *caller)
249 TALLOC_CTX *tmp_ctx;
250 uint32_t *nodes;
251 struct ctdb_context *ctdb = rec->ctdb;
253 tmp_ctx = talloc_new(ctdb);
254 CTDB_NO_MEMORY(ctdb, tmp_ctx);
256 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
257 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
258 nodes, 0,
259 CONTROL_TIMEOUT(), false, tdb_null,
260 NULL, recovered_fail_callback,
261 rec) != 0) {
262 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
264 talloc_free(tmp_ctx);
265 return -1;
268 talloc_free(tmp_ctx);
269 return 0;
272 /* this callback is called for every node that failed to execute the
273 start recovery event
275 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
277 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
279 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
281 ctdb_set_culprit(rec, node_pnn);
285 run the "startrecovery" eventscript on all nodes
287 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
289 TALLOC_CTX *tmp_ctx;
290 uint32_t *nodes;
291 struct ctdb_context *ctdb = rec->ctdb;
293 tmp_ctx = talloc_new(ctdb);
294 CTDB_NO_MEMORY(ctdb, tmp_ctx);
296 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
297 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
298 nodes, 0,
299 CONTROL_TIMEOUT(), false, tdb_null,
300 NULL,
301 startrecovery_fail_callback,
302 rec) != 0) {
303 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
304 talloc_free(tmp_ctx);
305 return -1;
308 talloc_free(tmp_ctx);
309 return 0;
312 static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
314 if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
315 DEBUG(DEBUG_ERR, (__location__ " Invalid length/pointer for getcap callback : %u %p\n", (unsigned)outdata.dsize, outdata.dptr));
316 return;
318 if (node_pnn < ctdb->num_nodes) {
319 ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
322 if (node_pnn == ctdb->pnn) {
323 ctdb->capabilities = ctdb->nodes[node_pnn]->capabilities;
328 update the node capabilities for all connected nodes
330 static int update_capabilities(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
332 uint32_t *nodes;
333 TALLOC_CTX *tmp_ctx;
335 tmp_ctx = talloc_new(ctdb);
336 CTDB_NO_MEMORY(ctdb, tmp_ctx);
338 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
339 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES,
340 nodes, 0,
341 CONTROL_TIMEOUT(),
342 false, tdb_null,
343 async_getcap_callback, NULL,
344 NULL) != 0) {
345 DEBUG(DEBUG_ERR, (__location__ " Failed to read node capabilities.\n"));
346 talloc_free(tmp_ctx);
347 return -1;
350 talloc_free(tmp_ctx);
351 return 0;
354 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
356 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
358 DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
359 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
362 static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
364 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
366 DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
367 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
371 change recovery mode on all nodes
373 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t rec_mode)
375 TDB_DATA data;
376 uint32_t *nodes;
377 TALLOC_CTX *tmp_ctx;
379 tmp_ctx = talloc_new(ctdb);
380 CTDB_NO_MEMORY(ctdb, tmp_ctx);
382 /* freeze all nodes */
383 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
384 if (rec_mode == CTDB_RECOVERY_ACTIVE) {
385 int i;
387 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
388 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
389 nodes, i,
390 CONTROL_TIMEOUT(),
391 false, tdb_null,
392 NULL,
393 set_recmode_fail_callback,
394 rec) != 0) {
395 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
396 talloc_free(tmp_ctx);
397 return -1;
403 data.dsize = sizeof(uint32_t);
404 data.dptr = (unsigned char *)&rec_mode;
406 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
407 nodes, 0,
408 CONTROL_TIMEOUT(),
409 false, data,
410 NULL, NULL,
411 NULL) != 0) {
412 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
413 talloc_free(tmp_ctx);
414 return -1;
417 talloc_free(tmp_ctx);
418 return 0;
422 change recovery master on all node
424 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
426 TDB_DATA data;
427 TALLOC_CTX *tmp_ctx;
428 uint32_t *nodes;
430 tmp_ctx = talloc_new(ctdb);
431 CTDB_NO_MEMORY(ctdb, tmp_ctx);
433 data.dsize = sizeof(uint32_t);
434 data.dptr = (unsigned char *)&pnn;
436 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
437 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
438 nodes, 0,
439 CONTROL_TIMEOUT(), false, data,
440 NULL, NULL,
441 NULL) != 0) {
442 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
443 talloc_free(tmp_ctx);
444 return -1;
447 talloc_free(tmp_ctx);
448 return 0;
451 /* update all remote nodes to use the same db priority that we have
452 this can fail if the remove node has not yet been upgraded to
453 support this function, so we always return success and never fail
454 a recovery if this call fails.
456 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
457 struct ctdb_node_map *nodemap,
458 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
460 int db;
462 /* step through all local databases */
463 for (db=0; db<dbmap->num;db++) {
464 struct ctdb_db_priority db_prio;
465 int ret;
467 db_prio.db_id = dbmap->dbs[db].dbid;
468 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].dbid, &db_prio.priority);
469 if (ret != 0) {
470 DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].dbid));
471 continue;
474 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].dbid, db_prio.priority));
476 ret = ctdb_ctrl_set_db_priority(ctdb, CONTROL_TIMEOUT(),
477 CTDB_CURRENT_NODE, &db_prio);
478 if (ret != 0) {
479 DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n",
480 db_prio.db_id));
484 return 0;
488 ensure all other nodes have attached to any databases that we have
490 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
491 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
493 int i, j, db, ret;
494 struct ctdb_dbid_map *remote_dbmap;
496 /* verify that all other nodes have all our databases */
497 for (j=0; j<nodemap->num; j++) {
498 /* we dont need to ourself ourselves */
499 if (nodemap->nodes[j].pnn == pnn) {
500 continue;
502 /* dont check nodes that are unavailable */
503 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
504 continue;
507 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
508 mem_ctx, &remote_dbmap);
509 if (ret != 0) {
510 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
511 return -1;
514 /* step through all local databases */
515 for (db=0; db<dbmap->num;db++) {
516 const char *name;
519 for (i=0;i<remote_dbmap->num;i++) {
520 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
521 break;
524 /* the remote node already have this database */
525 if (i!=remote_dbmap->num) {
526 continue;
528 /* ok so we need to create this database */
529 ret = ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn,
530 dbmap->dbs[db].dbid, mem_ctx,
531 &name);
532 if (ret != 0) {
533 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
534 return -1;
536 ret = ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(),
537 nodemap->nodes[j].pnn,
538 mem_ctx, name,
539 dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
540 if (ret != 0) {
541 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
542 return -1;
547 return 0;
552 ensure we are attached to any databases that anyone else is attached to
554 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
555 uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
557 int i, j, db, ret;
558 struct ctdb_dbid_map *remote_dbmap;
560 /* verify that we have all database any other node has */
561 for (j=0; j<nodemap->num; j++) {
562 /* we dont need to ourself ourselves */
563 if (nodemap->nodes[j].pnn == pnn) {
564 continue;
566 /* dont check nodes that are unavailable */
567 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
568 continue;
571 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
572 mem_ctx, &remote_dbmap);
573 if (ret != 0) {
574 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
575 return -1;
578 /* step through all databases on the remote node */
579 for (db=0; db<remote_dbmap->num;db++) {
580 const char *name;
582 for (i=0;i<(*dbmap)->num;i++) {
583 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
584 break;
587 /* we already have this db locally */
588 if (i!=(*dbmap)->num) {
589 continue;
591 /* ok so we need to create this database and
592 rebuild dbmap
594 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
595 remote_dbmap->dbs[db].dbid, mem_ctx, &name);
596 if (ret != 0) {
597 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
598 nodemap->nodes[j].pnn));
599 return -1;
601 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
602 remote_dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
603 if (ret != 0) {
604 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
605 return -1;
607 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
608 if (ret != 0) {
609 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
610 return -1;
615 return 0;
620 pull the remote database contents from one node into the recdb
622 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
623 struct tdb_wrap *recdb, uint32_t dbid)
625 int ret;
626 TDB_DATA outdata;
627 struct ctdb_marshall_buffer *reply;
628 struct ctdb_rec_data *rec;
629 int i;
630 TALLOC_CTX *tmp_ctx = talloc_new(recdb);
632 ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
633 CONTROL_TIMEOUT(), &outdata);
634 if (ret != 0) {
635 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
636 talloc_free(tmp_ctx);
637 return -1;
640 reply = (struct ctdb_marshall_buffer *)outdata.dptr;
642 if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
643 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
644 talloc_free(tmp_ctx);
645 return -1;
648 rec = (struct ctdb_rec_data *)&reply->data[0];
650 for (i=0;
651 i<reply->count;
652 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec), i++) {
653 TDB_DATA key, data;
654 struct ctdb_ltdb_header *hdr;
655 TDB_DATA existing;
657 key.dptr = &rec->data[0];
658 key.dsize = rec->keylen;
659 data.dptr = &rec->data[key.dsize];
660 data.dsize = rec->datalen;
662 hdr = (struct ctdb_ltdb_header *)data.dptr;
664 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
665 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
666 talloc_free(tmp_ctx);
667 return -1;
670 /* fetch the existing record, if any */
671 existing = tdb_fetch(recdb->tdb, key);
673 if (existing.dptr != NULL) {
674 struct ctdb_ltdb_header header;
675 if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
676 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
677 (unsigned)existing.dsize, srcnode));
678 free(existing.dptr);
679 talloc_free(tmp_ctx);
680 return -1;
682 header = *(struct ctdb_ltdb_header *)existing.dptr;
683 free(existing.dptr);
684 if (!(header.rsn < hdr->rsn ||
685 (header.dmaster != ctdb->recovery_master && header.rsn == hdr->rsn))) {
686 continue;
690 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
691 DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
692 talloc_free(tmp_ctx);
693 return -1;
697 talloc_free(tmp_ctx);
699 return 0;
703 struct pull_seqnum_cbdata {
704 int failed;
705 uint32_t pnn;
706 uint64_t seqnum;
709 static void pull_seqnum_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
711 struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
712 uint64_t seqnum;
714 if (cb_data->failed != 0) {
715 DEBUG(DEBUG_ERR, ("Got seqnum from node %d but we have already failed the entire operation\n", node_pnn));
716 return;
719 if (res != 0) {
720 DEBUG(DEBUG_ERR, ("Error when pulling seqnum from node %d\n", node_pnn));
721 cb_data->failed = 1;
722 return;
725 if (outdata.dsize != sizeof(uint64_t)) {
726 DEBUG(DEBUG_ERR, ("Error when reading pull seqnum from node %d, got %d bytes but expected %d\n", node_pnn, (int)outdata.dsize, (int)sizeof(uint64_t)));
727 cb_data->failed = -1;
728 return;
731 seqnum = *((uint64_t *)outdata.dptr);
733 if (seqnum > cb_data->seqnum ||
734 (cb_data->pnn == -1 && seqnum == 0)) {
735 cb_data->seqnum = seqnum;
736 cb_data->pnn = node_pnn;
740 static void pull_seqnum_fail_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
742 struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
744 DEBUG(DEBUG_ERR, ("Failed to pull db seqnum from node %d\n", node_pnn));
745 cb_data->failed = 1;
748 static int pull_highest_seqnum_pdb(struct ctdb_context *ctdb,
749 struct ctdb_recoverd *rec,
750 struct ctdb_node_map *nodemap,
751 struct tdb_wrap *recdb, uint32_t dbid)
753 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
754 uint32_t *nodes;
755 TDB_DATA data;
756 uint32_t outdata[2];
757 struct pull_seqnum_cbdata *cb_data;
759 DEBUG(DEBUG_NOTICE, ("Scan for highest seqnum pdb for db:0x%08x\n", dbid));
761 outdata[0] = dbid;
762 outdata[1] = 0;
764 data.dsize = sizeof(outdata);
765 data.dptr = (uint8_t *)&outdata[0];
767 cb_data = talloc(tmp_ctx, struct pull_seqnum_cbdata);
768 if (cb_data == NULL) {
769 DEBUG(DEBUG_ERR, ("Failed to allocate pull highest seqnum cb_data structure\n"));
770 talloc_free(tmp_ctx);
771 return -1;
774 cb_data->failed = 0;
775 cb_data->pnn = -1;
776 cb_data->seqnum = 0;
778 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
779 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_DB_SEQNUM,
780 nodes, 0,
781 CONTROL_TIMEOUT(), false, data,
782 pull_seqnum_cb,
783 pull_seqnum_fail_cb,
784 cb_data) != 0) {
785 DEBUG(DEBUG_ERR, (__location__ " Failed to run async GET_DB_SEQNUM\n"));
787 talloc_free(tmp_ctx);
788 return -1;
791 if (cb_data->failed != 0) {
792 DEBUG(DEBUG_NOTICE, ("Failed to pull sequence numbers for DB 0x%08x\n", dbid));
793 talloc_free(tmp_ctx);
794 return -1;
797 if (cb_data->pnn == -1) {
798 DEBUG(DEBUG_NOTICE, ("Failed to find a node with highest sequence numbers for DB 0x%08x\n", dbid));
799 talloc_free(tmp_ctx);
800 return -1;
803 DEBUG(DEBUG_NOTICE, ("Pull persistent db:0x%08x from node %d with highest seqnum:%lld\n", dbid, cb_data->pnn, (long long)cb_data->seqnum));
805 if (pull_one_remote_database(ctdb, cb_data->pnn, recdb, dbid) != 0) {
806 DEBUG(DEBUG_ERR, ("Failed to pull higest seqnum database 0x%08x from node %d\n", dbid, cb_data->pnn));
807 talloc_free(tmp_ctx);
808 return -1;
811 talloc_free(tmp_ctx);
812 return 0;
817 pull all the remote database contents into the recdb
819 static int pull_remote_database(struct ctdb_context *ctdb,
820 struct ctdb_recoverd *rec,
821 struct ctdb_node_map *nodemap,
822 struct tdb_wrap *recdb, uint32_t dbid,
823 bool persistent)
825 int j;
827 if (persistent && ctdb->tunable.recover_pdb_by_seqnum != 0) {
828 int ret;
829 ret = pull_highest_seqnum_pdb(ctdb, rec, nodemap, recdb, dbid);
830 if (ret == 0) {
831 return 0;
835 /* pull all records from all other nodes across onto this node
836 (this merges based on rsn)
838 for (j=0; j<nodemap->num; j++) {
839 /* dont merge from nodes that are unavailable */
840 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
841 continue;
843 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
844 DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
845 nodemap->nodes[j].pnn));
846 ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
847 return -1;
851 return 0;
856 update flags on all active nodes
858 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
860 int ret;
862 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
863 if (ret != 0) {
864 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
865 return -1;
868 return 0;
872 ensure all nodes have the same vnnmap we do
874 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
875 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
877 int j, ret;
879 /* push the new vnn map out to all the nodes */
880 for (j=0; j<nodemap->num; j++) {
881 /* dont push to nodes that are unavailable */
882 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
883 continue;
886 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
887 if (ret != 0) {
888 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
889 return -1;
893 return 0;
897 struct vacuum_info {
898 struct vacuum_info *next, *prev;
899 struct ctdb_recoverd *rec;
900 uint32_t srcnode;
901 struct ctdb_db_context *ctdb_db;
902 struct ctdb_marshall_buffer *recs;
903 struct ctdb_rec_data *r;
906 static void vacuum_fetch_next(struct vacuum_info *v);
909 called when a vacuum fetch has completed - just free it and do the next one
911 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
913 struct vacuum_info *v = talloc_get_type(state->async.private_data, struct vacuum_info);
914 talloc_free(state);
915 vacuum_fetch_next(v);
920 process the next element from the vacuum list
922 static void vacuum_fetch_next(struct vacuum_info *v)
924 struct ctdb_call call;
925 struct ctdb_rec_data *r;
927 while (v->recs->count) {
928 struct ctdb_client_call_state *state;
929 TDB_DATA data;
930 struct ctdb_ltdb_header *hdr;
932 ZERO_STRUCT(call);
933 call.call_id = CTDB_NULL_FUNC;
934 call.flags = CTDB_IMMEDIATE_MIGRATION;
935 call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
937 r = v->r;
938 v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
939 v->recs->count--;
941 call.key.dptr = &r->data[0];
942 call.key.dsize = r->keylen;
944 /* ensure we don't block this daemon - just skip a record if we can't get
945 the chainlock */
946 if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
947 continue;
950 data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
951 if (data.dptr == NULL) {
952 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
953 continue;
956 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
957 free(data.dptr);
958 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
959 continue;
962 hdr = (struct ctdb_ltdb_header *)data.dptr;
963 if (hdr->dmaster == v->rec->ctdb->pnn) {
964 /* its already local */
965 free(data.dptr);
966 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
967 continue;
970 free(data.dptr);
972 state = ctdb_call_send(v->ctdb_db, &call);
973 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
974 if (state == NULL) {
975 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
976 talloc_free(v);
977 return;
979 state->async.fn = vacuum_fetch_callback;
980 state->async.private_data = v;
981 return;
984 talloc_free(v);
989 destroy a vacuum info structure
991 static int vacuum_info_destructor(struct vacuum_info *v)
993 DLIST_REMOVE(v->rec->vacuum_info, v);
994 return 0;
999 handler for vacuum fetch
1001 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
1002 TDB_DATA data, void *private_data)
1004 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1005 struct ctdb_marshall_buffer *recs;
1006 int ret, i;
1007 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1008 const char *name;
1009 struct ctdb_dbid_map *dbmap=NULL;
1010 bool persistent = false;
1011 struct ctdb_db_context *ctdb_db;
1012 struct ctdb_rec_data *r;
1013 uint32_t srcnode;
1014 struct vacuum_info *v;
1016 recs = (struct ctdb_marshall_buffer *)data.dptr;
1017 r = (struct ctdb_rec_data *)&recs->data[0];
1019 if (recs->count == 0) {
1020 talloc_free(tmp_ctx);
1021 return;
1024 srcnode = r->reqid;
1026 for (v=rec->vacuum_info;v;v=v->next) {
1027 if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
1028 /* we're already working on records from this node */
1029 talloc_free(tmp_ctx);
1030 return;
1034 /* work out if the database is persistent */
1035 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
1036 if (ret != 0) {
1037 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
1038 talloc_free(tmp_ctx);
1039 return;
1042 for (i=0;i<dbmap->num;i++) {
1043 if (dbmap->dbs[i].dbid == recs->db_id) {
1044 persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
1045 break;
1048 if (i == dbmap->num) {
1049 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
1050 talloc_free(tmp_ctx);
1051 return;
1054 /* find the name of this database */
1055 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
1056 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
1057 talloc_free(tmp_ctx);
1058 return;
1061 /* attach to it */
1062 ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, persistent, 0);
1063 if (ctdb_db == NULL) {
1064 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
1065 talloc_free(tmp_ctx);
1066 return;
1069 v = talloc_zero(rec, struct vacuum_info);
1070 if (v == NULL) {
1071 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
1072 talloc_free(tmp_ctx);
1073 return;
1076 v->rec = rec;
1077 v->srcnode = srcnode;
1078 v->ctdb_db = ctdb_db;
1079 v->recs = talloc_memdup(v, recs, data.dsize);
1080 if (v->recs == NULL) {
1081 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
1082 talloc_free(v);
1083 talloc_free(tmp_ctx);
1084 return;
1086 v->r = (struct ctdb_rec_data *)&v->recs->data[0];
1088 DLIST_ADD(rec->vacuum_info, v);
1090 talloc_set_destructor(v, vacuum_info_destructor);
1092 vacuum_fetch_next(v);
1093 talloc_free(tmp_ctx);
1098 called when ctdb_wait_timeout should finish
1100 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
1101 struct timeval yt, void *p)
1103 uint32_t *timed_out = (uint32_t *)p;
1104 (*timed_out) = 1;
1108 wait for a given number of seconds
1110 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
1112 uint32_t timed_out = 0;
1113 time_t usecs = (secs - (time_t)secs) * 1000000;
1114 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs), ctdb_wait_handler, &timed_out);
1115 while (!timed_out) {
1116 event_loop_once(ctdb->ev);
1121 called when an election times out (ends)
1123 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
1124 struct timeval t, void *p)
1126 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1127 rec->election_timeout = NULL;
1128 fast_start = false;
1130 DEBUG(DEBUG_WARNING,(__location__ " Election timed out\n"));
1135 wait for an election to finish. It finished election_timeout seconds after
1136 the last election packet is received
1138 static void ctdb_wait_election(struct ctdb_recoverd *rec)
1140 struct ctdb_context *ctdb = rec->ctdb;
1141 while (rec->election_timeout) {
1142 event_loop_once(ctdb->ev);
1147 Update our local flags from all remote connected nodes.
1148 This is only run when we are or we belive we are the recovery master
1150 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
1152 int j;
1153 struct ctdb_context *ctdb = rec->ctdb;
1154 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1156 /* get the nodemap for all active remote nodes and verify
1157 they are the same as for this node
1159 for (j=0; j<nodemap->num; j++) {
1160 struct ctdb_node_map *remote_nodemap=NULL;
1161 int ret;
1163 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
1164 continue;
1166 if (nodemap->nodes[j].pnn == ctdb->pnn) {
1167 continue;
1170 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
1171 mem_ctx, &remote_nodemap);
1172 if (ret != 0) {
1173 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
1174 nodemap->nodes[j].pnn));
1175 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
1176 talloc_free(mem_ctx);
1177 return MONITOR_FAILED;
1179 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
1180 /* We should tell our daemon about this so it
1181 updates its flags or else we will log the same
1182 message again in the next iteration of recovery.
1183 Since we are the recovery master we can just as
1184 well update the flags on all nodes.
1186 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
1187 if (ret != 0) {
1188 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
1189 return -1;
1192 /* Update our local copy of the flags in the recovery
1193 daemon.
1195 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
1196 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
1197 nodemap->nodes[j].flags));
1198 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
1200 talloc_free(remote_nodemap);
1202 talloc_free(mem_ctx);
1203 return MONITOR_OK;
1207 /* Create a new random generation ip.
1208 The generation id can not be the INVALID_GENERATION id
1210 static uint32_t new_generation(void)
1212 uint32_t generation;
1214 while (1) {
1215 generation = random();
1217 if (generation != INVALID_GENERATION) {
1218 break;
1222 return generation;
1227 create a temporary working database
1229 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1231 char *name;
1232 struct tdb_wrap *recdb;
1233 unsigned tdb_flags;
1235 /* open up the temporary recovery database */
1236 name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
1237 ctdb->db_directory_state,
1238 ctdb->pnn);
1239 if (name == NULL) {
1240 return NULL;
1242 unlink(name);
1244 tdb_flags = TDB_NOLOCK;
1245 if (ctdb->valgrinding) {
1246 tdb_flags |= TDB_NOMMAP;
1248 tdb_flags |= (TDB_INCOMPATIBLE_HASH | TDB_DISALLOW_NESTING);
1250 recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1251 tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1252 if (recdb == NULL) {
1253 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1256 talloc_free(name);
1258 return recdb;
1263 a traverse function for pulling all relevant records from recdb
1265 struct recdb_data {
1266 struct ctdb_context *ctdb;
1267 struct ctdb_marshall_buffer *recdata;
1268 uint32_t len;
1269 uint32_t allocated_len;
1270 bool failed;
1271 bool persistent;
1274 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1276 struct recdb_data *params = (struct recdb_data *)p;
1277 struct ctdb_rec_data *rec;
1278 struct ctdb_ltdb_header *hdr;
1281 * skip empty records - but NOT for persistent databases:
1283 * The record-by-record mode of recovery deletes empty records.
1284 * For persistent databases, this can lead to data corruption
1285 * by deleting records that should be there:
1287 * - Assume the cluster has been running for a while.
1289 * - A record R in a persistent database has been created and
1290 * deleted a couple of times, the last operation being deletion,
1291 * leaving an empty record with a high RSN, say 10.
1293 * - Now a node N is turned off.
1295 * - This leaves the local database copy of D on N with the empty
1296 * copy of R and RSN 10. On all other nodes, the recovery has deleted
1297 * the copy of record R.
1299 * - Now the record is created again while node N is turned off.
1300 * This creates R with RSN = 1 on all nodes except for N.
1302 * - Now node N is turned on again. The following recovery will chose
1303 * the older empty copy of R due to RSN 10 > RSN 1.
1305 * ==> Hence the record is gone after the recovery.
1307 * On databases like Samba's registry, this can damage the higher-level
1308 * data structures built from the various tdb-level records.
1310 if (!params->persistent && data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1311 return 0;
1314 /* update the dmaster field to point to us */
1315 hdr = (struct ctdb_ltdb_header *)data.dptr;
1316 if (!params->persistent) {
1317 hdr->dmaster = params->ctdb->pnn;
1318 hdr->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
1321 /* add the record to the blob ready to send to the nodes */
1322 rec = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1323 if (rec == NULL) {
1324 params->failed = true;
1325 return -1;
1327 if (params->len + rec->length >= params->allocated_len) {
1328 params->allocated_len = rec->length + params->len + params->ctdb->tunable.pulldb_preallocation_size;
1329 params->recdata = talloc_realloc_size(NULL, params->recdata, params->allocated_len);
1331 if (params->recdata == NULL) {
1332 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u\n",
1333 rec->length + params->len));
1334 params->failed = true;
1335 return -1;
1337 params->recdata->count++;
1338 memcpy(params->len+(uint8_t *)params->recdata, rec, rec->length);
1339 params->len += rec->length;
1340 talloc_free(rec);
1342 return 0;
1346 push the recdb database out to all nodes
1348 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1349 bool persistent,
1350 struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1352 struct recdb_data params;
1353 struct ctdb_marshall_buffer *recdata;
1354 TDB_DATA outdata;
1355 TALLOC_CTX *tmp_ctx;
1356 uint32_t *nodes;
1358 tmp_ctx = talloc_new(ctdb);
1359 CTDB_NO_MEMORY(ctdb, tmp_ctx);
1361 recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1362 CTDB_NO_MEMORY(ctdb, recdata);
1364 recdata->db_id = dbid;
1366 params.ctdb = ctdb;
1367 params.recdata = recdata;
1368 params.len = offsetof(struct ctdb_marshall_buffer, data);
1369 params.allocated_len = params.len;
1370 params.failed = false;
1371 params.persistent = persistent;
1373 if (tdb_traverse_read(recdb->tdb, traverse_recdb, &params) == -1) {
1374 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1375 talloc_free(params.recdata);
1376 talloc_free(tmp_ctx);
1377 return -1;
1380 if (params.failed) {
1381 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1382 talloc_free(params.recdata);
1383 talloc_free(tmp_ctx);
1384 return -1;
1387 recdata = params.recdata;
1389 outdata.dptr = (void *)recdata;
1390 outdata.dsize = params.len;
1392 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1393 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1394 nodes, 0,
1395 CONTROL_TIMEOUT(), false, outdata,
1396 NULL, NULL,
1397 NULL) != 0) {
1398 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1399 talloc_free(recdata);
1400 talloc_free(tmp_ctx);
1401 return -1;
1404 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1405 dbid, recdata->count));
1407 talloc_free(recdata);
1408 talloc_free(tmp_ctx);
1410 return 0;
1415 go through a full recovery on one database
1417 static int recover_database(struct ctdb_recoverd *rec,
1418 TALLOC_CTX *mem_ctx,
1419 uint32_t dbid,
1420 bool persistent,
1421 uint32_t pnn,
1422 struct ctdb_node_map *nodemap,
1423 uint32_t transaction_id)
1425 struct tdb_wrap *recdb;
1426 int ret;
1427 struct ctdb_context *ctdb = rec->ctdb;
1428 TDB_DATA data;
1429 struct ctdb_control_wipe_database w;
1430 uint32_t *nodes;
1432 recdb = create_recdb(ctdb, mem_ctx);
1433 if (recdb == NULL) {
1434 return -1;
1437 /* pull all remote databases onto the recdb */
1438 ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
1439 if (ret != 0) {
1440 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1441 return -1;
1444 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1446 /* wipe all the remote databases. This is safe as we are in a transaction */
1447 w.db_id = dbid;
1448 w.transaction_id = transaction_id;
1450 data.dptr = (void *)&w;
1451 data.dsize = sizeof(w);
1453 nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1454 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1455 nodes, 0,
1456 CONTROL_TIMEOUT(), false, data,
1457 NULL, NULL,
1458 NULL) != 0) {
1459 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1460 talloc_free(recdb);
1461 return -1;
1464 /* push out the correct database. This sets the dmaster and skips
1465 the empty records */
1466 ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
1467 if (ret != 0) {
1468 talloc_free(recdb);
1469 return -1;
1472 /* all done with this database */
1473 talloc_free(recdb);
1475 return 0;
1478 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1479 struct ctdb_recoverd *rec,
1480 struct ctdb_node_map *nodemap,
1481 uint32_t *culprit)
1483 int j;
1484 int ret;
1486 if (ctdb->num_nodes != nodemap->num) {
1487 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
1488 ctdb->num_nodes, nodemap->num));
1489 if (culprit) {
1490 *culprit = ctdb->pnn;
1492 return -1;
1495 for (j=0; j<nodemap->num; j++) {
1496 /* For readability */
1497 struct ctdb_node *node = ctdb->nodes[j];
1499 /* release any existing data */
1500 if (node->known_public_ips) {
1501 talloc_free(node->known_public_ips);
1502 node->known_public_ips = NULL;
1504 if (node->available_public_ips) {
1505 talloc_free(node->available_public_ips);
1506 node->available_public_ips = NULL;
1509 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1510 continue;
1513 /* Retrieve the list of known public IPs from the node */
1514 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1515 CONTROL_TIMEOUT(),
1516 node->pnn,
1517 ctdb->nodes,
1519 &node->known_public_ips);
1520 if (ret != 0) {
1521 DEBUG(DEBUG_ERR,
1522 ("Failed to read known public IPs from node: %u\n",
1523 node->pnn));
1524 if (culprit) {
1525 *culprit = node->pnn;
1527 return -1;
1530 if (ctdb->do_checkpublicip &&
1531 rec->takeover_runs_disable_ctx == NULL &&
1532 verify_remote_ip_allocation(ctdb,
1533 node->known_public_ips,
1534 node->pnn)) {
1535 DEBUG(DEBUG_ERR,("Trigger IP reallocation\n"));
1536 rec->need_takeover_run = true;
1539 /* Retrieve the list of available public IPs from the node */
1540 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1541 CONTROL_TIMEOUT(),
1542 node->pnn,
1543 ctdb->nodes,
1544 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1545 &node->available_public_ips);
1546 if (ret != 0) {
1547 DEBUG(DEBUG_ERR,
1548 ("Failed to read available public IPs from node: %u\n",
1549 node->pnn));
1550 if (culprit) {
1551 *culprit = node->pnn;
1553 return -1;
1557 return 0;
1560 /* when we start a recovery, make sure all nodes use the same reclock file
1561 setting
1563 static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd *rec)
1565 struct ctdb_context *ctdb = rec->ctdb;
1566 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
1567 TDB_DATA data;
1568 uint32_t *nodes;
1570 if (ctdb->recovery_lock_file == NULL) {
1571 data.dptr = NULL;
1572 data.dsize = 0;
1573 } else {
1574 data.dsize = strlen(ctdb->recovery_lock_file) + 1;
1575 data.dptr = (uint8_t *)ctdb->recovery_lock_file;
1578 nodes = list_of_active_nodes(ctdb, rec->nodemap, tmp_ctx, true);
1579 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECLOCK_FILE,
1580 nodes, 0,
1581 CONTROL_TIMEOUT(),
1582 false, data,
1583 NULL, NULL,
1584 rec) != 0) {
1585 DEBUG(DEBUG_ERR, (__location__ " Failed to sync reclock file settings\n"));
1586 talloc_free(tmp_ctx);
1587 return -1;
1590 talloc_free(tmp_ctx);
1591 return 0;
1596 * this callback is called for every node that failed to execute ctdb_takeover_run()
1597 * and set flag to re-run takeover run.
1599 static void takeover_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
1601 DEBUG(DEBUG_ERR, ("Node %u failed the takeover run\n", node_pnn));
1603 if (callback_data != NULL) {
1604 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
1606 DEBUG(DEBUG_ERR, ("Setting node %u as recovery fail culprit\n", node_pnn));
1608 ctdb_set_culprit(rec, node_pnn);
1613 static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
1615 struct ctdb_context *ctdb = rec->ctdb;
1616 int i;
1617 struct ctdb_banning_state *ban_state;
1619 *self_ban = false;
1620 for (i=0; i<ctdb->num_nodes; i++) {
1621 if (ctdb->nodes[i]->ban_state == NULL) {
1622 continue;
1624 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1625 if (ban_state->count < 2*ctdb->num_nodes) {
1626 continue;
1629 DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
1630 ctdb->nodes[i]->pnn, ban_state->count,
1631 ctdb->tunable.recovery_ban_period));
1632 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1633 ban_state->count = 0;
1635 /* Banning ourself? */
1636 if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
1637 *self_ban = true;
1642 static bool do_takeover_run(struct ctdb_recoverd *rec,
1643 struct ctdb_node_map *nodemap,
1644 bool banning_credits_on_fail)
1646 uint32_t *nodes = NULL;
1647 struct srvid_request_data dtr;
1648 TDB_DATA data;
1649 int i;
1650 uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
1651 int ret;
1652 bool ok;
1654 DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
1656 if (rec->takeover_run_in_progress) {
1657 DEBUG(DEBUG_ERR, (__location__
1658 " takeover run already in progress \n"));
1659 ok = false;
1660 goto done;
1663 rec->takeover_run_in_progress = true;
1665 /* If takeover runs are in disabled then fail... */
1666 if (rec->takeover_runs_disable_ctx != NULL) {
1667 DEBUG(DEBUG_ERR,
1668 ("Takeover runs are disabled so refusing to run one\n"));
1669 ok = false;
1670 goto done;
1673 /* Disable IP checks (takeover runs, really) on other nodes
1674 * while doing this takeover run. This will stop those other
1675 * nodes from triggering takeover runs when think they should
1676 * be hosting an IP but it isn't yet on an interface. Don't
1677 * wait for replies since a failure here might cause some
1678 * noise in the logs but will not actually cause a problem.
1680 dtr.srvid = 0; /* No reply */
1681 dtr.pnn = -1;
1683 data.dptr = (uint8_t*)&dtr;
1684 data.dsize = sizeof(dtr);
1686 nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
1688 /* Disable for 60 seconds. This can be a tunable later if
1689 * necessary.
1691 dtr.data = 60;
1692 for (i = 0; i < talloc_array_length(nodes); i++) {
1693 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1694 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1695 data) != 0) {
1696 DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
1700 ret = ctdb_takeover_run(rec->ctdb, nodemap,
1701 rec->force_rebalance_nodes,
1702 takeover_fail_callback,
1703 banning_credits_on_fail ? rec : NULL);
1705 /* Reenable takeover runs and IP checks on other nodes */
1706 dtr.data = 0;
1707 for (i = 0; i < talloc_array_length(nodes); i++) {
1708 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1709 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1710 data) != 0) {
1711 DEBUG(DEBUG_INFO,("Failed to reenable takeover runs\n"));
1715 if (ret != 0) {
1716 DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
1717 ok = false;
1718 goto done;
1721 ok = true;
1722 /* Takeover run was successful so clear force rebalance targets */
1723 if (rebalance_nodes == rec->force_rebalance_nodes) {
1724 TALLOC_FREE(rec->force_rebalance_nodes);
1725 } else {
1726 DEBUG(DEBUG_WARNING,
1727 ("Rebalance target nodes changed during takeover run - not clearing\n"));
1729 done:
1730 rec->need_takeover_run = !ok;
1731 talloc_free(nodes);
1732 rec->takeover_run_in_progress = false;
1734 DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
1735 return ok;
1740 we are the recmaster, and recovery is needed - start a recovery run
1742 static int do_recovery(struct ctdb_recoverd *rec,
1743 TALLOC_CTX *mem_ctx, uint32_t pnn,
1744 struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap)
1746 struct ctdb_context *ctdb = rec->ctdb;
1747 int i, j, ret;
1748 uint32_t generation;
1749 struct ctdb_dbid_map *dbmap;
1750 TDB_DATA data;
1751 uint32_t *nodes;
1752 struct timeval start_time;
1753 uint32_t culprit = (uint32_t)-1;
1754 bool self_ban;
1756 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1758 /* if recovery fails, force it again */
1759 rec->need_recovery = true;
1761 ban_misbehaving_nodes(rec, &self_ban);
1762 if (self_ban) {
1763 DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
1764 return -1;
1767 if (ctdb->tunable.verify_recovery_lock != 0) {
1768 DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n"));
1769 start_time = timeval_current();
1770 if (!ctdb_recovery_lock(ctdb, true)) {
1771 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
1772 "and ban ourself for %u seconds\n",
1773 ctdb->tunable.recovery_ban_period));
1774 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
1775 return -1;
1777 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
1778 DEBUG(DEBUG_NOTICE,("Recovery lock taken successfully by recovery daemon\n"));
1781 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1783 /* get a list of all databases */
1784 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1785 if (ret != 0) {
1786 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1787 return -1;
1790 /* we do the db creation before we set the recovery mode, so the freeze happens
1791 on all databases we will be dealing with. */
1793 /* verify that we have all the databases any other node has */
1794 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1795 if (ret != 0) {
1796 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1797 return -1;
1800 /* verify that all other nodes have all our databases */
1801 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1802 if (ret != 0) {
1803 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1804 return -1;
1806 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1808 /* update the database priority for all remote databases */
1809 ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
1810 if (ret != 0) {
1811 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
1813 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
1816 /* update all other nodes to use the same setting for reclock files
1817 as the local recovery master.
1819 sync_recovery_lock_file_across_cluster(rec);
1821 /* set recovery mode to active on all nodes */
1822 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1823 if (ret != 0) {
1824 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1825 return -1;
1828 /* execute the "startrecovery" event script on all nodes */
1829 ret = run_startrecovery_eventscript(rec, nodemap);
1830 if (ret!=0) {
1831 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1832 return -1;
1836 update all nodes to have the same flags that we have
1838 for (i=0;i<nodemap->num;i++) {
1839 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1840 continue;
1843 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1844 if (ret != 0) {
1845 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1846 DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
1847 } else {
1848 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1849 return -1;
1854 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1856 /* pick a new generation number */
1857 generation = new_generation();
1859 /* change the vnnmap on this node to use the new generation
1860 number but not on any other nodes.
1861 this guarantees that if we abort the recovery prematurely
1862 for some reason (a node stops responding?)
1863 that we can just return immediately and we will reenter
1864 recovery shortly again.
1865 I.e. we deliberately leave the cluster with an inconsistent
1866 generation id to allow us to abort recovery at any stage and
1867 just restart it from scratch.
1869 vnnmap->generation = generation;
1870 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1871 if (ret != 0) {
1872 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1873 return -1;
1876 data.dptr = (void *)&generation;
1877 data.dsize = sizeof(uint32_t);
1879 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1880 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1881 nodes, 0,
1882 CONTROL_TIMEOUT(), false, data,
1883 NULL,
1884 transaction_start_fail_callback,
1885 rec) != 0) {
1886 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1887 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
1888 nodes, 0,
1889 CONTROL_TIMEOUT(), false, tdb_null,
1890 NULL,
1891 NULL,
1892 NULL) != 0) {
1893 DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
1895 return -1;
1898 DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1900 for (i=0;i<dbmap->num;i++) {
1901 ret = recover_database(rec, mem_ctx,
1902 dbmap->dbs[i].dbid,
1903 dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT,
1904 pnn, nodemap, generation);
1905 if (ret != 0) {
1906 DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
1907 return -1;
1911 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
1913 /* commit all the changes */
1914 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
1915 nodes, 0,
1916 CONTROL_TIMEOUT(), false, data,
1917 NULL, NULL,
1918 NULL) != 0) {
1919 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
1920 return -1;
1923 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
1926 /* update the capabilities for all nodes */
1927 ret = update_capabilities(ctdb, nodemap);
1928 if (ret!=0) {
1929 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1930 return -1;
1933 /* build a new vnn map with all the currently active and
1934 unbanned nodes */
1935 generation = new_generation();
1936 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
1937 CTDB_NO_MEMORY(ctdb, vnnmap);
1938 vnnmap->generation = generation;
1939 vnnmap->size = 0;
1940 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
1941 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1942 for (i=j=0;i<nodemap->num;i++) {
1943 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1944 continue;
1946 if (!(ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER)) {
1947 /* this node can not be an lmaster */
1948 DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
1949 continue;
1952 vnnmap->size++;
1953 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1954 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1955 vnnmap->map[j++] = nodemap->nodes[i].pnn;
1958 if (vnnmap->size == 0) {
1959 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
1960 vnnmap->size++;
1961 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1962 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1963 vnnmap->map[0] = pnn;
1966 /* update to the new vnnmap on all nodes */
1967 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
1968 if (ret != 0) {
1969 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
1970 return -1;
1973 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
1975 /* update recmaster to point to us for all nodes */
1976 ret = set_recovery_master(ctdb, nodemap, pnn);
1977 if (ret!=0) {
1978 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
1979 return -1;
1982 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
1984 /* disable recovery mode */
1985 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL);
1986 if (ret != 0) {
1987 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
1988 return -1;
1991 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
1993 /* Fetch known/available public IPs from each active node */
1994 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
1995 if (ret != 0) {
1996 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
1997 culprit));
1998 rec->need_takeover_run = true;
1999 return -1;
2002 do_takeover_run(rec, nodemap, false);
2004 /* execute the "recovered" event script on all nodes */
2005 ret = run_recovered_eventscript(rec, nodemap, "do_recovery");
2006 if (ret!=0) {
2007 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
2008 return -1;
2011 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
2013 /* send a message to all clients telling them that the cluster
2014 has been reconfigured */
2015 ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
2016 CTDB_SRVID_RECONFIGURE, tdb_null);
2017 if (ret != 0) {
2018 DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n"));
2019 return -1;
2022 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
2024 rec->need_recovery = false;
2026 /* we managed to complete a full recovery, make sure to forgive
2027 any past sins by the nodes that could now participate in the
2028 recovery.
2030 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
2031 for (i=0;i<nodemap->num;i++) {
2032 struct ctdb_banning_state *ban_state;
2034 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2035 continue;
2038 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
2039 if (ban_state == NULL) {
2040 continue;
2043 ban_state->count = 0;
2047 /* We just finished a recovery successfully.
2048 We now wait for rerecovery_timeout before we allow
2049 another recovery to take place.
2051 DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
2052 ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
2053 DEBUG(DEBUG_NOTICE, ("The rerecovery timeout has elapsed. We now allow recoveries to trigger again.\n"));
2055 return 0;
2060 elections are won by first checking the number of connected nodes, then
2061 the priority time, then the pnn
2063 struct election_message {
2064 uint32_t num_connected;
2065 struct timeval priority_time;
2066 uint32_t pnn;
2067 uint32_t node_flags;
2071 form this nodes election data
2073 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
2075 int ret, i;
2076 struct ctdb_node_map *nodemap;
2077 struct ctdb_context *ctdb = rec->ctdb;
2079 ZERO_STRUCTP(em);
2081 em->pnn = rec->ctdb->pnn;
2082 em->priority_time = rec->priority_time;
2084 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
2085 if (ret != 0) {
2086 DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
2087 return;
2090 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
2091 em->node_flags = rec->node_flags;
2093 for (i=0;i<nodemap->num;i++) {
2094 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
2095 em->num_connected++;
2099 /* we shouldnt try to win this election if we cant be a recmaster */
2100 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2101 em->num_connected = 0;
2102 em->priority_time = timeval_current();
2105 talloc_free(nodemap);
2109 see if the given election data wins
2111 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
2113 struct election_message myem;
2114 int cmp = 0;
2116 ctdb_election_data(rec, &myem);
2118 /* we cant win if we dont have the recmaster capability */
2119 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2120 return false;
2123 /* we cant win if we are banned */
2124 if (rec->node_flags & NODE_FLAGS_BANNED) {
2125 return false;
2128 /* we cant win if we are stopped */
2129 if (rec->node_flags & NODE_FLAGS_STOPPED) {
2130 return false;
2133 /* we will automatically win if the other node is banned */
2134 if (em->node_flags & NODE_FLAGS_BANNED) {
2135 return true;
2138 /* we will automatically win if the other node is banned */
2139 if (em->node_flags & NODE_FLAGS_STOPPED) {
2140 return true;
2143 /* try to use the most connected node */
2144 if (cmp == 0) {
2145 cmp = (int)myem.num_connected - (int)em->num_connected;
2148 /* then the longest running node */
2149 if (cmp == 0) {
2150 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
2153 if (cmp == 0) {
2154 cmp = (int)myem.pnn - (int)em->pnn;
2157 return cmp > 0;
2161 send out an election request
2163 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
2165 int ret;
2166 TDB_DATA election_data;
2167 struct election_message emsg;
2168 uint64_t srvid;
2169 struct ctdb_context *ctdb = rec->ctdb;
2171 srvid = CTDB_SRVID_RECOVERY;
2173 ctdb_election_data(rec, &emsg);
2175 election_data.dsize = sizeof(struct election_message);
2176 election_data.dptr = (unsigned char *)&emsg;
2179 /* first we assume we will win the election and set
2180 recoverymaster to be ourself on the current node
2182 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
2183 if (ret != 0) {
2184 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
2185 return -1;
2189 /* send an election message to all active nodes */
2190 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
2191 return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
2195 this function will unban all nodes in the cluster
2197 static void unban_all_nodes(struct ctdb_context *ctdb)
2199 int ret, i;
2200 struct ctdb_node_map *nodemap;
2201 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2203 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2204 if (ret != 0) {
2205 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
2206 return;
2209 for (i=0;i<nodemap->num;i++) {
2210 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
2211 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
2212 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(),
2213 nodemap->nodes[i].pnn, 0,
2214 NODE_FLAGS_BANNED);
2215 if (ret != 0) {
2216 DEBUG(DEBUG_ERR, (__location__ " failed to reset ban state\n"));
2221 talloc_free(tmp_ctx);
2226 we think we are winning the election - send a broadcast election request
2228 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
2230 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2231 int ret;
2233 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
2234 if (ret != 0) {
2235 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
2238 talloc_free(rec->send_election_te);
2239 rec->send_election_te = NULL;
2243 handler for memory dumps
2245 static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
2246 TDB_DATA data, void *private_data)
2248 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2249 TDB_DATA *dump;
2250 int ret;
2251 struct srvid_request *rd;
2253 if (data.dsize != sizeof(struct srvid_request)) {
2254 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2255 talloc_free(tmp_ctx);
2256 return;
2258 rd = (struct srvid_request *)data.dptr;
2260 dump = talloc_zero(tmp_ctx, TDB_DATA);
2261 if (dump == NULL) {
2262 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
2263 talloc_free(tmp_ctx);
2264 return;
2266 ret = ctdb_dump_memory(ctdb, dump);
2267 if (ret != 0) {
2268 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
2269 talloc_free(tmp_ctx);
2270 return;
2273 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
2275 ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
2276 if (ret != 0) {
2277 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
2278 talloc_free(tmp_ctx);
2279 return;
2282 talloc_free(tmp_ctx);
2286 handler for getlog
2288 static void getlog_handler(struct ctdb_context *ctdb, uint64_t srvid,
2289 TDB_DATA data, void *private_data)
2291 struct ctdb_get_log_addr *log_addr;
2292 pid_t child;
2294 if (data.dsize != sizeof(struct ctdb_get_log_addr)) {
2295 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2296 return;
2298 log_addr = (struct ctdb_get_log_addr *)data.dptr;
2300 child = ctdb_fork_no_free_ringbuffer(ctdb);
2301 if (child == (pid_t)-1) {
2302 DEBUG(DEBUG_ERR,("Failed to fork a log collector child\n"));
2303 return;
2306 if (child == 0) {
2307 ctdb_set_process_name("ctdb_rec_log_collector");
2308 if (switch_from_server_to_client(ctdb, "recoverd-log-collector") != 0) {
2309 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch log collector child into client mode.\n"));
2310 _exit(1);
2312 ctdb_collect_log(ctdb, log_addr);
2313 _exit(0);
2318 handler for clearlog
2320 static void clearlog_handler(struct ctdb_context *ctdb, uint64_t srvid,
2321 TDB_DATA data, void *private_data)
2323 ctdb_clear_log(ctdb);
2327 handler for reload_nodes
2329 static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid,
2330 TDB_DATA data, void *private_data)
2332 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2334 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
2336 ctdb_load_nodes_file(rec->ctdb);
2340 static void ctdb_rebalance_timeout(struct event_context *ev,
2341 struct timed_event *te,
2342 struct timeval t, void *p)
2344 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2346 if (rec->force_rebalance_nodes == NULL) {
2347 DEBUG(DEBUG_ERR,
2348 ("Rebalance timeout occurred - no nodes to rebalance\n"));
2349 return;
2352 DEBUG(DEBUG_NOTICE,
2353 ("Rebalance timeout occurred - do takeover run\n"));
2354 do_takeover_run(rec, rec->nodemap, false);
2358 static void recd_node_rebalance_handler(struct ctdb_context *ctdb,
2359 uint64_t srvid,
2360 TDB_DATA data, void *private_data)
2362 uint32_t pnn;
2363 uint32_t *t;
2364 int len;
2365 uint32_t deferred_rebalance;
2366 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2368 if (rec->recmaster != ctdb_get_pnn(ctdb)) {
2369 return;
2372 if (data.dsize != sizeof(uint32_t)) {
2373 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
2374 return;
2377 pnn = *(uint32_t *)&data.dptr[0];
2379 DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
2381 /* Copy any existing list of nodes. There's probably some
2382 * sort of realloc variant that will do this but we need to
2383 * make sure that freeing the old array also cancels the timer
2384 * event for the timeout... not sure if realloc will do that.
2386 len = (rec->force_rebalance_nodes != NULL) ?
2387 talloc_array_length(rec->force_rebalance_nodes) :
2390 /* This allows duplicates to be added but they don't cause
2391 * harm. A call to add a duplicate PNN arguably means that
2392 * the timeout should be reset, so this is the simplest
2393 * solution.
2395 t = talloc_zero_array(rec, uint32_t, len+1);
2396 CTDB_NO_MEMORY_VOID(ctdb, t);
2397 if (len > 0) {
2398 memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
2400 t[len] = pnn;
2402 talloc_free(rec->force_rebalance_nodes);
2404 rec->force_rebalance_nodes = t;
2406 /* If configured, setup a deferred takeover run to make sure
2407 * that certain nodes get IPs rebalanced to them. This will
2408 * be cancelled if a successful takeover run happens before
2409 * the timeout. Assign tunable value to variable for
2410 * readability.
2412 deferred_rebalance = ctdb->tunable.deferred_rebalance_on_node_add;
2413 if (deferred_rebalance != 0) {
2414 event_add_timed(ctdb->ev, rec->force_rebalance_nodes,
2415 timeval_current_ofs(deferred_rebalance, 0),
2416 ctdb_rebalance_timeout, rec);
2422 static void recd_update_ip_handler(struct ctdb_context *ctdb, uint64_t srvid,
2423 TDB_DATA data, void *private_data)
2425 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2426 struct ctdb_public_ip *ip;
2428 if (rec->recmaster != rec->ctdb->pnn) {
2429 DEBUG(DEBUG_INFO,("Not recmaster, ignore update ip message\n"));
2430 return;
2433 if (data.dsize != sizeof(struct ctdb_public_ip)) {
2434 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(struct ctdb_public_ip)));
2435 return;
2438 ip = (struct ctdb_public_ip *)data.dptr;
2440 update_ip_assignment_tree(rec->ctdb, ip);
2444 static void clear_takeover_runs_disable(struct ctdb_recoverd *rec)
2446 TALLOC_FREE(rec->takeover_runs_disable_ctx);
2449 static void reenable_takeover_runs(struct event_context *ev,
2450 struct timed_event *te,
2451 struct timeval yt, void *p)
2453 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2455 DEBUG(DEBUG_NOTICE,("Reenabling takeover runs after timeout\n"));
2456 clear_takeover_runs_disable(rec);
2459 static void disable_takeover_runs_handler(struct ctdb_context *ctdb,
2460 uint64_t srvid, TDB_DATA data,
2461 void *private_data)
2463 struct ctdb_recoverd *rec = talloc_get_type(private_data,
2464 struct ctdb_recoverd);
2465 struct srvid_request_data *r;
2466 uint32_t timeout;
2467 TDB_DATA result;
2468 int32_t ret = 0;
2470 /* Validate input data */
2471 if (data.dsize != sizeof(struct srvid_request_data)) {
2472 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2473 "expecting %lu\n", (long unsigned)data.dsize,
2474 (long unsigned)sizeof(struct srvid_request)));
2475 return;
2477 if (data.dptr == NULL) {
2478 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2479 return;
2482 r = (struct srvid_request_data *)data.dptr;
2483 timeout = r->data;
2485 if (timeout == 0) {
2486 DEBUG(DEBUG_NOTICE,("Reenabling takeover runs\n"));
2487 clear_takeover_runs_disable(rec);
2488 ret = ctdb_get_pnn(ctdb);
2489 goto done;
2492 if (rec->takeover_run_in_progress) {
2493 DEBUG(DEBUG_ERR,
2494 ("Unable to disable takeover runs - in progress\n"));
2495 ret = -EAGAIN;
2496 goto done;
2499 DEBUG(DEBUG_NOTICE,("Disabling takeover runs for %u seconds\n", timeout));
2501 /* Clear any old timers */
2502 clear_takeover_runs_disable(rec);
2504 /* When this is non-NULL it indicates that takeover runs are
2505 * disabled. This context also holds the timeout timer.
2507 rec->takeover_runs_disable_ctx = talloc_new(rec);
2508 if (rec->takeover_runs_disable_ctx == NULL) {
2509 DEBUG(DEBUG_ERR,(__location__ " Unable to allocate memory\n"));
2510 ret = -ENOMEM;
2511 goto done;
2514 /* Arrange for the timeout to occur */
2515 event_add_timed(ctdb->ev, rec->takeover_runs_disable_ctx,
2516 timeval_current_ofs(timeout, 0),
2517 reenable_takeover_runs,
2518 rec);
2520 /* Returning our PNN tells the caller that we succeeded */
2521 ret = ctdb_get_pnn(ctdb);
2522 done:
2523 result.dsize = sizeof(int32_t);
2524 result.dptr = (uint8_t *)&ret;
2525 srvid_request_reply(ctdb, (struct srvid_request *)r, result);
2528 /* Backward compatibility for this SRVID - call
2529 * disable_takeover_runs_handler() instead
2531 static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid,
2532 TDB_DATA data, void *private_data)
2534 struct ctdb_recoverd *rec = talloc_get_type(private_data,
2535 struct ctdb_recoverd);
2536 TDB_DATA data2;
2537 struct srvid_request_data *req;
2539 if (data.dsize != sizeof(uint32_t)) {
2540 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2541 "expecting %lu\n", (long unsigned)data.dsize,
2542 (long unsigned)sizeof(uint32_t)));
2543 return;
2545 if (data.dptr == NULL) {
2546 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2547 return;
2550 req = talloc(ctdb, struct srvid_request_data);
2551 CTDB_NO_MEMORY_VOID(ctdb, req);
2553 req->srvid = 0; /* No reply */
2554 req->pnn = -1;
2555 req->data = *((uint32_t *)data.dptr); /* Timeout */
2557 data2.dsize = sizeof(*req);
2558 data2.dptr = (uint8_t *)req;
2560 disable_takeover_runs_handler(rec->ctdb,
2561 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
2562 data2, rec);
2566 handler for ip reallocate, just add it to the list of requests and
2567 handle this later in the monitor_cluster loop so we do not recurse
2568 with other requests to takeover_run()
2570 static void ip_reallocate_handler(struct ctdb_context *ctdb, uint64_t srvid,
2571 TDB_DATA data, void *private_data)
2573 struct srvid_request *request;
2574 struct ctdb_recoverd *rec = talloc_get_type(private_data,
2575 struct ctdb_recoverd);
2577 if (data.dsize != sizeof(struct srvid_request)) {
2578 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2579 return;
2582 request = (struct srvid_request *)data.dptr;
2584 srvid_request_add(ctdb, &rec->reallocate_requests, request);
2587 static void process_ipreallocate_requests(struct ctdb_context *ctdb,
2588 struct ctdb_recoverd *rec)
2590 TDB_DATA result;
2591 int32_t ret;
2592 uint32_t culprit;
2593 struct srvid_requests *current;
2595 DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
2597 /* Only process requests that are currently pending. More
2598 * might come in while the takeover run is in progress and
2599 * they will need to be processed later since they might
2600 * be in response flag changes.
2602 current = rec->reallocate_requests;
2603 rec->reallocate_requests = NULL;
2605 /* update the list of public ips that a node can handle for
2606 all connected nodes
2608 ret = ctdb_reload_remote_public_ips(ctdb, rec, rec->nodemap, &culprit);
2609 if (ret != 0) {
2610 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2611 culprit));
2612 rec->need_takeover_run = true;
2614 if (ret == 0) {
2615 if (do_takeover_run(rec, rec->nodemap, false)) {
2616 ret = ctdb_get_pnn(ctdb);
2617 } else {
2618 ret = -1;
2622 result.dsize = sizeof(int32_t);
2623 result.dptr = (uint8_t *)&ret;
2625 srvid_requests_reply(ctdb, &current, result);
2630 handler for recovery master elections
2632 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
2633 TDB_DATA data, void *private_data)
2635 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2636 int ret;
2637 struct election_message *em = (struct election_message *)data.dptr;
2638 TALLOC_CTX *mem_ctx;
2640 /* Ignore election packets from ourself */
2641 if (ctdb->pnn == em->pnn) {
2642 return;
2645 /* we got an election packet - update the timeout for the election */
2646 talloc_free(rec->election_timeout);
2647 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2648 fast_start ?
2649 timeval_current_ofs(0, 500000) :
2650 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2651 ctdb_election_timeout, rec);
2653 mem_ctx = talloc_new(ctdb);
2655 /* someone called an election. check their election data
2656 and if we disagree and we would rather be the elected node,
2657 send a new election message to all other nodes
2659 if (ctdb_election_win(rec, em)) {
2660 if (!rec->send_election_te) {
2661 rec->send_election_te = event_add_timed(ctdb->ev, rec,
2662 timeval_current_ofs(0, 500000),
2663 election_send_request, rec);
2665 talloc_free(mem_ctx);
2666 /*unban_all_nodes(ctdb);*/
2667 return;
2670 /* we didn't win */
2671 talloc_free(rec->send_election_te);
2672 rec->send_election_te = NULL;
2674 if (ctdb->tunable.verify_recovery_lock != 0) {
2675 /* release the recmaster lock */
2676 if (em->pnn != ctdb->pnn &&
2677 ctdb->recovery_lock_fd != -1) {
2678 close(ctdb->recovery_lock_fd);
2679 ctdb->recovery_lock_fd = -1;
2680 unban_all_nodes(ctdb);
2684 /* ok, let that guy become recmaster then */
2685 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
2686 if (ret != 0) {
2687 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
2688 talloc_free(mem_ctx);
2689 return;
2692 talloc_free(mem_ctx);
2693 return;
2698 force the start of the election process
2700 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
2701 struct ctdb_node_map *nodemap)
2703 int ret;
2704 struct ctdb_context *ctdb = rec->ctdb;
2706 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
2708 /* set all nodes to recovery mode to stop all internode traffic */
2709 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
2710 if (ret != 0) {
2711 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
2712 return;
2715 talloc_free(rec->election_timeout);
2716 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2717 fast_start ?
2718 timeval_current_ofs(0, 500000) :
2719 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2720 ctdb_election_timeout, rec);
2722 ret = send_election_request(rec, pnn);
2723 if (ret!=0) {
2724 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
2725 return;
2728 /* wait for a few seconds to collect all responses */
2729 ctdb_wait_election(rec);
2735 handler for when a node changes its flags
2737 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
2738 TDB_DATA data, void *private_data)
2740 int ret;
2741 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2742 struct ctdb_node_map *nodemap=NULL;
2743 TALLOC_CTX *tmp_ctx;
2744 int i;
2745 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2746 int disabled_flag_changed;
2748 if (data.dsize != sizeof(*c)) {
2749 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2750 return;
2753 tmp_ctx = talloc_new(ctdb);
2754 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2756 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2757 if (ret != 0) {
2758 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2759 talloc_free(tmp_ctx);
2760 return;
2764 for (i=0;i<nodemap->num;i++) {
2765 if (nodemap->nodes[i].pnn == c->pnn) break;
2768 if (i == nodemap->num) {
2769 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2770 talloc_free(tmp_ctx);
2771 return;
2774 if (c->old_flags != c->new_flags) {
2775 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2778 disabled_flag_changed = (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
2780 nodemap->nodes[i].flags = c->new_flags;
2782 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2783 CTDB_CURRENT_NODE, &ctdb->recovery_master);
2785 if (ret == 0) {
2786 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2787 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2790 if (ret == 0 &&
2791 ctdb->recovery_master == ctdb->pnn &&
2792 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2793 /* Only do the takeover run if the perm disabled or unhealthy
2794 flags changed since these will cause an ip failover but not
2795 a recovery.
2796 If the node became disconnected or banned this will also
2797 lead to an ip address failover but that is handled
2798 during recovery
2800 if (disabled_flag_changed) {
2801 rec->need_takeover_run = true;
2805 talloc_free(tmp_ctx);
2809 handler for when we need to push out flag changes ot all other nodes
2811 static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
2812 TDB_DATA data, void *private_data)
2814 int ret;
2815 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2816 struct ctdb_node_map *nodemap=NULL;
2817 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2818 uint32_t recmaster;
2819 uint32_t *nodes;
2821 /* find the recovery master */
2822 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
2823 if (ret != 0) {
2824 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from local node\n"));
2825 talloc_free(tmp_ctx);
2826 return;
2829 /* read the node flags from the recmaster */
2830 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), recmaster, tmp_ctx, &nodemap);
2831 if (ret != 0) {
2832 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2833 talloc_free(tmp_ctx);
2834 return;
2836 if (c->pnn >= nodemap->num) {
2837 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2838 talloc_free(tmp_ctx);
2839 return;
2842 /* send the flags update to all connected nodes */
2843 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2845 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2846 nodes, 0, CONTROL_TIMEOUT(),
2847 false, data,
2848 NULL, NULL,
2849 NULL) != 0) {
2850 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2852 talloc_free(tmp_ctx);
2853 return;
2856 talloc_free(tmp_ctx);
2860 struct verify_recmode_normal_data {
2861 uint32_t count;
2862 enum monitor_result status;
2865 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2867 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2870 /* one more node has responded with recmode data*/
2871 rmdata->count--;
2873 /* if we failed to get the recmode, then return an error and let
2874 the main loop try again.
2876 if (state->state != CTDB_CONTROL_DONE) {
2877 if (rmdata->status == MONITOR_OK) {
2878 rmdata->status = MONITOR_FAILED;
2880 return;
2883 /* if we got a response, then the recmode will be stored in the
2884 status field
2886 if (state->status != CTDB_RECOVERY_NORMAL) {
2887 DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
2888 rmdata->status = MONITOR_RECOVERY_NEEDED;
2891 return;
2895 /* verify that all nodes are in normal recovery mode */
2896 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2898 struct verify_recmode_normal_data *rmdata;
2899 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2900 struct ctdb_client_control_state *state;
2901 enum monitor_result status;
2902 int j;
2904 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2905 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2906 rmdata->count = 0;
2907 rmdata->status = MONITOR_OK;
2909 /* loop over all active nodes and send an async getrecmode call to
2910 them*/
2911 for (j=0; j<nodemap->num; j++) {
2912 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2913 continue;
2915 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2916 CONTROL_TIMEOUT(),
2917 nodemap->nodes[j].pnn);
2918 if (state == NULL) {
2919 /* we failed to send the control, treat this as
2920 an error and try again next iteration
2922 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2923 talloc_free(mem_ctx);
2924 return MONITOR_FAILED;
2927 /* set up the callback functions */
2928 state->async.fn = verify_recmode_normal_callback;
2929 state->async.private_data = rmdata;
2931 /* one more control to wait for to complete */
2932 rmdata->count++;
2936 /* now wait for up to the maximum number of seconds allowed
2937 or until all nodes we expect a response from has replied
2939 while (rmdata->count > 0) {
2940 event_loop_once(ctdb->ev);
2943 status = rmdata->status;
2944 talloc_free(mem_ctx);
2945 return status;
2949 struct verify_recmaster_data {
2950 struct ctdb_recoverd *rec;
2951 uint32_t count;
2952 uint32_t pnn;
2953 enum monitor_result status;
2956 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2958 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2961 /* one more node has responded with recmaster data*/
2962 rmdata->count--;
2964 /* if we failed to get the recmaster, then return an error and let
2965 the main loop try again.
2967 if (state->state != CTDB_CONTROL_DONE) {
2968 if (rmdata->status == MONITOR_OK) {
2969 rmdata->status = MONITOR_FAILED;
2971 return;
2974 /* if we got a response, then the recmaster will be stored in the
2975 status field
2977 if (state->status != rmdata->pnn) {
2978 DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
2979 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2980 rmdata->status = MONITOR_ELECTION_NEEDED;
2983 return;
2987 /* verify that all nodes agree that we are the recmaster */
2988 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
2990 struct ctdb_context *ctdb = rec->ctdb;
2991 struct verify_recmaster_data *rmdata;
2992 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2993 struct ctdb_client_control_state *state;
2994 enum monitor_result status;
2995 int j;
2997 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2998 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2999 rmdata->rec = rec;
3000 rmdata->count = 0;
3001 rmdata->pnn = pnn;
3002 rmdata->status = MONITOR_OK;
3004 /* loop over all active nodes and send an async getrecmaster call to
3005 them*/
3006 for (j=0; j<nodemap->num; j++) {
3007 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3008 continue;
3010 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
3011 CONTROL_TIMEOUT(),
3012 nodemap->nodes[j].pnn);
3013 if (state == NULL) {
3014 /* we failed to send the control, treat this as
3015 an error and try again next iteration
3017 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
3018 talloc_free(mem_ctx);
3019 return MONITOR_FAILED;
3022 /* set up the callback functions */
3023 state->async.fn = verify_recmaster_callback;
3024 state->async.private_data = rmdata;
3026 /* one more control to wait for to complete */
3027 rmdata->count++;
3031 /* now wait for up to the maximum number of seconds allowed
3032 or until all nodes we expect a response from has replied
3034 while (rmdata->count > 0) {
3035 event_loop_once(ctdb->ev);
3038 status = rmdata->status;
3039 talloc_free(mem_ctx);
3040 return status;
3043 static bool interfaces_have_changed(struct ctdb_context *ctdb,
3044 struct ctdb_recoverd *rec)
3046 struct ctdb_control_get_ifaces *ifaces = NULL;
3047 TALLOC_CTX *mem_ctx;
3048 bool ret = false;
3050 mem_ctx = talloc_new(NULL);
3052 /* Read the interfaces from the local node */
3053 if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
3054 CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
3055 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
3056 /* We could return an error. However, this will be
3057 * rare so we'll decide that the interfaces have
3058 * actually changed, just in case.
3060 talloc_free(mem_ctx);
3061 return true;
3064 if (!rec->ifaces) {
3065 /* We haven't been here before so things have changed */
3066 DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
3067 ret = true;
3068 } else if (rec->ifaces->num != ifaces->num) {
3069 /* Number of interfaces has changed */
3070 DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
3071 rec->ifaces->num, ifaces->num));
3072 ret = true;
3073 } else {
3074 /* See if interface names or link states have changed */
3075 int i;
3076 for (i = 0; i < rec->ifaces->num; i++) {
3077 struct ctdb_control_iface_info * iface = &rec->ifaces->ifaces[i];
3078 if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
3079 DEBUG(DEBUG_NOTICE,
3080 ("Interface in slot %d changed: %s => %s\n",
3081 i, iface->name, ifaces->ifaces[i].name));
3082 ret = true;
3083 break;
3085 if (iface->link_state != ifaces->ifaces[i].link_state) {
3086 DEBUG(DEBUG_NOTICE,
3087 ("Interface %s changed state: %d => %d\n",
3088 iface->name, iface->link_state,
3089 ifaces->ifaces[i].link_state));
3090 ret = true;
3091 break;
3096 talloc_free(rec->ifaces);
3097 rec->ifaces = talloc_steal(rec, ifaces);
3099 talloc_free(mem_ctx);
3100 return ret;
3103 /* called to check that the local allocation of public ip addresses is ok.
3105 static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn, struct ctdb_node_map *nodemap)
3107 TALLOC_CTX *mem_ctx = talloc_new(NULL);
3108 struct ctdb_uptime *uptime1 = NULL;
3109 struct ctdb_uptime *uptime2 = NULL;
3110 int ret, j;
3111 bool need_takeover_run = false;
3113 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
3114 CTDB_CURRENT_NODE, &uptime1);
3115 if (ret != 0) {
3116 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
3117 talloc_free(mem_ctx);
3118 return -1;
3121 if (interfaces_have_changed(ctdb, rec)) {
3122 DEBUG(DEBUG_NOTICE, ("The interfaces status has changed on "
3123 "local node %u - force takeover run\n",
3124 pnn));
3125 need_takeover_run = true;
3128 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
3129 CTDB_CURRENT_NODE, &uptime2);
3130 if (ret != 0) {
3131 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
3132 talloc_free(mem_ctx);
3133 return -1;
3136 /* skip the check if the startrecovery time has changed */
3137 if (timeval_compare(&uptime1->last_recovery_started,
3138 &uptime2->last_recovery_started) != 0) {
3139 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3140 talloc_free(mem_ctx);
3141 return 0;
3144 /* skip the check if the endrecovery time has changed */
3145 if (timeval_compare(&uptime1->last_recovery_finished,
3146 &uptime2->last_recovery_finished) != 0) {
3147 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3148 talloc_free(mem_ctx);
3149 return 0;
3152 /* skip the check if we have started but not finished recovery */
3153 if (timeval_compare(&uptime1->last_recovery_finished,
3154 &uptime1->last_recovery_started) != 1) {
3155 DEBUG(DEBUG_INFO, (__location__ " in the middle of recovery or ip reallocation. skipping public ip address check\n"));
3156 talloc_free(mem_ctx);
3158 return 0;
3161 /* verify that we have the ip addresses we should have
3162 and we dont have ones we shouldnt have.
3163 if we find an inconsistency we set recmode to
3164 active on the local node and wait for the recmaster
3165 to do a full blown recovery.
3166 also if the pnn is -1 and we are healthy and can host the ip
3167 we also request a ip reallocation.
3169 if (ctdb->tunable.disable_ip_failover == 0) {
3170 struct ctdb_all_public_ips *ips = NULL;
3172 /* read the *available* IPs from the local node */
3173 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
3174 if (ret != 0) {
3175 DEBUG(DEBUG_ERR, ("Unable to get available public IPs from local node %u\n", pnn));
3176 talloc_free(mem_ctx);
3177 return -1;
3180 for (j=0; j<ips->num; j++) {
3181 if (ips->ips[j].pnn == -1 &&
3182 nodemap->nodes[pnn].flags == 0) {
3183 DEBUG(DEBUG_CRIT,("Public IP '%s' is not assigned and we could serve it\n",
3184 ctdb_addr_to_str(&ips->ips[j].addr)));
3185 need_takeover_run = true;
3189 talloc_free(ips);
3191 /* read the *known* IPs from the local node */
3192 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
3193 if (ret != 0) {
3194 DEBUG(DEBUG_ERR, ("Unable to get known public IPs from local node %u\n", pnn));
3195 talloc_free(mem_ctx);
3196 return -1;
3199 for (j=0; j<ips->num; j++) {
3200 if (ips->ips[j].pnn == pnn) {
3201 if (ctdb->do_checkpublicip && !ctdb_sys_have_ip(&ips->ips[j].addr)) {
3202 DEBUG(DEBUG_CRIT,("Public IP '%s' is assigned to us but not on an interface\n",
3203 ctdb_addr_to_str(&ips->ips[j].addr)));
3204 need_takeover_run = true;
3206 } else {
3207 if (ctdb->do_checkpublicip &&
3208 ctdb_sys_have_ip(&ips->ips[j].addr)) {
3210 DEBUG(DEBUG_CRIT,("We are still serving a public IP '%s' that we should not be serving. Removing it\n",
3211 ctdb_addr_to_str(&ips->ips[j].addr)));
3213 if (ctdb_ctrl_release_ip(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ips->ips[j]) != 0) {
3214 DEBUG(DEBUG_ERR,("Failed to release local IP address\n"));
3221 if (need_takeover_run) {
3222 struct srvid_request rd;
3223 TDB_DATA data;
3225 DEBUG(DEBUG_CRIT,("Trigger takeoverrun\n"));
3227 rd.pnn = ctdb->pnn;
3228 rd.srvid = 0;
3229 data.dptr = (uint8_t *)&rd;
3230 data.dsize = sizeof(rd);
3232 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
3233 if (ret != 0) {
3234 DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
3237 talloc_free(mem_ctx);
3238 return 0;
3242 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
3244 struct ctdb_node_map **remote_nodemaps = callback_data;
3246 if (node_pnn >= ctdb->num_nodes) {
3247 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
3248 return;
3251 remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
3255 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
3256 struct ctdb_node_map *nodemap,
3257 struct ctdb_node_map **remote_nodemaps)
3259 uint32_t *nodes;
3261 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
3262 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
3263 nodes, 0,
3264 CONTROL_TIMEOUT(), false, tdb_null,
3265 async_getnodemap_callback,
3266 NULL,
3267 remote_nodemaps) != 0) {
3268 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
3270 return -1;
3273 return 0;
3276 enum reclock_child_status { RECLOCK_CHECKING, RECLOCK_OK, RECLOCK_FAILED, RECLOCK_TIMEOUT};
3277 struct ctdb_check_reclock_state {
3278 struct ctdb_context *ctdb;
3279 struct timeval start_time;
3280 int fd[2];
3281 pid_t child;
3282 struct timed_event *te;
3283 struct fd_event *fde;
3284 enum reclock_child_status status;
3287 /* when we free the reclock state we must kill any child process.
3289 static int check_reclock_destructor(struct ctdb_check_reclock_state *state)
3291 struct ctdb_context *ctdb = state->ctdb;
3293 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&state->start_time));
3295 if (state->fd[0] != -1) {
3296 close(state->fd[0]);
3297 state->fd[0] = -1;
3299 if (state->fd[1] != -1) {
3300 close(state->fd[1]);
3301 state->fd[1] = -1;
3303 ctdb_kill(ctdb, state->child, SIGKILL);
3304 return 0;
3308 called if our check_reclock child times out. this would happen if
3309 i/o to the reclock file blocks.
3311 static void ctdb_check_reclock_timeout(struct event_context *ev, struct timed_event *te,
3312 struct timeval t, void *private_data)
3314 struct ctdb_check_reclock_state *state = talloc_get_type(private_data,
3315 struct ctdb_check_reclock_state);
3317 DEBUG(DEBUG_ERR,(__location__ " check_reclock child process hung/timedout CFS slow to grant locks?\n"));
3318 state->status = RECLOCK_TIMEOUT;
3321 /* this is called when the child process has completed checking the reclock
3322 file and has written data back to us through the pipe.
3324 static void reclock_child_handler(struct event_context *ev, struct fd_event *fde,
3325 uint16_t flags, void *private_data)
3327 struct ctdb_check_reclock_state *state= talloc_get_type(private_data,
3328 struct ctdb_check_reclock_state);
3329 char c = 0;
3330 int ret;
3332 /* we got a response from our child process so we can abort the
3333 timeout.
3335 talloc_free(state->te);
3336 state->te = NULL;
3338 ret = read(state->fd[0], &c, 1);
3339 if (ret != 1 || c != RECLOCK_OK) {
3340 DEBUG(DEBUG_ERR,(__location__ " reclock child process returned error %d\n", c));
3341 state->status = RECLOCK_FAILED;
3343 return;
3346 state->status = RECLOCK_OK;
3347 return;
3350 static int check_recovery_lock(struct ctdb_context *ctdb)
3352 int ret;
3353 struct ctdb_check_reclock_state *state;
3354 pid_t parent = getpid();
3356 if (ctdb->recovery_lock_fd == -1) {
3357 DEBUG(DEBUG_CRIT,("recovery master doesn't have the recovery lock\n"));
3358 return -1;
3361 state = talloc(ctdb, struct ctdb_check_reclock_state);
3362 CTDB_NO_MEMORY(ctdb, state);
3364 state->ctdb = ctdb;
3365 state->start_time = timeval_current();
3366 state->status = RECLOCK_CHECKING;
3367 state->fd[0] = -1;
3368 state->fd[1] = -1;
3370 ret = pipe(state->fd);
3371 if (ret != 0) {
3372 talloc_free(state);
3373 DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for check_reclock child\n"));
3374 return -1;
3377 state->child = ctdb_fork(ctdb);
3378 if (state->child == (pid_t)-1) {
3379 DEBUG(DEBUG_CRIT,(__location__ " fork() failed in check_reclock child\n"));
3380 close(state->fd[0]);
3381 state->fd[0] = -1;
3382 close(state->fd[1]);
3383 state->fd[1] = -1;
3384 talloc_free(state);
3385 return -1;
3388 if (state->child == 0) {
3389 char cc = RECLOCK_OK;
3390 close(state->fd[0]);
3391 state->fd[0] = -1;
3393 ctdb_set_process_name("ctdb_rec_reclock");
3394 debug_extra = talloc_asprintf(NULL, "recovery-lock:");
3395 if (pread(ctdb->recovery_lock_fd, &cc, 1, 0) == -1) {
3396 DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
3397 cc = RECLOCK_FAILED;
3400 write(state->fd[1], &cc, 1);
3401 /* make sure we die when our parent dies */
3402 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
3403 sleep(5);
3405 _exit(0);
3407 close(state->fd[1]);
3408 state->fd[1] = -1;
3409 set_close_on_exec(state->fd[0]);
3411 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for check_recovery_lock\n", state->fd[0]));
3413 talloc_set_destructor(state, check_reclock_destructor);
3415 state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(15, 0),
3416 ctdb_check_reclock_timeout, state);
3417 if (state->te == NULL) {
3418 DEBUG(DEBUG_CRIT,(__location__ " Failed to create a timed event for reclock child\n"));
3419 talloc_free(state);
3420 return -1;
3423 state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
3424 EVENT_FD_READ,
3425 reclock_child_handler,
3426 (void *)state);
3428 if (state->fde == NULL) {
3429 DEBUG(DEBUG_CRIT,(__location__ " Failed to create an fd event for reclock child\n"));
3430 talloc_free(state);
3431 return -1;
3433 tevent_fd_set_auto_close(state->fde);
3435 while (state->status == RECLOCK_CHECKING) {
3436 event_loop_once(ctdb->ev);
3439 if (state->status == RECLOCK_FAILED) {
3440 DEBUG(DEBUG_ERR,(__location__ " reclock child failed when checking file\n"));
3441 close(ctdb->recovery_lock_fd);
3442 ctdb->recovery_lock_fd = -1;
3443 talloc_free(state);
3444 return -1;
3447 talloc_free(state);
3448 return 0;
3451 static int update_recovery_lock_file(struct ctdb_context *ctdb)
3453 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
3454 const char *reclockfile;
3456 if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
3457 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
3458 talloc_free(tmp_ctx);
3459 return -1;
3462 if (reclockfile == NULL) {
3463 if (ctdb->recovery_lock_file != NULL) {
3464 DEBUG(DEBUG_ERR,("Reclock file disabled\n"));
3465 talloc_free(ctdb->recovery_lock_file);
3466 ctdb->recovery_lock_file = NULL;
3467 if (ctdb->recovery_lock_fd != -1) {
3468 close(ctdb->recovery_lock_fd);
3469 ctdb->recovery_lock_fd = -1;
3472 ctdb->tunable.verify_recovery_lock = 0;
3473 talloc_free(tmp_ctx);
3474 return 0;
3477 if (ctdb->recovery_lock_file == NULL) {
3478 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3479 if (ctdb->recovery_lock_fd != -1) {
3480 close(ctdb->recovery_lock_fd);
3481 ctdb->recovery_lock_fd = -1;
3483 talloc_free(tmp_ctx);
3484 return 0;
3488 if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
3489 talloc_free(tmp_ctx);
3490 return 0;
3493 talloc_free(ctdb->recovery_lock_file);
3494 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3495 ctdb->tunable.verify_recovery_lock = 0;
3496 if (ctdb->recovery_lock_fd != -1) {
3497 close(ctdb->recovery_lock_fd);
3498 ctdb->recovery_lock_fd = -1;
3501 talloc_free(tmp_ctx);
3502 return 0;
3505 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
3506 TALLOC_CTX *mem_ctx)
3508 uint32_t pnn;
3509 struct ctdb_node_map *nodemap=NULL;
3510 struct ctdb_node_map *recmaster_nodemap=NULL;
3511 struct ctdb_node_map **remote_nodemaps=NULL;
3512 struct ctdb_vnn_map *vnnmap=NULL;
3513 struct ctdb_vnn_map *remote_vnnmap=NULL;
3514 int32_t debug_level;
3515 int i, j, ret;
3516 bool self_ban;
3519 /* verify that the main daemon is still running */
3520 if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
3521 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
3522 exit(-1);
3525 /* ping the local daemon to tell it we are alive */
3526 ctdb_ctrl_recd_ping(ctdb);
3528 if (rec->election_timeout) {
3529 /* an election is in progress */
3530 return;
3533 /* read the debug level from the parent and update locally */
3534 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
3535 if (ret !=0) {
3536 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
3537 return;
3539 LogLevel = debug_level;
3541 /* get relevant tunables */
3542 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
3543 if (ret != 0) {
3544 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
3545 return;
3548 /* get the current recovery lock file from the server */
3549 if (update_recovery_lock_file(ctdb) != 0) {
3550 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
3551 return;
3554 /* Make sure that if recovery lock verification becomes disabled when
3555 we close the file
3557 if (ctdb->tunable.verify_recovery_lock == 0) {
3558 if (ctdb->recovery_lock_fd != -1) {
3559 close(ctdb->recovery_lock_fd);
3560 ctdb->recovery_lock_fd = -1;
3564 pnn = ctdb_get_pnn(ctdb);
3566 /* get the vnnmap */
3567 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
3568 if (ret != 0) {
3569 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
3570 return;
3574 /* get number of nodes */
3575 if (rec->nodemap) {
3576 talloc_free(rec->nodemap);
3577 rec->nodemap = NULL;
3578 nodemap=NULL;
3580 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
3581 if (ret != 0) {
3582 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
3583 return;
3585 nodemap = rec->nodemap;
3587 /* remember our own node flags */
3588 rec->node_flags = nodemap->nodes[pnn].flags;
3590 ban_misbehaving_nodes(rec, &self_ban);
3591 if (self_ban) {
3592 DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
3593 return;
3596 /* if the local daemon is STOPPED or BANNED, we verify that the databases are
3597 also frozen and that the recmode is set to active.
3599 if (rec->node_flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
3600 /* If this node has become inactive then we want to
3601 * reduce the chances of it taking over the recovery
3602 * master role when it becomes active again. This
3603 * helps to stabilise the recovery master role so that
3604 * it stays on the most stable node.
3606 rec->priority_time = timeval_current();
3608 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
3609 if (ret != 0) {
3610 DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
3612 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
3613 DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
3615 ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
3616 if (ret != 0) {
3617 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node in STOPPED or BANNED state\n"));
3618 return;
3620 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
3621 if (ret != 0) {
3622 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
3624 return;
3628 /* If this node is stopped or banned then it is not the recovery
3629 * master, so don't do anything. This prevents stopped or banned
3630 * node from starting election and sending unnecessary controls.
3632 return;
3635 /* check which node is the recovery master */
3636 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
3637 if (ret != 0) {
3638 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
3639 return;
3642 /* If we are not the recmaster then do some housekeeping */
3643 if (rec->recmaster != pnn) {
3644 /* Ignore any IP reallocate requests - only recmaster
3645 * processes them
3647 TALLOC_FREE(rec->reallocate_requests);
3648 /* Clear any nodes that should be force rebalanced in
3649 * the next takeover run. If the recovery master role
3650 * has moved then we don't want to process these some
3651 * time in the future.
3653 TALLOC_FREE(rec->force_rebalance_nodes);
3656 /* This is a special case. When recovery daemon is started, recmaster
3657 * is set to -1. If a node is not started in stopped state, then
3658 * start election to decide recovery master
3660 if (rec->recmaster == (uint32_t)-1) {
3661 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
3662 force_election(rec, pnn, nodemap);
3663 return;
3666 /* update the capabilities for all nodes */
3667 ret = update_capabilities(ctdb, nodemap);
3668 if (ret != 0) {
3669 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
3670 return;
3674 * If the current recmaster does not have CTDB_CAP_RECMASTER,
3675 * but we have, then force an election and try to become the new
3676 * recmaster.
3678 if ((rec->ctdb->nodes[rec->recmaster]->capabilities & CTDB_CAP_RECMASTER) == 0 &&
3679 (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
3680 !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
3681 DEBUG(DEBUG_ERR, (__location__ " Current recmaster node %u does not have CAP_RECMASTER,"
3682 " but we (node %u) have - force an election\n",
3683 rec->recmaster, pnn));
3684 force_election(rec, pnn, nodemap);
3685 return;
3688 /* count how many active nodes there are */
3689 rec->num_active = 0;
3690 rec->num_lmasters = 0;
3691 rec->num_connected = 0;
3692 for (i=0; i<nodemap->num; i++) {
3693 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
3694 rec->num_active++;
3695 if (rec->ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER) {
3696 rec->num_lmasters++;
3699 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
3700 rec->num_connected++;
3705 /* verify that the recmaster node is still active */
3706 for (j=0; j<nodemap->num; j++) {
3707 if (nodemap->nodes[j].pnn==rec->recmaster) {
3708 break;
3712 if (j == nodemap->num) {
3713 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
3714 force_election(rec, pnn, nodemap);
3715 return;
3718 /* if recovery master is disconnected we must elect a new recmaster */
3719 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
3720 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
3721 force_election(rec, pnn, nodemap);
3722 return;
3725 /* get nodemap from the recovery master to check if it is inactive */
3726 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3727 mem_ctx, &recmaster_nodemap);
3728 if (ret != 0) {
3729 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
3730 nodemap->nodes[j].pnn));
3731 return;
3735 if ((recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) &&
3736 (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
3737 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
3739 * update our nodemap to carry the recmaster's notion of
3740 * its own flags, so that we don't keep freezing the
3741 * inactive recmaster node...
3743 nodemap->nodes[j].flags = recmaster_nodemap->nodes[j].flags;
3744 force_election(rec, pnn, nodemap);
3745 return;
3748 /* verify that we have all ip addresses we should have and we dont
3749 * have addresses we shouldnt have.
3751 if (ctdb->tunable.disable_ip_failover == 0 &&
3752 rec->takeover_runs_disable_ctx == NULL) {
3753 if (verify_local_ip_allocation(ctdb, rec, pnn, nodemap) != 0) {
3754 DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
3759 /* if we are not the recmaster then we do not need to check
3760 if recovery is needed
3762 if (pnn != rec->recmaster) {
3763 return;
3767 /* ensure our local copies of flags are right */
3768 ret = update_local_flags(rec, nodemap);
3769 if (ret == MONITOR_ELECTION_NEEDED) {
3770 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
3771 force_election(rec, pnn, nodemap);
3772 return;
3774 if (ret != MONITOR_OK) {
3775 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
3776 return;
3779 if (ctdb->num_nodes != nodemap->num) {
3780 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
3781 ctdb_load_nodes_file(ctdb);
3782 return;
3785 /* verify that all active nodes agree that we are the recmaster */
3786 switch (verify_recmaster(rec, nodemap, pnn)) {
3787 case MONITOR_RECOVERY_NEEDED:
3788 /* can not happen */
3789 return;
3790 case MONITOR_ELECTION_NEEDED:
3791 force_election(rec, pnn, nodemap);
3792 return;
3793 case MONITOR_OK:
3794 break;
3795 case MONITOR_FAILED:
3796 return;
3800 if (rec->need_recovery) {
3801 /* a previous recovery didn't finish */
3802 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3803 return;
3806 /* verify that all active nodes are in normal mode
3807 and not in recovery mode
3809 switch (verify_recmode(ctdb, nodemap)) {
3810 case MONITOR_RECOVERY_NEEDED:
3811 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3812 return;
3813 case MONITOR_FAILED:
3814 return;
3815 case MONITOR_ELECTION_NEEDED:
3816 /* can not happen */
3817 case MONITOR_OK:
3818 break;
3822 if (ctdb->tunable.verify_recovery_lock != 0) {
3823 /* we should have the reclock - check its not stale */
3824 ret = check_recovery_lock(ctdb);
3825 if (ret != 0) {
3826 DEBUG(DEBUG_ERR,("Failed check_recovery_lock. Force a recovery\n"));
3827 ctdb_set_culprit(rec, ctdb->pnn);
3828 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3829 return;
3834 /* if there are takeovers requested, perform it and notify the waiters */
3835 if (rec->takeover_runs_disable_ctx == NULL &&
3836 rec->reallocate_requests) {
3837 process_ipreallocate_requests(ctdb, rec);
3840 /* get the nodemap for all active remote nodes
3842 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
3843 if (remote_nodemaps == NULL) {
3844 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3845 return;
3847 for(i=0; i<nodemap->num; i++) {
3848 remote_nodemaps[i] = NULL;
3850 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3851 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3852 return;
3855 /* verify that all other nodes have the same nodemap as we have
3857 for (j=0; j<nodemap->num; j++) {
3858 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3859 continue;
3862 if (remote_nodemaps[j] == NULL) {
3863 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3864 ctdb_set_culprit(rec, j);
3866 return;
3869 /* if the nodes disagree on how many nodes there are
3870 then this is a good reason to try recovery
3872 if (remote_nodemaps[j]->num != nodemap->num) {
3873 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3874 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3875 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3876 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3877 return;
3880 /* if the nodes disagree on which nodes exist and are
3881 active, then that is also a good reason to do recovery
3883 for (i=0;i<nodemap->num;i++) {
3884 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3885 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3886 nodemap->nodes[j].pnn, i,
3887 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3888 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3889 do_recovery(rec, mem_ctx, pnn, nodemap,
3890 vnnmap);
3891 return;
3897 * Update node flags obtained from each active node. This ensure we have
3898 * up-to-date information for all the nodes.
3900 for (j=0; j<nodemap->num; j++) {
3901 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3902 continue;
3904 nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
3907 for (j=0; j<nodemap->num; j++) {
3908 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3909 continue;
3912 /* verify the flags are consistent
3914 for (i=0; i<nodemap->num; i++) {
3915 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3916 continue;
3919 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3920 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3921 nodemap->nodes[j].pnn,
3922 nodemap->nodes[i].pnn,
3923 remote_nodemaps[j]->nodes[i].flags,
3924 nodemap->nodes[i].flags));
3925 if (i == j) {
3926 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3927 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3928 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3929 do_recovery(rec, mem_ctx, pnn, nodemap,
3930 vnnmap);
3931 return;
3932 } else {
3933 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3934 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3935 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3936 do_recovery(rec, mem_ctx, pnn, nodemap,
3937 vnnmap);
3938 return;
3945 /* There must be the same number of lmasters in the vnn map as
3946 * there are active nodes with the lmaster capability... or
3947 * do a recovery.
3949 if (vnnmap->size != rec->num_lmasters) {
3950 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
3951 vnnmap->size, rec->num_lmasters));
3952 ctdb_set_culprit(rec, ctdb->pnn);
3953 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3954 return;
3957 /* verify that all active nodes in the nodemap also exist in
3958 the vnnmap.
3960 for (j=0; j<nodemap->num; j++) {
3961 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3962 continue;
3964 if (nodemap->nodes[j].pnn == pnn) {
3965 continue;
3968 for (i=0; i<vnnmap->size; i++) {
3969 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3970 break;
3973 if (i == vnnmap->size) {
3974 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3975 nodemap->nodes[j].pnn));
3976 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3977 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3978 return;
3983 /* verify that all other nodes have the same vnnmap
3984 and are from the same generation
3986 for (j=0; j<nodemap->num; j++) {
3987 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3988 continue;
3990 if (nodemap->nodes[j].pnn == pnn) {
3991 continue;
3994 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3995 mem_ctx, &remote_vnnmap);
3996 if (ret != 0) {
3997 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3998 nodemap->nodes[j].pnn));
3999 return;
4002 /* verify the vnnmap generation is the same */
4003 if (vnnmap->generation != remote_vnnmap->generation) {
4004 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
4005 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
4006 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
4007 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
4008 return;
4011 /* verify the vnnmap size is the same */
4012 if (vnnmap->size != remote_vnnmap->size) {
4013 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
4014 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
4015 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
4016 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
4017 return;
4020 /* verify the vnnmap is the same */
4021 for (i=0;i<vnnmap->size;i++) {
4022 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
4023 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
4024 nodemap->nodes[j].pnn));
4025 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
4026 do_recovery(rec, mem_ctx, pnn, nodemap,
4027 vnnmap);
4028 return;
4033 /* we might need to change who has what IP assigned */
4034 if (rec->need_takeover_run) {
4035 uint32_t culprit = (uint32_t)-1;
4037 rec->need_takeover_run = false;
4039 /* update the list of public ips that a node can handle for
4040 all connected nodes
4042 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
4043 if (ret != 0) {
4044 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
4045 culprit));
4046 rec->need_takeover_run = true;
4047 return;
4050 /* execute the "startrecovery" event script on all nodes */
4051 ret = run_startrecovery_eventscript(rec, nodemap);
4052 if (ret!=0) {
4053 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
4054 ctdb_set_culprit(rec, ctdb->pnn);
4055 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
4056 return;
4059 /* If takeover run fails, then the offending nodes are
4060 * assigned ban culprit counts. And we re-try takeover.
4061 * If takeover run fails repeatedly, the node would get
4062 * banned.
4064 * If rec->need_takeover_run is not set to true at this
4065 * failure, monitoring is disabled cluster-wide (via
4066 * startrecovery eventscript) and will not get enabled.
4068 if (!do_takeover_run(rec, nodemap, true)) {
4069 return;
4072 /* execute the "recovered" event script on all nodes */
4073 ret = run_recovered_eventscript(rec, nodemap, "monitor_cluster");
4074 #if 0
4075 // we cant check whether the event completed successfully
4076 // since this script WILL fail if the node is in recovery mode
4077 // and if that race happens, the code here would just cause a second
4078 // cascading recovery.
4079 if (ret!=0) {
4080 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
4081 ctdb_set_culprit(rec, ctdb->pnn);
4082 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
4084 #endif
4089 the main monitoring loop
4091 static void monitor_cluster(struct ctdb_context *ctdb)
4093 struct ctdb_recoverd *rec;
4095 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
4097 rec = talloc_zero(ctdb, struct ctdb_recoverd);
4098 CTDB_NO_MEMORY_FATAL(ctdb, rec);
4100 rec->ctdb = ctdb;
4102 rec->takeover_run_in_progress = false;
4104 rec->priority_time = timeval_current();
4106 /* register a message port for sending memory dumps */
4107 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
4109 /* register a message port for requesting logs */
4110 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_GETLOG, getlog_handler, rec);
4112 /* register a message port for clearing logs */
4113 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_CLEARLOG, clearlog_handler, rec);
4115 /* register a message port for recovery elections */
4116 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
4118 /* when nodes are disabled/enabled */
4119 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
4121 /* when we are asked to puch out a flag change */
4122 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
4124 /* register a message port for vacuum fetch */
4125 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
4127 /* register a message port for reloadnodes */
4128 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
4130 /* register a message port for performing a takeover run */
4131 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
4133 /* register a message port for disabling the ip check for a short while */
4134 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
4136 /* register a message port for updating the recovery daemons node assignment for an ip */
4137 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECD_UPDATE_IP, recd_update_ip_handler, rec);
4139 /* register a message port for forcing a rebalance of a node next
4140 reallocation */
4141 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
4143 /* Register a message port for disabling takeover runs */
4144 ctdb_client_set_message_handler(ctdb,
4145 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
4146 disable_takeover_runs_handler, rec);
4148 for (;;) {
4149 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
4150 struct timeval start;
4151 double elapsed;
4153 if (!mem_ctx) {
4154 DEBUG(DEBUG_CRIT,(__location__
4155 " Failed to create temp context\n"));
4156 exit(-1);
4159 start = timeval_current();
4160 main_loop(ctdb, rec, mem_ctx);
4161 talloc_free(mem_ctx);
4163 /* we only check for recovery once every second */
4164 elapsed = timeval_elapsed(&start);
4165 if (elapsed < ctdb->tunable.recover_interval) {
4166 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
4167 - elapsed);
4173 event handler for when the main ctdbd dies
4175 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
4176 uint16_t flags, void *private_data)
4178 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
4179 _exit(1);
4183 called regularly to verify that the recovery daemon is still running
4185 static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
4186 struct timeval yt, void *p)
4188 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
4190 if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
4191 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
4193 event_add_timed(ctdb->ev, ctdb, timeval_zero(),
4194 ctdb_restart_recd, ctdb);
4196 return;
4199 event_add_timed(ctdb->ev, ctdb->recd_ctx,
4200 timeval_current_ofs(30, 0),
4201 ctdb_check_recd, ctdb);
4204 static void recd_sig_child_handler(struct event_context *ev,
4205 struct signal_event *se, int signum, int count,
4206 void *dont_care,
4207 void *private_data)
4209 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4210 int status;
4211 pid_t pid = -1;
4213 while (pid != 0) {
4214 pid = waitpid(-1, &status, WNOHANG);
4215 if (pid == -1) {
4216 if (errno != ECHILD) {
4217 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
4219 return;
4221 if (pid > 0) {
4222 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
4228 startup the recovery daemon as a child of the main ctdb daemon
4230 int ctdb_start_recoverd(struct ctdb_context *ctdb)
4232 int fd[2];
4233 struct signal_event *se;
4234 struct tevent_fd *fde;
4236 if (pipe(fd) != 0) {
4237 return -1;
4240 ctdb->ctdbd_pid = getpid();
4242 ctdb->recoverd_pid = ctdb_fork_no_free_ringbuffer(ctdb);
4243 if (ctdb->recoverd_pid == -1) {
4244 return -1;
4247 if (ctdb->recoverd_pid != 0) {
4248 talloc_free(ctdb->recd_ctx);
4249 ctdb->recd_ctx = talloc_new(ctdb);
4250 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
4252 close(fd[0]);
4253 event_add_timed(ctdb->ev, ctdb->recd_ctx,
4254 timeval_current_ofs(30, 0),
4255 ctdb_check_recd, ctdb);
4256 return 0;
4259 close(fd[1]);
4261 srandom(getpid() ^ time(NULL));
4263 /* Clear the log ringbuffer */
4264 ctdb_clear_log(ctdb);
4266 ctdb_set_process_name("ctdb_recovered");
4267 if (switch_from_server_to_client(ctdb, "recoverd") != 0) {
4268 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
4269 exit(1);
4272 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
4274 fde = event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ,
4275 ctdb_recoverd_parent, &fd[0]);
4276 tevent_fd_set_auto_close(fde);
4278 /* set up a handler to pick up sigchld */
4279 se = event_add_signal(ctdb->ev, ctdb,
4280 SIGCHLD, 0,
4281 recd_sig_child_handler,
4282 ctdb);
4283 if (se == NULL) {
4284 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
4285 exit(1);
4288 monitor_cluster(ctdb);
4290 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
4291 return -1;
4295 shutdown the recovery daemon
4297 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
4299 if (ctdb->recoverd_pid == 0) {
4300 return;
4303 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
4304 ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
4306 TALLOC_FREE(ctdb->recd_ctx);
4307 TALLOC_FREE(ctdb->recd_ping_count);
4310 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te,
4311 struct timeval t, void *private_data)
4313 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4315 DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
4316 ctdb_stop_recoverd(ctdb);
4317 ctdb_start_recoverd(ctdb);