Add error codes and message descriptions for NTSTATUS
[Samba.git] / ctdb / server / ctdb_recoverd.c
blob179eb7d311baf1cc47a109d7015e5a4c223fd021
1 /*
2 ctdb recovery daemon
4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
20 #include "includes.h"
21 #include "system/filesys.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/wait.h"
25 #include "popt.h"
26 #include "cmdline.h"
27 #include "../include/ctdb_client.h"
28 #include "../include/ctdb_private.h"
29 #include "db_wrap.h"
30 #include "dlinklist.h"
33 /* List of SRVID requests that need to be processed */
34 struct srvid_list {
35 struct srvid_list *next, *prev;
36 struct srvid_request *request;
39 struct srvid_requests {
40 struct srvid_list *requests;
43 static void srvid_request_reply(struct ctdb_context *ctdb,
44 struct srvid_request *request,
45 TDB_DATA result)
47 /* Someone that sent srvid==0 does not want a reply */
48 if (request->srvid == 0) {
49 talloc_free(request);
50 return;
53 if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
54 result) == 0) {
55 DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
56 (unsigned)request->pnn,
57 (unsigned long long)request->srvid));
58 } else {
59 DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
60 (unsigned)request->pnn,
61 (unsigned long long)request->srvid));
64 talloc_free(request);
67 static void srvid_requests_reply(struct ctdb_context *ctdb,
68 struct srvid_requests **requests,
69 TDB_DATA result)
71 struct srvid_list *r;
73 for (r = (*requests)->requests; r != NULL; r = r->next) {
74 srvid_request_reply(ctdb, r->request, result);
77 /* Free the list structure... */
78 TALLOC_FREE(*requests);
81 static void srvid_request_add(struct ctdb_context *ctdb,
82 struct srvid_requests **requests,
83 struct srvid_request *request)
85 struct srvid_list *t;
86 int32_t ret;
87 TDB_DATA result;
89 if (*requests == NULL) {
90 *requests = talloc_zero(ctdb, struct srvid_requests);
91 if (*requests == NULL) {
92 goto nomem;
96 t = talloc_zero(*requests, struct srvid_list);
97 if (t == NULL) {
98 /* If *requests was just allocated above then free it */
99 if ((*requests)->requests == NULL) {
100 TALLOC_FREE(*requests);
102 goto nomem;
105 t->request = (struct srvid_request *)talloc_steal(t, request);
106 DLIST_ADD((*requests)->requests, t);
108 return;
110 nomem:
111 /* Failed to add the request to the list. Send a fail. */
112 DEBUG(DEBUG_ERR, (__location__
113 " Out of memory, failed to queue SRVID request\n"));
114 ret = -ENOMEM;
115 result.dsize = sizeof(ret);
116 result.dptr = (uint8_t *)&ret;
117 srvid_request_reply(ctdb, request, result);
120 struct ctdb_banning_state {
121 uint32_t count;
122 struct timeval last_reported_time;
126 private state of recovery daemon
128 struct ctdb_recoverd {
129 struct ctdb_context *ctdb;
130 uint32_t recmaster;
131 uint32_t num_active;
132 uint32_t num_lmasters;
133 uint32_t num_connected;
134 uint32_t last_culprit_node;
135 struct ctdb_node_map *nodemap;
136 struct timeval priority_time;
137 bool need_takeover_run;
138 bool need_recovery;
139 uint32_t node_flags;
140 struct timed_event *send_election_te;
141 struct timed_event *election_timeout;
142 struct vacuum_info *vacuum_info;
143 struct srvid_requests *reallocate_requests;
144 bool takeover_run_in_progress;
145 TALLOC_CTX *takeover_runs_disable_ctx;
146 struct ctdb_control_get_ifaces *ifaces;
147 uint32_t *force_rebalance_nodes;
150 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
151 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
153 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te, struct timeval t, void *private_data);
156 ban a node for a period of time
158 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
160 int ret;
161 struct ctdb_context *ctdb = rec->ctdb;
162 struct ctdb_ban_time bantime;
164 if (!ctdb_validate_pnn(ctdb, pnn)) {
165 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
166 return;
169 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
171 bantime.pnn = pnn;
172 bantime.time = ban_time;
174 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
175 if (ret != 0) {
176 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
177 return;
182 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
186 remember the trouble maker
188 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
190 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
191 struct ctdb_banning_state *ban_state;
193 if (culprit > ctdb->num_nodes) {
194 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
195 return;
198 /* If we are banned or stopped, do not set other nodes as culprits */
199 if (rec->node_flags & NODE_FLAGS_INACTIVE) {
200 DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
201 return;
204 if (ctdb->nodes[culprit]->ban_state == NULL) {
205 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
206 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
210 ban_state = ctdb->nodes[culprit]->ban_state;
211 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
212 /* this was the first time in a long while this node
213 misbehaved so we will forgive any old transgressions.
215 ban_state->count = 0;
218 ban_state->count += count;
219 ban_state->last_reported_time = timeval_current();
220 rec->last_culprit_node = culprit;
224 remember the trouble maker
226 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
228 ctdb_set_culprit_count(rec, culprit, 1);
232 /* this callback is called for every node that failed to execute the
233 recovered event
235 static void recovered_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
237 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
239 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the recovered event. Setting it as recovery fail culprit\n", node_pnn));
241 ctdb_set_culprit(rec, node_pnn);
245 run the "recovered" eventscript on all nodes
247 static int run_recovered_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, const char *caller)
249 TALLOC_CTX *tmp_ctx;
250 uint32_t *nodes;
251 struct ctdb_context *ctdb = rec->ctdb;
253 tmp_ctx = talloc_new(ctdb);
254 CTDB_NO_MEMORY(ctdb, tmp_ctx);
256 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
257 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
258 nodes, 0,
259 CONTROL_TIMEOUT(), false, tdb_null,
260 NULL, recovered_fail_callback,
261 rec) != 0) {
262 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
264 talloc_free(tmp_ctx);
265 return -1;
268 talloc_free(tmp_ctx);
269 return 0;
272 /* this callback is called for every node that failed to execute the
273 start recovery event
275 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
277 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
279 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
281 ctdb_set_culprit(rec, node_pnn);
285 run the "startrecovery" eventscript on all nodes
287 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
289 TALLOC_CTX *tmp_ctx;
290 uint32_t *nodes;
291 struct ctdb_context *ctdb = rec->ctdb;
293 tmp_ctx = talloc_new(ctdb);
294 CTDB_NO_MEMORY(ctdb, tmp_ctx);
296 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
297 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
298 nodes, 0,
299 CONTROL_TIMEOUT(), false, tdb_null,
300 NULL,
301 startrecovery_fail_callback,
302 rec) != 0) {
303 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
304 talloc_free(tmp_ctx);
305 return -1;
308 talloc_free(tmp_ctx);
309 return 0;
312 static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
314 if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
315 DEBUG(DEBUG_ERR, (__location__ " Invalid length/pointer for getcap callback : %u %p\n", (unsigned)outdata.dsize, outdata.dptr));
316 return;
318 if (node_pnn < ctdb->num_nodes) {
319 ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
322 if (node_pnn == ctdb->pnn) {
323 ctdb->capabilities = ctdb->nodes[node_pnn]->capabilities;
328 update the node capabilities for all connected nodes
330 static int update_capabilities(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
332 uint32_t *nodes;
333 TALLOC_CTX *tmp_ctx;
335 tmp_ctx = talloc_new(ctdb);
336 CTDB_NO_MEMORY(ctdb, tmp_ctx);
338 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
339 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES,
340 nodes, 0,
341 CONTROL_TIMEOUT(),
342 false, tdb_null,
343 async_getcap_callback, NULL,
344 NULL) != 0) {
345 DEBUG(DEBUG_ERR, (__location__ " Failed to read node capabilities.\n"));
346 talloc_free(tmp_ctx);
347 return -1;
350 talloc_free(tmp_ctx);
351 return 0;
354 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
356 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
358 DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
359 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
362 static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
364 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
366 DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
367 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
371 change recovery mode on all nodes
373 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t rec_mode)
375 TDB_DATA data;
376 uint32_t *nodes;
377 TALLOC_CTX *tmp_ctx;
379 tmp_ctx = talloc_new(ctdb);
380 CTDB_NO_MEMORY(ctdb, tmp_ctx);
382 /* freeze all nodes */
383 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
384 if (rec_mode == CTDB_RECOVERY_ACTIVE) {
385 int i;
387 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
388 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
389 nodes, i,
390 CONTROL_TIMEOUT(),
391 false, tdb_null,
392 NULL,
393 set_recmode_fail_callback,
394 rec) != 0) {
395 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
396 talloc_free(tmp_ctx);
397 return -1;
403 data.dsize = sizeof(uint32_t);
404 data.dptr = (unsigned char *)&rec_mode;
406 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
407 nodes, 0,
408 CONTROL_TIMEOUT(),
409 false, data,
410 NULL, NULL,
411 NULL) != 0) {
412 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
413 talloc_free(tmp_ctx);
414 return -1;
417 talloc_free(tmp_ctx);
418 return 0;
422 change recovery master on all node
424 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
426 TDB_DATA data;
427 TALLOC_CTX *tmp_ctx;
428 uint32_t *nodes;
430 tmp_ctx = talloc_new(ctdb);
431 CTDB_NO_MEMORY(ctdb, tmp_ctx);
433 data.dsize = sizeof(uint32_t);
434 data.dptr = (unsigned char *)&pnn;
436 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
437 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
438 nodes, 0,
439 CONTROL_TIMEOUT(), false, data,
440 NULL, NULL,
441 NULL) != 0) {
442 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
443 talloc_free(tmp_ctx);
444 return -1;
447 talloc_free(tmp_ctx);
448 return 0;
451 /* update all remote nodes to use the same db priority that we have
452 this can fail if the remove node has not yet been upgraded to
453 support this function, so we always return success and never fail
454 a recovery if this call fails.
456 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
457 struct ctdb_node_map *nodemap,
458 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
460 int db;
461 uint32_t *nodes;
463 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
465 /* step through all local databases */
466 for (db=0; db<dbmap->num;db++) {
467 TDB_DATA data;
468 struct ctdb_db_priority db_prio;
469 int ret;
471 db_prio.db_id = dbmap->dbs[db].dbid;
472 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].dbid, &db_prio.priority);
473 if (ret != 0) {
474 DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].dbid));
475 continue;
478 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].dbid, db_prio.priority));
480 data.dptr = (uint8_t *)&db_prio;
481 data.dsize = sizeof(db_prio);
483 if (ctdb_client_async_control(ctdb,
484 CTDB_CONTROL_SET_DB_PRIORITY,
485 nodes, 0,
486 CONTROL_TIMEOUT(), false, data,
487 NULL, NULL,
488 NULL) != 0) {
489 DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n", db_prio.db_id));
493 return 0;
497 ensure all other nodes have attached to any databases that we have
499 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
500 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
502 int i, j, db, ret;
503 struct ctdb_dbid_map *remote_dbmap;
505 /* verify that all other nodes have all our databases */
506 for (j=0; j<nodemap->num; j++) {
507 /* we dont need to ourself ourselves */
508 if (nodemap->nodes[j].pnn == pnn) {
509 continue;
511 /* dont check nodes that are unavailable */
512 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
513 continue;
516 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
517 mem_ctx, &remote_dbmap);
518 if (ret != 0) {
519 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
520 return -1;
523 /* step through all local databases */
524 for (db=0; db<dbmap->num;db++) {
525 const char *name;
528 for (i=0;i<remote_dbmap->num;i++) {
529 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
530 break;
533 /* the remote node already have this database */
534 if (i!=remote_dbmap->num) {
535 continue;
537 /* ok so we need to create this database */
538 ret = ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn,
539 dbmap->dbs[db].dbid, mem_ctx,
540 &name);
541 if (ret != 0) {
542 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
543 return -1;
545 ret = ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(),
546 nodemap->nodes[j].pnn,
547 mem_ctx, name,
548 dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
549 if (ret != 0) {
550 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
551 return -1;
556 return 0;
561 ensure we are attached to any databases that anyone else is attached to
563 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
564 uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
566 int i, j, db, ret;
567 struct ctdb_dbid_map *remote_dbmap;
569 /* verify that we have all database any other node has */
570 for (j=0; j<nodemap->num; j++) {
571 /* we dont need to ourself ourselves */
572 if (nodemap->nodes[j].pnn == pnn) {
573 continue;
575 /* dont check nodes that are unavailable */
576 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
577 continue;
580 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
581 mem_ctx, &remote_dbmap);
582 if (ret != 0) {
583 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
584 return -1;
587 /* step through all databases on the remote node */
588 for (db=0; db<remote_dbmap->num;db++) {
589 const char *name;
591 for (i=0;i<(*dbmap)->num;i++) {
592 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
593 break;
596 /* we already have this db locally */
597 if (i!=(*dbmap)->num) {
598 continue;
600 /* ok so we need to create this database and
601 rebuild dbmap
603 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
604 remote_dbmap->dbs[db].dbid, mem_ctx, &name);
605 if (ret != 0) {
606 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
607 nodemap->nodes[j].pnn));
608 return -1;
610 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
611 remote_dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
612 if (ret != 0) {
613 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
614 return -1;
616 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
617 if (ret != 0) {
618 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
619 return -1;
624 return 0;
629 pull the remote database contents from one node into the recdb
631 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
632 struct tdb_wrap *recdb, uint32_t dbid)
634 int ret;
635 TDB_DATA outdata;
636 struct ctdb_marshall_buffer *reply;
637 struct ctdb_rec_data *rec;
638 int i;
639 TALLOC_CTX *tmp_ctx = talloc_new(recdb);
641 ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
642 CONTROL_TIMEOUT(), &outdata);
643 if (ret != 0) {
644 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
645 talloc_free(tmp_ctx);
646 return -1;
649 reply = (struct ctdb_marshall_buffer *)outdata.dptr;
651 if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
652 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
653 talloc_free(tmp_ctx);
654 return -1;
657 rec = (struct ctdb_rec_data *)&reply->data[0];
659 for (i=0;
660 i<reply->count;
661 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec), i++) {
662 TDB_DATA key, data;
663 struct ctdb_ltdb_header *hdr;
664 TDB_DATA existing;
666 key.dptr = &rec->data[0];
667 key.dsize = rec->keylen;
668 data.dptr = &rec->data[key.dsize];
669 data.dsize = rec->datalen;
671 hdr = (struct ctdb_ltdb_header *)data.dptr;
673 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
674 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
675 talloc_free(tmp_ctx);
676 return -1;
679 /* fetch the existing record, if any */
680 existing = tdb_fetch(recdb->tdb, key);
682 if (existing.dptr != NULL) {
683 struct ctdb_ltdb_header header;
684 if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
685 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
686 (unsigned)existing.dsize, srcnode));
687 free(existing.dptr);
688 talloc_free(tmp_ctx);
689 return -1;
691 header = *(struct ctdb_ltdb_header *)existing.dptr;
692 free(existing.dptr);
693 if (!(header.rsn < hdr->rsn ||
694 (header.dmaster != ctdb->recovery_master && header.rsn == hdr->rsn))) {
695 continue;
699 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
700 DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
701 talloc_free(tmp_ctx);
702 return -1;
706 talloc_free(tmp_ctx);
708 return 0;
712 struct pull_seqnum_cbdata {
713 int failed;
714 uint32_t pnn;
715 uint64_t seqnum;
718 static void pull_seqnum_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
720 struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
721 uint64_t seqnum;
723 if (cb_data->failed != 0) {
724 DEBUG(DEBUG_ERR, ("Got seqnum from node %d but we have already failed the entire operation\n", node_pnn));
725 return;
728 if (res != 0) {
729 DEBUG(DEBUG_ERR, ("Error when pulling seqnum from node %d\n", node_pnn));
730 cb_data->failed = 1;
731 return;
734 if (outdata.dsize != sizeof(uint64_t)) {
735 DEBUG(DEBUG_ERR, ("Error when reading pull seqnum from node %d, got %d bytes but expected %d\n", node_pnn, (int)outdata.dsize, (int)sizeof(uint64_t)));
736 cb_data->failed = -1;
737 return;
740 seqnum = *((uint64_t *)outdata.dptr);
742 if (seqnum > cb_data->seqnum ||
743 (cb_data->pnn == -1 && seqnum == 0)) {
744 cb_data->seqnum = seqnum;
745 cb_data->pnn = node_pnn;
749 static void pull_seqnum_fail_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
751 struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
753 DEBUG(DEBUG_ERR, ("Failed to pull db seqnum from node %d\n", node_pnn));
754 cb_data->failed = 1;
757 static int pull_highest_seqnum_pdb(struct ctdb_context *ctdb,
758 struct ctdb_recoverd *rec,
759 struct ctdb_node_map *nodemap,
760 struct tdb_wrap *recdb, uint32_t dbid)
762 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
763 uint32_t *nodes;
764 TDB_DATA data;
765 uint32_t outdata[2];
766 struct pull_seqnum_cbdata *cb_data;
768 DEBUG(DEBUG_NOTICE, ("Scan for highest seqnum pdb for db:0x%08x\n", dbid));
770 outdata[0] = dbid;
771 outdata[1] = 0;
773 data.dsize = sizeof(outdata);
774 data.dptr = (uint8_t *)&outdata[0];
776 cb_data = talloc(tmp_ctx, struct pull_seqnum_cbdata);
777 if (cb_data == NULL) {
778 DEBUG(DEBUG_ERR, ("Failed to allocate pull highest seqnum cb_data structure\n"));
779 talloc_free(tmp_ctx);
780 return -1;
783 cb_data->failed = 0;
784 cb_data->pnn = -1;
785 cb_data->seqnum = 0;
787 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
788 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_DB_SEQNUM,
789 nodes, 0,
790 CONTROL_TIMEOUT(), false, data,
791 pull_seqnum_cb,
792 pull_seqnum_fail_cb,
793 cb_data) != 0) {
794 DEBUG(DEBUG_ERR, (__location__ " Failed to run async GET_DB_SEQNUM\n"));
796 talloc_free(tmp_ctx);
797 return -1;
800 if (cb_data->failed != 0) {
801 DEBUG(DEBUG_NOTICE, ("Failed to pull sequence numbers for DB 0x%08x\n", dbid));
802 talloc_free(tmp_ctx);
803 return -1;
806 if (cb_data->pnn == -1) {
807 DEBUG(DEBUG_NOTICE, ("Failed to find a node with highest sequence numbers for DB 0x%08x\n", dbid));
808 talloc_free(tmp_ctx);
809 return -1;
812 DEBUG(DEBUG_NOTICE, ("Pull persistent db:0x%08x from node %d with highest seqnum:%lld\n", dbid, cb_data->pnn, (long long)cb_data->seqnum));
814 if (pull_one_remote_database(ctdb, cb_data->pnn, recdb, dbid) != 0) {
815 DEBUG(DEBUG_ERR, ("Failed to pull higest seqnum database 0x%08x from node %d\n", dbid, cb_data->pnn));
816 talloc_free(tmp_ctx);
817 return -1;
820 talloc_free(tmp_ctx);
821 return 0;
826 pull all the remote database contents into the recdb
828 static int pull_remote_database(struct ctdb_context *ctdb,
829 struct ctdb_recoverd *rec,
830 struct ctdb_node_map *nodemap,
831 struct tdb_wrap *recdb, uint32_t dbid,
832 bool persistent)
834 int j;
836 if (persistent && ctdb->tunable.recover_pdb_by_seqnum != 0) {
837 int ret;
838 ret = pull_highest_seqnum_pdb(ctdb, rec, nodemap, recdb, dbid);
839 if (ret == 0) {
840 return 0;
844 /* pull all records from all other nodes across onto this node
845 (this merges based on rsn)
847 for (j=0; j<nodemap->num; j++) {
848 /* dont merge from nodes that are unavailable */
849 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
850 continue;
852 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
853 DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
854 nodemap->nodes[j].pnn));
855 ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
856 return -1;
860 return 0;
865 update flags on all active nodes
867 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
869 int ret;
871 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
872 if (ret != 0) {
873 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
874 return -1;
877 return 0;
881 ensure all nodes have the same vnnmap we do
883 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
884 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
886 int j, ret;
888 /* push the new vnn map out to all the nodes */
889 for (j=0; j<nodemap->num; j++) {
890 /* dont push to nodes that are unavailable */
891 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
892 continue;
895 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
896 if (ret != 0) {
897 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
898 return -1;
902 return 0;
906 struct vacuum_info {
907 struct vacuum_info *next, *prev;
908 struct ctdb_recoverd *rec;
909 uint32_t srcnode;
910 struct ctdb_db_context *ctdb_db;
911 struct ctdb_marshall_buffer *recs;
912 struct ctdb_rec_data *r;
915 static void vacuum_fetch_next(struct vacuum_info *v);
918 called when a vacuum fetch has completed - just free it and do the next one
920 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
922 struct vacuum_info *v = talloc_get_type(state->async.private_data, struct vacuum_info);
923 talloc_free(state);
924 vacuum_fetch_next(v);
929 process the next element from the vacuum list
931 static void vacuum_fetch_next(struct vacuum_info *v)
933 struct ctdb_call call;
934 struct ctdb_rec_data *r;
936 while (v->recs->count) {
937 struct ctdb_client_call_state *state;
938 TDB_DATA data;
939 struct ctdb_ltdb_header *hdr;
941 ZERO_STRUCT(call);
942 call.call_id = CTDB_NULL_FUNC;
943 call.flags = CTDB_IMMEDIATE_MIGRATION;
944 call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
946 r = v->r;
947 v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
948 v->recs->count--;
950 call.key.dptr = &r->data[0];
951 call.key.dsize = r->keylen;
953 /* ensure we don't block this daemon - just skip a record if we can't get
954 the chainlock */
955 if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
956 continue;
959 data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
960 if (data.dptr == NULL) {
961 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
962 continue;
965 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
966 free(data.dptr);
967 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
968 continue;
971 hdr = (struct ctdb_ltdb_header *)data.dptr;
972 if (hdr->dmaster == v->rec->ctdb->pnn) {
973 /* its already local */
974 free(data.dptr);
975 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
976 continue;
979 free(data.dptr);
981 state = ctdb_call_send(v->ctdb_db, &call);
982 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
983 if (state == NULL) {
984 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
985 talloc_free(v);
986 return;
988 state->async.fn = vacuum_fetch_callback;
989 state->async.private_data = v;
990 return;
993 talloc_free(v);
998 destroy a vacuum info structure
1000 static int vacuum_info_destructor(struct vacuum_info *v)
1002 DLIST_REMOVE(v->rec->vacuum_info, v);
1003 return 0;
1008 handler for vacuum fetch
1010 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
1011 TDB_DATA data, void *private_data)
1013 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1014 struct ctdb_marshall_buffer *recs;
1015 int ret, i;
1016 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1017 const char *name;
1018 struct ctdb_dbid_map *dbmap=NULL;
1019 bool persistent = false;
1020 struct ctdb_db_context *ctdb_db;
1021 struct ctdb_rec_data *r;
1022 uint32_t srcnode;
1023 struct vacuum_info *v;
1025 recs = (struct ctdb_marshall_buffer *)data.dptr;
1026 r = (struct ctdb_rec_data *)&recs->data[0];
1028 if (recs->count == 0) {
1029 talloc_free(tmp_ctx);
1030 return;
1033 srcnode = r->reqid;
1035 for (v=rec->vacuum_info;v;v=v->next) {
1036 if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
1037 /* we're already working on records from this node */
1038 talloc_free(tmp_ctx);
1039 return;
1043 /* work out if the database is persistent */
1044 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
1045 if (ret != 0) {
1046 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
1047 talloc_free(tmp_ctx);
1048 return;
1051 for (i=0;i<dbmap->num;i++) {
1052 if (dbmap->dbs[i].dbid == recs->db_id) {
1053 persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
1054 break;
1057 if (i == dbmap->num) {
1058 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
1059 talloc_free(tmp_ctx);
1060 return;
1063 /* find the name of this database */
1064 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
1065 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
1066 talloc_free(tmp_ctx);
1067 return;
1070 /* attach to it */
1071 ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, persistent, 0);
1072 if (ctdb_db == NULL) {
1073 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
1074 talloc_free(tmp_ctx);
1075 return;
1078 v = talloc_zero(rec, struct vacuum_info);
1079 if (v == NULL) {
1080 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
1081 talloc_free(tmp_ctx);
1082 return;
1085 v->rec = rec;
1086 v->srcnode = srcnode;
1087 v->ctdb_db = ctdb_db;
1088 v->recs = talloc_memdup(v, recs, data.dsize);
1089 if (v->recs == NULL) {
1090 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
1091 talloc_free(v);
1092 talloc_free(tmp_ctx);
1093 return;
1095 v->r = (struct ctdb_rec_data *)&v->recs->data[0];
1097 DLIST_ADD(rec->vacuum_info, v);
1099 talloc_set_destructor(v, vacuum_info_destructor);
1101 vacuum_fetch_next(v);
1102 talloc_free(tmp_ctx);
1107 called when ctdb_wait_timeout should finish
1109 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
1110 struct timeval yt, void *p)
1112 uint32_t *timed_out = (uint32_t *)p;
1113 (*timed_out) = 1;
1117 wait for a given number of seconds
1119 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
1121 uint32_t timed_out = 0;
1122 time_t usecs = (secs - (time_t)secs) * 1000000;
1123 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs), ctdb_wait_handler, &timed_out);
1124 while (!timed_out) {
1125 event_loop_once(ctdb->ev);
1130 called when an election times out (ends)
1132 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
1133 struct timeval t, void *p)
1135 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1136 rec->election_timeout = NULL;
1137 fast_start = false;
1139 DEBUG(DEBUG_WARNING,(__location__ " Election timed out\n"));
1144 wait for an election to finish. It finished election_timeout seconds after
1145 the last election packet is received
1147 static void ctdb_wait_election(struct ctdb_recoverd *rec)
1149 struct ctdb_context *ctdb = rec->ctdb;
1150 while (rec->election_timeout) {
1151 event_loop_once(ctdb->ev);
1156 Update our local flags from all remote connected nodes.
1157 This is only run when we are or we belive we are the recovery master
1159 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
1161 int j;
1162 struct ctdb_context *ctdb = rec->ctdb;
1163 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1165 /* get the nodemap for all active remote nodes and verify
1166 they are the same as for this node
1168 for (j=0; j<nodemap->num; j++) {
1169 struct ctdb_node_map *remote_nodemap=NULL;
1170 int ret;
1172 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
1173 continue;
1175 if (nodemap->nodes[j].pnn == ctdb->pnn) {
1176 continue;
1179 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
1180 mem_ctx, &remote_nodemap);
1181 if (ret != 0) {
1182 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
1183 nodemap->nodes[j].pnn));
1184 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
1185 talloc_free(mem_ctx);
1186 return MONITOR_FAILED;
1188 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
1189 /* We should tell our daemon about this so it
1190 updates its flags or else we will log the same
1191 message again in the next iteration of recovery.
1192 Since we are the recovery master we can just as
1193 well update the flags on all nodes.
1195 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
1196 if (ret != 0) {
1197 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
1198 return -1;
1201 /* Update our local copy of the flags in the recovery
1202 daemon.
1204 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
1205 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
1206 nodemap->nodes[j].flags));
1207 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
1209 talloc_free(remote_nodemap);
1211 talloc_free(mem_ctx);
1212 return MONITOR_OK;
1216 /* Create a new random generation ip.
1217 The generation id can not be the INVALID_GENERATION id
1219 static uint32_t new_generation(void)
1221 uint32_t generation;
1223 while (1) {
1224 generation = random();
1226 if (generation != INVALID_GENERATION) {
1227 break;
1231 return generation;
1236 create a temporary working database
1238 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1240 char *name;
1241 struct tdb_wrap *recdb;
1242 unsigned tdb_flags;
1244 /* open up the temporary recovery database */
1245 name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
1246 ctdb->db_directory_state,
1247 ctdb->pnn);
1248 if (name == NULL) {
1249 return NULL;
1251 unlink(name);
1253 tdb_flags = TDB_NOLOCK;
1254 if (ctdb->valgrinding) {
1255 tdb_flags |= TDB_NOMMAP;
1257 tdb_flags |= (TDB_INCOMPATIBLE_HASH | TDB_DISALLOW_NESTING);
1259 recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1260 tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1261 if (recdb == NULL) {
1262 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1265 talloc_free(name);
1267 return recdb;
1272 a traverse function for pulling all relevant records from recdb
1274 struct recdb_data {
1275 struct ctdb_context *ctdb;
1276 struct ctdb_marshall_buffer *recdata;
1277 uint32_t len;
1278 uint32_t allocated_len;
1279 bool failed;
1280 bool persistent;
1283 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1285 struct recdb_data *params = (struct recdb_data *)p;
1286 struct ctdb_rec_data *rec;
1287 struct ctdb_ltdb_header *hdr;
1290 * skip empty records - but NOT for persistent databases:
1292 * The record-by-record mode of recovery deletes empty records.
1293 * For persistent databases, this can lead to data corruption
1294 * by deleting records that should be there:
1296 * - Assume the cluster has been running for a while.
1298 * - A record R in a persistent database has been created and
1299 * deleted a couple of times, the last operation being deletion,
1300 * leaving an empty record with a high RSN, say 10.
1302 * - Now a node N is turned off.
1304 * - This leaves the local database copy of D on N with the empty
1305 * copy of R and RSN 10. On all other nodes, the recovery has deleted
1306 * the copy of record R.
1308 * - Now the record is created again while node N is turned off.
1309 * This creates R with RSN = 1 on all nodes except for N.
1311 * - Now node N is turned on again. The following recovery will chose
1312 * the older empty copy of R due to RSN 10 > RSN 1.
1314 * ==> Hence the record is gone after the recovery.
1316 * On databases like Samba's registry, this can damage the higher-level
1317 * data structures built from the various tdb-level records.
1319 if (!params->persistent && data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1320 return 0;
1323 /* update the dmaster field to point to us */
1324 hdr = (struct ctdb_ltdb_header *)data.dptr;
1325 if (!params->persistent) {
1326 hdr->dmaster = params->ctdb->pnn;
1327 hdr->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
1330 /* add the record to the blob ready to send to the nodes */
1331 rec = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1332 if (rec == NULL) {
1333 params->failed = true;
1334 return -1;
1336 if (params->len + rec->length >= params->allocated_len) {
1337 params->allocated_len = rec->length + params->len + params->ctdb->tunable.pulldb_preallocation_size;
1338 params->recdata = talloc_realloc_size(NULL, params->recdata, params->allocated_len);
1340 if (params->recdata == NULL) {
1341 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u\n",
1342 rec->length + params->len));
1343 params->failed = true;
1344 return -1;
1346 params->recdata->count++;
1347 memcpy(params->len+(uint8_t *)params->recdata, rec, rec->length);
1348 params->len += rec->length;
1349 talloc_free(rec);
1351 return 0;
1355 push the recdb database out to all nodes
1357 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1358 bool persistent,
1359 struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1361 struct recdb_data params;
1362 struct ctdb_marshall_buffer *recdata;
1363 TDB_DATA outdata;
1364 TALLOC_CTX *tmp_ctx;
1365 uint32_t *nodes;
1367 tmp_ctx = talloc_new(ctdb);
1368 CTDB_NO_MEMORY(ctdb, tmp_ctx);
1370 recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1371 CTDB_NO_MEMORY(ctdb, recdata);
1373 recdata->db_id = dbid;
1375 params.ctdb = ctdb;
1376 params.recdata = recdata;
1377 params.len = offsetof(struct ctdb_marshall_buffer, data);
1378 params.allocated_len = params.len;
1379 params.failed = false;
1380 params.persistent = persistent;
1382 if (tdb_traverse_read(recdb->tdb, traverse_recdb, &params) == -1) {
1383 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1384 talloc_free(params.recdata);
1385 talloc_free(tmp_ctx);
1386 return -1;
1389 if (params.failed) {
1390 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1391 talloc_free(params.recdata);
1392 talloc_free(tmp_ctx);
1393 return -1;
1396 recdata = params.recdata;
1398 outdata.dptr = (void *)recdata;
1399 outdata.dsize = params.len;
1401 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1402 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1403 nodes, 0,
1404 CONTROL_TIMEOUT(), false, outdata,
1405 NULL, NULL,
1406 NULL) != 0) {
1407 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1408 talloc_free(recdata);
1409 talloc_free(tmp_ctx);
1410 return -1;
1413 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1414 dbid, recdata->count));
1416 talloc_free(recdata);
1417 talloc_free(tmp_ctx);
1419 return 0;
1424 go through a full recovery on one database
1426 static int recover_database(struct ctdb_recoverd *rec,
1427 TALLOC_CTX *mem_ctx,
1428 uint32_t dbid,
1429 bool persistent,
1430 uint32_t pnn,
1431 struct ctdb_node_map *nodemap,
1432 uint32_t transaction_id)
1434 struct tdb_wrap *recdb;
1435 int ret;
1436 struct ctdb_context *ctdb = rec->ctdb;
1437 TDB_DATA data;
1438 struct ctdb_control_wipe_database w;
1439 uint32_t *nodes;
1441 recdb = create_recdb(ctdb, mem_ctx);
1442 if (recdb == NULL) {
1443 return -1;
1446 /* pull all remote databases onto the recdb */
1447 ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
1448 if (ret != 0) {
1449 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1450 return -1;
1453 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1455 /* wipe all the remote databases. This is safe as we are in a transaction */
1456 w.db_id = dbid;
1457 w.transaction_id = transaction_id;
1459 data.dptr = (void *)&w;
1460 data.dsize = sizeof(w);
1462 nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1463 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1464 nodes, 0,
1465 CONTROL_TIMEOUT(), false, data,
1466 NULL, NULL,
1467 NULL) != 0) {
1468 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1469 talloc_free(recdb);
1470 return -1;
1473 /* push out the correct database. This sets the dmaster and skips
1474 the empty records */
1475 ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
1476 if (ret != 0) {
1477 talloc_free(recdb);
1478 return -1;
1481 /* all done with this database */
1482 talloc_free(recdb);
1484 return 0;
1487 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1488 struct ctdb_recoverd *rec,
1489 struct ctdb_node_map *nodemap,
1490 uint32_t *culprit)
1492 int j;
1493 int ret;
1495 if (ctdb->num_nodes != nodemap->num) {
1496 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
1497 ctdb->num_nodes, nodemap->num));
1498 if (culprit) {
1499 *culprit = ctdb->pnn;
1501 return -1;
1504 for (j=0; j<nodemap->num; j++) {
1505 /* For readability */
1506 struct ctdb_node *node = ctdb->nodes[j];
1508 /* release any existing data */
1509 if (node->known_public_ips) {
1510 talloc_free(node->known_public_ips);
1511 node->known_public_ips = NULL;
1513 if (node->available_public_ips) {
1514 talloc_free(node->available_public_ips);
1515 node->available_public_ips = NULL;
1518 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1519 continue;
1522 /* Retrieve the list of known public IPs from the node */
1523 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1524 CONTROL_TIMEOUT(),
1525 node->pnn,
1526 ctdb->nodes,
1528 &node->known_public_ips);
1529 if (ret != 0) {
1530 DEBUG(DEBUG_ERR,
1531 ("Failed to read known public IPs from node: %u\n",
1532 node->pnn));
1533 if (culprit) {
1534 *culprit = node->pnn;
1536 return -1;
1539 if (ctdb->do_checkpublicip &&
1540 rec->takeover_runs_disable_ctx == NULL &&
1541 verify_remote_ip_allocation(ctdb,
1542 node->known_public_ips,
1543 node->pnn)) {
1544 DEBUG(DEBUG_ERR,("Trigger IP reallocation\n"));
1545 rec->need_takeover_run = true;
1548 /* Retrieve the list of available public IPs from the node */
1549 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1550 CONTROL_TIMEOUT(),
1551 node->pnn,
1552 ctdb->nodes,
1553 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1554 &node->available_public_ips);
1555 if (ret != 0) {
1556 DEBUG(DEBUG_ERR,
1557 ("Failed to read available public IPs from node: %u\n",
1558 node->pnn));
1559 if (culprit) {
1560 *culprit = node->pnn;
1562 return -1;
1566 return 0;
1569 /* when we start a recovery, make sure all nodes use the same reclock file
1570 setting
1572 static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd *rec)
1574 struct ctdb_context *ctdb = rec->ctdb;
1575 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
1576 TDB_DATA data;
1577 uint32_t *nodes;
1579 if (ctdb->recovery_lock_file == NULL) {
1580 data.dptr = NULL;
1581 data.dsize = 0;
1582 } else {
1583 data.dsize = strlen(ctdb->recovery_lock_file) + 1;
1584 data.dptr = (uint8_t *)ctdb->recovery_lock_file;
1587 nodes = list_of_active_nodes(ctdb, rec->nodemap, tmp_ctx, true);
1588 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECLOCK_FILE,
1589 nodes, 0,
1590 CONTROL_TIMEOUT(),
1591 false, data,
1592 NULL, NULL,
1593 rec) != 0) {
1594 DEBUG(DEBUG_ERR, (__location__ " Failed to sync reclock file settings\n"));
1595 talloc_free(tmp_ctx);
1596 return -1;
1599 talloc_free(tmp_ctx);
1600 return 0;
1605 * this callback is called for every node that failed to execute ctdb_takeover_run()
1606 * and set flag to re-run takeover run.
1608 static void takeover_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
1610 DEBUG(DEBUG_ERR, ("Node %u failed the takeover run\n", node_pnn));
1612 if (callback_data != NULL) {
1613 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
1615 DEBUG(DEBUG_ERR, ("Setting node %u as recovery fail culprit\n", node_pnn));
1617 ctdb_set_culprit(rec, node_pnn);
1622 static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
1624 struct ctdb_context *ctdb = rec->ctdb;
1625 int i;
1626 struct ctdb_banning_state *ban_state;
1628 *self_ban = false;
1629 for (i=0; i<ctdb->num_nodes; i++) {
1630 if (ctdb->nodes[i]->ban_state == NULL) {
1631 continue;
1633 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1634 if (ban_state->count < 2*ctdb->num_nodes) {
1635 continue;
1638 DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
1639 ctdb->nodes[i]->pnn, ban_state->count,
1640 ctdb->tunable.recovery_ban_period));
1641 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1642 ban_state->count = 0;
1644 /* Banning ourself? */
1645 if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
1646 *self_ban = true;
1651 static bool do_takeover_run(struct ctdb_recoverd *rec,
1652 struct ctdb_node_map *nodemap,
1653 bool banning_credits_on_fail)
1655 uint32_t *nodes = NULL;
1656 struct srvid_request_data dtr;
1657 TDB_DATA data;
1658 int i;
1659 uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
1660 int ret;
1661 bool ok;
1663 DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
1665 if (rec->takeover_run_in_progress) {
1666 DEBUG(DEBUG_ERR, (__location__
1667 " takeover run already in progress \n"));
1668 ok = false;
1669 goto done;
1672 rec->takeover_run_in_progress = true;
1674 /* If takeover runs are in disabled then fail... */
1675 if (rec->takeover_runs_disable_ctx != NULL) {
1676 DEBUG(DEBUG_ERR,
1677 ("Takeover runs are disabled so refusing to run one\n"));
1678 ok = false;
1679 goto done;
1682 /* Disable IP checks (takeover runs, really) on other nodes
1683 * while doing this takeover run. This will stop those other
1684 * nodes from triggering takeover runs when think they should
1685 * be hosting an IP but it isn't yet on an interface. Don't
1686 * wait for replies since a failure here might cause some
1687 * noise in the logs but will not actually cause a problem.
1689 dtr.srvid = 0; /* No reply */
1690 dtr.pnn = -1;
1692 data.dptr = (uint8_t*)&dtr;
1693 data.dsize = sizeof(dtr);
1695 nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
1697 /* Disable for 60 seconds. This can be a tunable later if
1698 * necessary.
1700 dtr.data = 60;
1701 for (i = 0; i < talloc_array_length(nodes); i++) {
1702 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1703 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1704 data) != 0) {
1705 DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
1709 ret = ctdb_takeover_run(rec->ctdb, nodemap,
1710 rec->force_rebalance_nodes,
1711 takeover_fail_callback,
1712 banning_credits_on_fail ? rec : NULL);
1714 /* Reenable takeover runs and IP checks on other nodes */
1715 dtr.data = 0;
1716 for (i = 0; i < talloc_array_length(nodes); i++) {
1717 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1718 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1719 data) != 0) {
1720 DEBUG(DEBUG_INFO,("Failed to reenable takeover runs\n"));
1724 if (ret != 0) {
1725 DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
1726 ok = false;
1727 goto done;
1730 ok = true;
1731 /* Takeover run was successful so clear force rebalance targets */
1732 if (rebalance_nodes == rec->force_rebalance_nodes) {
1733 TALLOC_FREE(rec->force_rebalance_nodes);
1734 } else {
1735 DEBUG(DEBUG_WARNING,
1736 ("Rebalance target nodes changed during takeover run - not clearing\n"));
1738 done:
1739 rec->need_takeover_run = !ok;
1740 talloc_free(nodes);
1741 rec->takeover_run_in_progress = false;
1743 DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
1744 return ok;
1749 we are the recmaster, and recovery is needed - start a recovery run
1751 static int do_recovery(struct ctdb_recoverd *rec,
1752 TALLOC_CTX *mem_ctx, uint32_t pnn,
1753 struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap)
1755 struct ctdb_context *ctdb = rec->ctdb;
1756 int i, j, ret;
1757 uint32_t generation;
1758 struct ctdb_dbid_map *dbmap;
1759 TDB_DATA data;
1760 uint32_t *nodes;
1761 struct timeval start_time;
1762 uint32_t culprit = (uint32_t)-1;
1763 bool self_ban;
1765 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1767 /* if recovery fails, force it again */
1768 rec->need_recovery = true;
1770 ban_misbehaving_nodes(rec, &self_ban);
1771 if (self_ban) {
1772 DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
1773 return -1;
1776 if (ctdb->tunable.verify_recovery_lock != 0) {
1777 DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n"));
1778 start_time = timeval_current();
1779 if (!ctdb_recovery_lock(ctdb, true)) {
1780 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
1781 "and ban ourself for %u seconds\n",
1782 ctdb->tunable.recovery_ban_period));
1783 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
1784 return -1;
1786 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
1787 DEBUG(DEBUG_NOTICE,("Recovery lock taken successfully by recovery daemon\n"));
1790 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1792 /* get a list of all databases */
1793 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1794 if (ret != 0) {
1795 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1796 return -1;
1799 /* we do the db creation before we set the recovery mode, so the freeze happens
1800 on all databases we will be dealing with. */
1802 /* verify that we have all the databases any other node has */
1803 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1804 if (ret != 0) {
1805 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1806 return -1;
1809 /* verify that all other nodes have all our databases */
1810 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1811 if (ret != 0) {
1812 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1813 return -1;
1815 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1817 /* update the database priority for all remote databases */
1818 ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
1819 if (ret != 0) {
1820 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
1822 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
1825 /* update all other nodes to use the same setting for reclock files
1826 as the local recovery master.
1828 sync_recovery_lock_file_across_cluster(rec);
1830 /* set recovery mode to active on all nodes */
1831 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1832 if (ret != 0) {
1833 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1834 return -1;
1837 /* execute the "startrecovery" event script on all nodes */
1838 ret = run_startrecovery_eventscript(rec, nodemap);
1839 if (ret!=0) {
1840 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1841 return -1;
1845 update all nodes to have the same flags that we have
1847 for (i=0;i<nodemap->num;i++) {
1848 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1849 continue;
1852 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1853 if (ret != 0) {
1854 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1855 DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
1856 } else {
1857 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1858 return -1;
1863 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1865 /* pick a new generation number */
1866 generation = new_generation();
1868 /* change the vnnmap on this node to use the new generation
1869 number but not on any other nodes.
1870 this guarantees that if we abort the recovery prematurely
1871 for some reason (a node stops responding?)
1872 that we can just return immediately and we will reenter
1873 recovery shortly again.
1874 I.e. we deliberately leave the cluster with an inconsistent
1875 generation id to allow us to abort recovery at any stage and
1876 just restart it from scratch.
1878 vnnmap->generation = generation;
1879 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1880 if (ret != 0) {
1881 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1882 return -1;
1885 data.dptr = (void *)&generation;
1886 data.dsize = sizeof(uint32_t);
1888 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1889 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1890 nodes, 0,
1891 CONTROL_TIMEOUT(), false, data,
1892 NULL,
1893 transaction_start_fail_callback,
1894 rec) != 0) {
1895 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1896 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
1897 nodes, 0,
1898 CONTROL_TIMEOUT(), false, tdb_null,
1899 NULL,
1900 NULL,
1901 NULL) != 0) {
1902 DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
1904 return -1;
1907 DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1909 for (i=0;i<dbmap->num;i++) {
1910 ret = recover_database(rec, mem_ctx,
1911 dbmap->dbs[i].dbid,
1912 dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT,
1913 pnn, nodemap, generation);
1914 if (ret != 0) {
1915 DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
1916 return -1;
1920 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
1922 /* commit all the changes */
1923 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
1924 nodes, 0,
1925 CONTROL_TIMEOUT(), false, data,
1926 NULL, NULL,
1927 NULL) != 0) {
1928 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
1929 return -1;
1932 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
1935 /* update the capabilities for all nodes */
1936 ret = update_capabilities(ctdb, nodemap);
1937 if (ret!=0) {
1938 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1939 return -1;
1942 /* build a new vnn map with all the currently active and
1943 unbanned nodes */
1944 generation = new_generation();
1945 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
1946 CTDB_NO_MEMORY(ctdb, vnnmap);
1947 vnnmap->generation = generation;
1948 vnnmap->size = 0;
1949 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
1950 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1951 for (i=j=0;i<nodemap->num;i++) {
1952 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1953 continue;
1955 if (!(ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER)) {
1956 /* this node can not be an lmaster */
1957 DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
1958 continue;
1961 vnnmap->size++;
1962 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1963 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1964 vnnmap->map[j++] = nodemap->nodes[i].pnn;
1967 if (vnnmap->size == 0) {
1968 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
1969 vnnmap->size++;
1970 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1971 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1972 vnnmap->map[0] = pnn;
1975 /* update to the new vnnmap on all nodes */
1976 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
1977 if (ret != 0) {
1978 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
1979 return -1;
1982 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
1984 /* update recmaster to point to us for all nodes */
1985 ret = set_recovery_master(ctdb, nodemap, pnn);
1986 if (ret!=0) {
1987 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
1988 return -1;
1991 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
1993 /* disable recovery mode */
1994 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL);
1995 if (ret != 0) {
1996 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
1997 return -1;
2000 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
2002 /* Fetch known/available public IPs from each active node */
2003 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
2004 if (ret != 0) {
2005 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2006 culprit));
2007 rec->need_takeover_run = true;
2008 return -1;
2011 do_takeover_run(rec, nodemap, false);
2013 /* execute the "recovered" event script on all nodes */
2014 ret = run_recovered_eventscript(rec, nodemap, "do_recovery");
2015 if (ret!=0) {
2016 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
2017 return -1;
2020 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
2022 /* send a message to all clients telling them that the cluster
2023 has been reconfigured */
2024 ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
2025 CTDB_SRVID_RECONFIGURE, tdb_null);
2026 if (ret != 0) {
2027 DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n"));
2028 return -1;
2031 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
2033 rec->need_recovery = false;
2035 /* we managed to complete a full recovery, make sure to forgive
2036 any past sins by the nodes that could now participate in the
2037 recovery.
2039 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
2040 for (i=0;i<nodemap->num;i++) {
2041 struct ctdb_banning_state *ban_state;
2043 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2044 continue;
2047 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
2048 if (ban_state == NULL) {
2049 continue;
2052 ban_state->count = 0;
2056 /* We just finished a recovery successfully.
2057 We now wait for rerecovery_timeout before we allow
2058 another recovery to take place.
2060 DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
2061 ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
2062 DEBUG(DEBUG_NOTICE, ("The rerecovery timeout has elapsed. We now allow recoveries to trigger again.\n"));
2064 return 0;
2069 elections are won by first checking the number of connected nodes, then
2070 the priority time, then the pnn
2072 struct election_message {
2073 uint32_t num_connected;
2074 struct timeval priority_time;
2075 uint32_t pnn;
2076 uint32_t node_flags;
2080 form this nodes election data
2082 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
2084 int ret, i;
2085 struct ctdb_node_map *nodemap;
2086 struct ctdb_context *ctdb = rec->ctdb;
2088 ZERO_STRUCTP(em);
2090 em->pnn = rec->ctdb->pnn;
2091 em->priority_time = rec->priority_time;
2093 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
2094 if (ret != 0) {
2095 DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
2096 return;
2099 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
2100 em->node_flags = rec->node_flags;
2102 for (i=0;i<nodemap->num;i++) {
2103 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
2104 em->num_connected++;
2108 /* we shouldnt try to win this election if we cant be a recmaster */
2109 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2110 em->num_connected = 0;
2111 em->priority_time = timeval_current();
2114 talloc_free(nodemap);
2118 see if the given election data wins
2120 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
2122 struct election_message myem;
2123 int cmp = 0;
2125 ctdb_election_data(rec, &myem);
2127 /* we cant win if we dont have the recmaster capability */
2128 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2129 return false;
2132 /* we cant win if we are banned */
2133 if (rec->node_flags & NODE_FLAGS_BANNED) {
2134 return false;
2137 /* we cant win if we are stopped */
2138 if (rec->node_flags & NODE_FLAGS_STOPPED) {
2139 return false;
2142 /* we will automatically win if the other node is banned */
2143 if (em->node_flags & NODE_FLAGS_BANNED) {
2144 return true;
2147 /* we will automatically win if the other node is banned */
2148 if (em->node_flags & NODE_FLAGS_STOPPED) {
2149 return true;
2152 /* try to use the most connected node */
2153 if (cmp == 0) {
2154 cmp = (int)myem.num_connected - (int)em->num_connected;
2157 /* then the longest running node */
2158 if (cmp == 0) {
2159 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
2162 if (cmp == 0) {
2163 cmp = (int)myem.pnn - (int)em->pnn;
2166 return cmp > 0;
2170 send out an election request
2172 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
2174 int ret;
2175 TDB_DATA election_data;
2176 struct election_message emsg;
2177 uint64_t srvid;
2178 struct ctdb_context *ctdb = rec->ctdb;
2180 srvid = CTDB_SRVID_RECOVERY;
2182 ctdb_election_data(rec, &emsg);
2184 election_data.dsize = sizeof(struct election_message);
2185 election_data.dptr = (unsigned char *)&emsg;
2188 /* first we assume we will win the election and set
2189 recoverymaster to be ourself on the current node
2191 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
2192 if (ret != 0) {
2193 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
2194 return -1;
2198 /* send an election message to all active nodes */
2199 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
2200 return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
2204 this function will unban all nodes in the cluster
2206 static void unban_all_nodes(struct ctdb_context *ctdb)
2208 int ret, i;
2209 struct ctdb_node_map *nodemap;
2210 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2212 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2213 if (ret != 0) {
2214 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
2215 return;
2218 for (i=0;i<nodemap->num;i++) {
2219 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
2220 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
2221 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(),
2222 nodemap->nodes[i].pnn, 0,
2223 NODE_FLAGS_BANNED);
2224 if (ret != 0) {
2225 DEBUG(DEBUG_ERR, (__location__ " failed to reset ban state\n"));
2230 talloc_free(tmp_ctx);
2235 we think we are winning the election - send a broadcast election request
2237 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
2239 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2240 int ret;
2242 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
2243 if (ret != 0) {
2244 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
2247 talloc_free(rec->send_election_te);
2248 rec->send_election_te = NULL;
2252 handler for memory dumps
2254 static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
2255 TDB_DATA data, void *private_data)
2257 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2258 TDB_DATA *dump;
2259 int ret;
2260 struct srvid_request *rd;
2262 if (data.dsize != sizeof(struct srvid_request)) {
2263 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2264 talloc_free(tmp_ctx);
2265 return;
2267 rd = (struct srvid_request *)data.dptr;
2269 dump = talloc_zero(tmp_ctx, TDB_DATA);
2270 if (dump == NULL) {
2271 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
2272 talloc_free(tmp_ctx);
2273 return;
2275 ret = ctdb_dump_memory(ctdb, dump);
2276 if (ret != 0) {
2277 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
2278 talloc_free(tmp_ctx);
2279 return;
2282 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
2284 ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
2285 if (ret != 0) {
2286 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
2287 talloc_free(tmp_ctx);
2288 return;
2291 talloc_free(tmp_ctx);
2295 handler for getlog
2297 static void getlog_handler(struct ctdb_context *ctdb, uint64_t srvid,
2298 TDB_DATA data, void *private_data)
2300 struct ctdb_get_log_addr *log_addr;
2301 pid_t child;
2303 if (data.dsize != sizeof(struct ctdb_get_log_addr)) {
2304 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2305 return;
2307 log_addr = (struct ctdb_get_log_addr *)data.dptr;
2309 child = ctdb_fork_no_free_ringbuffer(ctdb);
2310 if (child == (pid_t)-1) {
2311 DEBUG(DEBUG_ERR,("Failed to fork a log collector child\n"));
2312 return;
2315 if (child == 0) {
2316 ctdb_set_process_name("ctdb_rec_log_collector");
2317 if (switch_from_server_to_client(ctdb, "recoverd-log-collector") != 0) {
2318 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch log collector child into client mode.\n"));
2319 _exit(1);
2321 ctdb_collect_log(ctdb, log_addr);
2322 _exit(0);
2327 handler for clearlog
2329 static void clearlog_handler(struct ctdb_context *ctdb, uint64_t srvid,
2330 TDB_DATA data, void *private_data)
2332 ctdb_clear_log(ctdb);
2336 handler for reload_nodes
2338 static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid,
2339 TDB_DATA data, void *private_data)
2341 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2343 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
2345 ctdb_load_nodes_file(rec->ctdb);
2349 static void ctdb_rebalance_timeout(struct event_context *ev,
2350 struct timed_event *te,
2351 struct timeval t, void *p)
2353 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2355 if (rec->force_rebalance_nodes == NULL) {
2356 DEBUG(DEBUG_ERR,
2357 ("Rebalance timeout occurred - no nodes to rebalance\n"));
2358 return;
2361 DEBUG(DEBUG_NOTICE,
2362 ("Rebalance timeout occurred - do takeover run\n"));
2363 do_takeover_run(rec, rec->nodemap, false);
2367 static void recd_node_rebalance_handler(struct ctdb_context *ctdb,
2368 uint64_t srvid,
2369 TDB_DATA data, void *private_data)
2371 uint32_t pnn;
2372 uint32_t *t;
2373 int len;
2374 uint32_t deferred_rebalance;
2375 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2377 if (rec->recmaster != ctdb_get_pnn(ctdb)) {
2378 return;
2381 if (data.dsize != sizeof(uint32_t)) {
2382 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
2383 return;
2386 pnn = *(uint32_t *)&data.dptr[0];
2388 DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
2390 /* Copy any existing list of nodes. There's probably some
2391 * sort of realloc variant that will do this but we need to
2392 * make sure that freeing the old array also cancels the timer
2393 * event for the timeout... not sure if realloc will do that.
2395 len = (rec->force_rebalance_nodes != NULL) ?
2396 talloc_array_length(rec->force_rebalance_nodes) :
2399 /* This allows duplicates to be added but they don't cause
2400 * harm. A call to add a duplicate PNN arguably means that
2401 * the timeout should be reset, so this is the simplest
2402 * solution.
2404 t = talloc_zero_array(rec, uint32_t, len+1);
2405 CTDB_NO_MEMORY_VOID(ctdb, t);
2406 if (len > 0) {
2407 memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
2409 t[len] = pnn;
2411 talloc_free(rec->force_rebalance_nodes);
2413 rec->force_rebalance_nodes = t;
2415 /* If configured, setup a deferred takeover run to make sure
2416 * that certain nodes get IPs rebalanced to them. This will
2417 * be cancelled if a successful takeover run happens before
2418 * the timeout. Assign tunable value to variable for
2419 * readability.
2421 deferred_rebalance = ctdb->tunable.deferred_rebalance_on_node_add;
2422 if (deferred_rebalance != 0) {
2423 event_add_timed(ctdb->ev, rec->force_rebalance_nodes,
2424 timeval_current_ofs(deferred_rebalance, 0),
2425 ctdb_rebalance_timeout, rec);
2431 static void recd_update_ip_handler(struct ctdb_context *ctdb, uint64_t srvid,
2432 TDB_DATA data, void *private_data)
2434 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2435 struct ctdb_public_ip *ip;
2437 if (rec->recmaster != rec->ctdb->pnn) {
2438 DEBUG(DEBUG_INFO,("Not recmaster, ignore update ip message\n"));
2439 return;
2442 if (data.dsize != sizeof(struct ctdb_public_ip)) {
2443 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(struct ctdb_public_ip)));
2444 return;
2447 ip = (struct ctdb_public_ip *)data.dptr;
2449 update_ip_assignment_tree(rec->ctdb, ip);
2453 static void clear_takeover_runs_disable(struct ctdb_recoverd *rec)
2455 TALLOC_FREE(rec->takeover_runs_disable_ctx);
2458 static void reenable_takeover_runs(struct event_context *ev,
2459 struct timed_event *te,
2460 struct timeval yt, void *p)
2462 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2464 DEBUG(DEBUG_NOTICE,("Reenabling takeover runs after timeout\n"));
2465 clear_takeover_runs_disable(rec);
2468 static void disable_takeover_runs_handler(struct ctdb_context *ctdb,
2469 uint64_t srvid, TDB_DATA data,
2470 void *private_data)
2472 struct ctdb_recoverd *rec = talloc_get_type(private_data,
2473 struct ctdb_recoverd);
2474 struct srvid_request_data *r;
2475 uint32_t timeout;
2476 TDB_DATA result;
2477 int32_t ret = 0;
2479 /* Validate input data */
2480 if (data.dsize != sizeof(struct srvid_request_data)) {
2481 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2482 "expecting %lu\n", (long unsigned)data.dsize,
2483 (long unsigned)sizeof(struct srvid_request)));
2484 return;
2486 if (data.dptr == NULL) {
2487 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2488 return;
2491 r = (struct srvid_request_data *)data.dptr;
2492 timeout = r->data;
2494 if (timeout == 0) {
2495 DEBUG(DEBUG_NOTICE,("Reenabling takeover runs\n"));
2496 clear_takeover_runs_disable(rec);
2497 ret = ctdb_get_pnn(ctdb);
2498 goto done;
2501 if (rec->takeover_run_in_progress) {
2502 DEBUG(DEBUG_ERR,
2503 ("Unable to disable takeover runs - in progress\n"));
2504 ret = -EAGAIN;
2505 goto done;
2508 DEBUG(DEBUG_NOTICE,("Disabling takeover runs for %u seconds\n", timeout));
2510 /* Clear any old timers */
2511 clear_takeover_runs_disable(rec);
2513 /* When this is non-NULL it indicates that takeover runs are
2514 * disabled. This context also holds the timeout timer.
2516 rec->takeover_runs_disable_ctx = talloc_new(rec);
2517 if (rec->takeover_runs_disable_ctx == NULL) {
2518 DEBUG(DEBUG_ERR,(__location__ " Unable to allocate memory\n"));
2519 ret = -ENOMEM;
2520 goto done;
2523 /* Arrange for the timeout to occur */
2524 event_add_timed(ctdb->ev, rec->takeover_runs_disable_ctx,
2525 timeval_current_ofs(timeout, 0),
2526 reenable_takeover_runs,
2527 rec);
2529 /* Returning our PNN tells the caller that we succeeded */
2530 ret = ctdb_get_pnn(ctdb);
2531 done:
2532 result.dsize = sizeof(int32_t);
2533 result.dptr = (uint8_t *)&ret;
2534 srvid_request_reply(ctdb, (struct srvid_request *)r, result);
2537 /* Backward compatibility for this SRVID - call
2538 * disable_takeover_runs_handler() instead
2540 static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid,
2541 TDB_DATA data, void *private_data)
2543 struct ctdb_recoverd *rec = talloc_get_type(private_data,
2544 struct ctdb_recoverd);
2545 TDB_DATA data2;
2546 struct srvid_request_data *req;
2548 if (data.dsize != sizeof(uint32_t)) {
2549 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2550 "expecting %lu\n", (long unsigned)data.dsize,
2551 (long unsigned)sizeof(uint32_t)));
2552 return;
2554 if (data.dptr == NULL) {
2555 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2556 return;
2559 req = talloc(ctdb, struct srvid_request_data);
2560 CTDB_NO_MEMORY_VOID(ctdb, req);
2562 req->srvid = 0; /* No reply */
2563 req->pnn = -1;
2564 req->data = *((uint32_t *)data.dptr); /* Timeout */
2566 data2.dsize = sizeof(*req);
2567 data2.dptr = (uint8_t *)req;
2569 disable_takeover_runs_handler(rec->ctdb,
2570 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
2571 data2, rec);
2575 handler for ip reallocate, just add it to the list of requests and
2576 handle this later in the monitor_cluster loop so we do not recurse
2577 with other requests to takeover_run()
2579 static void ip_reallocate_handler(struct ctdb_context *ctdb, uint64_t srvid,
2580 TDB_DATA data, void *private_data)
2582 struct srvid_request *request;
2583 struct ctdb_recoverd *rec = talloc_get_type(private_data,
2584 struct ctdb_recoverd);
2586 if (data.dsize != sizeof(struct srvid_request)) {
2587 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2588 return;
2591 request = (struct srvid_request *)data.dptr;
2593 srvid_request_add(ctdb, &rec->reallocate_requests, request);
2596 static void process_ipreallocate_requests(struct ctdb_context *ctdb,
2597 struct ctdb_recoverd *rec)
2599 TDB_DATA result;
2600 int32_t ret;
2601 uint32_t culprit;
2602 struct srvid_requests *current;
2604 DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
2606 /* Only process requests that are currently pending. More
2607 * might come in while the takeover run is in progress and
2608 * they will need to be processed later since they might
2609 * be in response flag changes.
2611 current = rec->reallocate_requests;
2612 rec->reallocate_requests = NULL;
2614 /* update the list of public ips that a node can handle for
2615 all connected nodes
2617 ret = ctdb_reload_remote_public_ips(ctdb, rec, rec->nodemap, &culprit);
2618 if (ret != 0) {
2619 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2620 culprit));
2621 rec->need_takeover_run = true;
2623 if (ret == 0) {
2624 if (do_takeover_run(rec, rec->nodemap, false)) {
2625 ret = ctdb_get_pnn(ctdb);
2626 } else {
2627 ret = -1;
2631 result.dsize = sizeof(int32_t);
2632 result.dptr = (uint8_t *)&ret;
2634 srvid_requests_reply(ctdb, &current, result);
2639 handler for recovery master elections
2641 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
2642 TDB_DATA data, void *private_data)
2644 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2645 int ret;
2646 struct election_message *em = (struct election_message *)data.dptr;
2647 TALLOC_CTX *mem_ctx;
2649 /* Ignore election packets from ourself */
2650 if (ctdb->pnn == em->pnn) {
2651 return;
2654 /* we got an election packet - update the timeout for the election */
2655 talloc_free(rec->election_timeout);
2656 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2657 fast_start ?
2658 timeval_current_ofs(0, 500000) :
2659 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2660 ctdb_election_timeout, rec);
2662 mem_ctx = talloc_new(ctdb);
2664 /* someone called an election. check their election data
2665 and if we disagree and we would rather be the elected node,
2666 send a new election message to all other nodes
2668 if (ctdb_election_win(rec, em)) {
2669 if (!rec->send_election_te) {
2670 rec->send_election_te = event_add_timed(ctdb->ev, rec,
2671 timeval_current_ofs(0, 500000),
2672 election_send_request, rec);
2674 talloc_free(mem_ctx);
2675 /*unban_all_nodes(ctdb);*/
2676 return;
2679 /* we didn't win */
2680 talloc_free(rec->send_election_te);
2681 rec->send_election_te = NULL;
2683 if (ctdb->tunable.verify_recovery_lock != 0) {
2684 /* release the recmaster lock */
2685 if (em->pnn != ctdb->pnn &&
2686 ctdb->recovery_lock_fd != -1) {
2687 close(ctdb->recovery_lock_fd);
2688 ctdb->recovery_lock_fd = -1;
2689 unban_all_nodes(ctdb);
2693 /* ok, let that guy become recmaster then */
2694 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
2695 if (ret != 0) {
2696 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
2697 talloc_free(mem_ctx);
2698 return;
2701 talloc_free(mem_ctx);
2702 return;
2707 force the start of the election process
2709 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
2710 struct ctdb_node_map *nodemap)
2712 int ret;
2713 struct ctdb_context *ctdb = rec->ctdb;
2715 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
2717 /* set all nodes to recovery mode to stop all internode traffic */
2718 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
2719 if (ret != 0) {
2720 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
2721 return;
2724 talloc_free(rec->election_timeout);
2725 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2726 fast_start ?
2727 timeval_current_ofs(0, 500000) :
2728 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2729 ctdb_election_timeout, rec);
2731 ret = send_election_request(rec, pnn);
2732 if (ret!=0) {
2733 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
2734 return;
2737 /* wait for a few seconds to collect all responses */
2738 ctdb_wait_election(rec);
2744 handler for when a node changes its flags
2746 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
2747 TDB_DATA data, void *private_data)
2749 int ret;
2750 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2751 struct ctdb_node_map *nodemap=NULL;
2752 TALLOC_CTX *tmp_ctx;
2753 int i;
2754 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2755 int disabled_flag_changed;
2757 if (data.dsize != sizeof(*c)) {
2758 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2759 return;
2762 tmp_ctx = talloc_new(ctdb);
2763 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2765 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2766 if (ret != 0) {
2767 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2768 talloc_free(tmp_ctx);
2769 return;
2773 for (i=0;i<nodemap->num;i++) {
2774 if (nodemap->nodes[i].pnn == c->pnn) break;
2777 if (i == nodemap->num) {
2778 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2779 talloc_free(tmp_ctx);
2780 return;
2783 if (c->old_flags != c->new_flags) {
2784 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2787 disabled_flag_changed = (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
2789 nodemap->nodes[i].flags = c->new_flags;
2791 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2792 CTDB_CURRENT_NODE, &ctdb->recovery_master);
2794 if (ret == 0) {
2795 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2796 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2799 if (ret == 0 &&
2800 ctdb->recovery_master == ctdb->pnn &&
2801 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2802 /* Only do the takeover run if the perm disabled or unhealthy
2803 flags changed since these will cause an ip failover but not
2804 a recovery.
2805 If the node became disconnected or banned this will also
2806 lead to an ip address failover but that is handled
2807 during recovery
2809 if (disabled_flag_changed) {
2810 rec->need_takeover_run = true;
2814 talloc_free(tmp_ctx);
2818 handler for when we need to push out flag changes ot all other nodes
2820 static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
2821 TDB_DATA data, void *private_data)
2823 int ret;
2824 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2825 struct ctdb_node_map *nodemap=NULL;
2826 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2827 uint32_t recmaster;
2828 uint32_t *nodes;
2830 /* find the recovery master */
2831 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
2832 if (ret != 0) {
2833 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from local node\n"));
2834 talloc_free(tmp_ctx);
2835 return;
2838 /* read the node flags from the recmaster */
2839 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), recmaster, tmp_ctx, &nodemap);
2840 if (ret != 0) {
2841 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2842 talloc_free(tmp_ctx);
2843 return;
2845 if (c->pnn >= nodemap->num) {
2846 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2847 talloc_free(tmp_ctx);
2848 return;
2851 /* send the flags update to all connected nodes */
2852 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2854 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2855 nodes, 0, CONTROL_TIMEOUT(),
2856 false, data,
2857 NULL, NULL,
2858 NULL) != 0) {
2859 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2861 talloc_free(tmp_ctx);
2862 return;
2865 talloc_free(tmp_ctx);
2869 struct verify_recmode_normal_data {
2870 uint32_t count;
2871 enum monitor_result status;
2874 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2876 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2879 /* one more node has responded with recmode data*/
2880 rmdata->count--;
2882 /* if we failed to get the recmode, then return an error and let
2883 the main loop try again.
2885 if (state->state != CTDB_CONTROL_DONE) {
2886 if (rmdata->status == MONITOR_OK) {
2887 rmdata->status = MONITOR_FAILED;
2889 return;
2892 /* if we got a response, then the recmode will be stored in the
2893 status field
2895 if (state->status != CTDB_RECOVERY_NORMAL) {
2896 DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
2897 rmdata->status = MONITOR_RECOVERY_NEEDED;
2900 return;
2904 /* verify that all nodes are in normal recovery mode */
2905 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2907 struct verify_recmode_normal_data *rmdata;
2908 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2909 struct ctdb_client_control_state *state;
2910 enum monitor_result status;
2911 int j;
2913 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2914 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2915 rmdata->count = 0;
2916 rmdata->status = MONITOR_OK;
2918 /* loop over all active nodes and send an async getrecmode call to
2919 them*/
2920 for (j=0; j<nodemap->num; j++) {
2921 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2922 continue;
2924 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2925 CONTROL_TIMEOUT(),
2926 nodemap->nodes[j].pnn);
2927 if (state == NULL) {
2928 /* we failed to send the control, treat this as
2929 an error and try again next iteration
2931 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2932 talloc_free(mem_ctx);
2933 return MONITOR_FAILED;
2936 /* set up the callback functions */
2937 state->async.fn = verify_recmode_normal_callback;
2938 state->async.private_data = rmdata;
2940 /* one more control to wait for to complete */
2941 rmdata->count++;
2945 /* now wait for up to the maximum number of seconds allowed
2946 or until all nodes we expect a response from has replied
2948 while (rmdata->count > 0) {
2949 event_loop_once(ctdb->ev);
2952 status = rmdata->status;
2953 talloc_free(mem_ctx);
2954 return status;
2958 struct verify_recmaster_data {
2959 struct ctdb_recoverd *rec;
2960 uint32_t count;
2961 uint32_t pnn;
2962 enum monitor_result status;
2965 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2967 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2970 /* one more node has responded with recmaster data*/
2971 rmdata->count--;
2973 /* if we failed to get the recmaster, then return an error and let
2974 the main loop try again.
2976 if (state->state != CTDB_CONTROL_DONE) {
2977 if (rmdata->status == MONITOR_OK) {
2978 rmdata->status = MONITOR_FAILED;
2980 return;
2983 /* if we got a response, then the recmaster will be stored in the
2984 status field
2986 if (state->status != rmdata->pnn) {
2987 DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
2988 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2989 rmdata->status = MONITOR_ELECTION_NEEDED;
2992 return;
2996 /* verify that all nodes agree that we are the recmaster */
2997 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
2999 struct ctdb_context *ctdb = rec->ctdb;
3000 struct verify_recmaster_data *rmdata;
3001 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3002 struct ctdb_client_control_state *state;
3003 enum monitor_result status;
3004 int j;
3006 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
3007 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
3008 rmdata->rec = rec;
3009 rmdata->count = 0;
3010 rmdata->pnn = pnn;
3011 rmdata->status = MONITOR_OK;
3013 /* loop over all active nodes and send an async getrecmaster call to
3014 them*/
3015 for (j=0; j<nodemap->num; j++) {
3016 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3017 continue;
3019 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
3020 CONTROL_TIMEOUT(),
3021 nodemap->nodes[j].pnn);
3022 if (state == NULL) {
3023 /* we failed to send the control, treat this as
3024 an error and try again next iteration
3026 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
3027 talloc_free(mem_ctx);
3028 return MONITOR_FAILED;
3031 /* set up the callback functions */
3032 state->async.fn = verify_recmaster_callback;
3033 state->async.private_data = rmdata;
3035 /* one more control to wait for to complete */
3036 rmdata->count++;
3040 /* now wait for up to the maximum number of seconds allowed
3041 or until all nodes we expect a response from has replied
3043 while (rmdata->count > 0) {
3044 event_loop_once(ctdb->ev);
3047 status = rmdata->status;
3048 talloc_free(mem_ctx);
3049 return status;
3052 static bool interfaces_have_changed(struct ctdb_context *ctdb,
3053 struct ctdb_recoverd *rec)
3055 struct ctdb_control_get_ifaces *ifaces = NULL;
3056 TALLOC_CTX *mem_ctx;
3057 bool ret = false;
3059 mem_ctx = talloc_new(NULL);
3061 /* Read the interfaces from the local node */
3062 if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
3063 CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
3064 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
3065 /* We could return an error. However, this will be
3066 * rare so we'll decide that the interfaces have
3067 * actually changed, just in case.
3069 talloc_free(mem_ctx);
3070 return true;
3073 if (!rec->ifaces) {
3074 /* We haven't been here before so things have changed */
3075 DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
3076 ret = true;
3077 } else if (rec->ifaces->num != ifaces->num) {
3078 /* Number of interfaces has changed */
3079 DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
3080 rec->ifaces->num, ifaces->num));
3081 ret = true;
3082 } else {
3083 /* See if interface names or link states have changed */
3084 int i;
3085 for (i = 0; i < rec->ifaces->num; i++) {
3086 struct ctdb_control_iface_info * iface = &rec->ifaces->ifaces[i];
3087 if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
3088 DEBUG(DEBUG_NOTICE,
3089 ("Interface in slot %d changed: %s => %s\n",
3090 i, iface->name, ifaces->ifaces[i].name));
3091 ret = true;
3092 break;
3094 if (iface->link_state != ifaces->ifaces[i].link_state) {
3095 DEBUG(DEBUG_NOTICE,
3096 ("Interface %s changed state: %d => %d\n",
3097 iface->name, iface->link_state,
3098 ifaces->ifaces[i].link_state));
3099 ret = true;
3100 break;
3105 talloc_free(rec->ifaces);
3106 rec->ifaces = talloc_steal(rec, ifaces);
3108 talloc_free(mem_ctx);
3109 return ret;
3112 /* called to check that the local allocation of public ip addresses is ok.
3114 static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn, struct ctdb_node_map *nodemap)
3116 TALLOC_CTX *mem_ctx = talloc_new(NULL);
3117 struct ctdb_uptime *uptime1 = NULL;
3118 struct ctdb_uptime *uptime2 = NULL;
3119 int ret, j;
3120 bool need_takeover_run = false;
3122 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
3123 CTDB_CURRENT_NODE, &uptime1);
3124 if (ret != 0) {
3125 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
3126 talloc_free(mem_ctx);
3127 return -1;
3130 if (interfaces_have_changed(ctdb, rec)) {
3131 DEBUG(DEBUG_NOTICE, ("The interfaces status has changed on "
3132 "local node %u - force takeover run\n",
3133 pnn));
3134 need_takeover_run = true;
3137 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
3138 CTDB_CURRENT_NODE, &uptime2);
3139 if (ret != 0) {
3140 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
3141 talloc_free(mem_ctx);
3142 return -1;
3145 /* skip the check if the startrecovery time has changed */
3146 if (timeval_compare(&uptime1->last_recovery_started,
3147 &uptime2->last_recovery_started) != 0) {
3148 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3149 talloc_free(mem_ctx);
3150 return 0;
3153 /* skip the check if the endrecovery time has changed */
3154 if (timeval_compare(&uptime1->last_recovery_finished,
3155 &uptime2->last_recovery_finished) != 0) {
3156 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3157 talloc_free(mem_ctx);
3158 return 0;
3161 /* skip the check if we have started but not finished recovery */
3162 if (timeval_compare(&uptime1->last_recovery_finished,
3163 &uptime1->last_recovery_started) != 1) {
3164 DEBUG(DEBUG_INFO, (__location__ " in the middle of recovery or ip reallocation. skipping public ip address check\n"));
3165 talloc_free(mem_ctx);
3167 return 0;
3170 /* verify that we have the ip addresses we should have
3171 and we dont have ones we shouldnt have.
3172 if we find an inconsistency we set recmode to
3173 active on the local node and wait for the recmaster
3174 to do a full blown recovery.
3175 also if the pnn is -1 and we are healthy and can host the ip
3176 we also request a ip reallocation.
3178 if (ctdb->tunable.disable_ip_failover == 0) {
3179 struct ctdb_all_public_ips *ips = NULL;
3181 /* read the *available* IPs from the local node */
3182 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
3183 if (ret != 0) {
3184 DEBUG(DEBUG_ERR, ("Unable to get available public IPs from local node %u\n", pnn));
3185 talloc_free(mem_ctx);
3186 return -1;
3189 for (j=0; j<ips->num; j++) {
3190 if (ips->ips[j].pnn == -1 &&
3191 nodemap->nodes[pnn].flags == 0) {
3192 DEBUG(DEBUG_CRIT,("Public IP '%s' is not assigned and we could serve it\n",
3193 ctdb_addr_to_str(&ips->ips[j].addr)));
3194 need_takeover_run = true;
3198 talloc_free(ips);
3200 /* read the *known* IPs from the local node */
3201 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
3202 if (ret != 0) {
3203 DEBUG(DEBUG_ERR, ("Unable to get known public IPs from local node %u\n", pnn));
3204 talloc_free(mem_ctx);
3205 return -1;
3208 for (j=0; j<ips->num; j++) {
3209 if (ips->ips[j].pnn == pnn) {
3210 if (ctdb->do_checkpublicip && !ctdb_sys_have_ip(&ips->ips[j].addr)) {
3211 DEBUG(DEBUG_CRIT,("Public IP '%s' is assigned to us but not on an interface\n",
3212 ctdb_addr_to_str(&ips->ips[j].addr)));
3213 need_takeover_run = true;
3215 } else {
3216 if (ctdb->do_checkpublicip &&
3217 ctdb_sys_have_ip(&ips->ips[j].addr)) {
3219 DEBUG(DEBUG_CRIT,("We are still serving a public IP '%s' that we should not be serving. Removing it\n",
3220 ctdb_addr_to_str(&ips->ips[j].addr)));
3222 if (ctdb_ctrl_release_ip(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ips->ips[j]) != 0) {
3223 DEBUG(DEBUG_ERR,("Failed to release local IP address\n"));
3230 if (need_takeover_run) {
3231 struct srvid_request rd;
3232 TDB_DATA data;
3234 DEBUG(DEBUG_CRIT,("Trigger takeoverrun\n"));
3236 rd.pnn = ctdb->pnn;
3237 rd.srvid = 0;
3238 data.dptr = (uint8_t *)&rd;
3239 data.dsize = sizeof(rd);
3241 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
3242 if (ret != 0) {
3243 DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
3246 talloc_free(mem_ctx);
3247 return 0;
3251 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
3253 struct ctdb_node_map **remote_nodemaps = callback_data;
3255 if (node_pnn >= ctdb->num_nodes) {
3256 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
3257 return;
3260 remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
3264 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
3265 struct ctdb_node_map *nodemap,
3266 struct ctdb_node_map **remote_nodemaps)
3268 uint32_t *nodes;
3270 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
3271 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
3272 nodes, 0,
3273 CONTROL_TIMEOUT(), false, tdb_null,
3274 async_getnodemap_callback,
3275 NULL,
3276 remote_nodemaps) != 0) {
3277 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
3279 return -1;
3282 return 0;
3285 enum reclock_child_status { RECLOCK_CHECKING, RECLOCK_OK, RECLOCK_FAILED, RECLOCK_TIMEOUT};
3286 struct ctdb_check_reclock_state {
3287 struct ctdb_context *ctdb;
3288 struct timeval start_time;
3289 int fd[2];
3290 pid_t child;
3291 struct timed_event *te;
3292 struct fd_event *fde;
3293 enum reclock_child_status status;
3296 /* when we free the reclock state we must kill any child process.
3298 static int check_reclock_destructor(struct ctdb_check_reclock_state *state)
3300 struct ctdb_context *ctdb = state->ctdb;
3302 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&state->start_time));
3304 if (state->fd[0] != -1) {
3305 close(state->fd[0]);
3306 state->fd[0] = -1;
3308 if (state->fd[1] != -1) {
3309 close(state->fd[1]);
3310 state->fd[1] = -1;
3312 ctdb_kill(ctdb, state->child, SIGKILL);
3313 return 0;
3317 called if our check_reclock child times out. this would happen if
3318 i/o to the reclock file blocks.
3320 static void ctdb_check_reclock_timeout(struct event_context *ev, struct timed_event *te,
3321 struct timeval t, void *private_data)
3323 struct ctdb_check_reclock_state *state = talloc_get_type(private_data,
3324 struct ctdb_check_reclock_state);
3326 DEBUG(DEBUG_ERR,(__location__ " check_reclock child process hung/timedout CFS slow to grant locks?\n"));
3327 state->status = RECLOCK_TIMEOUT;
3330 /* this is called when the child process has completed checking the reclock
3331 file and has written data back to us through the pipe.
3333 static void reclock_child_handler(struct event_context *ev, struct fd_event *fde,
3334 uint16_t flags, void *private_data)
3336 struct ctdb_check_reclock_state *state= talloc_get_type(private_data,
3337 struct ctdb_check_reclock_state);
3338 char c = 0;
3339 int ret;
3341 /* we got a response from our child process so we can abort the
3342 timeout.
3344 talloc_free(state->te);
3345 state->te = NULL;
3347 ret = read(state->fd[0], &c, 1);
3348 if (ret != 1 || c != RECLOCK_OK) {
3349 DEBUG(DEBUG_ERR,(__location__ " reclock child process returned error %d\n", c));
3350 state->status = RECLOCK_FAILED;
3352 return;
3355 state->status = RECLOCK_OK;
3356 return;
3359 static int check_recovery_lock(struct ctdb_context *ctdb)
3361 int ret;
3362 struct ctdb_check_reclock_state *state;
3363 pid_t parent = getpid();
3365 if (ctdb->recovery_lock_fd == -1) {
3366 DEBUG(DEBUG_CRIT,("recovery master doesn't have the recovery lock\n"));
3367 return -1;
3370 state = talloc(ctdb, struct ctdb_check_reclock_state);
3371 CTDB_NO_MEMORY(ctdb, state);
3373 state->ctdb = ctdb;
3374 state->start_time = timeval_current();
3375 state->status = RECLOCK_CHECKING;
3376 state->fd[0] = -1;
3377 state->fd[1] = -1;
3379 ret = pipe(state->fd);
3380 if (ret != 0) {
3381 talloc_free(state);
3382 DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for check_reclock child\n"));
3383 return -1;
3386 state->child = ctdb_fork(ctdb);
3387 if (state->child == (pid_t)-1) {
3388 DEBUG(DEBUG_CRIT,(__location__ " fork() failed in check_reclock child\n"));
3389 close(state->fd[0]);
3390 state->fd[0] = -1;
3391 close(state->fd[1]);
3392 state->fd[1] = -1;
3393 talloc_free(state);
3394 return -1;
3397 if (state->child == 0) {
3398 char cc = RECLOCK_OK;
3399 close(state->fd[0]);
3400 state->fd[0] = -1;
3402 ctdb_set_process_name("ctdb_rec_reclock");
3403 debug_extra = talloc_asprintf(NULL, "recovery-lock:");
3404 if (pread(ctdb->recovery_lock_fd, &cc, 1, 0) == -1) {
3405 DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
3406 cc = RECLOCK_FAILED;
3409 write(state->fd[1], &cc, 1);
3410 /* make sure we die when our parent dies */
3411 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
3412 sleep(5);
3414 _exit(0);
3416 close(state->fd[1]);
3417 state->fd[1] = -1;
3418 set_close_on_exec(state->fd[0]);
3420 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for check_recovery_lock\n", state->fd[0]));
3422 talloc_set_destructor(state, check_reclock_destructor);
3424 state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(15, 0),
3425 ctdb_check_reclock_timeout, state);
3426 if (state->te == NULL) {
3427 DEBUG(DEBUG_CRIT,(__location__ " Failed to create a timed event for reclock child\n"));
3428 talloc_free(state);
3429 return -1;
3432 state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
3433 EVENT_FD_READ,
3434 reclock_child_handler,
3435 (void *)state);
3437 if (state->fde == NULL) {
3438 DEBUG(DEBUG_CRIT,(__location__ " Failed to create an fd event for reclock child\n"));
3439 talloc_free(state);
3440 return -1;
3442 tevent_fd_set_auto_close(state->fde);
3444 while (state->status == RECLOCK_CHECKING) {
3445 event_loop_once(ctdb->ev);
3448 if (state->status == RECLOCK_FAILED) {
3449 DEBUG(DEBUG_ERR,(__location__ " reclock child failed when checking file\n"));
3450 close(ctdb->recovery_lock_fd);
3451 ctdb->recovery_lock_fd = -1;
3452 talloc_free(state);
3453 return -1;
3456 talloc_free(state);
3457 return 0;
3460 static int update_recovery_lock_file(struct ctdb_context *ctdb)
3462 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
3463 const char *reclockfile;
3465 if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
3466 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
3467 talloc_free(tmp_ctx);
3468 return -1;
3471 if (reclockfile == NULL) {
3472 if (ctdb->recovery_lock_file != NULL) {
3473 DEBUG(DEBUG_ERR,("Reclock file disabled\n"));
3474 talloc_free(ctdb->recovery_lock_file);
3475 ctdb->recovery_lock_file = NULL;
3476 if (ctdb->recovery_lock_fd != -1) {
3477 close(ctdb->recovery_lock_fd);
3478 ctdb->recovery_lock_fd = -1;
3481 ctdb->tunable.verify_recovery_lock = 0;
3482 talloc_free(tmp_ctx);
3483 return 0;
3486 if (ctdb->recovery_lock_file == NULL) {
3487 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3488 if (ctdb->recovery_lock_fd != -1) {
3489 close(ctdb->recovery_lock_fd);
3490 ctdb->recovery_lock_fd = -1;
3492 talloc_free(tmp_ctx);
3493 return 0;
3497 if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
3498 talloc_free(tmp_ctx);
3499 return 0;
3502 talloc_free(ctdb->recovery_lock_file);
3503 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3504 ctdb->tunable.verify_recovery_lock = 0;
3505 if (ctdb->recovery_lock_fd != -1) {
3506 close(ctdb->recovery_lock_fd);
3507 ctdb->recovery_lock_fd = -1;
3510 talloc_free(tmp_ctx);
3511 return 0;
3514 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
3515 TALLOC_CTX *mem_ctx)
3517 uint32_t pnn;
3518 struct ctdb_node_map *nodemap=NULL;
3519 struct ctdb_node_map *recmaster_nodemap=NULL;
3520 struct ctdb_node_map **remote_nodemaps=NULL;
3521 struct ctdb_vnn_map *vnnmap=NULL;
3522 struct ctdb_vnn_map *remote_vnnmap=NULL;
3523 int32_t debug_level;
3524 int i, j, ret;
3525 bool self_ban;
3528 /* verify that the main daemon is still running */
3529 if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
3530 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
3531 exit(-1);
3534 /* ping the local daemon to tell it we are alive */
3535 ctdb_ctrl_recd_ping(ctdb);
3537 if (rec->election_timeout) {
3538 /* an election is in progress */
3539 return;
3542 /* read the debug level from the parent and update locally */
3543 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
3544 if (ret !=0) {
3545 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
3546 return;
3548 LogLevel = debug_level;
3550 /* get relevant tunables */
3551 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
3552 if (ret != 0) {
3553 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
3554 return;
3557 /* get the current recovery lock file from the server */
3558 if (update_recovery_lock_file(ctdb) != 0) {
3559 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
3560 return;
3563 /* Make sure that if recovery lock verification becomes disabled when
3564 we close the file
3566 if (ctdb->tunable.verify_recovery_lock == 0) {
3567 if (ctdb->recovery_lock_fd != -1) {
3568 close(ctdb->recovery_lock_fd);
3569 ctdb->recovery_lock_fd = -1;
3573 pnn = ctdb_get_pnn(ctdb);
3575 /* get the vnnmap */
3576 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
3577 if (ret != 0) {
3578 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
3579 return;
3583 /* get number of nodes */
3584 if (rec->nodemap) {
3585 talloc_free(rec->nodemap);
3586 rec->nodemap = NULL;
3587 nodemap=NULL;
3589 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
3590 if (ret != 0) {
3591 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
3592 return;
3594 nodemap = rec->nodemap;
3596 /* remember our own node flags */
3597 rec->node_flags = nodemap->nodes[pnn].flags;
3599 ban_misbehaving_nodes(rec, &self_ban);
3600 if (self_ban) {
3601 DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
3602 return;
3605 /* if the local daemon is STOPPED or BANNED, we verify that the databases are
3606 also frozen and that the recmode is set to active.
3608 if (rec->node_flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
3609 /* If this node has become inactive then we want to
3610 * reduce the chances of it taking over the recovery
3611 * master role when it becomes active again. This
3612 * helps to stabilise the recovery master role so that
3613 * it stays on the most stable node.
3615 rec->priority_time = timeval_current();
3617 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
3618 if (ret != 0) {
3619 DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
3621 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
3622 DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
3624 ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
3625 if (ret != 0) {
3626 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node in STOPPED or BANNED state\n"));
3627 return;
3629 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
3630 if (ret != 0) {
3631 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
3633 return;
3637 /* If this node is stopped or banned then it is not the recovery
3638 * master, so don't do anything. This prevents stopped or banned
3639 * node from starting election and sending unnecessary controls.
3641 return;
3644 /* check which node is the recovery master */
3645 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
3646 if (ret != 0) {
3647 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
3648 return;
3651 /* If we are not the recmaster then do some housekeeping */
3652 if (rec->recmaster != pnn) {
3653 /* Ignore any IP reallocate requests - only recmaster
3654 * processes them
3656 TALLOC_FREE(rec->reallocate_requests);
3657 /* Clear any nodes that should be force rebalanced in
3658 * the next takeover run. If the recovery master role
3659 * has moved then we don't want to process these some
3660 * time in the future.
3662 TALLOC_FREE(rec->force_rebalance_nodes);
3665 /* This is a special case. When recovery daemon is started, recmaster
3666 * is set to -1. If a node is not started in stopped state, then
3667 * start election to decide recovery master
3669 if (rec->recmaster == (uint32_t)-1) {
3670 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
3671 force_election(rec, pnn, nodemap);
3672 return;
3675 /* update the capabilities for all nodes */
3676 ret = update_capabilities(ctdb, nodemap);
3677 if (ret != 0) {
3678 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
3679 return;
3683 * If the current recmaster does not have CTDB_CAP_RECMASTER,
3684 * but we have, then force an election and try to become the new
3685 * recmaster.
3687 if ((rec->ctdb->nodes[rec->recmaster]->capabilities & CTDB_CAP_RECMASTER) == 0 &&
3688 (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
3689 !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
3690 DEBUG(DEBUG_ERR, (__location__ " Current recmaster node %u does not have CAP_RECMASTER,"
3691 " but we (node %u) have - force an election\n",
3692 rec->recmaster, pnn));
3693 force_election(rec, pnn, nodemap);
3694 return;
3697 /* count how many active nodes there are */
3698 rec->num_active = 0;
3699 rec->num_lmasters = 0;
3700 rec->num_connected = 0;
3701 for (i=0; i<nodemap->num; i++) {
3702 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
3703 rec->num_active++;
3704 if (rec->ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER) {
3705 rec->num_lmasters++;
3708 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
3709 rec->num_connected++;
3714 /* verify that the recmaster node is still active */
3715 for (j=0; j<nodemap->num; j++) {
3716 if (nodemap->nodes[j].pnn==rec->recmaster) {
3717 break;
3721 if (j == nodemap->num) {
3722 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
3723 force_election(rec, pnn, nodemap);
3724 return;
3727 /* if recovery master is disconnected we must elect a new recmaster */
3728 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
3729 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
3730 force_election(rec, pnn, nodemap);
3731 return;
3734 /* get nodemap from the recovery master to check if it is inactive */
3735 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3736 mem_ctx, &recmaster_nodemap);
3737 if (ret != 0) {
3738 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
3739 nodemap->nodes[j].pnn));
3740 return;
3744 if ((recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) &&
3745 (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
3746 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
3748 * update our nodemap to carry the recmaster's notion of
3749 * its own flags, so that we don't keep freezing the
3750 * inactive recmaster node...
3752 nodemap->nodes[j].flags = recmaster_nodemap->nodes[j].flags;
3753 force_election(rec, pnn, nodemap);
3754 return;
3757 /* verify that we have all ip addresses we should have and we dont
3758 * have addresses we shouldnt have.
3760 if (ctdb->tunable.disable_ip_failover == 0 &&
3761 rec->takeover_runs_disable_ctx == NULL) {
3762 if (verify_local_ip_allocation(ctdb, rec, pnn, nodemap) != 0) {
3763 DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
3768 /* if we are not the recmaster then we do not need to check
3769 if recovery is needed
3771 if (pnn != rec->recmaster) {
3772 return;
3776 /* ensure our local copies of flags are right */
3777 ret = update_local_flags(rec, nodemap);
3778 if (ret == MONITOR_ELECTION_NEEDED) {
3779 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
3780 force_election(rec, pnn, nodemap);
3781 return;
3783 if (ret != MONITOR_OK) {
3784 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
3785 return;
3788 if (ctdb->num_nodes != nodemap->num) {
3789 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
3790 ctdb_load_nodes_file(ctdb);
3791 return;
3794 /* verify that all active nodes agree that we are the recmaster */
3795 switch (verify_recmaster(rec, nodemap, pnn)) {
3796 case MONITOR_RECOVERY_NEEDED:
3797 /* can not happen */
3798 return;
3799 case MONITOR_ELECTION_NEEDED:
3800 force_election(rec, pnn, nodemap);
3801 return;
3802 case MONITOR_OK:
3803 break;
3804 case MONITOR_FAILED:
3805 return;
3809 if (rec->need_recovery) {
3810 /* a previous recovery didn't finish */
3811 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3812 return;
3815 /* verify that all active nodes are in normal mode
3816 and not in recovery mode
3818 switch (verify_recmode(ctdb, nodemap)) {
3819 case MONITOR_RECOVERY_NEEDED:
3820 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3821 return;
3822 case MONITOR_FAILED:
3823 return;
3824 case MONITOR_ELECTION_NEEDED:
3825 /* can not happen */
3826 case MONITOR_OK:
3827 break;
3831 if (ctdb->tunable.verify_recovery_lock != 0) {
3832 /* we should have the reclock - check its not stale */
3833 ret = check_recovery_lock(ctdb);
3834 if (ret != 0) {
3835 DEBUG(DEBUG_ERR,("Failed check_recovery_lock. Force a recovery\n"));
3836 ctdb_set_culprit(rec, ctdb->pnn);
3837 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3838 return;
3843 /* if there are takeovers requested, perform it and notify the waiters */
3844 if (rec->takeover_runs_disable_ctx == NULL &&
3845 rec->reallocate_requests) {
3846 process_ipreallocate_requests(ctdb, rec);
3849 /* get the nodemap for all active remote nodes
3851 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
3852 if (remote_nodemaps == NULL) {
3853 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3854 return;
3856 for(i=0; i<nodemap->num; i++) {
3857 remote_nodemaps[i] = NULL;
3859 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3860 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3861 return;
3864 /* verify that all other nodes have the same nodemap as we have
3866 for (j=0; j<nodemap->num; j++) {
3867 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3868 continue;
3871 if (remote_nodemaps[j] == NULL) {
3872 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3873 ctdb_set_culprit(rec, j);
3875 return;
3878 /* if the nodes disagree on how many nodes there are
3879 then this is a good reason to try recovery
3881 if (remote_nodemaps[j]->num != nodemap->num) {
3882 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3883 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3884 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3885 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3886 return;
3889 /* if the nodes disagree on which nodes exist and are
3890 active, then that is also a good reason to do recovery
3892 for (i=0;i<nodemap->num;i++) {
3893 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3894 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3895 nodemap->nodes[j].pnn, i,
3896 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3897 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3898 do_recovery(rec, mem_ctx, pnn, nodemap,
3899 vnnmap);
3900 return;
3906 * Update node flags obtained from each active node. This ensure we have
3907 * up-to-date information for all the nodes.
3909 for (j=0; j<nodemap->num; j++) {
3910 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3911 continue;
3913 nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
3916 for (j=0; j<nodemap->num; j++) {
3917 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3918 continue;
3921 /* verify the flags are consistent
3923 for (i=0; i<nodemap->num; i++) {
3924 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3925 continue;
3928 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3929 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3930 nodemap->nodes[j].pnn,
3931 nodemap->nodes[i].pnn,
3932 remote_nodemaps[j]->nodes[i].flags,
3933 nodemap->nodes[i].flags));
3934 if (i == j) {
3935 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3936 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3937 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3938 do_recovery(rec, mem_ctx, pnn, nodemap,
3939 vnnmap);
3940 return;
3941 } else {
3942 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3943 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3944 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3945 do_recovery(rec, mem_ctx, pnn, nodemap,
3946 vnnmap);
3947 return;
3954 /* There must be the same number of lmasters in the vnn map as
3955 * there are active nodes with the lmaster capability... or
3956 * do a recovery.
3958 if (vnnmap->size != rec->num_lmasters) {
3959 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
3960 vnnmap->size, rec->num_lmasters));
3961 ctdb_set_culprit(rec, ctdb->pnn);
3962 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3963 return;
3966 /* verify that all active nodes in the nodemap also exist in
3967 the vnnmap.
3969 for (j=0; j<nodemap->num; j++) {
3970 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3971 continue;
3973 if (nodemap->nodes[j].pnn == pnn) {
3974 continue;
3977 for (i=0; i<vnnmap->size; i++) {
3978 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3979 break;
3982 if (i == vnnmap->size) {
3983 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3984 nodemap->nodes[j].pnn));
3985 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3986 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3987 return;
3992 /* verify that all other nodes have the same vnnmap
3993 and are from the same generation
3995 for (j=0; j<nodemap->num; j++) {
3996 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3997 continue;
3999 if (nodemap->nodes[j].pnn == pnn) {
4000 continue;
4003 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
4004 mem_ctx, &remote_vnnmap);
4005 if (ret != 0) {
4006 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
4007 nodemap->nodes[j].pnn));
4008 return;
4011 /* verify the vnnmap generation is the same */
4012 if (vnnmap->generation != remote_vnnmap->generation) {
4013 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
4014 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
4015 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
4016 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
4017 return;
4020 /* verify the vnnmap size is the same */
4021 if (vnnmap->size != remote_vnnmap->size) {
4022 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
4023 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
4024 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
4025 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
4026 return;
4029 /* verify the vnnmap is the same */
4030 for (i=0;i<vnnmap->size;i++) {
4031 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
4032 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
4033 nodemap->nodes[j].pnn));
4034 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
4035 do_recovery(rec, mem_ctx, pnn, nodemap,
4036 vnnmap);
4037 return;
4042 /* we might need to change who has what IP assigned */
4043 if (rec->need_takeover_run) {
4044 uint32_t culprit = (uint32_t)-1;
4046 rec->need_takeover_run = false;
4048 /* update the list of public ips that a node can handle for
4049 all connected nodes
4051 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
4052 if (ret != 0) {
4053 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
4054 culprit));
4055 rec->need_takeover_run = true;
4056 return;
4059 /* execute the "startrecovery" event script on all nodes */
4060 ret = run_startrecovery_eventscript(rec, nodemap);
4061 if (ret!=0) {
4062 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
4063 ctdb_set_culprit(rec, ctdb->pnn);
4064 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
4065 return;
4068 /* If takeover run fails, then the offending nodes are
4069 * assigned ban culprit counts. And we re-try takeover.
4070 * If takeover run fails repeatedly, the node would get
4071 * banned.
4073 * If rec->need_takeover_run is not set to true at this
4074 * failure, monitoring is disabled cluster-wide (via
4075 * startrecovery eventscript) and will not get enabled.
4077 if (!do_takeover_run(rec, nodemap, true)) {
4078 return;
4081 /* execute the "recovered" event script on all nodes */
4082 ret = run_recovered_eventscript(rec, nodemap, "monitor_cluster");
4083 #if 0
4084 // we cant check whether the event completed successfully
4085 // since this script WILL fail if the node is in recovery mode
4086 // and if that race happens, the code here would just cause a second
4087 // cascading recovery.
4088 if (ret!=0) {
4089 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
4090 ctdb_set_culprit(rec, ctdb->pnn);
4091 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
4093 #endif
4098 the main monitoring loop
4100 static void monitor_cluster(struct ctdb_context *ctdb)
4102 struct ctdb_recoverd *rec;
4104 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
4106 rec = talloc_zero(ctdb, struct ctdb_recoverd);
4107 CTDB_NO_MEMORY_FATAL(ctdb, rec);
4109 rec->ctdb = ctdb;
4111 rec->takeover_run_in_progress = false;
4113 rec->priority_time = timeval_current();
4115 /* register a message port for sending memory dumps */
4116 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
4118 /* register a message port for requesting logs */
4119 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_GETLOG, getlog_handler, rec);
4121 /* register a message port for clearing logs */
4122 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_CLEARLOG, clearlog_handler, rec);
4124 /* register a message port for recovery elections */
4125 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
4127 /* when nodes are disabled/enabled */
4128 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
4130 /* when we are asked to puch out a flag change */
4131 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
4133 /* register a message port for vacuum fetch */
4134 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
4136 /* register a message port for reloadnodes */
4137 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
4139 /* register a message port for performing a takeover run */
4140 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
4142 /* register a message port for disabling the ip check for a short while */
4143 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
4145 /* register a message port for updating the recovery daemons node assignment for an ip */
4146 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECD_UPDATE_IP, recd_update_ip_handler, rec);
4148 /* register a message port for forcing a rebalance of a node next
4149 reallocation */
4150 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
4152 /* Register a message port for disabling takeover runs */
4153 ctdb_client_set_message_handler(ctdb,
4154 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
4155 disable_takeover_runs_handler, rec);
4157 for (;;) {
4158 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
4159 struct timeval start;
4160 double elapsed;
4162 if (!mem_ctx) {
4163 DEBUG(DEBUG_CRIT,(__location__
4164 " Failed to create temp context\n"));
4165 exit(-1);
4168 start = timeval_current();
4169 main_loop(ctdb, rec, mem_ctx);
4170 talloc_free(mem_ctx);
4172 /* we only check for recovery once every second */
4173 elapsed = timeval_elapsed(&start);
4174 if (elapsed < ctdb->tunable.recover_interval) {
4175 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
4176 - elapsed);
4182 event handler for when the main ctdbd dies
4184 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
4185 uint16_t flags, void *private_data)
4187 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
4188 _exit(1);
4192 called regularly to verify that the recovery daemon is still running
4194 static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
4195 struct timeval yt, void *p)
4197 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
4199 if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
4200 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
4202 event_add_timed(ctdb->ev, ctdb, timeval_zero(),
4203 ctdb_restart_recd, ctdb);
4205 return;
4208 event_add_timed(ctdb->ev, ctdb->recd_ctx,
4209 timeval_current_ofs(30, 0),
4210 ctdb_check_recd, ctdb);
4213 static void recd_sig_child_handler(struct event_context *ev,
4214 struct signal_event *se, int signum, int count,
4215 void *dont_care,
4216 void *private_data)
4218 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4219 int status;
4220 pid_t pid = -1;
4222 while (pid != 0) {
4223 pid = waitpid(-1, &status, WNOHANG);
4224 if (pid == -1) {
4225 if (errno != ECHILD) {
4226 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
4228 return;
4230 if (pid > 0) {
4231 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
4237 startup the recovery daemon as a child of the main ctdb daemon
4239 int ctdb_start_recoverd(struct ctdb_context *ctdb)
4241 int fd[2];
4242 struct signal_event *se;
4243 struct tevent_fd *fde;
4245 if (pipe(fd) != 0) {
4246 return -1;
4249 ctdb->ctdbd_pid = getpid();
4251 ctdb->recoverd_pid = ctdb_fork_no_free_ringbuffer(ctdb);
4252 if (ctdb->recoverd_pid == -1) {
4253 return -1;
4256 if (ctdb->recoverd_pid != 0) {
4257 talloc_free(ctdb->recd_ctx);
4258 ctdb->recd_ctx = talloc_new(ctdb);
4259 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
4261 close(fd[0]);
4262 event_add_timed(ctdb->ev, ctdb->recd_ctx,
4263 timeval_current_ofs(30, 0),
4264 ctdb_check_recd, ctdb);
4265 return 0;
4268 close(fd[1]);
4270 srandom(getpid() ^ time(NULL));
4272 /* Clear the log ringbuffer */
4273 ctdb_clear_log(ctdb);
4275 ctdb_set_process_name("ctdb_recovered");
4276 if (switch_from_server_to_client(ctdb, "recoverd") != 0) {
4277 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
4278 exit(1);
4281 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
4283 fde = event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ,
4284 ctdb_recoverd_parent, &fd[0]);
4285 tevent_fd_set_auto_close(fde);
4287 /* set up a handler to pick up sigchld */
4288 se = event_add_signal(ctdb->ev, ctdb,
4289 SIGCHLD, 0,
4290 recd_sig_child_handler,
4291 ctdb);
4292 if (se == NULL) {
4293 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
4294 exit(1);
4297 monitor_cluster(ctdb);
4299 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
4300 return -1;
4304 shutdown the recovery daemon
4306 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
4308 if (ctdb->recoverd_pid == 0) {
4309 return;
4312 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
4313 ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
4315 TALLOC_FREE(ctdb->recd_ctx);
4316 TALLOC_FREE(ctdb->recd_ping_count);
4319 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te,
4320 struct timeval t, void *private_data)
4322 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4324 DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
4325 ctdb_stop_recoverd(ctdb);
4326 ctdb_start_recoverd(ctdb);