recoverd: fix a comment typo
[Samba.git] / ctdb / server / ctdb_recoverd.c
blob13949cac38893dc6047d2c83fad3aeb84ec1d4ae
1 /*
2 ctdb recovery daemon
4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
20 #include "includes.h"
21 #include "system/filesys.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/wait.h"
25 #include "popt.h"
26 #include "cmdline.h"
27 #include "../include/ctdb_client.h"
28 #include "../include/ctdb_private.h"
29 #include "db_wrap.h"
30 #include "dlinklist.h"
33 /* most recent reload all ips request we need to perform during the
34 next monitoring loop
36 struct reloadips_all_reply *reload_all_ips_request = NULL;
38 /* list of "ctdb ipreallocate" processes to call back when we have
39 finished the takeover run.
41 struct ip_reallocate_list {
42 struct ip_reallocate_list *next;
43 struct rd_memdump_reply *rd;
46 struct ctdb_banning_state {
47 uint32_t count;
48 struct timeval last_reported_time;
52 private state of recovery daemon
54 struct ctdb_recoverd {
55 struct ctdb_context *ctdb;
56 uint32_t recmaster;
57 uint32_t num_active;
58 uint32_t num_connected;
59 uint32_t last_culprit_node;
60 struct ctdb_node_map *nodemap;
61 struct timeval priority_time;
62 bool need_takeover_run;
63 bool need_recovery;
64 uint32_t node_flags;
65 struct timed_event *send_election_te;
66 struct timed_event *election_timeout;
67 struct vacuum_info *vacuum_info;
68 TALLOC_CTX *ip_reallocate_ctx;
69 struct ip_reallocate_list *reallocate_callers;
70 TALLOC_CTX *ip_check_disable_ctx;
71 struct ctdb_control_get_ifaces *ifaces;
72 TALLOC_CTX *deferred_rebalance_ctx;
75 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
76 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
78 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te, struct timeval t, void *private_data);
81 ban a node for a period of time
83 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
85 int ret;
86 struct ctdb_context *ctdb = rec->ctdb;
87 struct ctdb_ban_time bantime;
89 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
91 if (!ctdb_validate_pnn(ctdb, pnn)) {
92 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
93 return;
96 bantime.pnn = pnn;
97 bantime.time = ban_time;
99 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
100 if (ret != 0) {
101 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
102 return;
107 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
111 remember the trouble maker
113 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
115 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
116 struct ctdb_banning_state *ban_state;
118 if (culprit > ctdb->num_nodes) {
119 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
120 return;
123 if (ctdb->nodes[culprit]->ban_state == NULL) {
124 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
125 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
129 ban_state = ctdb->nodes[culprit]->ban_state;
130 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
131 /* this was the first time in a long while this node
132 misbehaved so we will forgive any old transgressions.
134 ban_state->count = 0;
137 ban_state->count += count;
138 ban_state->last_reported_time = timeval_current();
139 rec->last_culprit_node = culprit;
143 remember the trouble maker
145 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
147 ctdb_set_culprit_count(rec, culprit, 1);
151 /* this callback is called for every node that failed to execute the
152 recovered event
154 static void recovered_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
156 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
158 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the recovered event. Setting it as recovery fail culprit\n", node_pnn));
160 ctdb_set_culprit(rec, node_pnn);
164 run the "recovered" eventscript on all nodes
166 static int run_recovered_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, const char *caller)
168 TALLOC_CTX *tmp_ctx;
169 uint32_t *nodes;
170 struct ctdb_context *ctdb = rec->ctdb;
172 tmp_ctx = talloc_new(ctdb);
173 CTDB_NO_MEMORY(ctdb, tmp_ctx);
175 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
176 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
177 nodes, 0,
178 CONTROL_TIMEOUT(), false, tdb_null,
179 NULL, recovered_fail_callback,
180 rec) != 0) {
181 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
183 talloc_free(tmp_ctx);
184 return -1;
187 talloc_free(tmp_ctx);
188 return 0;
191 /* this callback is called for every node that failed to execute the
192 start recovery event
194 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
196 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
198 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
200 ctdb_set_culprit(rec, node_pnn);
204 run the "startrecovery" eventscript on all nodes
206 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
208 TALLOC_CTX *tmp_ctx;
209 uint32_t *nodes;
210 struct ctdb_context *ctdb = rec->ctdb;
212 tmp_ctx = talloc_new(ctdb);
213 CTDB_NO_MEMORY(ctdb, tmp_ctx);
215 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
216 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
217 nodes, 0,
218 CONTROL_TIMEOUT(), false, tdb_null,
219 NULL,
220 startrecovery_fail_callback,
221 rec) != 0) {
222 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
223 talloc_free(tmp_ctx);
224 return -1;
227 talloc_free(tmp_ctx);
228 return 0;
231 static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
233 if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
234 DEBUG(DEBUG_ERR, (__location__ " Invalid length/pointer for getcap callback : %u %p\n", (unsigned)outdata.dsize, outdata.dptr));
235 return;
237 if (node_pnn < ctdb->num_nodes) {
238 ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
241 if (node_pnn == ctdb->pnn) {
242 ctdb->capabilities = ctdb->nodes[node_pnn]->capabilities;
247 update the node capabilities for all connected nodes
249 static int update_capabilities(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
251 uint32_t *nodes;
252 TALLOC_CTX *tmp_ctx;
254 tmp_ctx = talloc_new(ctdb);
255 CTDB_NO_MEMORY(ctdb, tmp_ctx);
257 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
258 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES,
259 nodes, 0,
260 CONTROL_TIMEOUT(),
261 false, tdb_null,
262 async_getcap_callback, NULL,
263 NULL) != 0) {
264 DEBUG(DEBUG_ERR, (__location__ " Failed to read node capabilities.\n"));
265 talloc_free(tmp_ctx);
266 return -1;
269 talloc_free(tmp_ctx);
270 return 0;
273 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
275 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
277 DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
278 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
281 static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
283 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
285 DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
286 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
290 change recovery mode on all nodes
292 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t rec_mode)
294 TDB_DATA data;
295 uint32_t *nodes;
296 TALLOC_CTX *tmp_ctx;
298 tmp_ctx = talloc_new(ctdb);
299 CTDB_NO_MEMORY(ctdb, tmp_ctx);
301 /* freeze all nodes */
302 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
303 if (rec_mode == CTDB_RECOVERY_ACTIVE) {
304 int i;
306 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
307 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
308 nodes, i,
309 CONTROL_TIMEOUT(),
310 false, tdb_null,
311 NULL,
312 set_recmode_fail_callback,
313 rec) != 0) {
314 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
315 talloc_free(tmp_ctx);
316 return -1;
322 data.dsize = sizeof(uint32_t);
323 data.dptr = (unsigned char *)&rec_mode;
325 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
326 nodes, 0,
327 CONTROL_TIMEOUT(),
328 false, data,
329 NULL, NULL,
330 NULL) != 0) {
331 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
332 talloc_free(tmp_ctx);
333 return -1;
336 talloc_free(tmp_ctx);
337 return 0;
341 change recovery master on all node
343 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
345 TDB_DATA data;
346 TALLOC_CTX *tmp_ctx;
347 uint32_t *nodes;
349 tmp_ctx = talloc_new(ctdb);
350 CTDB_NO_MEMORY(ctdb, tmp_ctx);
352 data.dsize = sizeof(uint32_t);
353 data.dptr = (unsigned char *)&pnn;
355 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
356 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
357 nodes, 0,
358 CONTROL_TIMEOUT(), false, data,
359 NULL, NULL,
360 NULL) != 0) {
361 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
362 talloc_free(tmp_ctx);
363 return -1;
366 talloc_free(tmp_ctx);
367 return 0;
370 /* update all remote nodes to use the same db priority that we have
371 this can fail if the remove node has not yet been upgraded to
372 support this function, so we always return success and never fail
373 a recovery if this call fails.
375 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
376 struct ctdb_node_map *nodemap,
377 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
379 int db;
380 uint32_t *nodes;
382 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
384 /* step through all local databases */
385 for (db=0; db<dbmap->num;db++) {
386 TDB_DATA data;
387 struct ctdb_db_priority db_prio;
388 int ret;
390 db_prio.db_id = dbmap->dbs[db].dbid;
391 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].dbid, &db_prio.priority);
392 if (ret != 0) {
393 DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].dbid));
394 continue;
397 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].dbid, db_prio.priority));
399 data.dptr = (uint8_t *)&db_prio;
400 data.dsize = sizeof(db_prio);
402 if (ctdb_client_async_control(ctdb,
403 CTDB_CONTROL_SET_DB_PRIORITY,
404 nodes, 0,
405 CONTROL_TIMEOUT(), false, data,
406 NULL, NULL,
407 NULL) != 0) {
408 DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n", db_prio.db_id));
412 return 0;
416 ensure all other nodes have attached to any databases that we have
418 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
419 uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
421 int i, j, db, ret;
422 struct ctdb_dbid_map *remote_dbmap;
424 /* verify that all other nodes have all our databases */
425 for (j=0; j<nodemap->num; j++) {
426 /* we dont need to ourself ourselves */
427 if (nodemap->nodes[j].pnn == pnn) {
428 continue;
430 /* dont check nodes that are unavailable */
431 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
432 continue;
435 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
436 mem_ctx, &remote_dbmap);
437 if (ret != 0) {
438 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
439 return -1;
442 /* step through all local databases */
443 for (db=0; db<dbmap->num;db++) {
444 const char *name;
447 for (i=0;i<remote_dbmap->num;i++) {
448 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
449 break;
452 /* the remote node already have this database */
453 if (i!=remote_dbmap->num) {
454 continue;
456 /* ok so we need to create this database */
457 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn, dbmap->dbs[db].dbid,
458 mem_ctx, &name);
459 if (ret != 0) {
460 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
461 return -1;
463 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
464 mem_ctx, name,
465 dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
466 if (ret != 0) {
467 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
468 return -1;
473 return 0;
478 ensure we are attached to any databases that anyone else is attached to
480 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
481 uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
483 int i, j, db, ret;
484 struct ctdb_dbid_map *remote_dbmap;
486 /* verify that we have all database any other node has */
487 for (j=0; j<nodemap->num; j++) {
488 /* we dont need to ourself ourselves */
489 if (nodemap->nodes[j].pnn == pnn) {
490 continue;
492 /* dont check nodes that are unavailable */
493 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
494 continue;
497 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
498 mem_ctx, &remote_dbmap);
499 if (ret != 0) {
500 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
501 return -1;
504 /* step through all databases on the remote node */
505 for (db=0; db<remote_dbmap->num;db++) {
506 const char *name;
508 for (i=0;i<(*dbmap)->num;i++) {
509 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
510 break;
513 /* we already have this db locally */
514 if (i!=(*dbmap)->num) {
515 continue;
517 /* ok so we need to create this database and
518 rebuild dbmap
520 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
521 remote_dbmap->dbs[db].dbid, mem_ctx, &name);
522 if (ret != 0) {
523 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
524 nodemap->nodes[j].pnn));
525 return -1;
527 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
528 remote_dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
529 if (ret != 0) {
530 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
531 return -1;
533 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
534 if (ret != 0) {
535 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
536 return -1;
541 return 0;
546 pull the remote database contents from one node into the recdb
548 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
549 struct tdb_wrap *recdb, uint32_t dbid)
551 int ret;
552 TDB_DATA outdata;
553 struct ctdb_marshall_buffer *reply;
554 struct ctdb_rec_data *rec;
555 int i;
556 TALLOC_CTX *tmp_ctx = talloc_new(recdb);
558 ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
559 CONTROL_TIMEOUT(), &outdata);
560 if (ret != 0) {
561 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
562 talloc_free(tmp_ctx);
563 return -1;
566 reply = (struct ctdb_marshall_buffer *)outdata.dptr;
568 if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
569 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
570 talloc_free(tmp_ctx);
571 return -1;
574 rec = (struct ctdb_rec_data *)&reply->data[0];
576 for (i=0;
577 i<reply->count;
578 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec), i++) {
579 TDB_DATA key, data;
580 struct ctdb_ltdb_header *hdr;
581 TDB_DATA existing;
583 key.dptr = &rec->data[0];
584 key.dsize = rec->keylen;
585 data.dptr = &rec->data[key.dsize];
586 data.dsize = rec->datalen;
588 hdr = (struct ctdb_ltdb_header *)data.dptr;
590 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
591 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
592 talloc_free(tmp_ctx);
593 return -1;
596 /* fetch the existing record, if any */
597 existing = tdb_fetch(recdb->tdb, key);
599 if (existing.dptr != NULL) {
600 struct ctdb_ltdb_header header;
601 if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
602 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
603 (unsigned)existing.dsize, srcnode));
604 free(existing.dptr);
605 talloc_free(tmp_ctx);
606 return -1;
608 header = *(struct ctdb_ltdb_header *)existing.dptr;
609 free(existing.dptr);
610 if (!(header.rsn < hdr->rsn ||
611 (header.dmaster != ctdb->recovery_master && header.rsn == hdr->rsn))) {
612 continue;
616 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
617 DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
618 talloc_free(tmp_ctx);
619 return -1;
623 talloc_free(tmp_ctx);
625 return 0;
629 struct pull_seqnum_cbdata {
630 int failed;
631 uint32_t pnn;
632 uint64_t seqnum;
635 static void pull_seqnum_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
637 struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
638 uint64_t seqnum;
640 if (cb_data->failed != 0) {
641 DEBUG(DEBUG_ERR, ("Got seqnum from node %d but we have already failed the entire operation\n", node_pnn));
642 return;
645 if (res != 0) {
646 DEBUG(DEBUG_ERR, ("Error when pulling seqnum from node %d\n", node_pnn));
647 cb_data->failed = 1;
648 return;
651 if (outdata.dsize != sizeof(uint64_t)) {
652 DEBUG(DEBUG_ERR, ("Error when reading pull seqnum from node %d, got %d bytes but expected %d\n", node_pnn, (int)outdata.dsize, (int)sizeof(uint64_t)));
653 cb_data->failed = -1;
654 return;
657 seqnum = *((uint64_t *)outdata.dptr);
659 if (seqnum > cb_data->seqnum) {
660 cb_data->seqnum = seqnum;
661 cb_data->pnn = node_pnn;
665 static void pull_seqnum_fail_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
667 struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
669 DEBUG(DEBUG_ERR, ("Failed to pull db seqnum from node %d\n", node_pnn));
670 cb_data->failed = 1;
673 static int pull_highest_seqnum_pdb(struct ctdb_context *ctdb,
674 struct ctdb_recoverd *rec,
675 struct ctdb_node_map *nodemap,
676 struct tdb_wrap *recdb, uint32_t dbid)
678 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
679 uint32_t *nodes;
680 TDB_DATA data;
681 uint32_t outdata[2];
682 struct pull_seqnum_cbdata *cb_data;
684 DEBUG(DEBUG_NOTICE, ("Scan for highest seqnum pdb for db:0x%08x\n", dbid));
686 outdata[0] = dbid;
687 outdata[1] = 0;
689 data.dsize = sizeof(outdata);
690 data.dptr = (uint8_t *)&outdata[0];
692 cb_data = talloc(tmp_ctx, struct pull_seqnum_cbdata);
693 if (cb_data == NULL) {
694 DEBUG(DEBUG_ERR, ("Failed to allocate pull highest seqnum cb_data structure\n"));
695 talloc_free(tmp_ctx);
696 return -1;
699 cb_data->failed = 0;
700 cb_data->pnn = -1;
701 cb_data->seqnum = 0;
703 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
704 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_DB_SEQNUM,
705 nodes, 0,
706 CONTROL_TIMEOUT(), false, data,
707 pull_seqnum_cb,
708 pull_seqnum_fail_cb,
709 cb_data) != 0) {
710 DEBUG(DEBUG_ERR, (__location__ " Failed to run async GET_DB_SEQNUM\n"));
712 talloc_free(tmp_ctx);
713 return -1;
716 if (cb_data->failed != 0) {
717 DEBUG(DEBUG_NOTICE, ("Failed to pull sequence numbers for DB 0x%08x\n", dbid));
718 talloc_free(tmp_ctx);
719 return -1;
722 if (cb_data->seqnum == 0 || cb_data->pnn == -1) {
723 DEBUG(DEBUG_NOTICE, ("Failed to find a node with highest sequence numbers for DB 0x%08x\n", dbid));
724 talloc_free(tmp_ctx);
725 return -1;
728 DEBUG(DEBUG_NOTICE, ("Pull persistent db:0x%08x from node %d with highest seqnum:%lld\n", dbid, cb_data->pnn, (long long)cb_data->seqnum));
730 if (pull_one_remote_database(ctdb, cb_data->pnn, recdb, dbid) != 0) {
731 DEBUG(DEBUG_ERR, ("Failed to pull higest seqnum database 0x%08x from node %d\n", dbid, cb_data->pnn));
732 talloc_free(tmp_ctx);
733 return -1;
736 talloc_free(tmp_ctx);
737 return 0;
742 pull all the remote database contents into the recdb
744 static int pull_remote_database(struct ctdb_context *ctdb,
745 struct ctdb_recoverd *rec,
746 struct ctdb_node_map *nodemap,
747 struct tdb_wrap *recdb, uint32_t dbid,
748 bool persistent)
750 int j;
752 if (persistent && ctdb->tunable.recover_pdb_by_seqnum != 0) {
753 int ret;
754 ret = pull_highest_seqnum_pdb(ctdb, rec, nodemap, recdb, dbid);
755 if (ret == 0) {
756 return 0;
760 /* pull all records from all other nodes across onto this node
761 (this merges based on rsn)
763 for (j=0; j<nodemap->num; j++) {
764 /* dont merge from nodes that are unavailable */
765 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
766 continue;
768 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
769 DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
770 nodemap->nodes[j].pnn));
771 ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
772 return -1;
776 return 0;
781 update flags on all active nodes
783 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
785 int ret;
787 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
788 if (ret != 0) {
789 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
790 return -1;
793 return 0;
797 ensure all nodes have the same vnnmap we do
799 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
800 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
802 int j, ret;
804 /* push the new vnn map out to all the nodes */
805 for (j=0; j<nodemap->num; j++) {
806 /* dont push to nodes that are unavailable */
807 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
808 continue;
811 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
812 if (ret != 0) {
813 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
814 return -1;
818 return 0;
822 struct vacuum_info {
823 struct vacuum_info *next, *prev;
824 struct ctdb_recoverd *rec;
825 uint32_t srcnode;
826 struct ctdb_db_context *ctdb_db;
827 struct ctdb_marshall_buffer *recs;
828 struct ctdb_rec_data *r;
831 static void vacuum_fetch_next(struct vacuum_info *v);
834 called when a vacuum fetch has completed - just free it and do the next one
836 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
838 struct vacuum_info *v = talloc_get_type(state->async.private_data, struct vacuum_info);
839 talloc_free(state);
840 vacuum_fetch_next(v);
845 process the next element from the vacuum list
847 static void vacuum_fetch_next(struct vacuum_info *v)
849 struct ctdb_call call;
850 struct ctdb_rec_data *r;
852 while (v->recs->count) {
853 struct ctdb_client_call_state *state;
854 TDB_DATA data;
855 struct ctdb_ltdb_header *hdr;
857 ZERO_STRUCT(call);
858 call.call_id = CTDB_NULL_FUNC;
859 call.flags = CTDB_IMMEDIATE_MIGRATION;
860 call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
862 r = v->r;
863 v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
864 v->recs->count--;
866 call.key.dptr = &r->data[0];
867 call.key.dsize = r->keylen;
869 /* ensure we don't block this daemon - just skip a record if we can't get
870 the chainlock */
871 if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
872 continue;
875 data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
876 if (data.dptr == NULL) {
877 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
878 continue;
881 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
882 free(data.dptr);
883 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
884 continue;
887 hdr = (struct ctdb_ltdb_header *)data.dptr;
888 if (hdr->dmaster == v->rec->ctdb->pnn) {
889 /* its already local */
890 free(data.dptr);
891 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
892 continue;
895 free(data.dptr);
897 state = ctdb_call_send(v->ctdb_db, &call);
898 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
899 if (state == NULL) {
900 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
901 talloc_free(v);
902 return;
904 state->async.fn = vacuum_fetch_callback;
905 state->async.private_data = v;
906 return;
909 talloc_free(v);
914 destroy a vacuum info structure
916 static int vacuum_info_destructor(struct vacuum_info *v)
918 DLIST_REMOVE(v->rec->vacuum_info, v);
919 return 0;
924 handler for vacuum fetch
926 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
927 TDB_DATA data, void *private_data)
929 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
930 struct ctdb_marshall_buffer *recs;
931 int ret, i;
932 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
933 const char *name;
934 struct ctdb_dbid_map *dbmap=NULL;
935 bool persistent = false;
936 struct ctdb_db_context *ctdb_db;
937 struct ctdb_rec_data *r;
938 uint32_t srcnode;
939 struct vacuum_info *v;
941 recs = (struct ctdb_marshall_buffer *)data.dptr;
942 r = (struct ctdb_rec_data *)&recs->data[0];
944 if (recs->count == 0) {
945 talloc_free(tmp_ctx);
946 return;
949 srcnode = r->reqid;
951 for (v=rec->vacuum_info;v;v=v->next) {
952 if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
953 /* we're already working on records from this node */
954 talloc_free(tmp_ctx);
955 return;
959 /* work out if the database is persistent */
960 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
961 if (ret != 0) {
962 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
963 talloc_free(tmp_ctx);
964 return;
967 for (i=0;i<dbmap->num;i++) {
968 if (dbmap->dbs[i].dbid == recs->db_id) {
969 persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
970 break;
973 if (i == dbmap->num) {
974 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
975 talloc_free(tmp_ctx);
976 return;
979 /* find the name of this database */
980 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
981 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
982 talloc_free(tmp_ctx);
983 return;
986 /* attach to it */
987 ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, persistent, 0);
988 if (ctdb_db == NULL) {
989 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
990 talloc_free(tmp_ctx);
991 return;
994 v = talloc_zero(rec, struct vacuum_info);
995 if (v == NULL) {
996 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
997 talloc_free(tmp_ctx);
998 return;
1001 v->rec = rec;
1002 v->srcnode = srcnode;
1003 v->ctdb_db = ctdb_db;
1004 v->recs = talloc_memdup(v, recs, data.dsize);
1005 if (v->recs == NULL) {
1006 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
1007 talloc_free(v);
1008 talloc_free(tmp_ctx);
1009 return;
1011 v->r = (struct ctdb_rec_data *)&v->recs->data[0];
1013 DLIST_ADD(rec->vacuum_info, v);
1015 talloc_set_destructor(v, vacuum_info_destructor);
1017 vacuum_fetch_next(v);
1018 talloc_free(tmp_ctx);
1023 called when ctdb_wait_timeout should finish
1025 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
1026 struct timeval yt, void *p)
1028 uint32_t *timed_out = (uint32_t *)p;
1029 (*timed_out) = 1;
1033 wait for a given number of seconds
1035 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
1037 uint32_t timed_out = 0;
1038 time_t usecs = (secs - (time_t)secs) * 1000000;
1039 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs), ctdb_wait_handler, &timed_out);
1040 while (!timed_out) {
1041 event_loop_once(ctdb->ev);
1046 called when an election times out (ends)
1048 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
1049 struct timeval t, void *p)
1051 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1052 rec->election_timeout = NULL;
1053 fast_start = false;
1055 DEBUG(DEBUG_WARNING,(__location__ " Election timed out\n"));
1060 wait for an election to finish. It finished election_timeout seconds after
1061 the last election packet is received
1063 static void ctdb_wait_election(struct ctdb_recoverd *rec)
1065 struct ctdb_context *ctdb = rec->ctdb;
1066 while (rec->election_timeout) {
1067 event_loop_once(ctdb->ev);
1072 Update our local flags from all remote connected nodes.
1073 This is only run when we are or we belive we are the recovery master
1075 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
1077 int j;
1078 struct ctdb_context *ctdb = rec->ctdb;
1079 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1081 /* get the nodemap for all active remote nodes and verify
1082 they are the same as for this node
1084 for (j=0; j<nodemap->num; j++) {
1085 struct ctdb_node_map *remote_nodemap=NULL;
1086 int ret;
1088 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
1089 continue;
1091 if (nodemap->nodes[j].pnn == ctdb->pnn) {
1092 continue;
1095 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
1096 mem_ctx, &remote_nodemap);
1097 if (ret != 0) {
1098 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
1099 nodemap->nodes[j].pnn));
1100 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
1101 talloc_free(mem_ctx);
1102 return MONITOR_FAILED;
1104 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
1105 /* We should tell our daemon about this so it
1106 updates its flags or else we will log the same
1107 message again in the next iteration of recovery.
1108 Since we are the recovery master we can just as
1109 well update the flags on all nodes.
1111 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, nodemap->nodes[j].flags, ~nodemap->nodes[j].flags);
1112 if (ret != 0) {
1113 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
1114 return -1;
1117 /* Update our local copy of the flags in the recovery
1118 daemon.
1120 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
1121 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
1122 nodemap->nodes[j].flags));
1123 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
1125 talloc_free(remote_nodemap);
1127 talloc_free(mem_ctx);
1128 return MONITOR_OK;
1132 /* Create a new random generation ip.
1133 The generation id can not be the INVALID_GENERATION id
1135 static uint32_t new_generation(void)
1137 uint32_t generation;
1139 while (1) {
1140 generation = random();
1142 if (generation != INVALID_GENERATION) {
1143 break;
1147 return generation;
1152 create a temporary working database
1154 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1156 char *name;
1157 struct tdb_wrap *recdb;
1158 unsigned tdb_flags;
1160 /* open up the temporary recovery database */
1161 name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
1162 ctdb->db_directory_state,
1163 ctdb->pnn);
1164 if (name == NULL) {
1165 return NULL;
1167 unlink(name);
1169 tdb_flags = TDB_NOLOCK;
1170 if (ctdb->valgrinding) {
1171 tdb_flags |= TDB_NOMMAP;
1173 tdb_flags |= TDB_DISALLOW_NESTING;
1175 recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1176 tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1177 if (recdb == NULL) {
1178 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1181 talloc_free(name);
1183 return recdb;
1188 a traverse function for pulling all relevant records from recdb
1190 struct recdb_data {
1191 struct ctdb_context *ctdb;
1192 struct ctdb_marshall_buffer *recdata;
1193 uint32_t len;
1194 uint32_t allocated_len;
1195 bool failed;
1196 bool persistent;
1199 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1201 struct recdb_data *params = (struct recdb_data *)p;
1202 struct ctdb_rec_data *rec;
1203 struct ctdb_ltdb_header *hdr;
1205 /* skip empty records */
1206 if (data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1207 return 0;
1210 /* update the dmaster field to point to us */
1211 hdr = (struct ctdb_ltdb_header *)data.dptr;
1212 if (!params->persistent) {
1213 hdr->dmaster = params->ctdb->pnn;
1214 hdr->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
1217 /* add the record to the blob ready to send to the nodes */
1218 rec = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1219 if (rec == NULL) {
1220 params->failed = true;
1221 return -1;
1223 if (params->len + rec->length >= params->allocated_len) {
1224 params->allocated_len = rec->length + params->len + params->ctdb->tunable.pulldb_preallocation_size;
1225 params->recdata = talloc_realloc_size(NULL, params->recdata, params->allocated_len);
1227 if (params->recdata == NULL) {
1228 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u (%u records)\n",
1229 rec->length + params->len, params->recdata->count));
1230 params->failed = true;
1231 return -1;
1233 params->recdata->count++;
1234 memcpy(params->len+(uint8_t *)params->recdata, rec, rec->length);
1235 params->len += rec->length;
1236 talloc_free(rec);
1238 return 0;
1242 push the recdb database out to all nodes
1244 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1245 bool persistent,
1246 struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1248 struct recdb_data params;
1249 struct ctdb_marshall_buffer *recdata;
1250 TDB_DATA outdata;
1251 TALLOC_CTX *tmp_ctx;
1252 uint32_t *nodes;
1254 tmp_ctx = talloc_new(ctdb);
1255 CTDB_NO_MEMORY(ctdb, tmp_ctx);
1257 recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1258 CTDB_NO_MEMORY(ctdb, recdata);
1260 recdata->db_id = dbid;
1262 params.ctdb = ctdb;
1263 params.recdata = recdata;
1264 params.len = offsetof(struct ctdb_marshall_buffer, data);
1265 params.allocated_len = params.len;
1266 params.failed = false;
1267 params.persistent = persistent;
1269 if (tdb_traverse_read(recdb->tdb, traverse_recdb, &params) == -1) {
1270 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1271 talloc_free(params.recdata);
1272 talloc_free(tmp_ctx);
1273 return -1;
1276 if (params.failed) {
1277 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1278 talloc_free(params.recdata);
1279 talloc_free(tmp_ctx);
1280 return -1;
1283 recdata = params.recdata;
1285 outdata.dptr = (void *)recdata;
1286 outdata.dsize = params.len;
1288 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1289 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1290 nodes, 0,
1291 CONTROL_TIMEOUT(), false, outdata,
1292 NULL, NULL,
1293 NULL) != 0) {
1294 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1295 talloc_free(recdata);
1296 talloc_free(tmp_ctx);
1297 return -1;
1300 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1301 dbid, recdata->count));
1303 talloc_free(recdata);
1304 talloc_free(tmp_ctx);
1306 return 0;
1311 go through a full recovery on one database
1313 static int recover_database(struct ctdb_recoverd *rec,
1314 TALLOC_CTX *mem_ctx,
1315 uint32_t dbid,
1316 bool persistent,
1317 uint32_t pnn,
1318 struct ctdb_node_map *nodemap,
1319 uint32_t transaction_id)
1321 struct tdb_wrap *recdb;
1322 int ret;
1323 struct ctdb_context *ctdb = rec->ctdb;
1324 TDB_DATA data;
1325 struct ctdb_control_wipe_database w;
1326 uint32_t *nodes;
1328 recdb = create_recdb(ctdb, mem_ctx);
1329 if (recdb == NULL) {
1330 return -1;
1333 /* pull all remote databases onto the recdb */
1334 ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
1335 if (ret != 0) {
1336 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1337 return -1;
1340 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1342 /* wipe all the remote databases. This is safe as we are in a transaction */
1343 w.db_id = dbid;
1344 w.transaction_id = transaction_id;
1346 data.dptr = (void *)&w;
1347 data.dsize = sizeof(w);
1349 nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1350 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1351 nodes, 0,
1352 CONTROL_TIMEOUT(), false, data,
1353 NULL, NULL,
1354 NULL) != 0) {
1355 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1356 talloc_free(recdb);
1357 return -1;
1360 /* push out the correct database. This sets the dmaster and skips
1361 the empty records */
1362 ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
1363 if (ret != 0) {
1364 talloc_free(recdb);
1365 return -1;
1368 /* all done with this database */
1369 talloc_free(recdb);
1371 return 0;
1375 reload the nodes file
1377 static void reload_nodes_file(struct ctdb_context *ctdb)
1379 ctdb->nodes = NULL;
1380 ctdb_load_nodes_file(ctdb);
1383 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1384 struct ctdb_recoverd *rec,
1385 struct ctdb_node_map *nodemap,
1386 uint32_t *culprit)
1388 int j;
1389 int ret;
1391 if (ctdb->num_nodes != nodemap->num) {
1392 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
1393 ctdb->num_nodes, nodemap->num));
1394 if (culprit) {
1395 *culprit = ctdb->pnn;
1397 return -1;
1400 for (j=0; j<nodemap->num; j++) {
1401 /* release any existing data */
1402 if (ctdb->nodes[j]->known_public_ips) {
1403 talloc_free(ctdb->nodes[j]->known_public_ips);
1404 ctdb->nodes[j]->known_public_ips = NULL;
1406 if (ctdb->nodes[j]->available_public_ips) {
1407 talloc_free(ctdb->nodes[j]->available_public_ips);
1408 ctdb->nodes[j]->available_public_ips = NULL;
1411 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1412 continue;
1415 /* grab a new shiny list of public ips from the node */
1416 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1417 CONTROL_TIMEOUT(),
1418 ctdb->nodes[j]->pnn,
1419 ctdb->nodes,
1421 &ctdb->nodes[j]->known_public_ips);
1422 if (ret != 0) {
1423 DEBUG(DEBUG_ERR,("Failed to read known public ips from node : %u\n",
1424 ctdb->nodes[j]->pnn));
1425 if (culprit) {
1426 *culprit = ctdb->nodes[j]->pnn;
1428 return -1;
1431 if (ctdb->do_checkpublicip) {
1432 if (rec->ip_check_disable_ctx == NULL) {
1433 if (verify_remote_ip_allocation(ctdb, ctdb->nodes[j]->known_public_ips)) {
1434 DEBUG(DEBUG_ERR,("Node %d has inconsistent public ip allocation and needs update.\n", ctdb->nodes[j]->pnn));
1435 rec->need_takeover_run = true;
1440 /* grab a new shiny list of public ips from the node */
1441 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1442 CONTROL_TIMEOUT(),
1443 ctdb->nodes[j]->pnn,
1444 ctdb->nodes,
1445 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1446 &ctdb->nodes[j]->available_public_ips);
1447 if (ret != 0) {
1448 DEBUG(DEBUG_ERR,("Failed to read available public ips from node : %u\n",
1449 ctdb->nodes[j]->pnn));
1450 if (culprit) {
1451 *culprit = ctdb->nodes[j]->pnn;
1453 return -1;
1457 return 0;
1460 /* when we start a recovery, make sure all nodes use the same reclock file
1461 setting
1463 static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd *rec)
1465 struct ctdb_context *ctdb = rec->ctdb;
1466 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
1467 TDB_DATA data;
1468 uint32_t *nodes;
1470 if (ctdb->recovery_lock_file == NULL) {
1471 data.dptr = NULL;
1472 data.dsize = 0;
1473 } else {
1474 data.dsize = strlen(ctdb->recovery_lock_file) + 1;
1475 data.dptr = (uint8_t *)ctdb->recovery_lock_file;
1478 nodes = list_of_active_nodes(ctdb, rec->nodemap, tmp_ctx, true);
1479 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECLOCK_FILE,
1480 nodes, 0,
1481 CONTROL_TIMEOUT(),
1482 false, data,
1483 NULL, NULL,
1484 rec) != 0) {
1485 DEBUG(DEBUG_ERR, (__location__ " Failed to sync reclock file settings\n"));
1486 talloc_free(tmp_ctx);
1487 return -1;
1490 talloc_free(tmp_ctx);
1491 return 0;
1496 * this callback is called for every node that failed to execute ctdb_takeover_run()
1497 * and set flag to re-run takeover run.
1499 static void takeover_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
1501 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
1503 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the takeover run. Setting it as recovery fail culprit\n", node_pnn));
1505 ctdb_set_culprit(rec, node_pnn);
1506 rec->need_takeover_run = true;
1511 we are the recmaster, and recovery is needed - start a recovery run
1513 static int do_recovery(struct ctdb_recoverd *rec,
1514 TALLOC_CTX *mem_ctx, uint32_t pnn,
1515 struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap)
1517 struct ctdb_context *ctdb = rec->ctdb;
1518 int i, j, ret;
1519 uint32_t generation;
1520 struct ctdb_dbid_map *dbmap;
1521 TDB_DATA data;
1522 uint32_t *nodes;
1523 struct timeval start_time;
1524 uint32_t culprit = (uint32_t)-1;
1526 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1528 /* if recovery fails, force it again */
1529 rec->need_recovery = true;
1531 for (i=0; i<ctdb->num_nodes; i++) {
1532 struct ctdb_banning_state *ban_state;
1534 if (ctdb->nodes[i]->ban_state == NULL) {
1535 continue;
1537 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1538 if (ban_state->count < 2*ctdb->num_nodes) {
1539 continue;
1541 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
1542 ctdb->nodes[i]->pnn, ban_state->count,
1543 ctdb->tunable.recovery_ban_period));
1544 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1545 ban_state->count = 0;
1549 if (ctdb->tunable.verify_recovery_lock != 0) {
1550 DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n"));
1551 start_time = timeval_current();
1552 if (!ctdb_recovery_lock(ctdb, true)) {
1553 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
1554 "and ban ourself for %u seconds\n",
1555 ctdb->tunable.recovery_ban_period));
1556 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
1557 return -1;
1559 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
1560 DEBUG(DEBUG_NOTICE,("Recovery lock taken successfully by recovery daemon\n"));
1563 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1565 /* get a list of all databases */
1566 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1567 if (ret != 0) {
1568 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1569 return -1;
1572 /* we do the db creation before we set the recovery mode, so the freeze happens
1573 on all databases we will be dealing with. */
1575 /* verify that we have all the databases any other node has */
1576 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1577 if (ret != 0) {
1578 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1579 return -1;
1582 /* verify that all other nodes have all our databases */
1583 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1584 if (ret != 0) {
1585 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1586 return -1;
1588 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1590 /* update the database priority for all remote databases */
1591 ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
1592 if (ret != 0) {
1593 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
1595 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
1598 /* update all other nodes to use the same setting for reclock files
1599 as the local recovery master.
1601 sync_recovery_lock_file_across_cluster(rec);
1603 /* set recovery mode to active on all nodes */
1604 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1605 if (ret != 0) {
1606 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1607 return -1;
1610 /* execute the "startrecovery" event script on all nodes */
1611 ret = run_startrecovery_eventscript(rec, nodemap);
1612 if (ret!=0) {
1613 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1614 return -1;
1618 update all nodes to have the same flags that we have
1620 for (i=0;i<nodemap->num;i++) {
1621 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1622 continue;
1625 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1626 if (ret != 0) {
1627 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1628 return -1;
1632 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1634 /* pick a new generation number */
1635 generation = new_generation();
1637 /* change the vnnmap on this node to use the new generation
1638 number but not on any other nodes.
1639 this guarantees that if we abort the recovery prematurely
1640 for some reason (a node stops responding?)
1641 that we can just return immediately and we will reenter
1642 recovery shortly again.
1643 I.e. we deliberately leave the cluster with an inconsistent
1644 generation id to allow us to abort recovery at any stage and
1645 just restart it from scratch.
1647 vnnmap->generation = generation;
1648 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1649 if (ret != 0) {
1650 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1651 return -1;
1654 data.dptr = (void *)&generation;
1655 data.dsize = sizeof(uint32_t);
1657 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1658 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1659 nodes, 0,
1660 CONTROL_TIMEOUT(), false, data,
1661 NULL,
1662 transaction_start_fail_callback,
1663 rec) != 0) {
1664 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1665 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
1666 nodes, 0,
1667 CONTROL_TIMEOUT(), false, tdb_null,
1668 NULL,
1669 NULL,
1670 NULL) != 0) {
1671 DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
1673 return -1;
1676 DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1678 for (i=0;i<dbmap->num;i++) {
1679 ret = recover_database(rec, mem_ctx,
1680 dbmap->dbs[i].dbid,
1681 dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT,
1682 pnn, nodemap, generation);
1683 if (ret != 0) {
1684 DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
1685 return -1;
1689 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
1691 /* commit all the changes */
1692 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
1693 nodes, 0,
1694 CONTROL_TIMEOUT(), false, data,
1695 NULL, NULL,
1696 NULL) != 0) {
1697 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
1698 return -1;
1701 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
1704 /* update the capabilities for all nodes */
1705 ret = update_capabilities(ctdb, nodemap);
1706 if (ret!=0) {
1707 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1708 return -1;
1711 /* build a new vnn map with all the currently active and
1712 unbanned nodes */
1713 generation = new_generation();
1714 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
1715 CTDB_NO_MEMORY(ctdb, vnnmap);
1716 vnnmap->generation = generation;
1717 vnnmap->size = 0;
1718 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
1719 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1720 for (i=j=0;i<nodemap->num;i++) {
1721 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1722 continue;
1724 if (!(ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER)) {
1725 /* this node can not be an lmaster */
1726 DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
1727 continue;
1730 vnnmap->size++;
1731 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1732 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1733 vnnmap->map[j++] = nodemap->nodes[i].pnn;
1736 if (vnnmap->size == 0) {
1737 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
1738 vnnmap->size++;
1739 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1740 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1741 vnnmap->map[0] = pnn;
1744 /* update to the new vnnmap on all nodes */
1745 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
1746 if (ret != 0) {
1747 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
1748 return -1;
1751 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
1753 /* update recmaster to point to us for all nodes */
1754 ret = set_recovery_master(ctdb, nodemap, pnn);
1755 if (ret!=0) {
1756 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
1757 return -1;
1760 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
1763 update all nodes to have the same flags that we have
1765 for (i=0;i<nodemap->num;i++) {
1766 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1767 continue;
1770 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1771 if (ret != 0) {
1772 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1773 return -1;
1777 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1779 /* disable recovery mode */
1780 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL);
1781 if (ret != 0) {
1782 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
1783 return -1;
1786 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
1789 tell nodes to takeover their public IPs
1791 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
1792 if (ret != 0) {
1793 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
1794 culprit));
1795 rec->need_takeover_run = true;
1796 return -1;
1798 rec->need_takeover_run = false;
1799 ret = ctdb_takeover_run(ctdb, nodemap, NULL, NULL);
1800 if (ret != 0) {
1801 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. ctdb_takeover_run() failed.\n"));
1802 rec->need_takeover_run = true;
1805 /* execute the "recovered" event script on all nodes */
1806 ret = run_recovered_eventscript(rec, nodemap, "do_recovery");
1807 if (ret!=0) {
1808 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
1809 return -1;
1812 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
1814 /* send a message to all clients telling them that the cluster
1815 has been reconfigured */
1816 ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
1818 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1820 rec->need_recovery = false;
1822 /* we managed to complete a full recovery, make sure to forgive
1823 any past sins by the nodes that could now participate in the
1824 recovery.
1826 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
1827 for (i=0;i<nodemap->num;i++) {
1828 struct ctdb_banning_state *ban_state;
1830 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1831 continue;
1834 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
1835 if (ban_state == NULL) {
1836 continue;
1839 ban_state->count = 0;
1843 /* We just finished a recovery successfully.
1844 We now wait for rerecovery_timeout before we allow
1845 another recovery to take place.
1847 DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
1848 ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
1849 DEBUG(DEBUG_NOTICE, ("The rerecovery timeout has elapsed. We now allow recoveries to trigger again.\n"));
1851 return 0;
1856 elections are won by first checking the number of connected nodes, then
1857 the priority time, then the pnn
1859 struct election_message {
1860 uint32_t num_connected;
1861 struct timeval priority_time;
1862 uint32_t pnn;
1863 uint32_t node_flags;
1867 form this nodes election data
1869 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1871 int ret, i;
1872 struct ctdb_node_map *nodemap;
1873 struct ctdb_context *ctdb = rec->ctdb;
1875 ZERO_STRUCTP(em);
1877 em->pnn = rec->ctdb->pnn;
1878 em->priority_time = rec->priority_time;
1880 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1881 if (ret != 0) {
1882 DEBUG(DEBUG_ERR,(__location__ " unable to get election data\n"));
1883 return;
1886 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
1887 em->node_flags = rec->node_flags;
1889 for (i=0;i<nodemap->num;i++) {
1890 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1891 em->num_connected++;
1895 /* we shouldnt try to win this election if we cant be a recmaster */
1896 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1897 em->num_connected = 0;
1898 em->priority_time = timeval_current();
1901 talloc_free(nodemap);
1905 see if the given election data wins
1907 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1909 struct election_message myem;
1910 int cmp = 0;
1912 ctdb_election_data(rec, &myem);
1914 /* we cant win if we dont have the recmaster capability */
1915 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1916 return false;
1919 /* we cant win if we are banned */
1920 if (rec->node_flags & NODE_FLAGS_BANNED) {
1921 return false;
1924 /* we cant win if we are stopped */
1925 if (rec->node_flags & NODE_FLAGS_STOPPED) {
1926 return false;
1929 /* we will automatically win if the other node is banned */
1930 if (em->node_flags & NODE_FLAGS_BANNED) {
1931 return true;
1934 /* we will automatically win if the other node is banned */
1935 if (em->node_flags & NODE_FLAGS_STOPPED) {
1936 return true;
1939 /* try to use the most connected node */
1940 if (cmp == 0) {
1941 cmp = (int)myem.num_connected - (int)em->num_connected;
1944 /* then the longest running node */
1945 if (cmp == 0) {
1946 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1949 if (cmp == 0) {
1950 cmp = (int)myem.pnn - (int)em->pnn;
1953 return cmp > 0;
1957 send out an election request
1959 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn, bool update_recmaster)
1961 int ret;
1962 TDB_DATA election_data;
1963 struct election_message emsg;
1964 uint64_t srvid;
1965 struct ctdb_context *ctdb = rec->ctdb;
1967 srvid = CTDB_SRVID_RECOVERY;
1969 ctdb_election_data(rec, &emsg);
1971 election_data.dsize = sizeof(struct election_message);
1972 election_data.dptr = (unsigned char *)&emsg;
1975 /* send an election message to all active nodes */
1976 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
1977 ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1980 /* A new node that is already frozen has entered the cluster.
1981 The existing nodes are not frozen and dont need to be frozen
1982 until the election has ended and we start the actual recovery
1984 if (update_recmaster == true) {
1985 /* first we assume we will win the election and set
1986 recoverymaster to be ourself on the current node
1988 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
1989 if (ret != 0) {
1990 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
1991 return -1;
1996 return 0;
2000 this function will unban all nodes in the cluster
2002 static void unban_all_nodes(struct ctdb_context *ctdb)
2004 int ret, i;
2005 struct ctdb_node_map *nodemap;
2006 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2008 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2009 if (ret != 0) {
2010 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
2011 return;
2014 for (i=0;i<nodemap->num;i++) {
2015 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
2016 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
2017 ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn, 0, NODE_FLAGS_BANNED);
2021 talloc_free(tmp_ctx);
2026 we think we are winning the election - send a broadcast election request
2028 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
2030 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2031 int ret;
2033 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb), false);
2034 if (ret != 0) {
2035 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
2038 talloc_free(rec->send_election_te);
2039 rec->send_election_te = NULL;
2043 handler for memory dumps
2045 static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
2046 TDB_DATA data, void *private_data)
2048 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2049 TDB_DATA *dump;
2050 int ret;
2051 struct rd_memdump_reply *rd;
2053 if (data.dsize != sizeof(struct rd_memdump_reply)) {
2054 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2055 talloc_free(tmp_ctx);
2056 return;
2058 rd = (struct rd_memdump_reply *)data.dptr;
2060 dump = talloc_zero(tmp_ctx, TDB_DATA);
2061 if (dump == NULL) {
2062 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
2063 talloc_free(tmp_ctx);
2064 return;
2066 ret = ctdb_dump_memory(ctdb, dump);
2067 if (ret != 0) {
2068 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
2069 talloc_free(tmp_ctx);
2070 return;
2073 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
2075 ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
2076 if (ret != 0) {
2077 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
2078 talloc_free(tmp_ctx);
2079 return;
2082 talloc_free(tmp_ctx);
2086 handler for getlog
2088 static void getlog_handler(struct ctdb_context *ctdb, uint64_t srvid,
2089 TDB_DATA data, void *private_data)
2091 struct ctdb_get_log_addr *log_addr;
2092 pid_t child;
2094 if (data.dsize != sizeof(struct ctdb_get_log_addr)) {
2095 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2096 return;
2098 log_addr = (struct ctdb_get_log_addr *)data.dptr;
2100 child = ctdb_fork(ctdb);
2101 if (child == (pid_t)-1) {
2102 DEBUG(DEBUG_ERR,("Failed to fork a log collector child\n"));
2103 return;
2106 if (child == 0) {
2107 if (switch_from_server_to_client(ctdb, "recoverd-log-collector") != 0) {
2108 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch log collector child into client mode.\n"));
2109 _exit(1);
2111 ctdb_collect_log(ctdb, log_addr);
2112 _exit(0);
2117 handler for clearlog
2119 static void clearlog_handler(struct ctdb_context *ctdb, uint64_t srvid,
2120 TDB_DATA data, void *private_data)
2122 ctdb_clear_log(ctdb);
2126 handler for reload_nodes
2128 static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid,
2129 TDB_DATA data, void *private_data)
2131 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2133 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
2135 reload_nodes_file(rec->ctdb);
2139 static void reenable_ip_check(struct event_context *ev, struct timed_event *te,
2140 struct timeval yt, void *p)
2142 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2144 talloc_free(rec->ip_check_disable_ctx);
2145 rec->ip_check_disable_ctx = NULL;
2149 static void ctdb_rebalance_timeout(struct event_context *ev, struct timed_event *te,
2150 struct timeval t, void *p)
2152 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2153 struct ctdb_context *ctdb = rec->ctdb;
2154 int ret;
2156 DEBUG(DEBUG_NOTICE,("Rebalance all nodes that have had ip assignment changes.\n"));
2158 ret = ctdb_takeover_run(ctdb, rec->nodemap, NULL, NULL);
2159 if (ret != 0) {
2160 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. ctdb_takeover_run() failed.\n"));
2161 rec->need_takeover_run = true;
2164 talloc_free(rec->deferred_rebalance_ctx);
2165 rec->deferred_rebalance_ctx = NULL;
2169 static void recd_node_rebalance_handler(struct ctdb_context *ctdb, uint64_t srvid,
2170 TDB_DATA data, void *private_data)
2172 uint32_t pnn;
2173 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2175 if (data.dsize != sizeof(uint32_t)) {
2176 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
2177 return;
2180 if (ctdb->tunable.deferred_rebalance_on_node_add == 0) {
2181 return;
2184 pnn = *(uint32_t *)&data.dptr[0];
2186 lcp2_forcerebalance(ctdb, pnn);
2187 DEBUG(DEBUG_NOTICE,("Received message to perform node rebalancing for node %d\n", pnn));
2189 if (rec->deferred_rebalance_ctx != NULL) {
2190 talloc_free(rec->deferred_rebalance_ctx);
2192 rec->deferred_rebalance_ctx = talloc_new(rec);
2193 event_add_timed(ctdb->ev, rec->deferred_rebalance_ctx,
2194 timeval_current_ofs(ctdb->tunable.deferred_rebalance_on_node_add, 0),
2195 ctdb_rebalance_timeout, rec);
2200 static void recd_update_ip_handler(struct ctdb_context *ctdb, uint64_t srvid,
2201 TDB_DATA data, void *private_data)
2203 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2204 struct ctdb_public_ip *ip;
2206 if (rec->recmaster != rec->ctdb->pnn) {
2207 DEBUG(DEBUG_INFO,("Not recmaster, ignore update ip message\n"));
2208 return;
2211 if (data.dsize != sizeof(struct ctdb_public_ip)) {
2212 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(struct ctdb_public_ip)));
2213 return;
2216 ip = (struct ctdb_public_ip *)data.dptr;
2218 update_ip_assignment_tree(rec->ctdb, ip);
2222 static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid,
2223 TDB_DATA data, void *private_data)
2225 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2226 uint32_t timeout;
2228 if (rec->ip_check_disable_ctx != NULL) {
2229 talloc_free(rec->ip_check_disable_ctx);
2230 rec->ip_check_disable_ctx = NULL;
2233 if (data.dsize != sizeof(uint32_t)) {
2234 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2235 "expexting %lu\n", (long unsigned)data.dsize,
2236 (long unsigned)sizeof(uint32_t)));
2237 return;
2239 if (data.dptr == NULL) {
2240 DEBUG(DEBUG_ERR,(__location__ " No data recaived\n"));
2241 return;
2244 timeout = *((uint32_t *)data.dptr);
2246 if (timeout == 0) {
2247 DEBUG(DEBUG_NOTICE,("Reenabling ip check\n"));
2248 return;
2251 DEBUG(DEBUG_NOTICE,("Disabling ip check for %u seconds\n", timeout));
2253 rec->ip_check_disable_ctx = talloc_new(rec);
2254 CTDB_NO_MEMORY_VOID(ctdb, rec->ip_check_disable_ctx);
2256 event_add_timed(ctdb->ev, rec->ip_check_disable_ctx, timeval_current_ofs(timeout, 0), reenable_ip_check, rec);
2261 handler for reload all ips.
2263 static void ip_reloadall_handler(struct ctdb_context *ctdb, uint64_t srvid,
2264 TDB_DATA data, void *private_data)
2266 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2268 if (data.dsize != sizeof(struct reloadips_all_reply)) {
2269 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2270 return;
2273 reload_all_ips_request = (struct reloadips_all_reply *)talloc_steal(rec, data.dptr);
2275 DEBUG(DEBUG_NOTICE,("RELOAD_ALL_IPS message received from node:%d srvid:%d\n", reload_all_ips_request->pnn, (int)reload_all_ips_request->srvid));
2276 return;
2279 static void async_reloadips_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2281 uint32_t *status = callback_data;
2283 if (res != 0) {
2284 DEBUG(DEBUG_ERR,("Reload ips all failed on node %d\n", node_pnn));
2285 *status = 1;
2289 static int
2290 reload_all_ips(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, struct reloadips_all_reply *rips)
2292 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2293 uint32_t *nodes;
2294 uint32_t status;
2295 int i;
2297 DEBUG(DEBUG_ERR,("RELOAD ALL IPS on all active nodes\n"));
2298 for (i = 0; i< nodemap->num; i++) {
2299 if (nodemap->nodes[i].flags != 0) {
2300 DEBUG(DEBUG_ERR, ("Can not reload ips on all nodes. Node %d is not up and healthy\n", i));
2301 talloc_free(tmp_ctx);
2302 return -1;
2306 /* send the flags update to all connected nodes */
2307 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2308 status = 0;
2309 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_RELOAD_PUBLIC_IPS,
2310 nodes, 0,
2311 CONTROL_TIMEOUT(),
2312 false, tdb_null,
2313 async_reloadips_callback, NULL,
2314 &status) != 0) {
2315 DEBUG(DEBUG_ERR, (__location__ " Failed to reloadips on all nodes.\n"));
2316 talloc_free(tmp_ctx);
2317 return -1;
2320 if (status != 0) {
2321 DEBUG(DEBUG_ERR, (__location__ " Failed to reloadips on all nodes.\n"));
2322 talloc_free(tmp_ctx);
2323 return -1;
2326 ctdb_client_send_message(ctdb, rips->pnn, rips->srvid, tdb_null);
2328 talloc_free(tmp_ctx);
2329 return 0;
2334 handler for ip reallocate, just add it to the list of callers and
2335 handle this later in the monitor_cluster loop so we do not recurse
2336 with other callers to takeover_run()
2338 static void ip_reallocate_handler(struct ctdb_context *ctdb, uint64_t srvid,
2339 TDB_DATA data, void *private_data)
2341 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2342 struct ip_reallocate_list *caller;
2344 if (data.dsize != sizeof(struct rd_memdump_reply)) {
2345 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2346 return;
2349 if (rec->ip_reallocate_ctx == NULL) {
2350 rec->ip_reallocate_ctx = talloc_new(rec);
2351 CTDB_NO_MEMORY_FATAL(ctdb, rec->ip_reallocate_ctx);
2354 caller = talloc(rec->ip_reallocate_ctx, struct ip_reallocate_list);
2355 CTDB_NO_MEMORY_FATAL(ctdb, caller);
2357 caller->rd = (struct rd_memdump_reply *)talloc_steal(caller, data.dptr);
2358 caller->next = rec->reallocate_callers;
2359 rec->reallocate_callers = caller;
2361 return;
2364 static void process_ipreallocate_requests(struct ctdb_context *ctdb, struct ctdb_recoverd *rec)
2366 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2367 TDB_DATA result;
2368 int32_t ret;
2369 struct ip_reallocate_list *callers;
2370 uint32_t culprit;
2372 DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
2374 /* update the list of public ips that a node can handle for
2375 all connected nodes
2377 ret = ctdb_reload_remote_public_ips(ctdb, rec, rec->nodemap, &culprit);
2378 if (ret != 0) {
2379 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2380 culprit));
2381 rec->need_takeover_run = true;
2383 if (ret == 0) {
2384 ret = ctdb_takeover_run(ctdb, rec->nodemap, NULL, NULL);
2385 if (ret != 0) {
2386 DEBUG(DEBUG_ERR,("Failed to reallocate addresses: ctdb_takeover_run() failed.\n"));
2387 rec->need_takeover_run = true;
2391 result.dsize = sizeof(int32_t);
2392 result.dptr = (uint8_t *)&ret;
2394 for (callers=rec->reallocate_callers; callers; callers=callers->next) {
2396 /* Someone that sent srvid==0 does not want a reply */
2397 if (callers->rd->srvid == 0) {
2398 continue;
2400 DEBUG(DEBUG_INFO,("Sending ip reallocate reply message to "
2401 "%u:%llu\n", (unsigned)callers->rd->pnn,
2402 (unsigned long long)callers->rd->srvid));
2403 ret = ctdb_client_send_message(ctdb, callers->rd->pnn, callers->rd->srvid, result);
2404 if (ret != 0) {
2405 DEBUG(DEBUG_ERR,("Failed to send ip reallocate reply "
2406 "message to %u:%llu\n",
2407 (unsigned)callers->rd->pnn,
2408 (unsigned long long)callers->rd->srvid));
2412 talloc_free(tmp_ctx);
2413 talloc_free(rec->ip_reallocate_ctx);
2414 rec->ip_reallocate_ctx = NULL;
2415 rec->reallocate_callers = NULL;
2421 handler for recovery master elections
2423 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
2424 TDB_DATA data, void *private_data)
2426 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2427 int ret;
2428 struct election_message *em = (struct election_message *)data.dptr;
2429 TALLOC_CTX *mem_ctx;
2431 /* we got an election packet - update the timeout for the election */
2432 talloc_free(rec->election_timeout);
2433 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2434 fast_start ?
2435 timeval_current_ofs(0, 500000) :
2436 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2437 ctdb_election_timeout, rec);
2439 mem_ctx = talloc_new(ctdb);
2441 /* someone called an election. check their election data
2442 and if we disagree and we would rather be the elected node,
2443 send a new election message to all other nodes
2445 if (ctdb_election_win(rec, em)) {
2446 if (!rec->send_election_te) {
2447 rec->send_election_te = event_add_timed(ctdb->ev, rec,
2448 timeval_current_ofs(0, 500000),
2449 election_send_request, rec);
2451 talloc_free(mem_ctx);
2452 /*unban_all_nodes(ctdb);*/
2453 return;
2456 /* we didn't win */
2457 talloc_free(rec->send_election_te);
2458 rec->send_election_te = NULL;
2460 if (ctdb->tunable.verify_recovery_lock != 0) {
2461 /* release the recmaster lock */
2462 if (em->pnn != ctdb->pnn &&
2463 ctdb->recovery_lock_fd != -1) {
2464 close(ctdb->recovery_lock_fd);
2465 ctdb->recovery_lock_fd = -1;
2466 unban_all_nodes(ctdb);
2470 /* ok, let that guy become recmaster then */
2471 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
2472 if (ret != 0) {
2473 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
2474 talloc_free(mem_ctx);
2475 return;
2478 talloc_free(mem_ctx);
2479 return;
2484 force the start of the election process
2486 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
2487 struct ctdb_node_map *nodemap)
2489 int ret;
2490 struct ctdb_context *ctdb = rec->ctdb;
2492 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
2494 /* set all nodes to recovery mode to stop all internode traffic */
2495 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
2496 if (ret != 0) {
2497 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
2498 return;
2501 talloc_free(rec->election_timeout);
2502 rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2503 fast_start ?
2504 timeval_current_ofs(0, 500000) :
2505 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2506 ctdb_election_timeout, rec);
2508 ret = send_election_request(rec, pnn, true);
2509 if (ret!=0) {
2510 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
2511 return;
2514 /* wait for a few seconds to collect all responses */
2515 ctdb_wait_election(rec);
2521 handler for when a node changes its flags
2523 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
2524 TDB_DATA data, void *private_data)
2526 int ret;
2527 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2528 struct ctdb_node_map *nodemap=NULL;
2529 TALLOC_CTX *tmp_ctx;
2530 int i;
2531 struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2532 int disabled_flag_changed;
2534 if (data.dsize != sizeof(*c)) {
2535 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2536 return;
2539 tmp_ctx = talloc_new(ctdb);
2540 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2542 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2543 if (ret != 0) {
2544 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2545 talloc_free(tmp_ctx);
2546 return;
2550 for (i=0;i<nodemap->num;i++) {
2551 if (nodemap->nodes[i].pnn == c->pnn) break;
2554 if (i == nodemap->num) {
2555 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2556 talloc_free(tmp_ctx);
2557 return;
2560 if (nodemap->nodes[i].flags != c->new_flags) {
2561 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, nodemap->nodes[i].flags));
2564 disabled_flag_changed = (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
2566 nodemap->nodes[i].flags = c->new_flags;
2568 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2569 CTDB_CURRENT_NODE, &ctdb->recovery_master);
2571 if (ret == 0) {
2572 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2573 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2576 if (ret == 0 &&
2577 ctdb->recovery_master == ctdb->pnn &&
2578 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2579 /* Only do the takeover run if the perm disabled or unhealthy
2580 flags changed since these will cause an ip failover but not
2581 a recovery.
2582 If the node became disconnected or banned this will also
2583 lead to an ip address failover but that is handled
2584 during recovery
2586 if (disabled_flag_changed) {
2587 rec->need_takeover_run = true;
2591 talloc_free(tmp_ctx);
2595 handler for when we need to push out flag changes ot all other nodes
2597 static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
2598 TDB_DATA data, void *private_data)
2600 int ret;
2601 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2602 struct ctdb_node_map *nodemap=NULL;
2603 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2604 uint32_t recmaster;
2605 uint32_t *nodes;
2607 /* find the recovery master */
2608 ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
2609 if (ret != 0) {
2610 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from local node\n"));
2611 talloc_free(tmp_ctx);
2612 return;
2615 /* read the node flags from the recmaster */
2616 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), recmaster, tmp_ctx, &nodemap);
2617 if (ret != 0) {
2618 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2619 talloc_free(tmp_ctx);
2620 return;
2622 if (c->pnn >= nodemap->num) {
2623 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2624 talloc_free(tmp_ctx);
2625 return;
2628 /* send the flags update to all connected nodes */
2629 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2631 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2632 nodes, 0, CONTROL_TIMEOUT(),
2633 false, data,
2634 NULL, NULL,
2635 NULL) != 0) {
2636 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2638 talloc_free(tmp_ctx);
2639 return;
2642 talloc_free(tmp_ctx);
2646 struct verify_recmode_normal_data {
2647 uint32_t count;
2648 enum monitor_result status;
2651 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2653 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2656 /* one more node has responded with recmode data*/
2657 rmdata->count--;
2659 /* if we failed to get the recmode, then return an error and let
2660 the main loop try again.
2662 if (state->state != CTDB_CONTROL_DONE) {
2663 if (rmdata->status == MONITOR_OK) {
2664 rmdata->status = MONITOR_FAILED;
2666 return;
2669 /* if we got a response, then the recmode will be stored in the
2670 status field
2672 if (state->status != CTDB_RECOVERY_NORMAL) {
2673 DEBUG(DEBUG_NOTICE, (__location__ " Node:%u was in recovery mode. Restart recovery process\n", state->c->hdr.destnode));
2674 rmdata->status = MONITOR_RECOVERY_NEEDED;
2677 return;
2681 /* verify that all nodes are in normal recovery mode */
2682 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2684 struct verify_recmode_normal_data *rmdata;
2685 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2686 struct ctdb_client_control_state *state;
2687 enum monitor_result status;
2688 int j;
2690 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2691 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2692 rmdata->count = 0;
2693 rmdata->status = MONITOR_OK;
2695 /* loop over all active nodes and send an async getrecmode call to
2696 them*/
2697 for (j=0; j<nodemap->num; j++) {
2698 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2699 continue;
2701 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2702 CONTROL_TIMEOUT(),
2703 nodemap->nodes[j].pnn);
2704 if (state == NULL) {
2705 /* we failed to send the control, treat this as
2706 an error and try again next iteration
2708 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2709 talloc_free(mem_ctx);
2710 return MONITOR_FAILED;
2713 /* set up the callback functions */
2714 state->async.fn = verify_recmode_normal_callback;
2715 state->async.private_data = rmdata;
2717 /* one more control to wait for to complete */
2718 rmdata->count++;
2722 /* now wait for up to the maximum number of seconds allowed
2723 or until all nodes we expect a response from has replied
2725 while (rmdata->count > 0) {
2726 event_loop_once(ctdb->ev);
2729 status = rmdata->status;
2730 talloc_free(mem_ctx);
2731 return status;
2735 struct verify_recmaster_data {
2736 struct ctdb_recoverd *rec;
2737 uint32_t count;
2738 uint32_t pnn;
2739 enum monitor_result status;
2742 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2744 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2747 /* one more node has responded with recmaster data*/
2748 rmdata->count--;
2750 /* if we failed to get the recmaster, then return an error and let
2751 the main loop try again.
2753 if (state->state != CTDB_CONTROL_DONE) {
2754 if (rmdata->status == MONITOR_OK) {
2755 rmdata->status = MONITOR_FAILED;
2757 return;
2760 /* if we got a response, then the recmaster will be stored in the
2761 status field
2763 if (state->status != rmdata->pnn) {
2764 DEBUG(DEBUG_ERR,("Node %d does not agree we are the recmaster. Need a new recmaster election\n", state->c->hdr.destnode));
2765 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2766 rmdata->status = MONITOR_ELECTION_NEEDED;
2769 return;
2773 /* verify that all nodes agree that we are the recmaster */
2774 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
2776 struct ctdb_context *ctdb = rec->ctdb;
2777 struct verify_recmaster_data *rmdata;
2778 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2779 struct ctdb_client_control_state *state;
2780 enum monitor_result status;
2781 int j;
2783 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2784 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2785 rmdata->rec = rec;
2786 rmdata->count = 0;
2787 rmdata->pnn = pnn;
2788 rmdata->status = MONITOR_OK;
2790 /* loop over all active nodes and send an async getrecmaster call to
2791 them*/
2792 for (j=0; j<nodemap->num; j++) {
2793 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2794 continue;
2796 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
2797 CONTROL_TIMEOUT(),
2798 nodemap->nodes[j].pnn);
2799 if (state == NULL) {
2800 /* we failed to send the control, treat this as
2801 an error and try again next iteration
2803 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
2804 talloc_free(mem_ctx);
2805 return MONITOR_FAILED;
2808 /* set up the callback functions */
2809 state->async.fn = verify_recmaster_callback;
2810 state->async.private_data = rmdata;
2812 /* one more control to wait for to complete */
2813 rmdata->count++;
2817 /* now wait for up to the maximum number of seconds allowed
2818 or until all nodes we expect a response from has replied
2820 while (rmdata->count > 0) {
2821 event_loop_once(ctdb->ev);
2824 status = rmdata->status;
2825 talloc_free(mem_ctx);
2826 return status;
2830 /* called to check that the local allocation of public ip addresses is ok.
2832 static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn, struct ctdb_node_map *nodemap)
2834 TALLOC_CTX *mem_ctx = talloc_new(NULL);
2835 struct ctdb_control_get_ifaces *ifaces = NULL;
2836 struct ctdb_uptime *uptime1 = NULL;
2837 struct ctdb_uptime *uptime2 = NULL;
2838 int ret, j;
2839 bool need_iface_check = false;
2840 bool need_takeover_run = false;
2842 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2843 CTDB_CURRENT_NODE, &uptime1);
2844 if (ret != 0) {
2845 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2846 talloc_free(mem_ctx);
2847 return -1;
2851 /* read the interfaces from the local node */
2852 ret = ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ifaces);
2853 if (ret != 0) {
2854 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", pnn));
2855 talloc_free(mem_ctx);
2856 return -1;
2859 if (!rec->ifaces) {
2860 need_iface_check = true;
2861 } else if (rec->ifaces->num != ifaces->num) {
2862 need_iface_check = true;
2863 } else if (memcmp(rec->ifaces, ifaces, talloc_get_size(ifaces)) != 0) {
2864 need_iface_check = true;
2867 talloc_free(rec->ifaces);
2868 rec->ifaces = talloc_steal(rec, ifaces);
2870 if (need_iface_check) {
2871 DEBUG(DEBUG_NOTICE, ("The interfaces status has changed on "
2872 "local node %u - force takeover run\n",
2873 pnn));
2874 need_takeover_run = true;
2877 ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2878 CTDB_CURRENT_NODE, &uptime2);
2879 if (ret != 0) {
2880 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2881 talloc_free(mem_ctx);
2882 return -1;
2885 /* skip the check if the startrecovery time has changed */
2886 if (timeval_compare(&uptime1->last_recovery_started,
2887 &uptime2->last_recovery_started) != 0) {
2888 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2889 talloc_free(mem_ctx);
2890 return 0;
2893 /* skip the check if the endrecovery time has changed */
2894 if (timeval_compare(&uptime1->last_recovery_finished,
2895 &uptime2->last_recovery_finished) != 0) {
2896 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2897 talloc_free(mem_ctx);
2898 return 0;
2901 /* skip the check if we have started but not finished recovery */
2902 if (timeval_compare(&uptime1->last_recovery_finished,
2903 &uptime1->last_recovery_started) != 1) {
2904 DEBUG(DEBUG_INFO, (__location__ " in the middle of recovery or ip reallocation. skipping public ip address check\n"));
2905 talloc_free(mem_ctx);
2907 return 0;
2910 /* verify that we have the ip addresses we should have
2911 and we dont have ones we shouldnt have.
2912 if we find an inconsistency we set recmode to
2913 active on the local node and wait for the recmaster
2914 to do a full blown recovery.
2915 also if the pnn is -1 and we are healthy and can host the ip
2916 we also request a ip reallocation.
2918 if (ctdb->tunable.disable_ip_failover == 0) {
2919 struct ctdb_all_public_ips *ips = NULL;
2921 /* read the *available* IPs from the local node */
2922 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
2923 if (ret != 0) {
2924 DEBUG(DEBUG_ERR, ("Unable to get available public IPs from local node %u\n", pnn));
2925 talloc_free(mem_ctx);
2926 return -1;
2929 for (j=0; j<ips->num; j++) {
2930 if (ips->ips[j].pnn == -1 &&
2931 nodemap->nodes[pnn].flags == 0) {
2932 DEBUG(DEBUG_CRIT,("Public IP '%s' is not assigned and we could serve it\n",
2933 ctdb_addr_to_str(&ips->ips[j].addr)));
2934 need_takeover_run = true;
2938 talloc_free(ips);
2940 /* read the *known* IPs from the local node */
2941 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
2942 if (ret != 0) {
2943 DEBUG(DEBUG_ERR, ("Unable to get known public IPs from local node %u\n", pnn));
2944 talloc_free(mem_ctx);
2945 return -1;
2948 for (j=0; j<ips->num; j++) {
2949 if (ips->ips[j].pnn == pnn) {
2950 if (ctdb->do_checkpublicip && !ctdb_sys_have_ip(&ips->ips[j].addr)) {
2951 DEBUG(DEBUG_CRIT,("Public IP '%s' is assigned to us but not on an interface\n",
2952 ctdb_addr_to_str(&ips->ips[j].addr)));
2953 need_takeover_run = true;
2955 } else {
2956 if (ctdb->do_checkpublicip &&
2957 ctdb_sys_have_ip(&ips->ips[j].addr)) {
2959 DEBUG(DEBUG_CRIT,("We are still serving a public IP '%s' that we should not be serving. Removing it\n",
2960 ctdb_addr_to_str(&ips->ips[j].addr)));
2962 if (ctdb_ctrl_release_ip(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ips->ips[j]) != 0) {
2963 DEBUG(DEBUG_ERR,("Failed to release local IP address\n"));
2970 if (need_takeover_run) {
2971 struct takeover_run_reply rd;
2972 TDB_DATA data;
2974 DEBUG(DEBUG_CRIT,("Trigger takeoverrun\n"));
2976 rd.pnn = ctdb->pnn;
2977 rd.srvid = 0;
2978 data.dptr = (uint8_t *)&rd;
2979 data.dsize = sizeof(rd);
2981 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
2982 if (ret != 0) {
2983 DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
2986 talloc_free(mem_ctx);
2987 return 0;
2991 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2993 struct ctdb_node_map **remote_nodemaps = callback_data;
2995 if (node_pnn >= ctdb->num_nodes) {
2996 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
2997 return;
3000 remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
3004 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
3005 struct ctdb_node_map *nodemap,
3006 struct ctdb_node_map **remote_nodemaps)
3008 uint32_t *nodes;
3010 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
3011 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
3012 nodes, 0,
3013 CONTROL_TIMEOUT(), false, tdb_null,
3014 async_getnodemap_callback,
3015 NULL,
3016 remote_nodemaps) != 0) {
3017 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
3019 return -1;
3022 return 0;
3025 enum reclock_child_status { RECLOCK_CHECKING, RECLOCK_OK, RECLOCK_FAILED, RECLOCK_TIMEOUT};
3026 struct ctdb_check_reclock_state {
3027 struct ctdb_context *ctdb;
3028 struct timeval start_time;
3029 int fd[2];
3030 pid_t child;
3031 struct timed_event *te;
3032 struct fd_event *fde;
3033 enum reclock_child_status status;
3036 /* when we free the reclock state we must kill any child process.
3038 static int check_reclock_destructor(struct ctdb_check_reclock_state *state)
3040 struct ctdb_context *ctdb = state->ctdb;
3042 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&state->start_time));
3044 if (state->fd[0] != -1) {
3045 close(state->fd[0]);
3046 state->fd[0] = -1;
3048 if (state->fd[1] != -1) {
3049 close(state->fd[1]);
3050 state->fd[1] = -1;
3052 ctdb_kill(ctdb, state->child, SIGKILL);
3053 return 0;
3057 called if our check_reclock child times out. this would happen if
3058 i/o to the reclock file blocks.
3060 static void ctdb_check_reclock_timeout(struct event_context *ev, struct timed_event *te,
3061 struct timeval t, void *private_data)
3063 struct ctdb_check_reclock_state *state = talloc_get_type(private_data,
3064 struct ctdb_check_reclock_state);
3066 DEBUG(DEBUG_ERR,(__location__ " check_reclock child process hung/timedout CFS slow to grant locks?\n"));
3067 state->status = RECLOCK_TIMEOUT;
3070 /* this is called when the child process has completed checking the reclock
3071 file and has written data back to us through the pipe.
3073 static void reclock_child_handler(struct event_context *ev, struct fd_event *fde,
3074 uint16_t flags, void *private_data)
3076 struct ctdb_check_reclock_state *state= talloc_get_type(private_data,
3077 struct ctdb_check_reclock_state);
3078 char c = 0;
3079 int ret;
3081 /* we got a response from our child process so we can abort the
3082 timeout.
3084 talloc_free(state->te);
3085 state->te = NULL;
3087 ret = read(state->fd[0], &c, 1);
3088 if (ret != 1 || c != RECLOCK_OK) {
3089 DEBUG(DEBUG_ERR,(__location__ " reclock child process returned error %d\n", c));
3090 state->status = RECLOCK_FAILED;
3092 return;
3095 state->status = RECLOCK_OK;
3096 return;
3099 static int check_recovery_lock(struct ctdb_context *ctdb)
3101 int ret;
3102 struct ctdb_check_reclock_state *state;
3103 pid_t parent = getpid();
3105 if (ctdb->recovery_lock_fd == -1) {
3106 DEBUG(DEBUG_CRIT,("recovery master doesn't have the recovery lock\n"));
3107 return -1;
3110 state = talloc(ctdb, struct ctdb_check_reclock_state);
3111 CTDB_NO_MEMORY(ctdb, state);
3113 state->ctdb = ctdb;
3114 state->start_time = timeval_current();
3115 state->status = RECLOCK_CHECKING;
3116 state->fd[0] = -1;
3117 state->fd[1] = -1;
3119 ret = pipe(state->fd);
3120 if (ret != 0) {
3121 talloc_free(state);
3122 DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for check_reclock child\n"));
3123 return -1;
3126 state->child = ctdb_fork(ctdb);
3127 if (state->child == (pid_t)-1) {
3128 DEBUG(DEBUG_CRIT,(__location__ " fork() failed in check_reclock child\n"));
3129 close(state->fd[0]);
3130 state->fd[0] = -1;
3131 close(state->fd[1]);
3132 state->fd[1] = -1;
3133 talloc_free(state);
3134 return -1;
3137 if (state->child == 0) {
3138 char cc = RECLOCK_OK;
3139 close(state->fd[0]);
3140 state->fd[0] = -1;
3142 debug_extra = talloc_asprintf(NULL, "recovery-lock:");
3143 if (pread(ctdb->recovery_lock_fd, &cc, 1, 0) == -1) {
3144 DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
3145 cc = RECLOCK_FAILED;
3148 write(state->fd[1], &cc, 1);
3149 /* make sure we die when our parent dies */
3150 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
3151 sleep(5);
3152 write(state->fd[1], &cc, 1);
3154 _exit(0);
3156 close(state->fd[1]);
3157 state->fd[1] = -1;
3158 set_close_on_exec(state->fd[0]);
3160 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for check_recovery_lock\n", state->fd[0]));
3162 talloc_set_destructor(state, check_reclock_destructor);
3164 state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(15, 0),
3165 ctdb_check_reclock_timeout, state);
3166 if (state->te == NULL) {
3167 DEBUG(DEBUG_CRIT,(__location__ " Failed to create a timed event for reclock child\n"));
3168 talloc_free(state);
3169 return -1;
3172 state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
3173 EVENT_FD_READ,
3174 reclock_child_handler,
3175 (void *)state);
3177 if (state->fde == NULL) {
3178 DEBUG(DEBUG_CRIT,(__location__ " Failed to create an fd event for reclock child\n"));
3179 talloc_free(state);
3180 return -1;
3182 tevent_fd_set_auto_close(state->fde);
3184 while (state->status == RECLOCK_CHECKING) {
3185 event_loop_once(ctdb->ev);
3188 if (state->status == RECLOCK_FAILED) {
3189 DEBUG(DEBUG_ERR,(__location__ " reclock child failed when checking file\n"));
3190 close(ctdb->recovery_lock_fd);
3191 ctdb->recovery_lock_fd = -1;
3192 talloc_free(state);
3193 return -1;
3196 talloc_free(state);
3197 return 0;
3200 static int update_recovery_lock_file(struct ctdb_context *ctdb)
3202 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
3203 const char *reclockfile;
3205 if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
3206 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
3207 talloc_free(tmp_ctx);
3208 return -1;
3211 if (reclockfile == NULL) {
3212 if (ctdb->recovery_lock_file != NULL) {
3213 DEBUG(DEBUG_ERR,("Reclock file disabled\n"));
3214 talloc_free(ctdb->recovery_lock_file);
3215 ctdb->recovery_lock_file = NULL;
3216 if (ctdb->recovery_lock_fd != -1) {
3217 close(ctdb->recovery_lock_fd);
3218 ctdb->recovery_lock_fd = -1;
3221 ctdb->tunable.verify_recovery_lock = 0;
3222 talloc_free(tmp_ctx);
3223 return 0;
3226 if (ctdb->recovery_lock_file == NULL) {
3227 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3228 if (ctdb->recovery_lock_fd != -1) {
3229 close(ctdb->recovery_lock_fd);
3230 ctdb->recovery_lock_fd = -1;
3232 talloc_free(tmp_ctx);
3233 return 0;
3237 if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
3238 talloc_free(tmp_ctx);
3239 return 0;
3242 talloc_free(ctdb->recovery_lock_file);
3243 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3244 ctdb->tunable.verify_recovery_lock = 0;
3245 if (ctdb->recovery_lock_fd != -1) {
3246 close(ctdb->recovery_lock_fd);
3247 ctdb->recovery_lock_fd = -1;
3250 talloc_free(tmp_ctx);
3251 return 0;
3254 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
3255 TALLOC_CTX *mem_ctx)
3257 uint32_t pnn;
3258 struct ctdb_node_map *nodemap=NULL;
3259 struct ctdb_node_map *recmaster_nodemap=NULL;
3260 struct ctdb_node_map **remote_nodemaps=NULL;
3261 struct ctdb_vnn_map *vnnmap=NULL;
3262 struct ctdb_vnn_map *remote_vnnmap=NULL;
3263 int32_t debug_level;
3264 int i, j, ret;
3268 /* verify that the main daemon is still running */
3269 if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
3270 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
3271 exit(-1);
3274 /* ping the local daemon to tell it we are alive */
3275 ctdb_ctrl_recd_ping(ctdb);
3277 if (rec->election_timeout) {
3278 /* an election is in progress */
3279 return;
3282 /* read the debug level from the parent and update locally */
3283 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
3284 if (ret !=0) {
3285 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
3286 return;
3288 LogLevel = debug_level;
3291 /* We must check if we need to ban a node here but we want to do this
3292 as early as possible so we dont wait until we have pulled the node
3293 map from the local node. thats why we have the hardcoded value 20
3295 for (i=0; i<ctdb->num_nodes; i++) {
3296 struct ctdb_banning_state *ban_state;
3298 if (ctdb->nodes[i]->ban_state == NULL) {
3299 continue;
3301 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
3302 if (ban_state->count < 20) {
3303 continue;
3305 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
3306 ctdb->nodes[i]->pnn, ban_state->count,
3307 ctdb->tunable.recovery_ban_period));
3308 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
3309 ban_state->count = 0;
3312 /* get relevant tunables */
3313 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
3314 if (ret != 0) {
3315 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
3316 return;
3319 /* get the current recovery lock file from the server */
3320 if (update_recovery_lock_file(ctdb) != 0) {
3321 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
3322 return;
3325 /* Make sure that if recovery lock verification becomes disabled when
3326 we close the file
3328 if (ctdb->tunable.verify_recovery_lock == 0) {
3329 if (ctdb->recovery_lock_fd != -1) {
3330 close(ctdb->recovery_lock_fd);
3331 ctdb->recovery_lock_fd = -1;
3335 pnn = ctdb_ctrl_getpnn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
3336 if (pnn == (uint32_t)-1) {
3337 DEBUG(DEBUG_ERR,("Failed to get local pnn - retrying\n"));
3338 return;
3341 /* get the vnnmap */
3342 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
3343 if (ret != 0) {
3344 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
3345 return;
3349 /* get number of nodes */
3350 if (rec->nodemap) {
3351 talloc_free(rec->nodemap);
3352 rec->nodemap = NULL;
3353 nodemap=NULL;
3355 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
3356 if (ret != 0) {
3357 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
3358 return;
3360 nodemap = rec->nodemap;
3362 /* update the capabilities for all nodes */
3363 ret = update_capabilities(ctdb, nodemap);
3364 if (ret != 0) {
3365 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
3366 return;
3369 /* check which node is the recovery master */
3370 ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
3371 if (ret != 0) {
3372 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
3373 return;
3376 /* if we are not the recmaster we can safely ignore any ip reallocate requests */
3377 if (rec->recmaster != pnn) {
3378 if (rec->ip_reallocate_ctx != NULL) {
3379 talloc_free(rec->ip_reallocate_ctx);
3380 rec->ip_reallocate_ctx = NULL;
3381 rec->reallocate_callers = NULL;
3385 if (rec->recmaster == (uint32_t)-1) {
3386 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
3387 force_election(rec, pnn, nodemap);
3388 return;
3391 /* if the local daemon is STOPPED, we verify that the databases are
3392 also frozen and thet the recmode is set to active
3394 if (nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) {
3395 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
3396 if (ret != 0) {
3397 DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
3399 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
3400 DEBUG(DEBUG_ERR,("Node is stopped but recovery mode is not active. Activate recovery mode and lock databases\n"));
3402 ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
3403 if (ret != 0) {
3404 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node in STOPPED state\n"));
3405 return;
3407 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
3408 if (ret != 0) {
3409 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED state\n"));
3411 return;
3413 return;
3416 /* If the local node is stopped, verify we are not the recmaster
3417 and yield this role if so
3419 if ((nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) && (rec->recmaster == pnn)) {
3420 DEBUG(DEBUG_ERR,("Local node is INACTIVE. Yielding recmaster role\n"));
3421 force_election(rec, pnn, nodemap);
3422 return;
3426 * if the current recmaster do not have CTDB_CAP_RECMASTER,
3427 * but we have force an election and try to become the new
3428 * recmaster
3430 if ((rec->ctdb->nodes[rec->recmaster]->capabilities & CTDB_CAP_RECMASTER) == 0 &&
3431 (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
3432 !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
3433 DEBUG(DEBUG_ERR, (__location__ " Current recmaster node %u does not have CAP_RECMASTER,"
3434 " but we (node %u) have - force an election\n",
3435 rec->recmaster, pnn));
3436 force_election(rec, pnn, nodemap);
3437 return;
3440 /* check that we (recovery daemon) and the local ctdb daemon
3441 agrees on whether we are banned or not
3443 //qqq
3445 /* remember our own node flags */
3446 rec->node_flags = nodemap->nodes[pnn].flags;
3448 /* count how many active nodes there are */
3449 rec->num_active = 0;
3450 rec->num_connected = 0;
3451 for (i=0; i<nodemap->num; i++) {
3452 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
3453 rec->num_active++;
3455 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
3456 rec->num_connected++;
3461 /* verify that the recmaster node is still active */
3462 for (j=0; j<nodemap->num; j++) {
3463 if (nodemap->nodes[j].pnn==rec->recmaster) {
3464 break;
3468 if (j == nodemap->num) {
3469 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
3470 force_election(rec, pnn, nodemap);
3471 return;
3474 /* if recovery master is disconnected we must elect a new recmaster */
3475 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
3476 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
3477 force_election(rec, pnn, nodemap);
3478 return;
3481 /* get nodemap from the recovery master to check if it is inactive */
3482 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3483 mem_ctx, &recmaster_nodemap);
3484 if (ret != 0) {
3485 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
3486 nodemap->nodes[j].pnn));
3487 return;
3491 if ((recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) &&
3492 (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
3493 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
3494 force_election(rec, pnn, nodemap);
3495 return;
3498 /* If this node is stopped then it is not the recovery master
3499 * so the only remaining action is to potentially to verify
3500 * the local IP allocation below. This won't accomplish
3501 * anything useful so skip it.
3503 if (rec->node_flags & NODE_FLAGS_STOPPED) {
3504 return;
3507 /* verify that we have all ip addresses we should have and we dont
3508 * have addresses we shouldnt have.
3510 if (ctdb->tunable.disable_ip_failover == 0) {
3511 if (rec->ip_check_disable_ctx == NULL) {
3512 if (verify_local_ip_allocation(ctdb, rec, pnn, nodemap) != 0) {
3513 DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
3519 /* if we are not the recmaster then we do not need to check
3520 if recovery is needed
3522 if (pnn != rec->recmaster) {
3523 return;
3527 /* ensure our local copies of flags are right */
3528 ret = update_local_flags(rec, nodemap);
3529 if (ret == MONITOR_ELECTION_NEEDED) {
3530 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
3531 force_election(rec, pnn, nodemap);
3532 return;
3534 if (ret != MONITOR_OK) {
3535 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
3536 return;
3539 if (ctdb->num_nodes != nodemap->num) {
3540 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
3541 reload_nodes_file(ctdb);
3542 return;
3545 /* verify that all active nodes agree that we are the recmaster */
3546 switch (verify_recmaster(rec, nodemap, pnn)) {
3547 case MONITOR_RECOVERY_NEEDED:
3548 /* can not happen */
3549 return;
3550 case MONITOR_ELECTION_NEEDED:
3551 force_election(rec, pnn, nodemap);
3552 return;
3553 case MONITOR_OK:
3554 break;
3555 case MONITOR_FAILED:
3556 return;
3560 if (rec->need_recovery) {
3561 /* a previous recovery didn't finish */
3562 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3563 return;
3566 /* verify that all active nodes are in normal mode
3567 and not in recovery mode
3569 switch (verify_recmode(ctdb, nodemap)) {
3570 case MONITOR_RECOVERY_NEEDED:
3571 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3572 return;
3573 case MONITOR_FAILED:
3574 return;
3575 case MONITOR_ELECTION_NEEDED:
3576 /* can not happen */
3577 case MONITOR_OK:
3578 break;
3582 if (ctdb->tunable.verify_recovery_lock != 0) {
3583 /* we should have the reclock - check its not stale */
3584 ret = check_recovery_lock(ctdb);
3585 if (ret != 0) {
3586 DEBUG(DEBUG_ERR,("Failed check_recovery_lock. Force a recovery\n"));
3587 ctdb_set_culprit(rec, ctdb->pnn);
3588 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3589 return;
3594 /* is there a pending reload all ips ? */
3595 if (reload_all_ips_request != NULL) {
3596 reload_all_ips(ctdb, rec, nodemap, reload_all_ips_request);
3597 talloc_free(reload_all_ips_request);
3598 reload_all_ips_request = NULL;
3601 /* if there are takeovers requested, perform it and notify the waiters */
3602 if (rec->reallocate_callers) {
3603 process_ipreallocate_requests(ctdb, rec);
3606 /* get the nodemap for all active remote nodes
3608 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
3609 if (remote_nodemaps == NULL) {
3610 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3611 return;
3613 for(i=0; i<nodemap->num; i++) {
3614 remote_nodemaps[i] = NULL;
3616 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3617 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3618 return;
3621 /* verify that all other nodes have the same nodemap as we have
3623 for (j=0; j<nodemap->num; j++) {
3624 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3625 continue;
3628 if (remote_nodemaps[j] == NULL) {
3629 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3630 ctdb_set_culprit(rec, j);
3632 return;
3635 /* if the nodes disagree on how many nodes there are
3636 then this is a good reason to try recovery
3638 if (remote_nodemaps[j]->num != nodemap->num) {
3639 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3640 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3641 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3642 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3643 return;
3646 /* if the nodes disagree on which nodes exist and are
3647 active, then that is also a good reason to do recovery
3649 for (i=0;i<nodemap->num;i++) {
3650 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3651 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3652 nodemap->nodes[j].pnn, i,
3653 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3654 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3655 do_recovery(rec, mem_ctx, pnn, nodemap,
3656 vnnmap);
3657 return;
3661 /* verify the flags are consistent
3663 for (i=0; i<nodemap->num; i++) {
3664 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3665 continue;
3668 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3669 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3670 nodemap->nodes[j].pnn,
3671 nodemap->nodes[i].pnn,
3672 remote_nodemaps[j]->nodes[i].flags,
3673 nodemap->nodes[j].flags));
3674 if (i == j) {
3675 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3676 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3677 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3678 do_recovery(rec, mem_ctx, pnn, nodemap,
3679 vnnmap);
3680 return;
3681 } else {
3682 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3683 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3684 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3685 do_recovery(rec, mem_ctx, pnn, nodemap,
3686 vnnmap);
3687 return;
3694 /* there better be the same number of lmasters in the vnn map
3695 as there are active nodes or we will have to do a recovery
3697 if (vnnmap->size != rec->num_active) {
3698 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active nodes. %u vs %u\n",
3699 vnnmap->size, rec->num_active));
3700 ctdb_set_culprit(rec, ctdb->pnn);
3701 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3702 return;
3705 /* verify that all active nodes in the nodemap also exist in
3706 the vnnmap.
3708 for (j=0; j<nodemap->num; j++) {
3709 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3710 continue;
3712 if (nodemap->nodes[j].pnn == pnn) {
3713 continue;
3716 for (i=0; i<vnnmap->size; i++) {
3717 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3718 break;
3721 if (i == vnnmap->size) {
3722 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3723 nodemap->nodes[j].pnn));
3724 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3725 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3726 return;
3731 /* verify that all other nodes have the same vnnmap
3732 and are from the same generation
3734 for (j=0; j<nodemap->num; j++) {
3735 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3736 continue;
3738 if (nodemap->nodes[j].pnn == pnn) {
3739 continue;
3742 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3743 mem_ctx, &remote_vnnmap);
3744 if (ret != 0) {
3745 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3746 nodemap->nodes[j].pnn));
3747 return;
3750 /* verify the vnnmap generation is the same */
3751 if (vnnmap->generation != remote_vnnmap->generation) {
3752 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3753 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3754 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3755 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3756 return;
3759 /* verify the vnnmap size is the same */
3760 if (vnnmap->size != remote_vnnmap->size) {
3761 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3762 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3763 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3764 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3765 return;
3768 /* verify the vnnmap is the same */
3769 for (i=0;i<vnnmap->size;i++) {
3770 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3771 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3772 nodemap->nodes[j].pnn));
3773 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3774 do_recovery(rec, mem_ctx, pnn, nodemap,
3775 vnnmap);
3776 return;
3781 /* we might need to change who has what IP assigned */
3782 if (rec->need_takeover_run) {
3783 uint32_t culprit = (uint32_t)-1;
3785 rec->need_takeover_run = false;
3787 /* update the list of public ips that a node can handle for
3788 all connected nodes
3790 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
3791 if (ret != 0) {
3792 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
3793 culprit));
3794 rec->need_takeover_run = true;
3795 return;
3798 /* execute the "startrecovery" event script on all nodes */
3799 ret = run_startrecovery_eventscript(rec, nodemap);
3800 if (ret!=0) {
3801 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
3802 ctdb_set_culprit(rec, ctdb->pnn);
3803 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3804 return;
3807 /* If takeover run fails, then the offending nodes are
3808 * assigned ban culprit counts. And we re-try takeover.
3809 * If takeover run fails repeatedly, the node would get
3810 * banned.
3812 * If rec->need_takeover_run is not set to true at this
3813 * failure, monitoring is disabled cluster-wide (via
3814 * startrecovery eventscript) and will not get enabled.
3816 ret = ctdb_takeover_run(ctdb, nodemap, takeover_fail_callback, rec);
3817 if (ret != 0) {
3818 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. Trying again\n"));
3819 return;
3822 /* execute the "recovered" event script on all nodes */
3823 ret = run_recovered_eventscript(rec, nodemap, "monitor_cluster");
3824 #if 0
3825 // we cant check whether the event completed successfully
3826 // since this script WILL fail if the node is in recovery mode
3827 // and if that race happens, the code here would just cause a second
3828 // cascading recovery.
3829 if (ret!=0) {
3830 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
3831 ctdb_set_culprit(rec, ctdb->pnn);
3832 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3834 #endif
3839 the main monitoring loop
3841 static void monitor_cluster(struct ctdb_context *ctdb)
3843 struct ctdb_recoverd *rec;
3845 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
3847 rec = talloc_zero(ctdb, struct ctdb_recoverd);
3848 CTDB_NO_MEMORY_FATAL(ctdb, rec);
3850 rec->ctdb = ctdb;
3852 rec->priority_time = timeval_current();
3854 /* register a message port for sending memory dumps */
3855 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
3857 /* register a message port for requesting logs */
3858 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_GETLOG, getlog_handler, rec);
3860 /* register a message port for clearing logs */
3861 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_CLEARLOG, clearlog_handler, rec);
3863 /* register a message port for recovery elections */
3864 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
3866 /* when nodes are disabled/enabled */
3867 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
3869 /* when we are asked to puch out a flag change */
3870 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
3872 /* register a message port for vacuum fetch */
3873 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
3875 /* register a message port for reloadnodes */
3876 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
3878 /* register a message port for performing a takeover run */
3879 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
3881 /* register a message port for performing a reload all ips */
3882 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_ALL_IPS, ip_reloadall_handler, rec);
3884 /* register a message port for disabling the ip check for a short while */
3885 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
3887 /* register a message port for updating the recovery daemons node assignment for an ip */
3888 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECD_UPDATE_IP, recd_update_ip_handler, rec);
3890 /* register a message port for forcing a rebalance of a node next
3891 reallocation */
3892 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
3894 for (;;) {
3895 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3896 struct timeval start;
3897 double elapsed;
3899 if (!mem_ctx) {
3900 DEBUG(DEBUG_CRIT,(__location__
3901 " Failed to create temp context\n"));
3902 exit(-1);
3905 start = timeval_current();
3906 main_loop(ctdb, rec, mem_ctx);
3907 talloc_free(mem_ctx);
3909 /* we only check for recovery once every second */
3910 elapsed = timeval_elapsed(&start);
3911 if (elapsed < ctdb->tunable.recover_interval) {
3912 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
3913 - elapsed);
3919 event handler for when the main ctdbd dies
3921 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
3922 uint16_t flags, void *private_data)
3924 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
3925 _exit(1);
3929 called regularly to verify that the recovery daemon is still running
3931 static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
3932 struct timeval yt, void *p)
3934 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3936 if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
3937 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
3939 event_add_timed(ctdb->ev, ctdb, timeval_zero(),
3940 ctdb_restart_recd, ctdb);
3942 return;
3945 event_add_timed(ctdb->ev, ctdb,
3946 timeval_current_ofs(30, 0),
3947 ctdb_check_recd, ctdb);
3950 static void recd_sig_child_handler(struct event_context *ev,
3951 struct signal_event *se, int signum, int count,
3952 void *dont_care,
3953 void *private_data)
3955 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3956 int status;
3957 pid_t pid = -1;
3959 while (pid != 0) {
3960 pid = waitpid(-1, &status, WNOHANG);
3961 if (pid == -1) {
3962 if (errno != ECHILD) {
3963 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3965 return;
3967 if (pid > 0) {
3968 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3974 startup the recovery daemon as a child of the main ctdb daemon
3976 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3978 int fd[2];
3979 struct signal_event *se;
3980 struct tevent_fd *fde;
3982 if (pipe(fd) != 0) {
3983 return -1;
3986 ctdb->ctdbd_pid = getpid();
3988 ctdb->recoverd_pid = ctdb_fork(ctdb);
3989 if (ctdb->recoverd_pid == -1) {
3990 return -1;
3993 if (ctdb->recoverd_pid != 0) {
3994 close(fd[0]);
3995 event_add_timed(ctdb->ev, ctdb,
3996 timeval_current_ofs(30, 0),
3997 ctdb_check_recd, ctdb);
3998 return 0;
4001 close(fd[1]);
4003 srandom(getpid() ^ time(NULL));
4005 if (switch_from_server_to_client(ctdb, "recoverd") != 0) {
4006 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
4007 exit(1);
4010 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
4012 fde = event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ,
4013 ctdb_recoverd_parent, &fd[0]);
4014 tevent_fd_set_auto_close(fde);
4016 /* set up a handler to pick up sigchld */
4017 se = event_add_signal(ctdb->ev, ctdb,
4018 SIGCHLD, 0,
4019 recd_sig_child_handler,
4020 ctdb);
4021 if (se == NULL) {
4022 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
4023 exit(1);
4026 monitor_cluster(ctdb);
4028 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
4029 return -1;
4033 shutdown the recovery daemon
4035 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
4037 if (ctdb->recoverd_pid == 0) {
4038 return;
4041 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
4042 ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
4045 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te,
4046 struct timeval t, void *private_data)
4048 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4050 DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
4051 ctdb_stop_recoverd(ctdb);
4052 ctdb_start_recoverd(ctdb);