4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "system/filesys.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/wait.h"
27 #include "../include/ctdb_client.h"
28 #include "../include/ctdb_private.h"
30 #include "dlinklist.h"
33 /* List of SRVID requests that need to be processed */
35 struct srvid_list
*next
, *prev
;
36 struct srvid_request
*request
;
39 struct srvid_requests
{
40 struct srvid_list
*requests
;
43 static void srvid_request_reply(struct ctdb_context
*ctdb
,
44 struct srvid_request
*request
,
47 /* Someone that sent srvid==0 does not want a reply */
48 if (request
->srvid
== 0) {
53 if (ctdb_client_send_message(ctdb
, request
->pnn
, request
->srvid
,
55 DEBUG(DEBUG_INFO
,("Sent SRVID reply to %u:%llu\n",
56 (unsigned)request
->pnn
,
57 (unsigned long long)request
->srvid
));
59 DEBUG(DEBUG_ERR
,("Failed to send SRVID reply to %u:%llu\n",
60 (unsigned)request
->pnn
,
61 (unsigned long long)request
->srvid
));
67 static void srvid_requests_reply(struct ctdb_context
*ctdb
,
68 struct srvid_requests
**requests
,
73 for (r
= (*requests
)->requests
; r
!= NULL
; r
= r
->next
) {
74 srvid_request_reply(ctdb
, r
->request
, result
);
77 /* Free the list structure... */
78 TALLOC_FREE(*requests
);
81 static void srvid_request_add(struct ctdb_context
*ctdb
,
82 struct srvid_requests
**requests
,
83 struct srvid_request
*request
)
89 if (*requests
== NULL
) {
90 *requests
= talloc_zero(ctdb
, struct srvid_requests
);
91 if (*requests
== NULL
) {
96 t
= talloc_zero(*requests
, struct srvid_list
);
98 /* If *requests was just allocated above then free it */
99 if ((*requests
)->requests
== NULL
) {
100 TALLOC_FREE(*requests
);
105 t
->request
= (struct srvid_request
*)talloc_steal(t
, request
);
106 DLIST_ADD((*requests
)->requests
, t
);
111 /* Failed to add the request to the list. Send a fail. */
112 DEBUG(DEBUG_ERR
, (__location__
113 " Out of memory, failed to queue SRVID request\n"));
115 result
.dsize
= sizeof(ret
);
116 result
.dptr
= (uint8_t *)&ret
;
117 srvid_request_reply(ctdb
, request
, result
);
120 struct ctdb_banning_state
{
122 struct timeval last_reported_time
;
126 private state of recovery daemon
128 struct ctdb_recoverd
{
129 struct ctdb_context
*ctdb
;
132 uint32_t num_lmasters
;
133 uint32_t num_connected
;
134 uint32_t last_culprit_node
;
135 struct ctdb_node_map
*nodemap
;
136 struct timeval priority_time
;
137 bool need_takeover_run
;
140 struct timed_event
*send_election_te
;
141 struct timed_event
*election_timeout
;
142 struct vacuum_info
*vacuum_info
;
143 struct srvid_requests
*reallocate_requests
;
144 bool takeover_run_in_progress
;
145 TALLOC_CTX
*takeover_runs_disable_ctx
;
146 struct ctdb_control_get_ifaces
*ifaces
;
147 uint32_t *force_rebalance_nodes
;
150 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
151 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
153 static void ctdb_restart_recd(struct event_context
*ev
, struct timed_event
*te
, struct timeval t
, void *private_data
);
156 ban a node for a period of time
158 static void ctdb_ban_node(struct ctdb_recoverd
*rec
, uint32_t pnn
, uint32_t ban_time
)
161 struct ctdb_context
*ctdb
= rec
->ctdb
;
162 struct ctdb_ban_time bantime
;
164 if (!ctdb_validate_pnn(ctdb
, pnn
)) {
165 DEBUG(DEBUG_ERR
,("Bad pnn %u in ctdb_ban_node\n", pnn
));
169 DEBUG(DEBUG_NOTICE
,("Banning node %u for %u seconds\n", pnn
, ban_time
));
172 bantime
.time
= ban_time
;
174 ret
= ctdb_ctrl_set_ban(ctdb
, CONTROL_TIMEOUT(), pnn
, &bantime
);
176 DEBUG(DEBUG_ERR
,(__location__
" Failed to ban node %d\n", pnn
));
182 enum monitor_result
{ MONITOR_OK
, MONITOR_RECOVERY_NEEDED
, MONITOR_ELECTION_NEEDED
, MONITOR_FAILED
};
186 remember the trouble maker
188 static void ctdb_set_culprit_count(struct ctdb_recoverd
*rec
, uint32_t culprit
, uint32_t count
)
190 struct ctdb_context
*ctdb
= talloc_get_type(rec
->ctdb
, struct ctdb_context
);
191 struct ctdb_banning_state
*ban_state
;
193 if (culprit
> ctdb
->num_nodes
) {
194 DEBUG(DEBUG_ERR
,("Trying to set culprit %d but num_nodes is %d\n", culprit
, ctdb
->num_nodes
));
198 /* If we are banned or stopped, do not set other nodes as culprits */
199 if (rec
->node_flags
& NODE_FLAGS_INACTIVE
) {
200 DEBUG(DEBUG_NOTICE
, ("This node is INACTIVE, cannot set culprit node %d\n", culprit
));
204 if (ctdb
->nodes
[culprit
]->ban_state
== NULL
) {
205 ctdb
->nodes
[culprit
]->ban_state
= talloc_zero(ctdb
->nodes
[culprit
], struct ctdb_banning_state
);
206 CTDB_NO_MEMORY_VOID(ctdb
, ctdb
->nodes
[culprit
]->ban_state
);
210 ban_state
= ctdb
->nodes
[culprit
]->ban_state
;
211 if (timeval_elapsed(&ban_state
->last_reported_time
) > ctdb
->tunable
.recovery_grace_period
) {
212 /* this was the first time in a long while this node
213 misbehaved so we will forgive any old transgressions.
215 ban_state
->count
= 0;
218 ban_state
->count
+= count
;
219 ban_state
->last_reported_time
= timeval_current();
220 rec
->last_culprit_node
= culprit
;
224 remember the trouble maker
226 static void ctdb_set_culprit(struct ctdb_recoverd
*rec
, uint32_t culprit
)
228 ctdb_set_culprit_count(rec
, culprit
, 1);
232 /* this callback is called for every node that failed to execute the
235 static void recovered_fail_callback(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
237 struct ctdb_recoverd
*rec
= talloc_get_type(callback_data
, struct ctdb_recoverd
);
239 DEBUG(DEBUG_ERR
, (__location__
" Node %u failed the recovered event. Setting it as recovery fail culprit\n", node_pnn
));
241 ctdb_set_culprit(rec
, node_pnn
);
245 run the "recovered" eventscript on all nodes
247 static int run_recovered_eventscript(struct ctdb_recoverd
*rec
, struct ctdb_node_map
*nodemap
, const char *caller
)
251 struct ctdb_context
*ctdb
= rec
->ctdb
;
253 tmp_ctx
= talloc_new(ctdb
);
254 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
256 nodes
= list_of_active_nodes(ctdb
, nodemap
, tmp_ctx
, true);
257 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_END_RECOVERY
,
259 CONTROL_TIMEOUT(), false, tdb_null
,
260 NULL
, recovered_fail_callback
,
262 DEBUG(DEBUG_ERR
, (__location__
" Unable to run the 'recovered' event when called from %s\n", caller
));
264 talloc_free(tmp_ctx
);
268 talloc_free(tmp_ctx
);
272 /* this callback is called for every node that failed to execute the
275 static void startrecovery_fail_callback(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
277 struct ctdb_recoverd
*rec
= talloc_get_type(callback_data
, struct ctdb_recoverd
);
279 DEBUG(DEBUG_ERR
, (__location__
" Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn
));
281 ctdb_set_culprit(rec
, node_pnn
);
285 run the "startrecovery" eventscript on all nodes
287 static int run_startrecovery_eventscript(struct ctdb_recoverd
*rec
, struct ctdb_node_map
*nodemap
)
291 struct ctdb_context
*ctdb
= rec
->ctdb
;
293 tmp_ctx
= talloc_new(ctdb
);
294 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
296 nodes
= list_of_active_nodes(ctdb
, nodemap
, tmp_ctx
, true);
297 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_START_RECOVERY
,
299 CONTROL_TIMEOUT(), false, tdb_null
,
301 startrecovery_fail_callback
,
303 DEBUG(DEBUG_ERR
, (__location__
" Unable to run the 'startrecovery' event. Recovery failed.\n"));
304 talloc_free(tmp_ctx
);
308 talloc_free(tmp_ctx
);
312 static void async_getcap_callback(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
314 if ( (outdata
.dsize
!= sizeof(uint32_t)) || (outdata
.dptr
== NULL
) ) {
315 DEBUG(DEBUG_ERR
, (__location__
" Invalid length/pointer for getcap callback : %u %p\n", (unsigned)outdata
.dsize
, outdata
.dptr
));
318 if (node_pnn
< ctdb
->num_nodes
) {
319 ctdb
->nodes
[node_pnn
]->capabilities
= *((uint32_t *)outdata
.dptr
);
322 if (node_pnn
== ctdb
->pnn
) {
323 ctdb
->capabilities
= ctdb
->nodes
[node_pnn
]->capabilities
;
328 update the node capabilities for all connected nodes
330 static int update_capabilities(struct ctdb_context
*ctdb
, struct ctdb_node_map
*nodemap
)
335 tmp_ctx
= talloc_new(ctdb
);
336 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
338 nodes
= list_of_connected_nodes(ctdb
, nodemap
, tmp_ctx
, true);
339 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_GET_CAPABILITIES
,
343 async_getcap_callback
, NULL
,
345 DEBUG(DEBUG_ERR
, (__location__
" Failed to read node capabilities.\n"));
346 talloc_free(tmp_ctx
);
350 talloc_free(tmp_ctx
);
354 static void set_recmode_fail_callback(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
356 struct ctdb_recoverd
*rec
= talloc_get_type(callback_data
, struct ctdb_recoverd
);
358 DEBUG(DEBUG_ERR
,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn
, rec
->nodemap
->num
));
359 ctdb_set_culprit_count(rec
, node_pnn
, rec
->nodemap
->num
);
362 static void transaction_start_fail_callback(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
364 struct ctdb_recoverd
*rec
= talloc_get_type(callback_data
, struct ctdb_recoverd
);
366 DEBUG(DEBUG_ERR
,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn
, rec
->nodemap
->num
));
367 ctdb_set_culprit_count(rec
, node_pnn
, rec
->nodemap
->num
);
371 change recovery mode on all nodes
373 static int set_recovery_mode(struct ctdb_context
*ctdb
, struct ctdb_recoverd
*rec
, struct ctdb_node_map
*nodemap
, uint32_t rec_mode
)
379 tmp_ctx
= talloc_new(ctdb
);
380 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
382 nodes
= list_of_active_nodes(ctdb
, nodemap
, tmp_ctx
, true);
384 data
.dsize
= sizeof(uint32_t);
385 data
.dptr
= (unsigned char *)&rec_mode
;
387 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_SET_RECMODE
,
393 DEBUG(DEBUG_ERR
, (__location__
" Unable to set recovery mode. Recovery failed.\n"));
394 talloc_free(tmp_ctx
);
398 /* freeze all nodes */
399 if (rec_mode
== CTDB_RECOVERY_ACTIVE
) {
402 for (i
=1; i
<=NUM_DB_PRIORITIES
; i
++) {
403 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_FREEZE
,
408 set_recmode_fail_callback
,
410 DEBUG(DEBUG_ERR
, (__location__
" Unable to freeze nodes. Recovery failed.\n"));
411 talloc_free(tmp_ctx
);
417 talloc_free(tmp_ctx
);
422 change recovery master on all node
424 static int set_recovery_master(struct ctdb_context
*ctdb
, struct ctdb_node_map
*nodemap
, uint32_t pnn
)
430 tmp_ctx
= talloc_new(ctdb
);
431 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
433 data
.dsize
= sizeof(uint32_t);
434 data
.dptr
= (unsigned char *)&pnn
;
436 nodes
= list_of_active_nodes(ctdb
, nodemap
, tmp_ctx
, true);
437 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_SET_RECMASTER
,
439 CONTROL_TIMEOUT(), false, data
,
442 DEBUG(DEBUG_ERR
, (__location__
" Unable to set recmaster. Recovery failed.\n"));
443 talloc_free(tmp_ctx
);
447 talloc_free(tmp_ctx
);
451 /* update all remote nodes to use the same db priority that we have
452 this can fail if the remove node has not yet been upgraded to
453 support this function, so we always return success and never fail
454 a recovery if this call fails.
456 static int update_db_priority_on_remote_nodes(struct ctdb_context
*ctdb
,
457 struct ctdb_node_map
*nodemap
,
458 uint32_t pnn
, struct ctdb_dbid_map
*dbmap
, TALLOC_CTX
*mem_ctx
)
462 /* step through all local databases */
463 for (db
=0; db
<dbmap
->num
;db
++) {
464 struct ctdb_db_priority db_prio
;
467 db_prio
.db_id
= dbmap
->dbs
[db
].dbid
;
468 ret
= ctdb_ctrl_get_db_priority(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, dbmap
->dbs
[db
].dbid
, &db_prio
.priority
);
470 DEBUG(DEBUG_ERR
,(__location__
" Failed to read database priority from local node for db 0x%08x\n", dbmap
->dbs
[db
].dbid
));
474 DEBUG(DEBUG_INFO
,("Update DB priority for db 0x%08x to %u\n", dbmap
->dbs
[db
].dbid
, db_prio
.priority
));
476 ret
= ctdb_ctrl_set_db_priority(ctdb
, CONTROL_TIMEOUT(),
477 CTDB_CURRENT_NODE
, &db_prio
);
479 DEBUG(DEBUG_ERR
,(__location__
" Failed to set DB priority for 0x%08x\n",
488 ensure all other nodes have attached to any databases that we have
490 static int create_missing_remote_databases(struct ctdb_context
*ctdb
, struct ctdb_node_map
*nodemap
,
491 uint32_t pnn
, struct ctdb_dbid_map
*dbmap
, TALLOC_CTX
*mem_ctx
)
494 struct ctdb_dbid_map
*remote_dbmap
;
496 /* verify that all other nodes have all our databases */
497 for (j
=0; j
<nodemap
->num
; j
++) {
498 /* we dont need to ourself ourselves */
499 if (nodemap
->nodes
[j
].pnn
== pnn
) {
502 /* dont check nodes that are unavailable */
503 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
507 ret
= ctdb_ctrl_getdbmap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
508 mem_ctx
, &remote_dbmap
);
510 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbids from node %u\n", pnn
));
514 /* step through all local databases */
515 for (db
=0; db
<dbmap
->num
;db
++) {
519 for (i
=0;i
<remote_dbmap
->num
;i
++) {
520 if (dbmap
->dbs
[db
].dbid
== remote_dbmap
->dbs
[i
].dbid
) {
524 /* the remote node already have this database */
525 if (i
!=remote_dbmap
->num
) {
528 /* ok so we need to create this database */
529 ret
= ctdb_ctrl_getdbname(ctdb
, CONTROL_TIMEOUT(), pnn
,
530 dbmap
->dbs
[db
].dbid
, mem_ctx
,
533 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbname from node %u\n", pnn
));
536 ret
= ctdb_ctrl_createdb(ctdb
, CONTROL_TIMEOUT(),
537 nodemap
->nodes
[j
].pnn
,
539 dbmap
->dbs
[db
].flags
& CTDB_DB_FLAGS_PERSISTENT
);
541 DEBUG(DEBUG_ERR
, (__location__
" Unable to create remote db:%s\n", name
));
552 ensure we are attached to any databases that anyone else is attached to
554 static int create_missing_local_databases(struct ctdb_context
*ctdb
, struct ctdb_node_map
*nodemap
,
555 uint32_t pnn
, struct ctdb_dbid_map
**dbmap
, TALLOC_CTX
*mem_ctx
)
558 struct ctdb_dbid_map
*remote_dbmap
;
560 /* verify that we have all database any other node has */
561 for (j
=0; j
<nodemap
->num
; j
++) {
562 /* we dont need to ourself ourselves */
563 if (nodemap
->nodes
[j
].pnn
== pnn
) {
566 /* dont check nodes that are unavailable */
567 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
571 ret
= ctdb_ctrl_getdbmap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
572 mem_ctx
, &remote_dbmap
);
574 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbids from node %u\n", pnn
));
578 /* step through all databases on the remote node */
579 for (db
=0; db
<remote_dbmap
->num
;db
++) {
582 for (i
=0;i
<(*dbmap
)->num
;i
++) {
583 if (remote_dbmap
->dbs
[db
].dbid
== (*dbmap
)->dbs
[i
].dbid
) {
587 /* we already have this db locally */
588 if (i
!=(*dbmap
)->num
) {
591 /* ok so we need to create this database and
594 ctdb_ctrl_getdbname(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
595 remote_dbmap
->dbs
[db
].dbid
, mem_ctx
, &name
);
597 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbname from node %u\n",
598 nodemap
->nodes
[j
].pnn
));
601 ctdb_ctrl_createdb(ctdb
, CONTROL_TIMEOUT(), pnn
, mem_ctx
, name
,
602 remote_dbmap
->dbs
[db
].flags
& CTDB_DB_FLAGS_PERSISTENT
);
604 DEBUG(DEBUG_ERR
, (__location__
" Unable to create local db:%s\n", name
));
607 ret
= ctdb_ctrl_getdbmap(ctdb
, CONTROL_TIMEOUT(), pnn
, mem_ctx
, dbmap
);
609 DEBUG(DEBUG_ERR
, (__location__
" Unable to reread dbmap on node %u\n", pnn
));
620 pull the remote database contents from one node into the recdb
622 static int pull_one_remote_database(struct ctdb_context
*ctdb
, uint32_t srcnode
,
623 struct tdb_wrap
*recdb
, uint32_t dbid
)
627 struct ctdb_marshall_buffer
*reply
;
628 struct ctdb_rec_data
*rec
;
630 TALLOC_CTX
*tmp_ctx
= talloc_new(recdb
);
632 ret
= ctdb_ctrl_pulldb(ctdb
, srcnode
, dbid
, CTDB_LMASTER_ANY
, tmp_ctx
,
633 CONTROL_TIMEOUT(), &outdata
);
635 DEBUG(DEBUG_ERR
,(__location__
" Unable to copy db from node %u\n", srcnode
));
636 talloc_free(tmp_ctx
);
640 reply
= (struct ctdb_marshall_buffer
*)outdata
.dptr
;
642 if (outdata
.dsize
< offsetof(struct ctdb_marshall_buffer
, data
)) {
643 DEBUG(DEBUG_ERR
,(__location__
" invalid data in pulldb reply\n"));
644 talloc_free(tmp_ctx
);
648 rec
= (struct ctdb_rec_data
*)&reply
->data
[0];
652 rec
= (struct ctdb_rec_data
*)(rec
->length
+ (uint8_t *)rec
), i
++) {
654 struct ctdb_ltdb_header
*hdr
;
657 key
.dptr
= &rec
->data
[0];
658 key
.dsize
= rec
->keylen
;
659 data
.dptr
= &rec
->data
[key
.dsize
];
660 data
.dsize
= rec
->datalen
;
662 hdr
= (struct ctdb_ltdb_header
*)data
.dptr
;
664 if (data
.dsize
< sizeof(struct ctdb_ltdb_header
)) {
665 DEBUG(DEBUG_CRIT
,(__location__
" bad ltdb record\n"));
666 talloc_free(tmp_ctx
);
670 /* fetch the existing record, if any */
671 existing
= tdb_fetch(recdb
->tdb
, key
);
673 if (existing
.dptr
!= NULL
) {
674 struct ctdb_ltdb_header header
;
675 if (existing
.dsize
< sizeof(struct ctdb_ltdb_header
)) {
676 DEBUG(DEBUG_CRIT
,(__location__
" Bad record size %u from node %u\n",
677 (unsigned)existing
.dsize
, srcnode
));
679 talloc_free(tmp_ctx
);
682 header
= *(struct ctdb_ltdb_header
*)existing
.dptr
;
684 if (!(header
.rsn
< hdr
->rsn
||
685 (header
.dmaster
!= ctdb
->recovery_master
&& header
.rsn
== hdr
->rsn
))) {
690 if (tdb_store(recdb
->tdb
, key
, data
, TDB_REPLACE
) != 0) {
691 DEBUG(DEBUG_CRIT
,(__location__
" Failed to store record\n"));
692 talloc_free(tmp_ctx
);
697 talloc_free(tmp_ctx
);
703 struct pull_seqnum_cbdata
{
709 static void pull_seqnum_cb(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
711 struct pull_seqnum_cbdata
*cb_data
= talloc_get_type(callback_data
, struct pull_seqnum_cbdata
);
714 if (cb_data
->failed
!= 0) {
715 DEBUG(DEBUG_ERR
, ("Got seqnum from node %d but we have already failed the entire operation\n", node_pnn
));
720 DEBUG(DEBUG_ERR
, ("Error when pulling seqnum from node %d\n", node_pnn
));
725 if (outdata
.dsize
!= sizeof(uint64_t)) {
726 DEBUG(DEBUG_ERR
, ("Error when reading pull seqnum from node %d, got %d bytes but expected %d\n", node_pnn
, (int)outdata
.dsize
, (int)sizeof(uint64_t)));
727 cb_data
->failed
= -1;
731 seqnum
= *((uint64_t *)outdata
.dptr
);
733 if (seqnum
> cb_data
->seqnum
||
734 (cb_data
->pnn
== -1 && seqnum
== 0)) {
735 cb_data
->seqnum
= seqnum
;
736 cb_data
->pnn
= node_pnn
;
740 static void pull_seqnum_fail_cb(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
742 struct pull_seqnum_cbdata
*cb_data
= talloc_get_type(callback_data
, struct pull_seqnum_cbdata
);
744 DEBUG(DEBUG_ERR
, ("Failed to pull db seqnum from node %d\n", node_pnn
));
748 static int pull_highest_seqnum_pdb(struct ctdb_context
*ctdb
,
749 struct ctdb_recoverd
*rec
,
750 struct ctdb_node_map
*nodemap
,
751 struct tdb_wrap
*recdb
, uint32_t dbid
)
753 TALLOC_CTX
*tmp_ctx
= talloc_new(NULL
);
757 struct pull_seqnum_cbdata
*cb_data
;
759 DEBUG(DEBUG_NOTICE
, ("Scan for highest seqnum pdb for db:0x%08x\n", dbid
));
764 data
.dsize
= sizeof(outdata
);
765 data
.dptr
= (uint8_t *)&outdata
[0];
767 cb_data
= talloc(tmp_ctx
, struct pull_seqnum_cbdata
);
768 if (cb_data
== NULL
) {
769 DEBUG(DEBUG_ERR
, ("Failed to allocate pull highest seqnum cb_data structure\n"));
770 talloc_free(tmp_ctx
);
778 nodes
= list_of_active_nodes(ctdb
, nodemap
, tmp_ctx
, true);
779 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_GET_DB_SEQNUM
,
781 CONTROL_TIMEOUT(), false, data
,
785 DEBUG(DEBUG_ERR
, (__location__
" Failed to run async GET_DB_SEQNUM\n"));
787 talloc_free(tmp_ctx
);
791 if (cb_data
->failed
!= 0) {
792 DEBUG(DEBUG_NOTICE
, ("Failed to pull sequence numbers for DB 0x%08x\n", dbid
));
793 talloc_free(tmp_ctx
);
797 if (cb_data
->pnn
== -1) {
798 DEBUG(DEBUG_NOTICE
, ("Failed to find a node with highest sequence numbers for DB 0x%08x\n", dbid
));
799 talloc_free(tmp_ctx
);
803 DEBUG(DEBUG_NOTICE
, ("Pull persistent db:0x%08x from node %d with highest seqnum:%lld\n", dbid
, cb_data
->pnn
, (long long)cb_data
->seqnum
));
805 if (pull_one_remote_database(ctdb
, cb_data
->pnn
, recdb
, dbid
) != 0) {
806 DEBUG(DEBUG_ERR
, ("Failed to pull higest seqnum database 0x%08x from node %d\n", dbid
, cb_data
->pnn
));
807 talloc_free(tmp_ctx
);
811 talloc_free(tmp_ctx
);
817 pull all the remote database contents into the recdb
819 static int pull_remote_database(struct ctdb_context
*ctdb
,
820 struct ctdb_recoverd
*rec
,
821 struct ctdb_node_map
*nodemap
,
822 struct tdb_wrap
*recdb
, uint32_t dbid
,
827 if (persistent
&& ctdb
->tunable
.recover_pdb_by_seqnum
!= 0) {
829 ret
= pull_highest_seqnum_pdb(ctdb
, rec
, nodemap
, recdb
, dbid
);
835 /* pull all records from all other nodes across onto this node
836 (this merges based on rsn)
838 for (j
=0; j
<nodemap
->num
; j
++) {
839 /* dont merge from nodes that are unavailable */
840 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
843 if (pull_one_remote_database(ctdb
, nodemap
->nodes
[j
].pnn
, recdb
, dbid
) != 0) {
844 DEBUG(DEBUG_ERR
,(__location__
" Failed to pull remote database from node %u\n",
845 nodemap
->nodes
[j
].pnn
));
846 ctdb_set_culprit_count(rec
, nodemap
->nodes
[j
].pnn
, nodemap
->num
);
856 update flags on all active nodes
858 static int update_flags_on_all_nodes(struct ctdb_context
*ctdb
, struct ctdb_node_map
*nodemap
, uint32_t pnn
, uint32_t flags
)
862 ret
= ctdb_ctrl_modflags(ctdb
, CONTROL_TIMEOUT(), pnn
, flags
, ~flags
);
864 DEBUG(DEBUG_ERR
, (__location__
" Unable to update nodeflags on remote nodes\n"));
872 ensure all nodes have the same vnnmap we do
874 static int update_vnnmap_on_all_nodes(struct ctdb_context
*ctdb
, struct ctdb_node_map
*nodemap
,
875 uint32_t pnn
, struct ctdb_vnn_map
*vnnmap
, TALLOC_CTX
*mem_ctx
)
879 /* push the new vnn map out to all the nodes */
880 for (j
=0; j
<nodemap
->num
; j
++) {
881 /* dont push to nodes that are unavailable */
882 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
886 ret
= ctdb_ctrl_setvnnmap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
, mem_ctx
, vnnmap
);
888 DEBUG(DEBUG_ERR
, (__location__
" Unable to set vnnmap for node %u\n", pnn
));
898 struct vacuum_info
*next
, *prev
;
899 struct ctdb_recoverd
*rec
;
901 struct ctdb_db_context
*ctdb_db
;
902 struct ctdb_marshall_buffer
*recs
;
903 struct ctdb_rec_data
*r
;
906 static void vacuum_fetch_next(struct vacuum_info
*v
);
909 called when a vacuum fetch has completed - just free it and do the next one
911 static void vacuum_fetch_callback(struct ctdb_client_call_state
*state
)
913 struct vacuum_info
*v
= talloc_get_type(state
->async
.private_data
, struct vacuum_info
);
915 vacuum_fetch_next(v
);
920 process the next element from the vacuum list
922 static void vacuum_fetch_next(struct vacuum_info
*v
)
924 struct ctdb_call call
;
925 struct ctdb_rec_data
*r
;
927 while (v
->recs
->count
) {
928 struct ctdb_client_call_state
*state
;
930 struct ctdb_ltdb_header
*hdr
;
933 call
.call_id
= CTDB_NULL_FUNC
;
934 call
.flags
= CTDB_IMMEDIATE_MIGRATION
;
935 call
.flags
|= CTDB_CALL_FLAG_VACUUM_MIGRATION
;
938 v
->r
= (struct ctdb_rec_data
*)(r
->length
+ (uint8_t *)r
);
941 call
.key
.dptr
= &r
->data
[0];
942 call
.key
.dsize
= r
->keylen
;
944 /* ensure we don't block this daemon - just skip a record if we can't get
946 if (tdb_chainlock_nonblock(v
->ctdb_db
->ltdb
->tdb
, call
.key
) != 0) {
950 data
= tdb_fetch(v
->ctdb_db
->ltdb
->tdb
, call
.key
);
951 if (data
.dptr
== NULL
) {
952 tdb_chainunlock(v
->ctdb_db
->ltdb
->tdb
, call
.key
);
956 if (data
.dsize
< sizeof(struct ctdb_ltdb_header
)) {
958 tdb_chainunlock(v
->ctdb_db
->ltdb
->tdb
, call
.key
);
962 hdr
= (struct ctdb_ltdb_header
*)data
.dptr
;
963 if (hdr
->dmaster
== v
->rec
->ctdb
->pnn
) {
964 /* its already local */
966 tdb_chainunlock(v
->ctdb_db
->ltdb
->tdb
, call
.key
);
972 state
= ctdb_call_send(v
->ctdb_db
, &call
);
973 tdb_chainunlock(v
->ctdb_db
->ltdb
->tdb
, call
.key
);
975 DEBUG(DEBUG_ERR
,(__location__
" Failed to setup vacuum fetch call\n"));
979 state
->async
.fn
= vacuum_fetch_callback
;
980 state
->async
.private_data
= v
;
989 destroy a vacuum info structure
991 static int vacuum_info_destructor(struct vacuum_info
*v
)
993 DLIST_REMOVE(v
->rec
->vacuum_info
, v
);
999 handler for vacuum fetch
1001 static void vacuum_fetch_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
1002 TDB_DATA data
, void *private_data
)
1004 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
, struct ctdb_recoverd
);
1005 struct ctdb_marshall_buffer
*recs
;
1007 TALLOC_CTX
*tmp_ctx
= talloc_new(ctdb
);
1009 struct ctdb_dbid_map
*dbmap
=NULL
;
1010 bool persistent
= false;
1011 struct ctdb_db_context
*ctdb_db
;
1012 struct ctdb_rec_data
*r
;
1014 struct vacuum_info
*v
;
1016 recs
= (struct ctdb_marshall_buffer
*)data
.dptr
;
1017 r
= (struct ctdb_rec_data
*)&recs
->data
[0];
1019 if (recs
->count
== 0) {
1020 talloc_free(tmp_ctx
);
1026 for (v
=rec
->vacuum_info
;v
;v
=v
->next
) {
1027 if (srcnode
== v
->srcnode
&& recs
->db_id
== v
->ctdb_db
->db_id
) {
1028 /* we're already working on records from this node */
1029 talloc_free(tmp_ctx
);
1034 /* work out if the database is persistent */
1035 ret
= ctdb_ctrl_getdbmap(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, tmp_ctx
, &dbmap
);
1037 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbids from local node\n"));
1038 talloc_free(tmp_ctx
);
1042 for (i
=0;i
<dbmap
->num
;i
++) {
1043 if (dbmap
->dbs
[i
].dbid
== recs
->db_id
) {
1044 persistent
= dbmap
->dbs
[i
].flags
& CTDB_DB_FLAGS_PERSISTENT
;
1048 if (i
== dbmap
->num
) {
1049 DEBUG(DEBUG_ERR
, (__location__
" Unable to find db_id 0x%x on local node\n", recs
->db_id
));
1050 talloc_free(tmp_ctx
);
1054 /* find the name of this database */
1055 if (ctdb_ctrl_getdbname(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, recs
->db_id
, tmp_ctx
, &name
) != 0) {
1056 DEBUG(DEBUG_ERR
,(__location__
" Failed to get name of db 0x%x\n", recs
->db_id
));
1057 talloc_free(tmp_ctx
);
1062 ctdb_db
= ctdb_attach(ctdb
, CONTROL_TIMEOUT(), name
, persistent
, 0);
1063 if (ctdb_db
== NULL
) {
1064 DEBUG(DEBUG_ERR
,(__location__
" Failed to attach to database '%s'\n", name
));
1065 talloc_free(tmp_ctx
);
1069 v
= talloc_zero(rec
, struct vacuum_info
);
1071 DEBUG(DEBUG_CRIT
,(__location__
" Out of memory\n"));
1072 talloc_free(tmp_ctx
);
1077 v
->srcnode
= srcnode
;
1078 v
->ctdb_db
= ctdb_db
;
1079 v
->recs
= talloc_memdup(v
, recs
, data
.dsize
);
1080 if (v
->recs
== NULL
) {
1081 DEBUG(DEBUG_CRIT
,(__location__
" Out of memory\n"));
1083 talloc_free(tmp_ctx
);
1086 v
->r
= (struct ctdb_rec_data
*)&v
->recs
->data
[0];
1088 DLIST_ADD(rec
->vacuum_info
, v
);
1090 talloc_set_destructor(v
, vacuum_info_destructor
);
1092 vacuum_fetch_next(v
);
1093 talloc_free(tmp_ctx
);
1098 * handler for database detach
1100 static void detach_database_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
1101 TDB_DATA data
, void *private_data
)
1103 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
,
1104 struct ctdb_recoverd
);
1106 struct vacuum_info
*v
, *vnext
;
1107 struct ctdb_db_context
*ctdb_db
;
1109 if (data
.dsize
!= sizeof(db_id
)) {
1112 db_id
= *(uint32_t *)data
.dptr
;
1114 ctdb_db
= find_ctdb_db(ctdb
, db_id
);
1115 if (ctdb_db
== NULL
) {
1116 /* database is not attached */
1120 /* Stop any active vacuum fetch */
1121 v
= rec
->vacuum_info
;
1125 if (v
->ctdb_db
->db_id
== db_id
) {
1131 DLIST_REMOVE(ctdb
->db_list
, ctdb_db
);
1133 DEBUG(DEBUG_NOTICE
, ("Detached from database '%s'\n",
1135 talloc_free(ctdb_db
);
1139 called when ctdb_wait_timeout should finish
1141 static void ctdb_wait_handler(struct event_context
*ev
, struct timed_event
*te
,
1142 struct timeval yt
, void *p
)
1144 uint32_t *timed_out
= (uint32_t *)p
;
1149 wait for a given number of seconds
1151 static void ctdb_wait_timeout(struct ctdb_context
*ctdb
, double secs
)
1153 uint32_t timed_out
= 0;
1154 time_t usecs
= (secs
- (time_t)secs
) * 1000000;
1155 event_add_timed(ctdb
->ev
, ctdb
, timeval_current_ofs(secs
, usecs
), ctdb_wait_handler
, &timed_out
);
1156 while (!timed_out
) {
1157 event_loop_once(ctdb
->ev
);
1162 called when an election times out (ends)
1164 static void ctdb_election_timeout(struct event_context
*ev
, struct timed_event
*te
,
1165 struct timeval t
, void *p
)
1167 struct ctdb_recoverd
*rec
= talloc_get_type(p
, struct ctdb_recoverd
);
1168 rec
->election_timeout
= NULL
;
1171 DEBUG(DEBUG_WARNING
,("Election period ended\n"));
1176 wait for an election to finish. It finished election_timeout seconds after
1177 the last election packet is received
1179 static void ctdb_wait_election(struct ctdb_recoverd
*rec
)
1181 struct ctdb_context
*ctdb
= rec
->ctdb
;
1182 while (rec
->election_timeout
) {
1183 event_loop_once(ctdb
->ev
);
1188 Update our local flags from all remote connected nodes.
1189 This is only run when we are or we belive we are the recovery master
1191 static int update_local_flags(struct ctdb_recoverd
*rec
, struct ctdb_node_map
*nodemap
)
1194 struct ctdb_context
*ctdb
= rec
->ctdb
;
1195 TALLOC_CTX
*mem_ctx
= talloc_new(ctdb
);
1197 /* get the nodemap for all active remote nodes and verify
1198 they are the same as for this node
1200 for (j
=0; j
<nodemap
->num
; j
++) {
1201 struct ctdb_node_map
*remote_nodemap
=NULL
;
1204 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_DISCONNECTED
) {
1207 if (nodemap
->nodes
[j
].pnn
== ctdb
->pnn
) {
1211 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
1212 mem_ctx
, &remote_nodemap
);
1214 DEBUG(DEBUG_ERR
, (__location__
" Unable to get nodemap from remote node %u\n",
1215 nodemap
->nodes
[j
].pnn
));
1216 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
1217 talloc_free(mem_ctx
);
1218 return MONITOR_FAILED
;
1220 if (nodemap
->nodes
[j
].flags
!= remote_nodemap
->nodes
[j
].flags
) {
1221 /* We should tell our daemon about this so it
1222 updates its flags or else we will log the same
1223 message again in the next iteration of recovery.
1224 Since we are the recovery master we can just as
1225 well update the flags on all nodes.
1227 ret
= ctdb_ctrl_modflags(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
, remote_nodemap
->nodes
[j
].flags
, ~remote_nodemap
->nodes
[j
].flags
);
1229 DEBUG(DEBUG_ERR
, (__location__
" Unable to update nodeflags on remote nodes\n"));
1233 /* Update our local copy of the flags in the recovery
1236 DEBUG(DEBUG_NOTICE
,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
1237 nodemap
->nodes
[j
].pnn
, remote_nodemap
->nodes
[j
].flags
,
1238 nodemap
->nodes
[j
].flags
));
1239 nodemap
->nodes
[j
].flags
= remote_nodemap
->nodes
[j
].flags
;
1241 talloc_free(remote_nodemap
);
1243 talloc_free(mem_ctx
);
1248 /* Create a new random generation ip.
1249 The generation id can not be the INVALID_GENERATION id
1251 static uint32_t new_generation(void)
1253 uint32_t generation
;
1256 generation
= random();
1258 if (generation
!= INVALID_GENERATION
) {
1268 create a temporary working database
1270 static struct tdb_wrap
*create_recdb(struct ctdb_context
*ctdb
, TALLOC_CTX
*mem_ctx
)
1273 struct tdb_wrap
*recdb
;
1276 /* open up the temporary recovery database */
1277 name
= talloc_asprintf(mem_ctx
, "%s/recdb.tdb.%u",
1278 ctdb
->db_directory_state
,
1285 tdb_flags
= TDB_NOLOCK
;
1286 if (ctdb
->valgrinding
) {
1287 tdb_flags
|= TDB_NOMMAP
;
1289 tdb_flags
|= (TDB_INCOMPATIBLE_HASH
| TDB_DISALLOW_NESTING
);
1291 recdb
= tdb_wrap_open(mem_ctx
, name
, ctdb
->tunable
.database_hash_size
,
1292 tdb_flags
, O_RDWR
|O_CREAT
|O_EXCL
, 0600);
1293 if (recdb
== NULL
) {
1294 DEBUG(DEBUG_CRIT
,(__location__
" Failed to create temp recovery database '%s'\n", name
));
1304 a traverse function for pulling all relevant records from recdb
1307 struct ctdb_context
*ctdb
;
1308 struct ctdb_marshall_buffer
*recdata
;
1310 uint32_t allocated_len
;
1315 static int traverse_recdb(struct tdb_context
*tdb
, TDB_DATA key
, TDB_DATA data
, void *p
)
1317 struct recdb_data
*params
= (struct recdb_data
*)p
;
1318 struct ctdb_rec_data
*rec
;
1319 struct ctdb_ltdb_header
*hdr
;
1322 * skip empty records - but NOT for persistent databases:
1324 * The record-by-record mode of recovery deletes empty records.
1325 * For persistent databases, this can lead to data corruption
1326 * by deleting records that should be there:
1328 * - Assume the cluster has been running for a while.
1330 * - A record R in a persistent database has been created and
1331 * deleted a couple of times, the last operation being deletion,
1332 * leaving an empty record with a high RSN, say 10.
1334 * - Now a node N is turned off.
1336 * - This leaves the local database copy of D on N with the empty
1337 * copy of R and RSN 10. On all other nodes, the recovery has deleted
1338 * the copy of record R.
1340 * - Now the record is created again while node N is turned off.
1341 * This creates R with RSN = 1 on all nodes except for N.
1343 * - Now node N is turned on again. The following recovery will chose
1344 * the older empty copy of R due to RSN 10 > RSN 1.
1346 * ==> Hence the record is gone after the recovery.
1348 * On databases like Samba's registry, this can damage the higher-level
1349 * data structures built from the various tdb-level records.
1351 if (!params
->persistent
&& data
.dsize
<= sizeof(struct ctdb_ltdb_header
)) {
1355 /* update the dmaster field to point to us */
1356 hdr
= (struct ctdb_ltdb_header
*)data
.dptr
;
1357 if (!params
->persistent
) {
1358 hdr
->dmaster
= params
->ctdb
->pnn
;
1359 hdr
->flags
|= CTDB_REC_FLAG_MIGRATED_WITH_DATA
;
1362 /* add the record to the blob ready to send to the nodes */
1363 rec
= ctdb_marshall_record(params
->recdata
, 0, key
, NULL
, data
);
1365 params
->failed
= true;
1368 if (params
->len
+ rec
->length
>= params
->allocated_len
) {
1369 params
->allocated_len
= rec
->length
+ params
->len
+ params
->ctdb
->tunable
.pulldb_preallocation_size
;
1370 params
->recdata
= talloc_realloc_size(NULL
, params
->recdata
, params
->allocated_len
);
1372 if (params
->recdata
== NULL
) {
1373 DEBUG(DEBUG_CRIT
,(__location__
" Failed to expand recdata to %u\n",
1374 rec
->length
+ params
->len
));
1375 params
->failed
= true;
1378 params
->recdata
->count
++;
1379 memcpy(params
->len
+(uint8_t *)params
->recdata
, rec
, rec
->length
);
1380 params
->len
+= rec
->length
;
1387 push the recdb database out to all nodes
1389 static int push_recdb_database(struct ctdb_context
*ctdb
, uint32_t dbid
,
1391 struct tdb_wrap
*recdb
, struct ctdb_node_map
*nodemap
)
1393 struct recdb_data params
;
1394 struct ctdb_marshall_buffer
*recdata
;
1396 TALLOC_CTX
*tmp_ctx
;
1399 tmp_ctx
= talloc_new(ctdb
);
1400 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
1402 recdata
= talloc_zero(recdb
, struct ctdb_marshall_buffer
);
1403 CTDB_NO_MEMORY(ctdb
, recdata
);
1405 recdata
->db_id
= dbid
;
1408 params
.recdata
= recdata
;
1409 params
.len
= offsetof(struct ctdb_marshall_buffer
, data
);
1410 params
.allocated_len
= params
.len
;
1411 params
.failed
= false;
1412 params
.persistent
= persistent
;
1414 if (tdb_traverse_read(recdb
->tdb
, traverse_recdb
, ¶ms
) == -1) {
1415 DEBUG(DEBUG_ERR
,(__location__
" Failed to traverse recdb database\n"));
1416 talloc_free(params
.recdata
);
1417 talloc_free(tmp_ctx
);
1421 if (params
.failed
) {
1422 DEBUG(DEBUG_ERR
,(__location__
" Failed to traverse recdb database\n"));
1423 talloc_free(params
.recdata
);
1424 talloc_free(tmp_ctx
);
1428 recdata
= params
.recdata
;
1430 outdata
.dptr
= (void *)recdata
;
1431 outdata
.dsize
= params
.len
;
1433 nodes
= list_of_active_nodes(ctdb
, nodemap
, tmp_ctx
, true);
1434 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_PUSH_DB
,
1436 CONTROL_TIMEOUT(), false, outdata
,
1439 DEBUG(DEBUG_ERR
,(__location__
" Failed to push recdb records to nodes for db 0x%x\n", dbid
));
1440 talloc_free(recdata
);
1441 talloc_free(tmp_ctx
);
1445 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - pushed remote database 0x%x of size %u\n",
1446 dbid
, recdata
->count
));
1448 talloc_free(recdata
);
1449 talloc_free(tmp_ctx
);
1456 go through a full recovery on one database
1458 static int recover_database(struct ctdb_recoverd
*rec
,
1459 TALLOC_CTX
*mem_ctx
,
1463 struct ctdb_node_map
*nodemap
,
1464 uint32_t transaction_id
)
1466 struct tdb_wrap
*recdb
;
1468 struct ctdb_context
*ctdb
= rec
->ctdb
;
1470 struct ctdb_control_wipe_database w
;
1473 recdb
= create_recdb(ctdb
, mem_ctx
);
1474 if (recdb
== NULL
) {
1478 /* pull all remote databases onto the recdb */
1479 ret
= pull_remote_database(ctdb
, rec
, nodemap
, recdb
, dbid
, persistent
);
1481 DEBUG(DEBUG_ERR
, (__location__
" Unable to pull remote database 0x%x\n", dbid
));
1485 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - pulled remote database 0x%x\n", dbid
));
1487 /* wipe all the remote databases. This is safe as we are in a transaction */
1489 w
.transaction_id
= transaction_id
;
1491 data
.dptr
= (void *)&w
;
1492 data
.dsize
= sizeof(w
);
1494 nodes
= list_of_active_nodes(ctdb
, nodemap
, recdb
, true);
1495 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_WIPE_DATABASE
,
1497 CONTROL_TIMEOUT(), false, data
,
1500 DEBUG(DEBUG_ERR
, (__location__
" Unable to wipe database. Recovery failed.\n"));
1505 /* push out the correct database. This sets the dmaster and skips
1506 the empty records */
1507 ret
= push_recdb_database(ctdb
, dbid
, persistent
, recdb
, nodemap
);
1513 /* all done with this database */
1519 static int ctdb_reload_remote_public_ips(struct ctdb_context
*ctdb
,
1520 struct ctdb_recoverd
*rec
,
1521 struct ctdb_node_map
*nodemap
,
1527 if (ctdb
->num_nodes
!= nodemap
->num
) {
1528 DEBUG(DEBUG_ERR
, (__location__
" ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
1529 ctdb
->num_nodes
, nodemap
->num
));
1531 *culprit
= ctdb
->pnn
;
1536 for (j
=0; j
<nodemap
->num
; j
++) {
1537 /* For readability */
1538 struct ctdb_node
*node
= ctdb
->nodes
[j
];
1540 /* release any existing data */
1541 if (node
->known_public_ips
) {
1542 talloc_free(node
->known_public_ips
);
1543 node
->known_public_ips
= NULL
;
1545 if (node
->available_public_ips
) {
1546 talloc_free(node
->available_public_ips
);
1547 node
->available_public_ips
= NULL
;
1550 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
1554 /* Retrieve the list of known public IPs from the node */
1555 ret
= ctdb_ctrl_get_public_ips_flags(ctdb
,
1560 &node
->known_public_ips
);
1563 ("Failed to read known public IPs from node: %u\n",
1566 *culprit
= node
->pnn
;
1571 if (ctdb
->do_checkpublicip
&&
1572 rec
->takeover_runs_disable_ctx
== NULL
&&
1573 verify_remote_ip_allocation(ctdb
,
1574 node
->known_public_ips
,
1576 DEBUG(DEBUG_ERR
,("Trigger IP reallocation\n"));
1577 rec
->need_takeover_run
= true;
1580 /* Retrieve the list of available public IPs from the node */
1581 ret
= ctdb_ctrl_get_public_ips_flags(ctdb
,
1585 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE
,
1586 &node
->available_public_ips
);
1589 ("Failed to read available public IPs from node: %u\n",
1592 *culprit
= node
->pnn
;
1601 /* when we start a recovery, make sure all nodes use the same reclock file
1604 static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd
*rec
)
1606 struct ctdb_context
*ctdb
= rec
->ctdb
;
1607 TALLOC_CTX
*tmp_ctx
= talloc_new(NULL
);
1611 if (ctdb
->recovery_lock_file
== NULL
) {
1615 data
.dsize
= strlen(ctdb
->recovery_lock_file
) + 1;
1616 data
.dptr
= (uint8_t *)ctdb
->recovery_lock_file
;
1619 nodes
= list_of_active_nodes(ctdb
, rec
->nodemap
, tmp_ctx
, true);
1620 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_SET_RECLOCK_FILE
,
1626 DEBUG(DEBUG_ERR
, (__location__
" Failed to sync reclock file settings\n"));
1627 talloc_free(tmp_ctx
);
1631 talloc_free(tmp_ctx
);
1637 * this callback is called for every node that failed to execute ctdb_takeover_run()
1638 * and set flag to re-run takeover run.
1640 static void takeover_fail_callback(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
1642 DEBUG(DEBUG_ERR
, ("Node %u failed the takeover run\n", node_pnn
));
1644 if (callback_data
!= NULL
) {
1645 struct ctdb_recoverd
*rec
= talloc_get_type(callback_data
, struct ctdb_recoverd
);
1647 DEBUG(DEBUG_ERR
, ("Setting node %u as recovery fail culprit\n", node_pnn
));
1649 ctdb_set_culprit(rec
, node_pnn
);
1654 static void ban_misbehaving_nodes(struct ctdb_recoverd
*rec
, bool *self_ban
)
1656 struct ctdb_context
*ctdb
= rec
->ctdb
;
1658 struct ctdb_banning_state
*ban_state
;
1661 for (i
=0; i
<ctdb
->num_nodes
; i
++) {
1662 if (ctdb
->nodes
[i
]->ban_state
== NULL
) {
1665 ban_state
= (struct ctdb_banning_state
*)ctdb
->nodes
[i
]->ban_state
;
1666 if (ban_state
->count
< 2*ctdb
->num_nodes
) {
1670 DEBUG(DEBUG_NOTICE
,("Node %u reached %u banning credits - banning it for %u seconds\n",
1671 ctdb
->nodes
[i
]->pnn
, ban_state
->count
,
1672 ctdb
->tunable
.recovery_ban_period
));
1673 ctdb_ban_node(rec
, ctdb
->nodes
[i
]->pnn
, ctdb
->tunable
.recovery_ban_period
);
1674 ban_state
->count
= 0;
1676 /* Banning ourself? */
1677 if (ctdb
->nodes
[i
]->pnn
== rec
->ctdb
->pnn
) {
1683 static bool do_takeover_run(struct ctdb_recoverd
*rec
,
1684 struct ctdb_node_map
*nodemap
,
1685 bool banning_credits_on_fail
)
1687 uint32_t *nodes
= NULL
;
1688 struct srvid_request_data dtr
;
1691 uint32_t *rebalance_nodes
= rec
->force_rebalance_nodes
;
1695 DEBUG(DEBUG_NOTICE
, ("Takeover run starting\n"));
1697 if (rec
->takeover_run_in_progress
) {
1698 DEBUG(DEBUG_ERR
, (__location__
1699 " takeover run already in progress \n"));
1704 rec
->takeover_run_in_progress
= true;
1706 /* If takeover runs are in disabled then fail... */
1707 if (rec
->takeover_runs_disable_ctx
!= NULL
) {
1709 ("Takeover runs are disabled so refusing to run one\n"));
1714 /* Disable IP checks (takeover runs, really) on other nodes
1715 * while doing this takeover run. This will stop those other
1716 * nodes from triggering takeover runs when think they should
1717 * be hosting an IP but it isn't yet on an interface. Don't
1718 * wait for replies since a failure here might cause some
1719 * noise in the logs but will not actually cause a problem.
1721 dtr
.srvid
= 0; /* No reply */
1724 data
.dptr
= (uint8_t*)&dtr
;
1725 data
.dsize
= sizeof(dtr
);
1727 nodes
= list_of_connected_nodes(rec
->ctdb
, nodemap
, rec
, false);
1729 /* Disable for 60 seconds. This can be a tunable later if
1733 for (i
= 0; i
< talloc_array_length(nodes
); i
++) {
1734 if (ctdb_client_send_message(rec
->ctdb
, nodes
[i
],
1735 CTDB_SRVID_DISABLE_TAKEOVER_RUNS
,
1737 DEBUG(DEBUG_INFO
,("Failed to disable takeover runs\n"));
1741 ret
= ctdb_takeover_run(rec
->ctdb
, nodemap
,
1742 rec
->force_rebalance_nodes
,
1743 takeover_fail_callback
,
1744 banning_credits_on_fail
? rec
: NULL
);
1746 /* Reenable takeover runs and IP checks on other nodes */
1748 for (i
= 0; i
< talloc_array_length(nodes
); i
++) {
1749 if (ctdb_client_send_message(rec
->ctdb
, nodes
[i
],
1750 CTDB_SRVID_DISABLE_TAKEOVER_RUNS
,
1752 DEBUG(DEBUG_INFO
,("Failed to reenable takeover runs\n"));
1757 DEBUG(DEBUG_ERR
, ("ctdb_takeover_run() failed\n"));
1763 /* Takeover run was successful so clear force rebalance targets */
1764 if (rebalance_nodes
== rec
->force_rebalance_nodes
) {
1765 TALLOC_FREE(rec
->force_rebalance_nodes
);
1767 DEBUG(DEBUG_WARNING
,
1768 ("Rebalance target nodes changed during takeover run - not clearing\n"));
1771 rec
->need_takeover_run
= !ok
;
1773 rec
->takeover_run_in_progress
= false;
1775 DEBUG(DEBUG_NOTICE
, ("Takeover run %s\n", ok
? "completed successfully" : "unsuccessful"));
1781 we are the recmaster, and recovery is needed - start a recovery run
1783 static int do_recovery(struct ctdb_recoverd
*rec
,
1784 TALLOC_CTX
*mem_ctx
, uint32_t pnn
,
1785 struct ctdb_node_map
*nodemap
, struct ctdb_vnn_map
*vnnmap
)
1787 struct ctdb_context
*ctdb
= rec
->ctdb
;
1789 uint32_t generation
;
1790 struct ctdb_dbid_map
*dbmap
;
1793 struct timeval start_time
;
1794 uint32_t culprit
= (uint32_t)-1;
1797 DEBUG(DEBUG_NOTICE
, (__location__
" Starting do_recovery\n"));
1799 /* if recovery fails, force it again */
1800 rec
->need_recovery
= true;
1802 if (rec
->election_timeout
) {
1803 /* an election is in progress */
1804 DEBUG(DEBUG_ERR
, ("do_recovery called while election in progress - try again later\n"));
1808 ban_misbehaving_nodes(rec
, &self_ban
);
1810 DEBUG(DEBUG_NOTICE
, ("This node was banned, aborting recovery\n"));
1814 if (ctdb
->tunable
.verify_recovery_lock
!= 0) {
1815 DEBUG(DEBUG_ERR
,("Taking out recovery lock from recovery daemon\n"));
1816 start_time
= timeval_current();
1817 if (!ctdb_recovery_lock(ctdb
, true)) {
1818 DEBUG(DEBUG_ERR
,("Unable to get recovery lock - aborting recovery "
1819 "and ban ourself for %u seconds\n",
1820 ctdb
->tunable
.recovery_ban_period
));
1821 ctdb_ban_node(rec
, pnn
, ctdb
->tunable
.recovery_ban_period
);
1824 ctdb_ctrl_report_recd_lock_latency(ctdb
, CONTROL_TIMEOUT(), timeval_elapsed(&start_time
));
1825 DEBUG(DEBUG_NOTICE
,("Recovery lock taken successfully by recovery daemon\n"));
1828 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery initiated due to problem with node %u\n", rec
->last_culprit_node
));
1830 /* get a list of all databases */
1831 ret
= ctdb_ctrl_getdbmap(ctdb
, CONTROL_TIMEOUT(), pnn
, mem_ctx
, &dbmap
);
1833 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbids from node :%u\n", pnn
));
1837 /* we do the db creation before we set the recovery mode, so the freeze happens
1838 on all databases we will be dealing with. */
1840 /* verify that we have all the databases any other node has */
1841 ret
= create_missing_local_databases(ctdb
, nodemap
, pnn
, &dbmap
, mem_ctx
);
1843 DEBUG(DEBUG_ERR
, (__location__
" Unable to create missing local databases\n"));
1847 /* verify that all other nodes have all our databases */
1848 ret
= create_missing_remote_databases(ctdb
, nodemap
, pnn
, dbmap
, mem_ctx
);
1850 DEBUG(DEBUG_ERR
, (__location__
" Unable to create missing remote databases\n"));
1853 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - created remote databases\n"));
1855 /* update the database priority for all remote databases */
1856 ret
= update_db_priority_on_remote_nodes(ctdb
, nodemap
, pnn
, dbmap
, mem_ctx
);
1858 DEBUG(DEBUG_ERR
, (__location__
" Unable to set db priority on remote nodes\n"));
1860 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - updated db priority for all databases\n"));
1863 /* update all other nodes to use the same setting for reclock files
1864 as the local recovery master.
1866 sync_recovery_lock_file_across_cluster(rec
);
1868 /* set recovery mode to active on all nodes */
1869 ret
= set_recovery_mode(ctdb
, rec
, nodemap
, CTDB_RECOVERY_ACTIVE
);
1871 DEBUG(DEBUG_ERR
, (__location__
" Unable to set recovery mode to active on cluster\n"));
1875 /* execute the "startrecovery" event script on all nodes */
1876 ret
= run_startrecovery_eventscript(rec
, nodemap
);
1878 DEBUG(DEBUG_ERR
, (__location__
" Unable to run the 'startrecovery' event on cluster\n"));
1883 update all nodes to have the same flags that we have
1885 for (i
=0;i
<nodemap
->num
;i
++) {
1886 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
) {
1890 ret
= update_flags_on_all_nodes(ctdb
, nodemap
, i
, nodemap
->nodes
[i
].flags
);
1892 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_INACTIVE
) {
1893 DEBUG(DEBUG_WARNING
, (__location__
"Unable to update flags on inactive node %d\n", i
));
1895 DEBUG(DEBUG_ERR
, (__location__
" Unable to update flags on all nodes for node %d\n", i
));
1901 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - updated flags\n"));
1903 /* pick a new generation number */
1904 generation
= new_generation();
1906 /* change the vnnmap on this node to use the new generation
1907 number but not on any other nodes.
1908 this guarantees that if we abort the recovery prematurely
1909 for some reason (a node stops responding?)
1910 that we can just return immediately and we will reenter
1911 recovery shortly again.
1912 I.e. we deliberately leave the cluster with an inconsistent
1913 generation id to allow us to abort recovery at any stage and
1914 just restart it from scratch.
1916 vnnmap
->generation
= generation
;
1917 ret
= ctdb_ctrl_setvnnmap(ctdb
, CONTROL_TIMEOUT(), pnn
, mem_ctx
, vnnmap
);
1919 DEBUG(DEBUG_ERR
, (__location__
" Unable to set vnnmap for node %u\n", pnn
));
1923 data
.dptr
= (void *)&generation
;
1924 data
.dsize
= sizeof(uint32_t);
1926 nodes
= list_of_active_nodes(ctdb
, nodemap
, mem_ctx
, true);
1927 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_TRANSACTION_START
,
1929 CONTROL_TIMEOUT(), false, data
,
1931 transaction_start_fail_callback
,
1933 DEBUG(DEBUG_ERR
, (__location__
" Unable to start transactions. Recovery failed.\n"));
1934 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_TRANSACTION_CANCEL
,
1936 CONTROL_TIMEOUT(), false, tdb_null
,
1940 DEBUG(DEBUG_ERR
,("Failed to cancel recovery transaction\n"));
1945 DEBUG(DEBUG_NOTICE
,(__location__
" started transactions on all nodes\n"));
1947 for (i
=0;i
<dbmap
->num
;i
++) {
1948 ret
= recover_database(rec
, mem_ctx
,
1950 dbmap
->dbs
[i
].flags
& CTDB_DB_FLAGS_PERSISTENT
,
1951 pnn
, nodemap
, generation
);
1953 DEBUG(DEBUG_ERR
, (__location__
" Failed to recover database 0x%x\n", dbmap
->dbs
[i
].dbid
));
1958 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - starting database commits\n"));
1960 /* commit all the changes */
1961 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_TRANSACTION_COMMIT
,
1963 CONTROL_TIMEOUT(), false, data
,
1966 DEBUG(DEBUG_ERR
, (__location__
" Unable to commit recovery changes. Recovery failed.\n"));
1970 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - committed databases\n"));
1973 /* update the capabilities for all nodes */
1974 ret
= update_capabilities(ctdb
, nodemap
);
1976 DEBUG(DEBUG_ERR
, (__location__
" Unable to update node capabilities.\n"));
1980 /* build a new vnn map with all the currently active and
1982 generation
= new_generation();
1983 vnnmap
= talloc(mem_ctx
, struct ctdb_vnn_map
);
1984 CTDB_NO_MEMORY(ctdb
, vnnmap
);
1985 vnnmap
->generation
= generation
;
1987 vnnmap
->map
= talloc_zero_array(vnnmap
, uint32_t, vnnmap
->size
);
1988 CTDB_NO_MEMORY(ctdb
, vnnmap
->map
);
1989 for (i
=j
=0;i
<nodemap
->num
;i
++) {
1990 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_INACTIVE
) {
1993 if (!(ctdb
->nodes
[i
]->capabilities
& CTDB_CAP_LMASTER
)) {
1994 /* this node can not be an lmaster */
1995 DEBUG(DEBUG_DEBUG
, ("Node %d cant be a LMASTER, skipping it\n", i
));
2000 vnnmap
->map
= talloc_realloc(vnnmap
, vnnmap
->map
, uint32_t, vnnmap
->size
);
2001 CTDB_NO_MEMORY(ctdb
, vnnmap
->map
);
2002 vnnmap
->map
[j
++] = nodemap
->nodes
[i
].pnn
;
2005 if (vnnmap
->size
== 0) {
2006 DEBUG(DEBUG_NOTICE
, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
2008 vnnmap
->map
= talloc_realloc(vnnmap
, vnnmap
->map
, uint32_t, vnnmap
->size
);
2009 CTDB_NO_MEMORY(ctdb
, vnnmap
->map
);
2010 vnnmap
->map
[0] = pnn
;
2013 /* update to the new vnnmap on all nodes */
2014 ret
= update_vnnmap_on_all_nodes(ctdb
, nodemap
, pnn
, vnnmap
, mem_ctx
);
2016 DEBUG(DEBUG_ERR
, (__location__
" Unable to update vnnmap on all nodes\n"));
2020 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - updated vnnmap\n"));
2022 /* update recmaster to point to us for all nodes */
2023 ret
= set_recovery_master(ctdb
, nodemap
, pnn
);
2025 DEBUG(DEBUG_ERR
, (__location__
" Unable to set recovery master\n"));
2029 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - updated recmaster\n"));
2031 /* disable recovery mode */
2032 ret
= set_recovery_mode(ctdb
, rec
, nodemap
, CTDB_RECOVERY_NORMAL
);
2034 DEBUG(DEBUG_ERR
, (__location__
" Unable to set recovery mode to normal on cluster\n"));
2038 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - disabled recovery mode\n"));
2040 /* Fetch known/available public IPs from each active node */
2041 ret
= ctdb_reload_remote_public_ips(ctdb
, rec
, nodemap
, &culprit
);
2043 DEBUG(DEBUG_ERR
,("Failed to read public ips from remote node %d\n",
2045 rec
->need_takeover_run
= true;
2049 do_takeover_run(rec
, nodemap
, false);
2051 /* execute the "recovered" event script on all nodes */
2052 ret
= run_recovered_eventscript(rec
, nodemap
, "do_recovery");
2054 DEBUG(DEBUG_ERR
, (__location__
" Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
2058 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - finished the recovered event\n"));
2060 /* send a message to all clients telling them that the cluster
2061 has been reconfigured */
2062 ret
= ctdb_client_send_message(ctdb
, CTDB_BROADCAST_CONNECTED
,
2063 CTDB_SRVID_RECONFIGURE
, tdb_null
);
2065 DEBUG(DEBUG_ERR
, (__location__
" Failed to send reconfigure message\n"));
2069 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery complete\n"));
2071 rec
->need_recovery
= false;
2073 /* we managed to complete a full recovery, make sure to forgive
2074 any past sins by the nodes that could now participate in the
2077 DEBUG(DEBUG_ERR
,("Resetting ban count to 0 for all nodes\n"));
2078 for (i
=0;i
<nodemap
->num
;i
++) {
2079 struct ctdb_banning_state
*ban_state
;
2081 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
) {
2085 ban_state
= (struct ctdb_banning_state
*)ctdb
->nodes
[nodemap
->nodes
[i
].pnn
]->ban_state
;
2086 if (ban_state
== NULL
) {
2090 ban_state
->count
= 0;
2094 /* We just finished a recovery successfully.
2095 We now wait for rerecovery_timeout before we allow
2096 another recovery to take place.
2098 DEBUG(DEBUG_NOTICE
, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb
->tunable
.rerecovery_timeout
));
2099 ctdb_wait_timeout(ctdb
, ctdb
->tunable
.rerecovery_timeout
);
2100 DEBUG(DEBUG_NOTICE
, ("The rerecovery timeout has elapsed. We now allow recoveries to trigger again.\n"));
2107 elections are won by first checking the number of connected nodes, then
2108 the priority time, then the pnn
2110 struct election_message
{
2111 uint32_t num_connected
;
2112 struct timeval priority_time
;
2114 uint32_t node_flags
;
2118 form this nodes election data
2120 static void ctdb_election_data(struct ctdb_recoverd
*rec
, struct election_message
*em
)
2123 struct ctdb_node_map
*nodemap
;
2124 struct ctdb_context
*ctdb
= rec
->ctdb
;
2128 em
->pnn
= rec
->ctdb
->pnn
;
2129 em
->priority_time
= rec
->priority_time
;
2131 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, rec
, &nodemap
);
2133 DEBUG(DEBUG_ERR
,(__location__
" unable to get node map\n"));
2137 rec
->node_flags
= nodemap
->nodes
[ctdb
->pnn
].flags
;
2138 em
->node_flags
= rec
->node_flags
;
2140 for (i
=0;i
<nodemap
->num
;i
++) {
2141 if (!(nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
)) {
2142 em
->num_connected
++;
2146 /* we shouldnt try to win this election if we cant be a recmaster */
2147 if ((ctdb
->capabilities
& CTDB_CAP_RECMASTER
) == 0) {
2148 em
->num_connected
= 0;
2149 em
->priority_time
= timeval_current();
2152 talloc_free(nodemap
);
2156 see if the given election data wins
2158 static bool ctdb_election_win(struct ctdb_recoverd
*rec
, struct election_message
*em
)
2160 struct election_message myem
;
2163 ctdb_election_data(rec
, &myem
);
2165 /* we cant win if we dont have the recmaster capability */
2166 if ((rec
->ctdb
->capabilities
& CTDB_CAP_RECMASTER
) == 0) {
2170 /* we cant win if we are banned */
2171 if (rec
->node_flags
& NODE_FLAGS_BANNED
) {
2175 /* we cant win if we are stopped */
2176 if (rec
->node_flags
& NODE_FLAGS_STOPPED
) {
2180 /* we will automatically win if the other node is banned */
2181 if (em
->node_flags
& NODE_FLAGS_BANNED
) {
2185 /* we will automatically win if the other node is banned */
2186 if (em
->node_flags
& NODE_FLAGS_STOPPED
) {
2190 /* try to use the most connected node */
2192 cmp
= (int)myem
.num_connected
- (int)em
->num_connected
;
2195 /* then the longest running node */
2197 cmp
= timeval_compare(&em
->priority_time
, &myem
.priority_time
);
2201 cmp
= (int)myem
.pnn
- (int)em
->pnn
;
2208 send out an election request
2210 static int send_election_request(struct ctdb_recoverd
*rec
, uint32_t pnn
)
2213 TDB_DATA election_data
;
2214 struct election_message emsg
;
2216 struct ctdb_context
*ctdb
= rec
->ctdb
;
2218 srvid
= CTDB_SRVID_RECOVERY
;
2220 ctdb_election_data(rec
, &emsg
);
2222 election_data
.dsize
= sizeof(struct election_message
);
2223 election_data
.dptr
= (unsigned char *)&emsg
;
2226 /* first we assume we will win the election and set
2227 recoverymaster to be ourself on the current node
2229 ret
= ctdb_ctrl_setrecmaster(ctdb
, CONTROL_TIMEOUT(), pnn
, pnn
);
2231 DEBUG(DEBUG_ERR
, (__location__
" failed to send recmaster election request\n"));
2236 /* send an election message to all active nodes */
2237 DEBUG(DEBUG_INFO
,(__location__
" Send election request to all active nodes\n"));
2238 return ctdb_client_send_message(ctdb
, CTDB_BROADCAST_ALL
, srvid
, election_data
);
2242 this function will unban all nodes in the cluster
2244 static void unban_all_nodes(struct ctdb_context
*ctdb
)
2247 struct ctdb_node_map
*nodemap
;
2248 TALLOC_CTX
*tmp_ctx
= talloc_new(ctdb
);
2250 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, tmp_ctx
, &nodemap
);
2252 DEBUG(DEBUG_ERR
,(__location__
" failed to get nodemap to unban all nodes\n"));
2256 for (i
=0;i
<nodemap
->num
;i
++) {
2257 if ( (!(nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
))
2258 && (nodemap
->nodes
[i
].flags
& NODE_FLAGS_BANNED
) ) {
2259 ret
= ctdb_ctrl_modflags(ctdb
, CONTROL_TIMEOUT(),
2260 nodemap
->nodes
[i
].pnn
, 0,
2263 DEBUG(DEBUG_ERR
, (__location__
" failed to reset ban state\n"));
2268 talloc_free(tmp_ctx
);
2273 we think we are winning the election - send a broadcast election request
2275 static void election_send_request(struct event_context
*ev
, struct timed_event
*te
, struct timeval t
, void *p
)
2277 struct ctdb_recoverd
*rec
= talloc_get_type(p
, struct ctdb_recoverd
);
2280 ret
= send_election_request(rec
, ctdb_get_pnn(rec
->ctdb
));
2282 DEBUG(DEBUG_ERR
,("Failed to send election request!\n"));
2285 talloc_free(rec
->send_election_te
);
2286 rec
->send_election_te
= NULL
;
2290 handler for memory dumps
2292 static void mem_dump_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2293 TDB_DATA data
, void *private_data
)
2295 TALLOC_CTX
*tmp_ctx
= talloc_new(ctdb
);
2298 struct srvid_request
*rd
;
2300 if (data
.dsize
!= sizeof(struct srvid_request
)) {
2301 DEBUG(DEBUG_ERR
, (__location__
" Wrong size of return address.\n"));
2302 talloc_free(tmp_ctx
);
2305 rd
= (struct srvid_request
*)data
.dptr
;
2307 dump
= talloc_zero(tmp_ctx
, TDB_DATA
);
2309 DEBUG(DEBUG_ERR
, (__location__
" Failed to allocate memory for memdump\n"));
2310 talloc_free(tmp_ctx
);
2313 ret
= ctdb_dump_memory(ctdb
, dump
);
2315 DEBUG(DEBUG_ERR
, (__location__
" ctdb_dump_memory() failed\n"));
2316 talloc_free(tmp_ctx
);
2320 DEBUG(DEBUG_ERR
, ("recovery master memory dump\n"));
2322 ret
= ctdb_client_send_message(ctdb
, rd
->pnn
, rd
->srvid
, *dump
);
2324 DEBUG(DEBUG_ERR
,("Failed to send rd memdump reply message\n"));
2325 talloc_free(tmp_ctx
);
2329 talloc_free(tmp_ctx
);
2335 static void getlog_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2336 TDB_DATA data
, void *private_data
)
2338 struct ctdb_get_log_addr
*log_addr
;
2341 if (data
.dsize
!= sizeof(struct ctdb_get_log_addr
)) {
2342 DEBUG(DEBUG_ERR
, (__location__
" Wrong size of return address.\n"));
2345 log_addr
= (struct ctdb_get_log_addr
*)data
.dptr
;
2347 child
= ctdb_fork_no_free_ringbuffer(ctdb
);
2348 if (child
== (pid_t
)-1) {
2349 DEBUG(DEBUG_ERR
,("Failed to fork a log collector child\n"));
2354 ctdb_set_process_name("ctdb_rec_log_collector");
2355 if (switch_from_server_to_client(ctdb
, "recoverd-log-collector") != 0) {
2356 DEBUG(DEBUG_CRIT
, (__location__
"ERROR: failed to switch log collector child into client mode.\n"));
2359 ctdb_collect_log(ctdb
, log_addr
);
2365 handler for clearlog
2367 static void clearlog_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2368 TDB_DATA data
, void *private_data
)
2370 ctdb_clear_log(ctdb
);
2374 handler for reload_nodes
2376 static void reload_nodes_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2377 TDB_DATA data
, void *private_data
)
2379 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
, struct ctdb_recoverd
);
2381 DEBUG(DEBUG_ERR
, (__location__
" Reload nodes file from recovery daemon\n"));
2383 ctdb_load_nodes_file(rec
->ctdb
);
2387 static void ctdb_rebalance_timeout(struct event_context
*ev
,
2388 struct timed_event
*te
,
2389 struct timeval t
, void *p
)
2391 struct ctdb_recoverd
*rec
= talloc_get_type(p
, struct ctdb_recoverd
);
2393 if (rec
->force_rebalance_nodes
== NULL
) {
2395 ("Rebalance timeout occurred - no nodes to rebalance\n"));
2400 ("Rebalance timeout occurred - do takeover run\n"));
2401 do_takeover_run(rec
, rec
->nodemap
, false);
2405 static void recd_node_rebalance_handler(struct ctdb_context
*ctdb
,
2407 TDB_DATA data
, void *private_data
)
2412 uint32_t deferred_rebalance
;
2413 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
, struct ctdb_recoverd
);
2415 if (rec
->recmaster
!= ctdb_get_pnn(ctdb
)) {
2419 if (data
.dsize
!= sizeof(uint32_t)) {
2420 DEBUG(DEBUG_ERR
,(__location__
" Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data
.dsize
, sizeof(uint32_t)));
2424 pnn
= *(uint32_t *)&data
.dptr
[0];
2426 DEBUG(DEBUG_NOTICE
,("Setting up rebalance of IPs to node %u\n", pnn
));
2428 /* Copy any existing list of nodes. There's probably some
2429 * sort of realloc variant that will do this but we need to
2430 * make sure that freeing the old array also cancels the timer
2431 * event for the timeout... not sure if realloc will do that.
2433 len
= (rec
->force_rebalance_nodes
!= NULL
) ?
2434 talloc_array_length(rec
->force_rebalance_nodes
) :
2437 /* This allows duplicates to be added but they don't cause
2438 * harm. A call to add a duplicate PNN arguably means that
2439 * the timeout should be reset, so this is the simplest
2442 t
= talloc_zero_array(rec
, uint32_t, len
+1);
2443 CTDB_NO_MEMORY_VOID(ctdb
, t
);
2445 memcpy(t
, rec
->force_rebalance_nodes
, sizeof(uint32_t) * len
);
2449 talloc_free(rec
->force_rebalance_nodes
);
2451 rec
->force_rebalance_nodes
= t
;
2453 /* If configured, setup a deferred takeover run to make sure
2454 * that certain nodes get IPs rebalanced to them. This will
2455 * be cancelled if a successful takeover run happens before
2456 * the timeout. Assign tunable value to variable for
2459 deferred_rebalance
= ctdb
->tunable
.deferred_rebalance_on_node_add
;
2460 if (deferred_rebalance
!= 0) {
2461 event_add_timed(ctdb
->ev
, rec
->force_rebalance_nodes
,
2462 timeval_current_ofs(deferred_rebalance
, 0),
2463 ctdb_rebalance_timeout
, rec
);
2469 static void recd_update_ip_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2470 TDB_DATA data
, void *private_data
)
2472 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
, struct ctdb_recoverd
);
2473 struct ctdb_public_ip
*ip
;
2475 if (rec
->recmaster
!= rec
->ctdb
->pnn
) {
2476 DEBUG(DEBUG_INFO
,("Not recmaster, ignore update ip message\n"));
2480 if (data
.dsize
!= sizeof(struct ctdb_public_ip
)) {
2481 DEBUG(DEBUG_ERR
,(__location__
" Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data
.dsize
, sizeof(struct ctdb_public_ip
)));
2485 ip
= (struct ctdb_public_ip
*)data
.dptr
;
2487 update_ip_assignment_tree(rec
->ctdb
, ip
);
2491 static void clear_takeover_runs_disable(struct ctdb_recoverd
*rec
)
2493 TALLOC_FREE(rec
->takeover_runs_disable_ctx
);
2496 static void reenable_takeover_runs(struct event_context
*ev
,
2497 struct timed_event
*te
,
2498 struct timeval yt
, void *p
)
2500 struct ctdb_recoverd
*rec
= talloc_get_type(p
, struct ctdb_recoverd
);
2502 DEBUG(DEBUG_NOTICE
,("Reenabling takeover runs after timeout\n"));
2503 clear_takeover_runs_disable(rec
);
2506 static void disable_takeover_runs_handler(struct ctdb_context
*ctdb
,
2507 uint64_t srvid
, TDB_DATA data
,
2510 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
,
2511 struct ctdb_recoverd
);
2512 struct srvid_request_data
*r
;
2517 /* Validate input data */
2518 if (data
.dsize
!= sizeof(struct srvid_request_data
)) {
2519 DEBUG(DEBUG_ERR
,(__location__
" Wrong size for data :%lu "
2520 "expecting %lu\n", (long unsigned)data
.dsize
,
2521 (long unsigned)sizeof(struct srvid_request
)));
2524 if (data
.dptr
== NULL
) {
2525 DEBUG(DEBUG_ERR
,(__location__
" No data received\n"));
2529 r
= (struct srvid_request_data
*)data
.dptr
;
2533 DEBUG(DEBUG_NOTICE
,("Reenabling takeover runs\n"));
2534 clear_takeover_runs_disable(rec
);
2535 ret
= ctdb_get_pnn(ctdb
);
2539 if (rec
->takeover_run_in_progress
) {
2541 ("Unable to disable takeover runs - in progress\n"));
2546 DEBUG(DEBUG_NOTICE
,("Disabling takeover runs for %u seconds\n", timeout
));
2548 /* Clear any old timers */
2549 clear_takeover_runs_disable(rec
);
2551 /* When this is non-NULL it indicates that takeover runs are
2552 * disabled. This context also holds the timeout timer.
2554 rec
->takeover_runs_disable_ctx
= talloc_new(rec
);
2555 if (rec
->takeover_runs_disable_ctx
== NULL
) {
2556 DEBUG(DEBUG_ERR
,(__location__
" Unable to allocate memory\n"));
2561 /* Arrange for the timeout to occur */
2562 event_add_timed(ctdb
->ev
, rec
->takeover_runs_disable_ctx
,
2563 timeval_current_ofs(timeout
, 0),
2564 reenable_takeover_runs
,
2567 /* Returning our PNN tells the caller that we succeeded */
2568 ret
= ctdb_get_pnn(ctdb
);
2570 result
.dsize
= sizeof(int32_t);
2571 result
.dptr
= (uint8_t *)&ret
;
2572 srvid_request_reply(ctdb
, (struct srvid_request
*)r
, result
);
2575 /* Backward compatibility for this SRVID - call
2576 * disable_takeover_runs_handler() instead
2578 static void disable_ip_check_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2579 TDB_DATA data
, void *private_data
)
2581 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
,
2582 struct ctdb_recoverd
);
2584 struct srvid_request_data
*req
;
2586 if (data
.dsize
!= sizeof(uint32_t)) {
2587 DEBUG(DEBUG_ERR
,(__location__
" Wrong size for data :%lu "
2588 "expecting %lu\n", (long unsigned)data
.dsize
,
2589 (long unsigned)sizeof(uint32_t)));
2592 if (data
.dptr
== NULL
) {
2593 DEBUG(DEBUG_ERR
,(__location__
" No data received\n"));
2597 req
= talloc(ctdb
, struct srvid_request_data
);
2598 CTDB_NO_MEMORY_VOID(ctdb
, req
);
2600 req
->srvid
= 0; /* No reply */
2602 req
->data
= *((uint32_t *)data
.dptr
); /* Timeout */
2604 data2
.dsize
= sizeof(*req
);
2605 data2
.dptr
= (uint8_t *)req
;
2607 disable_takeover_runs_handler(rec
->ctdb
,
2608 CTDB_SRVID_DISABLE_TAKEOVER_RUNS
,
2613 handler for ip reallocate, just add it to the list of requests and
2614 handle this later in the monitor_cluster loop so we do not recurse
2615 with other requests to takeover_run()
2617 static void ip_reallocate_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2618 TDB_DATA data
, void *private_data
)
2620 struct srvid_request
*request
;
2621 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
,
2622 struct ctdb_recoverd
);
2624 if (data
.dsize
!= sizeof(struct srvid_request
)) {
2625 DEBUG(DEBUG_ERR
, (__location__
" Wrong size of return address.\n"));
2629 request
= (struct srvid_request
*)data
.dptr
;
2631 srvid_request_add(ctdb
, &rec
->reallocate_requests
, request
);
2634 static void process_ipreallocate_requests(struct ctdb_context
*ctdb
,
2635 struct ctdb_recoverd
*rec
)
2640 struct srvid_requests
*current
;
2642 DEBUG(DEBUG_INFO
, ("recovery master forced ip reallocation\n"));
2644 /* Only process requests that are currently pending. More
2645 * might come in while the takeover run is in progress and
2646 * they will need to be processed later since they might
2647 * be in response flag changes.
2649 current
= rec
->reallocate_requests
;
2650 rec
->reallocate_requests
= NULL
;
2652 /* update the list of public ips that a node can handle for
2655 ret
= ctdb_reload_remote_public_ips(ctdb
, rec
, rec
->nodemap
, &culprit
);
2657 DEBUG(DEBUG_ERR
,("Failed to read public ips from remote node %d\n",
2659 rec
->need_takeover_run
= true;
2662 if (do_takeover_run(rec
, rec
->nodemap
, false)) {
2663 ret
= ctdb_get_pnn(ctdb
);
2669 result
.dsize
= sizeof(int32_t);
2670 result
.dptr
= (uint8_t *)&ret
;
2672 srvid_requests_reply(ctdb
, ¤t
, result
);
2677 handler for recovery master elections
2679 static void election_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2680 TDB_DATA data
, void *private_data
)
2682 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
, struct ctdb_recoverd
);
2684 struct election_message
*em
= (struct election_message
*)data
.dptr
;
2685 TALLOC_CTX
*mem_ctx
;
2687 /* Ignore election packets from ourself */
2688 if (ctdb
->pnn
== em
->pnn
) {
2692 /* we got an election packet - update the timeout for the election */
2693 talloc_free(rec
->election_timeout
);
2694 rec
->election_timeout
= event_add_timed(ctdb
->ev
, ctdb
,
2696 timeval_current_ofs(0, 500000) :
2697 timeval_current_ofs(ctdb
->tunable
.election_timeout
, 0),
2698 ctdb_election_timeout
, rec
);
2700 mem_ctx
= talloc_new(ctdb
);
2702 /* someone called an election. check their election data
2703 and if we disagree and we would rather be the elected node,
2704 send a new election message to all other nodes
2706 if (ctdb_election_win(rec
, em
)) {
2707 if (!rec
->send_election_te
) {
2708 rec
->send_election_te
= event_add_timed(ctdb
->ev
, rec
,
2709 timeval_current_ofs(0, 500000),
2710 election_send_request
, rec
);
2712 talloc_free(mem_ctx
);
2713 /*unban_all_nodes(ctdb);*/
2718 talloc_free(rec
->send_election_te
);
2719 rec
->send_election_te
= NULL
;
2721 if (ctdb
->tunable
.verify_recovery_lock
!= 0) {
2722 /* release the recmaster lock */
2723 if (em
->pnn
!= ctdb
->pnn
&&
2724 ctdb
->recovery_lock_fd
!= -1) {
2725 DEBUG(DEBUG_NOTICE
, ("Release the recovery lock\n"));
2726 close(ctdb
->recovery_lock_fd
);
2727 ctdb
->recovery_lock_fd
= -1;
2728 unban_all_nodes(ctdb
);
2732 /* ok, let that guy become recmaster then */
2733 ret
= ctdb_ctrl_setrecmaster(ctdb
, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb
), em
->pnn
);
2735 DEBUG(DEBUG_ERR
, (__location__
" failed to send recmaster election request"));
2736 talloc_free(mem_ctx
);
2740 talloc_free(mem_ctx
);
2746 force the start of the election process
2748 static void force_election(struct ctdb_recoverd
*rec
, uint32_t pnn
,
2749 struct ctdb_node_map
*nodemap
)
2752 struct ctdb_context
*ctdb
= rec
->ctdb
;
2754 DEBUG(DEBUG_INFO
,(__location__
" Force an election\n"));
2756 /* set all nodes to recovery mode to stop all internode traffic */
2757 ret
= set_recovery_mode(ctdb
, rec
, nodemap
, CTDB_RECOVERY_ACTIVE
);
2759 DEBUG(DEBUG_ERR
, (__location__
" Unable to set recovery mode to active on cluster\n"));
2763 talloc_free(rec
->election_timeout
);
2764 rec
->election_timeout
= event_add_timed(ctdb
->ev
, ctdb
,
2766 timeval_current_ofs(0, 500000) :
2767 timeval_current_ofs(ctdb
->tunable
.election_timeout
, 0),
2768 ctdb_election_timeout
, rec
);
2770 ret
= send_election_request(rec
, pnn
);
2772 DEBUG(DEBUG_ERR
, (__location__
" failed to initiate recmaster election"));
2776 /* wait for a few seconds to collect all responses */
2777 ctdb_wait_election(rec
);
2783 handler for when a node changes its flags
2785 static void monitor_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2786 TDB_DATA data
, void *private_data
)
2789 struct ctdb_node_flag_change
*c
= (struct ctdb_node_flag_change
*)data
.dptr
;
2790 struct ctdb_node_map
*nodemap
=NULL
;
2791 TALLOC_CTX
*tmp_ctx
;
2793 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
, struct ctdb_recoverd
);
2794 int disabled_flag_changed
;
2796 if (data
.dsize
!= sizeof(*c
)) {
2797 DEBUG(DEBUG_ERR
,(__location__
"Invalid data in ctdb_node_flag_change\n"));
2801 tmp_ctx
= talloc_new(ctdb
);
2802 CTDB_NO_MEMORY_VOID(ctdb
, tmp_ctx
);
2804 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, tmp_ctx
, &nodemap
);
2806 DEBUG(DEBUG_ERR
,(__location__
"ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2807 talloc_free(tmp_ctx
);
2812 for (i
=0;i
<nodemap
->num
;i
++) {
2813 if (nodemap
->nodes
[i
].pnn
== c
->pnn
) break;
2816 if (i
== nodemap
->num
) {
2817 DEBUG(DEBUG_CRIT
,(__location__
"Flag change for non-existant node %u\n", c
->pnn
));
2818 talloc_free(tmp_ctx
);
2822 if (c
->old_flags
!= c
->new_flags
) {
2823 DEBUG(DEBUG_NOTICE
,("Node %u has changed flags - now 0x%x was 0x%x\n", c
->pnn
, c
->new_flags
, c
->old_flags
));
2826 disabled_flag_changed
= (nodemap
->nodes
[i
].flags
^ c
->new_flags
) & NODE_FLAGS_DISABLED
;
2828 nodemap
->nodes
[i
].flags
= c
->new_flags
;
2830 ret
= ctdb_ctrl_getrecmaster(ctdb
, tmp_ctx
, CONTROL_TIMEOUT(),
2831 CTDB_CURRENT_NODE
, &ctdb
->recovery_master
);
2834 ret
= ctdb_ctrl_getrecmode(ctdb
, tmp_ctx
, CONTROL_TIMEOUT(),
2835 CTDB_CURRENT_NODE
, &ctdb
->recovery_mode
);
2839 ctdb
->recovery_master
== ctdb
->pnn
&&
2840 ctdb
->recovery_mode
== CTDB_RECOVERY_NORMAL
) {
2841 /* Only do the takeover run if the perm disabled or unhealthy
2842 flags changed since these will cause an ip failover but not
2844 If the node became disconnected or banned this will also
2845 lead to an ip address failover but that is handled
2848 if (disabled_flag_changed
) {
2849 rec
->need_takeover_run
= true;
2853 talloc_free(tmp_ctx
);
2857 handler for when we need to push out flag changes ot all other nodes
2859 static void push_flags_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2860 TDB_DATA data
, void *private_data
)
2863 struct ctdb_node_flag_change
*c
= (struct ctdb_node_flag_change
*)data
.dptr
;
2864 struct ctdb_node_map
*nodemap
=NULL
;
2865 TALLOC_CTX
*tmp_ctx
= talloc_new(ctdb
);
2869 /* find the recovery master */
2870 ret
= ctdb_ctrl_getrecmaster(ctdb
, tmp_ctx
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, &recmaster
);
2872 DEBUG(DEBUG_ERR
, (__location__
" Unable to get recmaster from local node\n"));
2873 talloc_free(tmp_ctx
);
2877 /* read the node flags from the recmaster */
2878 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), recmaster
, tmp_ctx
, &nodemap
);
2880 DEBUG(DEBUG_ERR
, (__location__
" Unable to get nodemap from node %u\n", c
->pnn
));
2881 talloc_free(tmp_ctx
);
2884 if (c
->pnn
>= nodemap
->num
) {
2885 DEBUG(DEBUG_ERR
,(__location__
" Nodemap from recmaster does not contain node %d\n", c
->pnn
));
2886 talloc_free(tmp_ctx
);
2890 /* send the flags update to all connected nodes */
2891 nodes
= list_of_connected_nodes(ctdb
, nodemap
, tmp_ctx
, true);
2893 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_MODIFY_FLAGS
,
2894 nodes
, 0, CONTROL_TIMEOUT(),
2898 DEBUG(DEBUG_ERR
, (__location__
" ctdb_control to modify node flags failed\n"));
2900 talloc_free(tmp_ctx
);
2904 talloc_free(tmp_ctx
);
2908 struct verify_recmode_normal_data
{
2910 enum monitor_result status
;
2913 static void verify_recmode_normal_callback(struct ctdb_client_control_state
*state
)
2915 struct verify_recmode_normal_data
*rmdata
= talloc_get_type(state
->async
.private_data
, struct verify_recmode_normal_data
);
2918 /* one more node has responded with recmode data*/
2921 /* if we failed to get the recmode, then return an error and let
2922 the main loop try again.
2924 if (state
->state
!= CTDB_CONTROL_DONE
) {
2925 if (rmdata
->status
== MONITOR_OK
) {
2926 rmdata
->status
= MONITOR_FAILED
;
2931 /* if we got a response, then the recmode will be stored in the
2934 if (state
->status
!= CTDB_RECOVERY_NORMAL
) {
2935 DEBUG(DEBUG_NOTICE
, ("Node:%u was in recovery mode. Start recovery process\n", state
->c
->hdr
.destnode
));
2936 rmdata
->status
= MONITOR_RECOVERY_NEEDED
;
2943 /* verify that all nodes are in normal recovery mode */
2944 static enum monitor_result
verify_recmode(struct ctdb_context
*ctdb
, struct ctdb_node_map
*nodemap
)
2946 struct verify_recmode_normal_data
*rmdata
;
2947 TALLOC_CTX
*mem_ctx
= talloc_new(ctdb
);
2948 struct ctdb_client_control_state
*state
;
2949 enum monitor_result status
;
2952 rmdata
= talloc(mem_ctx
, struct verify_recmode_normal_data
);
2953 CTDB_NO_MEMORY_FATAL(ctdb
, rmdata
);
2955 rmdata
->status
= MONITOR_OK
;
2957 /* loop over all active nodes and send an async getrecmode call to
2959 for (j
=0; j
<nodemap
->num
; j
++) {
2960 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
2963 state
= ctdb_ctrl_getrecmode_send(ctdb
, mem_ctx
,
2965 nodemap
->nodes
[j
].pnn
);
2966 if (state
== NULL
) {
2967 /* we failed to send the control, treat this as
2968 an error and try again next iteration
2970 DEBUG(DEBUG_ERR
,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2971 talloc_free(mem_ctx
);
2972 return MONITOR_FAILED
;
2975 /* set up the callback functions */
2976 state
->async
.fn
= verify_recmode_normal_callback
;
2977 state
->async
.private_data
= rmdata
;
2979 /* one more control to wait for to complete */
2984 /* now wait for up to the maximum number of seconds allowed
2985 or until all nodes we expect a response from has replied
2987 while (rmdata
->count
> 0) {
2988 event_loop_once(ctdb
->ev
);
2991 status
= rmdata
->status
;
2992 talloc_free(mem_ctx
);
2997 struct verify_recmaster_data
{
2998 struct ctdb_recoverd
*rec
;
3001 enum monitor_result status
;
3004 static void verify_recmaster_callback(struct ctdb_client_control_state
*state
)
3006 struct verify_recmaster_data
*rmdata
= talloc_get_type(state
->async
.private_data
, struct verify_recmaster_data
);
3009 /* one more node has responded with recmaster data*/
3012 /* if we failed to get the recmaster, then return an error and let
3013 the main loop try again.
3015 if (state
->state
!= CTDB_CONTROL_DONE
) {
3016 if (rmdata
->status
== MONITOR_OK
) {
3017 rmdata
->status
= MONITOR_FAILED
;
3022 /* if we got a response, then the recmaster will be stored in the
3025 if (state
->status
!= rmdata
->pnn
) {
3026 DEBUG(DEBUG_ERR
,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state
->c
->hdr
.destnode
, state
->status
));
3027 ctdb_set_culprit(rmdata
->rec
, state
->c
->hdr
.destnode
);
3028 rmdata
->status
= MONITOR_ELECTION_NEEDED
;
3035 /* verify that all nodes agree that we are the recmaster */
3036 static enum monitor_result
verify_recmaster(struct ctdb_recoverd
*rec
, struct ctdb_node_map
*nodemap
, uint32_t pnn
)
3038 struct ctdb_context
*ctdb
= rec
->ctdb
;
3039 struct verify_recmaster_data
*rmdata
;
3040 TALLOC_CTX
*mem_ctx
= talloc_new(ctdb
);
3041 struct ctdb_client_control_state
*state
;
3042 enum monitor_result status
;
3045 rmdata
= talloc(mem_ctx
, struct verify_recmaster_data
);
3046 CTDB_NO_MEMORY_FATAL(ctdb
, rmdata
);
3050 rmdata
->status
= MONITOR_OK
;
3052 /* loop over all active nodes and send an async getrecmaster call to
3054 for (j
=0; j
<nodemap
->num
; j
++) {
3055 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
3058 state
= ctdb_ctrl_getrecmaster_send(ctdb
, mem_ctx
,
3060 nodemap
->nodes
[j
].pnn
);
3061 if (state
== NULL
) {
3062 /* we failed to send the control, treat this as
3063 an error and try again next iteration
3065 DEBUG(DEBUG_ERR
,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
3066 talloc_free(mem_ctx
);
3067 return MONITOR_FAILED
;
3070 /* set up the callback functions */
3071 state
->async
.fn
= verify_recmaster_callback
;
3072 state
->async
.private_data
= rmdata
;
3074 /* one more control to wait for to complete */
3079 /* now wait for up to the maximum number of seconds allowed
3080 or until all nodes we expect a response from has replied
3082 while (rmdata
->count
> 0) {
3083 event_loop_once(ctdb
->ev
);
3086 status
= rmdata
->status
;
3087 talloc_free(mem_ctx
);
3091 static bool interfaces_have_changed(struct ctdb_context
*ctdb
,
3092 struct ctdb_recoverd
*rec
)
3094 struct ctdb_control_get_ifaces
*ifaces
= NULL
;
3095 TALLOC_CTX
*mem_ctx
;
3098 mem_ctx
= talloc_new(NULL
);
3100 /* Read the interfaces from the local node */
3101 if (ctdb_ctrl_get_ifaces(ctdb
, CONTROL_TIMEOUT(),
3102 CTDB_CURRENT_NODE
, mem_ctx
, &ifaces
) != 0) {
3103 DEBUG(DEBUG_ERR
, ("Unable to get interfaces from local node %u\n", ctdb
->pnn
));
3104 /* We could return an error. However, this will be
3105 * rare so we'll decide that the interfaces have
3106 * actually changed, just in case.
3108 talloc_free(mem_ctx
);
3113 /* We haven't been here before so things have changed */
3114 DEBUG(DEBUG_NOTICE
, ("Initial interface fetched\n"));
3116 } else if (rec
->ifaces
->num
!= ifaces
->num
) {
3117 /* Number of interfaces has changed */
3118 DEBUG(DEBUG_NOTICE
, ("Interface count changed from %d to %d\n",
3119 rec
->ifaces
->num
, ifaces
->num
));
3122 /* See if interface names or link states have changed */
3124 for (i
= 0; i
< rec
->ifaces
->num
; i
++) {
3125 struct ctdb_control_iface_info
* iface
= &rec
->ifaces
->ifaces
[i
];
3126 if (strcmp(iface
->name
, ifaces
->ifaces
[i
].name
) != 0) {
3128 ("Interface in slot %d changed: %s => %s\n",
3129 i
, iface
->name
, ifaces
->ifaces
[i
].name
));
3133 if (iface
->link_state
!= ifaces
->ifaces
[i
].link_state
) {
3135 ("Interface %s changed state: %d => %d\n",
3136 iface
->name
, iface
->link_state
,
3137 ifaces
->ifaces
[i
].link_state
));
3144 talloc_free(rec
->ifaces
);
3145 rec
->ifaces
= talloc_steal(rec
, ifaces
);
3147 talloc_free(mem_ctx
);
3151 /* called to check that the local allocation of public ip addresses is ok.
3153 static int verify_local_ip_allocation(struct ctdb_context
*ctdb
, struct ctdb_recoverd
*rec
, uint32_t pnn
, struct ctdb_node_map
*nodemap
)
3155 TALLOC_CTX
*mem_ctx
= talloc_new(NULL
);
3156 struct ctdb_uptime
*uptime1
= NULL
;
3157 struct ctdb_uptime
*uptime2
= NULL
;
3159 bool need_takeover_run
= false;
3161 ret
= ctdb_ctrl_uptime(ctdb
, mem_ctx
, CONTROL_TIMEOUT(),
3162 CTDB_CURRENT_NODE
, &uptime1
);
3164 DEBUG(DEBUG_ERR
, ("Unable to get uptime from local node %u\n", pnn
));
3165 talloc_free(mem_ctx
);
3169 if (interfaces_have_changed(ctdb
, rec
)) {
3170 DEBUG(DEBUG_NOTICE
, ("The interfaces status has changed on "
3171 "local node %u - force takeover run\n",
3173 need_takeover_run
= true;
3176 ret
= ctdb_ctrl_uptime(ctdb
, mem_ctx
, CONTROL_TIMEOUT(),
3177 CTDB_CURRENT_NODE
, &uptime2
);
3179 DEBUG(DEBUG_ERR
, ("Unable to get uptime from local node %u\n", pnn
));
3180 talloc_free(mem_ctx
);
3184 /* skip the check if the startrecovery time has changed */
3185 if (timeval_compare(&uptime1
->last_recovery_started
,
3186 &uptime2
->last_recovery_started
) != 0) {
3187 DEBUG(DEBUG_NOTICE
, (__location__
" last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3188 talloc_free(mem_ctx
);
3192 /* skip the check if the endrecovery time has changed */
3193 if (timeval_compare(&uptime1
->last_recovery_finished
,
3194 &uptime2
->last_recovery_finished
) != 0) {
3195 DEBUG(DEBUG_NOTICE
, (__location__
" last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3196 talloc_free(mem_ctx
);
3200 /* skip the check if we have started but not finished recovery */
3201 if (timeval_compare(&uptime1
->last_recovery_finished
,
3202 &uptime1
->last_recovery_started
) != 1) {
3203 DEBUG(DEBUG_INFO
, (__location__
" in the middle of recovery or ip reallocation. skipping public ip address check\n"));
3204 talloc_free(mem_ctx
);
3209 /* verify that we have the ip addresses we should have
3210 and we dont have ones we shouldnt have.
3211 if we find an inconsistency we set recmode to
3212 active on the local node and wait for the recmaster
3213 to do a full blown recovery.
3214 also if the pnn is -1 and we are healthy and can host the ip
3215 we also request a ip reallocation.
3217 if (ctdb
->tunable
.disable_ip_failover
== 0) {
3218 struct ctdb_all_public_ips
*ips
= NULL
;
3220 /* read the *available* IPs from the local node */
3221 ret
= ctdb_ctrl_get_public_ips_flags(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, mem_ctx
, CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE
, &ips
);
3223 DEBUG(DEBUG_ERR
, ("Unable to get available public IPs from local node %u\n", pnn
));
3224 talloc_free(mem_ctx
);
3228 for (j
=0; j
<ips
->num
; j
++) {
3229 if (ips
->ips
[j
].pnn
== -1 &&
3230 nodemap
->nodes
[pnn
].flags
== 0) {
3231 DEBUG(DEBUG_CRIT
,("Public IP '%s' is not assigned and we could serve it\n",
3232 ctdb_addr_to_str(&ips
->ips
[j
].addr
)));
3233 need_takeover_run
= true;
3239 /* read the *known* IPs from the local node */
3240 ret
= ctdb_ctrl_get_public_ips_flags(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, mem_ctx
, 0, &ips
);
3242 DEBUG(DEBUG_ERR
, ("Unable to get known public IPs from local node %u\n", pnn
));
3243 talloc_free(mem_ctx
);
3247 for (j
=0; j
<ips
->num
; j
++) {
3248 if (ips
->ips
[j
].pnn
== pnn
) {
3249 if (ctdb
->do_checkpublicip
&& !ctdb_sys_have_ip(&ips
->ips
[j
].addr
)) {
3250 DEBUG(DEBUG_CRIT
,("Public IP '%s' is assigned to us but not on an interface\n",
3251 ctdb_addr_to_str(&ips
->ips
[j
].addr
)));
3252 need_takeover_run
= true;
3255 if (ctdb
->do_checkpublicip
&&
3256 ctdb_sys_have_ip(&ips
->ips
[j
].addr
)) {
3258 DEBUG(DEBUG_CRIT
,("We are still serving a public IP '%s' that we should not be serving. Removing it\n",
3259 ctdb_addr_to_str(&ips
->ips
[j
].addr
)));
3261 if (ctdb_ctrl_release_ip(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, &ips
->ips
[j
]) != 0) {
3262 DEBUG(DEBUG_ERR
,("Failed to release local IP address\n"));
3269 if (need_takeover_run
) {
3270 struct srvid_request rd
;
3273 DEBUG(DEBUG_CRIT
,("Trigger takeoverrun\n"));
3277 data
.dptr
= (uint8_t *)&rd
;
3278 data
.dsize
= sizeof(rd
);
3280 ret
= ctdb_client_send_message(ctdb
, rec
->recmaster
, CTDB_SRVID_TAKEOVER_RUN
, data
);
3282 DEBUG(DEBUG_ERR
,(__location__
" Failed to send ipreallocate to recmaster :%d\n", (int)rec
->recmaster
));
3285 talloc_free(mem_ctx
);
3290 static void async_getnodemap_callback(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
3292 struct ctdb_node_map
**remote_nodemaps
= callback_data
;
3294 if (node_pnn
>= ctdb
->num_nodes
) {
3295 DEBUG(DEBUG_ERR
,(__location__
" pnn from invalid node\n"));
3299 remote_nodemaps
[node_pnn
] = (struct ctdb_node_map
*)talloc_steal(remote_nodemaps
, outdata
.dptr
);
3303 static int get_remote_nodemaps(struct ctdb_context
*ctdb
, TALLOC_CTX
*mem_ctx
,
3304 struct ctdb_node_map
*nodemap
,
3305 struct ctdb_node_map
**remote_nodemaps
)
3309 nodes
= list_of_active_nodes(ctdb
, nodemap
, mem_ctx
, true);
3310 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_GET_NODEMAP
,
3312 CONTROL_TIMEOUT(), false, tdb_null
,
3313 async_getnodemap_callback
,
3315 remote_nodemaps
) != 0) {
3316 DEBUG(DEBUG_ERR
, (__location__
" Unable to pull all remote nodemaps\n"));
3324 enum reclock_child_status
{ RECLOCK_CHECKING
, RECLOCK_OK
, RECLOCK_FAILED
, RECLOCK_TIMEOUT
};
3325 struct ctdb_check_reclock_state
{
3326 struct ctdb_context
*ctdb
;
3327 struct timeval start_time
;
3330 struct timed_event
*te
;
3331 struct fd_event
*fde
;
3332 enum reclock_child_status status
;
3335 /* when we free the reclock state we must kill any child process.
3337 static int check_reclock_destructor(struct ctdb_check_reclock_state
*state
)
3339 struct ctdb_context
*ctdb
= state
->ctdb
;
3341 ctdb_ctrl_report_recd_lock_latency(ctdb
, CONTROL_TIMEOUT(), timeval_elapsed(&state
->start_time
));
3343 if (state
->fd
[0] != -1) {
3344 close(state
->fd
[0]);
3347 if (state
->fd
[1] != -1) {
3348 close(state
->fd
[1]);
3351 ctdb_kill(ctdb
, state
->child
, SIGKILL
);
3356 called if our check_reclock child times out. this would happen if
3357 i/o to the reclock file blocks.
3359 static void ctdb_check_reclock_timeout(struct event_context
*ev
, struct timed_event
*te
,
3360 struct timeval t
, void *private_data
)
3362 struct ctdb_check_reclock_state
*state
= talloc_get_type(private_data
,
3363 struct ctdb_check_reclock_state
);
3365 DEBUG(DEBUG_ERR
,(__location__
" check_reclock child process hung/timedout CFS slow to grant locks?\n"));
3366 state
->status
= RECLOCK_TIMEOUT
;
3369 /* this is called when the child process has completed checking the reclock
3370 file and has written data back to us through the pipe.
3372 static void reclock_child_handler(struct event_context
*ev
, struct fd_event
*fde
,
3373 uint16_t flags
, void *private_data
)
3375 struct ctdb_check_reclock_state
*state
= talloc_get_type(private_data
,
3376 struct ctdb_check_reclock_state
);
3380 /* we got a response from our child process so we can abort the
3383 talloc_free(state
->te
);
3386 ret
= read(state
->fd
[0], &c
, 1);
3387 if (ret
!= 1 || c
!= RECLOCK_OK
) {
3388 DEBUG(DEBUG_ERR
,(__location__
" reclock child process returned error %d\n", c
));
3389 state
->status
= RECLOCK_FAILED
;
3394 state
->status
= RECLOCK_OK
;
3398 static int check_recovery_lock(struct ctdb_context
*ctdb
)
3401 struct ctdb_check_reclock_state
*state
;
3402 pid_t parent
= getpid();
3404 if (ctdb
->recovery_lock_fd
== -1) {
3405 DEBUG(DEBUG_CRIT
,("recovery master doesn't have the recovery lock\n"));
3409 state
= talloc(ctdb
, struct ctdb_check_reclock_state
);
3410 CTDB_NO_MEMORY(ctdb
, state
);
3413 state
->start_time
= timeval_current();
3414 state
->status
= RECLOCK_CHECKING
;
3418 ret
= pipe(state
->fd
);
3421 DEBUG(DEBUG_CRIT
,(__location__
" Failed to open pipe for check_reclock child\n"));
3425 state
->child
= ctdb_fork(ctdb
);
3426 if (state
->child
== (pid_t
)-1) {
3427 DEBUG(DEBUG_CRIT
,(__location__
" fork() failed in check_reclock child\n"));
3428 close(state
->fd
[0]);
3430 close(state
->fd
[1]);
3436 if (state
->child
== 0) {
3437 char cc
= RECLOCK_OK
;
3438 close(state
->fd
[0]);
3441 ctdb_set_process_name("ctdb_rec_reclock");
3442 debug_extra
= talloc_asprintf(NULL
, "recovery-lock:");
3443 if (pread(ctdb
->recovery_lock_fd
, &cc
, 1, 0) == -1) {
3444 DEBUG(DEBUG_CRIT
,("failed read from recovery_lock_fd - %s\n", strerror(errno
)));
3445 cc
= RECLOCK_FAILED
;
3448 write(state
->fd
[1], &cc
, 1);
3449 /* make sure we die when our parent dies */
3450 while (ctdb_kill(ctdb
, parent
, 0) == 0 || errno
!= ESRCH
) {
3455 close(state
->fd
[1]);
3457 set_close_on_exec(state
->fd
[0]);
3459 DEBUG(DEBUG_DEBUG
, (__location__
" Created PIPE FD:%d for check_recovery_lock\n", state
->fd
[0]));
3461 talloc_set_destructor(state
, check_reclock_destructor
);
3463 state
->te
= event_add_timed(ctdb
->ev
, state
, timeval_current_ofs(15, 0),
3464 ctdb_check_reclock_timeout
, state
);
3465 if (state
->te
== NULL
) {
3466 DEBUG(DEBUG_CRIT
,(__location__
" Failed to create a timed event for reclock child\n"));
3471 state
->fde
= event_add_fd(ctdb
->ev
, state
, state
->fd
[0],
3473 reclock_child_handler
,
3476 if (state
->fde
== NULL
) {
3477 DEBUG(DEBUG_CRIT
,(__location__
" Failed to create an fd event for reclock child\n"));
3481 tevent_fd_set_auto_close(state
->fde
);
3483 while (state
->status
== RECLOCK_CHECKING
) {
3484 event_loop_once(ctdb
->ev
);
3487 if (state
->status
== RECLOCK_FAILED
) {
3488 DEBUG(DEBUG_ERR
,(__location__
" reclock child failed when checking file\n"));
3489 close(ctdb
->recovery_lock_fd
);
3490 ctdb
->recovery_lock_fd
= -1;
3499 static int update_recovery_lock_file(struct ctdb_context
*ctdb
)
3501 TALLOC_CTX
*tmp_ctx
= talloc_new(NULL
);
3502 const char *reclockfile
;
3504 if (ctdb_ctrl_getreclock(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, tmp_ctx
, &reclockfile
) != 0) {
3505 DEBUG(DEBUG_ERR
,("Failed to read reclock file from daemon\n"));
3506 talloc_free(tmp_ctx
);
3510 if (reclockfile
== NULL
) {
3511 if (ctdb
->recovery_lock_file
!= NULL
) {
3512 DEBUG(DEBUG_ERR
,("Reclock file disabled\n"));
3513 talloc_free(ctdb
->recovery_lock_file
);
3514 ctdb
->recovery_lock_file
= NULL
;
3515 if (ctdb
->recovery_lock_fd
!= -1) {
3516 close(ctdb
->recovery_lock_fd
);
3517 ctdb
->recovery_lock_fd
= -1;
3520 ctdb
->tunable
.verify_recovery_lock
= 0;
3521 talloc_free(tmp_ctx
);
3525 if (ctdb
->recovery_lock_file
== NULL
) {
3526 ctdb
->recovery_lock_file
= talloc_strdup(ctdb
, reclockfile
);
3527 if (ctdb
->recovery_lock_fd
!= -1) {
3528 close(ctdb
->recovery_lock_fd
);
3529 ctdb
->recovery_lock_fd
= -1;
3531 talloc_free(tmp_ctx
);
3536 if (!strcmp(reclockfile
, ctdb
->recovery_lock_file
)) {
3537 talloc_free(tmp_ctx
);
3541 talloc_free(ctdb
->recovery_lock_file
);
3542 ctdb
->recovery_lock_file
= talloc_strdup(ctdb
, reclockfile
);
3543 ctdb
->tunable
.verify_recovery_lock
= 0;
3544 if (ctdb
->recovery_lock_fd
!= -1) {
3545 close(ctdb
->recovery_lock_fd
);
3546 ctdb
->recovery_lock_fd
= -1;
3549 talloc_free(tmp_ctx
);
3553 static void main_loop(struct ctdb_context
*ctdb
, struct ctdb_recoverd
*rec
,
3554 TALLOC_CTX
*mem_ctx
)
3557 struct ctdb_node_map
*nodemap
=NULL
;
3558 struct ctdb_node_map
*recmaster_nodemap
=NULL
;
3559 struct ctdb_node_map
**remote_nodemaps
=NULL
;
3560 struct ctdb_vnn_map
*vnnmap
=NULL
;
3561 struct ctdb_vnn_map
*remote_vnnmap
=NULL
;
3562 int32_t debug_level
;
3567 /* verify that the main daemon is still running */
3568 if (ctdb_kill(ctdb
, ctdb
->ctdbd_pid
, 0) != 0) {
3569 DEBUG(DEBUG_CRIT
,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
3573 /* ping the local daemon to tell it we are alive */
3574 ctdb_ctrl_recd_ping(ctdb
);
3576 if (rec
->election_timeout
) {
3577 /* an election is in progress */
3581 /* read the debug level from the parent and update locally */
3582 ret
= ctdb_ctrl_get_debuglevel(ctdb
, CTDB_CURRENT_NODE
, &debug_level
);
3584 DEBUG(DEBUG_ERR
, (__location__
" Failed to read debuglevel from parent\n"));
3587 LogLevel
= debug_level
;
3589 /* get relevant tunables */
3590 ret
= ctdb_ctrl_get_all_tunables(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, &ctdb
->tunable
);
3592 DEBUG(DEBUG_ERR
,("Failed to get tunables - retrying\n"));
3596 /* get the current recovery lock file from the server */
3597 if (update_recovery_lock_file(ctdb
) != 0) {
3598 DEBUG(DEBUG_ERR
,("Failed to update the recovery lock file\n"));
3602 /* Make sure that if recovery lock verification becomes disabled when
3605 if (ctdb
->tunable
.verify_recovery_lock
== 0) {
3606 if (ctdb
->recovery_lock_fd
!= -1) {
3607 close(ctdb
->recovery_lock_fd
);
3608 ctdb
->recovery_lock_fd
= -1;
3612 pnn
= ctdb_get_pnn(ctdb
);
3614 /* get the vnnmap */
3615 ret
= ctdb_ctrl_getvnnmap(ctdb
, CONTROL_TIMEOUT(), pnn
, mem_ctx
, &vnnmap
);
3617 DEBUG(DEBUG_ERR
, (__location__
" Unable to get vnnmap from node %u\n", pnn
));
3622 /* get number of nodes */
3624 talloc_free(rec
->nodemap
);
3625 rec
->nodemap
= NULL
;
3628 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), pnn
, rec
, &rec
->nodemap
);
3630 DEBUG(DEBUG_ERR
, (__location__
" Unable to get nodemap from node %u\n", pnn
));
3633 nodemap
= rec
->nodemap
;
3635 /* remember our own node flags */
3636 rec
->node_flags
= nodemap
->nodes
[pnn
].flags
;
3638 ban_misbehaving_nodes(rec
, &self_ban
);
3640 DEBUG(DEBUG_NOTICE
, ("This node was banned, restart main_loop\n"));
3644 /* if the local daemon is STOPPED or BANNED, we verify that the databases are
3645 also frozen and that the recmode is set to active.
3647 if (rec
->node_flags
& (NODE_FLAGS_STOPPED
| NODE_FLAGS_BANNED
)) {
3648 /* If this node has become inactive then we want to
3649 * reduce the chances of it taking over the recovery
3650 * master role when it becomes active again. This
3651 * helps to stabilise the recovery master role so that
3652 * it stays on the most stable node.
3654 rec
->priority_time
= timeval_current();
3656 ret
= ctdb_ctrl_getrecmode(ctdb
, mem_ctx
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, &ctdb
->recovery_mode
);
3658 DEBUG(DEBUG_ERR
,(__location__
" Failed to read recmode from local node\n"));
3660 if (ctdb
->recovery_mode
== CTDB_RECOVERY_NORMAL
) {
3661 DEBUG(DEBUG_ERR
,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
3663 ret
= ctdb_ctrl_setrecmode(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, CTDB_RECOVERY_ACTIVE
);
3665 DEBUG(DEBUG_ERR
,(__location__
" Failed to activate recovery mode in STOPPED or BANNED state\n"));
3669 ret
= ctdb_ctrl_freeze(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
);
3671 DEBUG(DEBUG_ERR
,(__location__
" Failed to freeze node in STOPPED or BANNED state\n"));
3676 /* If this node is stopped or banned then it is not the recovery
3677 * master, so don't do anything. This prevents stopped or banned
3678 * node from starting election and sending unnecessary controls.
3683 /* check which node is the recovery master */
3684 ret
= ctdb_ctrl_getrecmaster(ctdb
, mem_ctx
, CONTROL_TIMEOUT(), pnn
, &rec
->recmaster
);
3686 DEBUG(DEBUG_ERR
, (__location__
" Unable to get recmaster from node %u\n", pnn
));
3690 /* If we are not the recmaster then do some housekeeping */
3691 if (rec
->recmaster
!= pnn
) {
3692 /* Ignore any IP reallocate requests - only recmaster
3695 TALLOC_FREE(rec
->reallocate_requests
);
3696 /* Clear any nodes that should be force rebalanced in
3697 * the next takeover run. If the recovery master role
3698 * has moved then we don't want to process these some
3699 * time in the future.
3701 TALLOC_FREE(rec
->force_rebalance_nodes
);
3704 /* This is a special case. When recovery daemon is started, recmaster
3705 * is set to -1. If a node is not started in stopped state, then
3706 * start election to decide recovery master
3708 if (rec
->recmaster
== (uint32_t)-1) {
3709 DEBUG(DEBUG_NOTICE
,(__location__
" Initial recovery master set - forcing election\n"));
3710 force_election(rec
, pnn
, nodemap
);
3714 /* update the capabilities for all nodes */
3715 ret
= update_capabilities(ctdb
, nodemap
);
3717 DEBUG(DEBUG_ERR
, (__location__
" Unable to update node capabilities.\n"));
3722 * If the current recmaster does not have CTDB_CAP_RECMASTER,
3723 * but we have, then force an election and try to become the new
3726 if ((rec
->ctdb
->nodes
[rec
->recmaster
]->capabilities
& CTDB_CAP_RECMASTER
) == 0 &&
3727 (rec
->ctdb
->capabilities
& CTDB_CAP_RECMASTER
) &&
3728 !(nodemap
->nodes
[pnn
].flags
& NODE_FLAGS_INACTIVE
)) {
3729 DEBUG(DEBUG_ERR
, (__location__
" Current recmaster node %u does not have CAP_RECMASTER,"
3730 " but we (node %u) have - force an election\n",
3731 rec
->recmaster
, pnn
));
3732 force_election(rec
, pnn
, nodemap
);
3736 /* count how many active nodes there are */
3737 rec
->num_active
= 0;
3738 rec
->num_lmasters
= 0;
3739 rec
->num_connected
= 0;
3740 for (i
=0; i
<nodemap
->num
; i
++) {
3741 if (!(nodemap
->nodes
[i
].flags
& NODE_FLAGS_INACTIVE
)) {
3743 if (rec
->ctdb
->nodes
[i
]->capabilities
& CTDB_CAP_LMASTER
) {
3744 rec
->num_lmasters
++;
3747 if (!(nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
)) {
3748 rec
->num_connected
++;
3753 /* verify that the recmaster node is still active */
3754 for (j
=0; j
<nodemap
->num
; j
++) {
3755 if (nodemap
->nodes
[j
].pnn
==rec
->recmaster
) {
3760 if (j
== nodemap
->num
) {
3761 DEBUG(DEBUG_ERR
, ("Recmaster node %u not in list. Force reelection\n", rec
->recmaster
));
3762 force_election(rec
, pnn
, nodemap
);
3766 /* if recovery master is disconnected we must elect a new recmaster */
3767 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_DISCONNECTED
) {
3768 DEBUG(DEBUG_NOTICE
, ("Recmaster node %u is disconnected. Force reelection\n", nodemap
->nodes
[j
].pnn
));
3769 force_election(rec
, pnn
, nodemap
);
3773 /* get nodemap from the recovery master to check if it is inactive */
3774 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
3775 mem_ctx
, &recmaster_nodemap
);
3777 DEBUG(DEBUG_ERR
, (__location__
" Unable to get nodemap from recovery master %u\n",
3778 nodemap
->nodes
[j
].pnn
));
3783 if ((recmaster_nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) &&
3784 (rec
->node_flags
& NODE_FLAGS_INACTIVE
) == 0) {
3785 DEBUG(DEBUG_NOTICE
, ("Recmaster node %u no longer available. Force reelection\n", nodemap
->nodes
[j
].pnn
));
3787 * update our nodemap to carry the recmaster's notion of
3788 * its own flags, so that we don't keep freezing the
3789 * inactive recmaster node...
3791 nodemap
->nodes
[j
].flags
= recmaster_nodemap
->nodes
[j
].flags
;
3792 force_election(rec
, pnn
, nodemap
);
3796 /* verify that we have all ip addresses we should have and we dont
3797 * have addresses we shouldnt have.
3799 if (ctdb
->tunable
.disable_ip_failover
== 0 &&
3800 rec
->takeover_runs_disable_ctx
== NULL
) {
3801 if (verify_local_ip_allocation(ctdb
, rec
, pnn
, nodemap
) != 0) {
3802 DEBUG(DEBUG_ERR
, (__location__
" Public IPs were inconsistent.\n"));
3807 /* if we are not the recmaster then we do not need to check
3808 if recovery is needed
3810 if (pnn
!= rec
->recmaster
) {
3815 /* ensure our local copies of flags are right */
3816 ret
= update_local_flags(rec
, nodemap
);
3817 if (ret
== MONITOR_ELECTION_NEEDED
) {
3818 DEBUG(DEBUG_NOTICE
,("update_local_flags() called for a re-election.\n"));
3819 force_election(rec
, pnn
, nodemap
);
3822 if (ret
!= MONITOR_OK
) {
3823 DEBUG(DEBUG_ERR
,("Unable to update local flags\n"));
3827 if (ctdb
->num_nodes
!= nodemap
->num
) {
3828 DEBUG(DEBUG_ERR
, (__location__
" ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb
->num_nodes
, nodemap
->num
));
3829 ctdb_load_nodes_file(ctdb
);
3833 /* verify that all active nodes agree that we are the recmaster */
3834 switch (verify_recmaster(rec
, nodemap
, pnn
)) {
3835 case MONITOR_RECOVERY_NEEDED
:
3836 /* can not happen */
3838 case MONITOR_ELECTION_NEEDED
:
3839 force_election(rec
, pnn
, nodemap
);
3843 case MONITOR_FAILED
:
3848 if (rec
->need_recovery
) {
3849 /* a previous recovery didn't finish */
3850 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3854 /* verify that all active nodes are in normal mode
3855 and not in recovery mode
3857 switch (verify_recmode(ctdb
, nodemap
)) {
3858 case MONITOR_RECOVERY_NEEDED
:
3859 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3861 case MONITOR_FAILED
:
3863 case MONITOR_ELECTION_NEEDED
:
3864 /* can not happen */
3870 if (ctdb
->tunable
.verify_recovery_lock
!= 0) {
3871 /* we should have the reclock - check its not stale */
3872 ret
= check_recovery_lock(ctdb
);
3874 DEBUG(DEBUG_ERR
,("Failed check_recovery_lock. Force a recovery\n"));
3875 ctdb_set_culprit(rec
, ctdb
->pnn
);
3876 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3882 /* if there are takeovers requested, perform it and notify the waiters */
3883 if (rec
->takeover_runs_disable_ctx
== NULL
&&
3884 rec
->reallocate_requests
) {
3885 process_ipreallocate_requests(ctdb
, rec
);
3888 /* get the nodemap for all active remote nodes
3890 remote_nodemaps
= talloc_array(mem_ctx
, struct ctdb_node_map
*, nodemap
->num
);
3891 if (remote_nodemaps
== NULL
) {
3892 DEBUG(DEBUG_ERR
, (__location__
" failed to allocate remote nodemap array\n"));
3895 for(i
=0; i
<nodemap
->num
; i
++) {
3896 remote_nodemaps
[i
] = NULL
;
3898 if (get_remote_nodemaps(ctdb
, mem_ctx
, nodemap
, remote_nodemaps
) != 0) {
3899 DEBUG(DEBUG_ERR
,(__location__
" Failed to read remote nodemaps\n"));
3903 /* verify that all other nodes have the same nodemap as we have
3905 for (j
=0; j
<nodemap
->num
; j
++) {
3906 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
3910 if (remote_nodemaps
[j
] == NULL
) {
3911 DEBUG(DEBUG_ERR
,(__location__
" Did not get a remote nodemap for node %d, restarting monitoring\n", j
));
3912 ctdb_set_culprit(rec
, j
);
3917 /* if the nodes disagree on how many nodes there are
3918 then this is a good reason to try recovery
3920 if (remote_nodemaps
[j
]->num
!= nodemap
->num
) {
3921 DEBUG(DEBUG_ERR
, (__location__
" Remote node:%u has different node count. %u vs %u of the local node\n",
3922 nodemap
->nodes
[j
].pnn
, remote_nodemaps
[j
]->num
, nodemap
->num
));
3923 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3924 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3928 /* if the nodes disagree on which nodes exist and are
3929 active, then that is also a good reason to do recovery
3931 for (i
=0;i
<nodemap
->num
;i
++) {
3932 if (remote_nodemaps
[j
]->nodes
[i
].pnn
!= nodemap
->nodes
[i
].pnn
) {
3933 DEBUG(DEBUG_ERR
, (__location__
" Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3934 nodemap
->nodes
[j
].pnn
, i
,
3935 remote_nodemaps
[j
]->nodes
[i
].pnn
, nodemap
->nodes
[i
].pnn
));
3936 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3937 do_recovery(rec
, mem_ctx
, pnn
, nodemap
,
3945 * Update node flags obtained from each active node. This ensure we have
3946 * up-to-date information for all the nodes.
3948 for (j
=0; j
<nodemap
->num
; j
++) {
3949 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
3952 nodemap
->nodes
[j
].flags
= remote_nodemaps
[j
]->nodes
[j
].flags
;
3955 for (j
=0; j
<nodemap
->num
; j
++) {
3956 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
3960 /* verify the flags are consistent
3962 for (i
=0; i
<nodemap
->num
; i
++) {
3963 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
) {
3967 if (nodemap
->nodes
[i
].flags
!= remote_nodemaps
[j
]->nodes
[i
].flags
) {
3968 DEBUG(DEBUG_ERR
, (__location__
" Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3969 nodemap
->nodes
[j
].pnn
,
3970 nodemap
->nodes
[i
].pnn
,
3971 remote_nodemaps
[j
]->nodes
[i
].flags
,
3972 nodemap
->nodes
[i
].flags
));
3974 DEBUG(DEBUG_ERR
,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps
[j
]->nodes
[i
].flags
, j
));
3975 update_flags_on_all_nodes(ctdb
, nodemap
, nodemap
->nodes
[i
].pnn
, remote_nodemaps
[j
]->nodes
[i
].flags
);
3976 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3977 do_recovery(rec
, mem_ctx
, pnn
, nodemap
,
3981 DEBUG(DEBUG_ERR
,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap
->nodes
[i
].flags
, i
));
3982 update_flags_on_all_nodes(ctdb
, nodemap
, nodemap
->nodes
[i
].pnn
, nodemap
->nodes
[i
].flags
);
3983 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3984 do_recovery(rec
, mem_ctx
, pnn
, nodemap
,
3993 /* There must be the same number of lmasters in the vnn map as
3994 * there are active nodes with the lmaster capability... or
3997 if (vnnmap
->size
!= rec
->num_lmasters
) {
3998 DEBUG(DEBUG_ERR
, (__location__
" The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
3999 vnnmap
->size
, rec
->num_lmasters
));
4000 ctdb_set_culprit(rec
, ctdb
->pnn
);
4001 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
4005 /* verify that all active nodes in the nodemap also exist in
4008 for (j
=0; j
<nodemap
->num
; j
++) {
4009 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
4012 if (nodemap
->nodes
[j
].pnn
== pnn
) {
4016 for (i
=0; i
<vnnmap
->size
; i
++) {
4017 if (vnnmap
->map
[i
] == nodemap
->nodes
[j
].pnn
) {
4021 if (i
== vnnmap
->size
) {
4022 DEBUG(DEBUG_ERR
, (__location__
" Node %u is active in the nodemap but did not exist in the vnnmap\n",
4023 nodemap
->nodes
[j
].pnn
));
4024 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
4025 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
4031 /* verify that all other nodes have the same vnnmap
4032 and are from the same generation
4034 for (j
=0; j
<nodemap
->num
; j
++) {
4035 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
4038 if (nodemap
->nodes
[j
].pnn
== pnn
) {
4042 ret
= ctdb_ctrl_getvnnmap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
4043 mem_ctx
, &remote_vnnmap
);
4045 DEBUG(DEBUG_ERR
, (__location__
" Unable to get vnnmap from remote node %u\n",
4046 nodemap
->nodes
[j
].pnn
));
4050 /* verify the vnnmap generation is the same */
4051 if (vnnmap
->generation
!= remote_vnnmap
->generation
) {
4052 DEBUG(DEBUG_ERR
, (__location__
" Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
4053 nodemap
->nodes
[j
].pnn
, remote_vnnmap
->generation
, vnnmap
->generation
));
4054 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
4055 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
4059 /* verify the vnnmap size is the same */
4060 if (vnnmap
->size
!= remote_vnnmap
->size
) {
4061 DEBUG(DEBUG_ERR
, (__location__
" Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
4062 nodemap
->nodes
[j
].pnn
, remote_vnnmap
->size
, vnnmap
->size
));
4063 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
4064 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
4068 /* verify the vnnmap is the same */
4069 for (i
=0;i
<vnnmap
->size
;i
++) {
4070 if (remote_vnnmap
->map
[i
] != vnnmap
->map
[i
]) {
4071 DEBUG(DEBUG_ERR
, (__location__
" Remote node %u has different vnnmap.\n",
4072 nodemap
->nodes
[j
].pnn
));
4073 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
4074 do_recovery(rec
, mem_ctx
, pnn
, nodemap
,
4081 /* we might need to change who has what IP assigned */
4082 if (rec
->need_takeover_run
) {
4083 uint32_t culprit
= (uint32_t)-1;
4085 rec
->need_takeover_run
= false;
4087 /* update the list of public ips that a node can handle for
4090 ret
= ctdb_reload_remote_public_ips(ctdb
, rec
, nodemap
, &culprit
);
4092 DEBUG(DEBUG_ERR
,("Failed to read public ips from remote node %d\n",
4094 rec
->need_takeover_run
= true;
4098 /* execute the "startrecovery" event script on all nodes */
4099 ret
= run_startrecovery_eventscript(rec
, nodemap
);
4101 DEBUG(DEBUG_ERR
, (__location__
" Unable to run the 'startrecovery' event on cluster\n"));
4102 ctdb_set_culprit(rec
, ctdb
->pnn
);
4103 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
4107 /* If takeover run fails, then the offending nodes are
4108 * assigned ban culprit counts. And we re-try takeover.
4109 * If takeover run fails repeatedly, the node would get
4112 * If rec->need_takeover_run is not set to true at this
4113 * failure, monitoring is disabled cluster-wide (via
4114 * startrecovery eventscript) and will not get enabled.
4116 if (!do_takeover_run(rec
, nodemap
, true)) {
4120 /* execute the "recovered" event script on all nodes */
4121 ret
= run_recovered_eventscript(rec
, nodemap
, "monitor_cluster");
4123 // we cant check whether the event completed successfully
4124 // since this script WILL fail if the node is in recovery mode
4125 // and if that race happens, the code here would just cause a second
4126 // cascading recovery.
4128 DEBUG(DEBUG_ERR
, (__location__
" Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
4129 ctdb_set_culprit(rec
, ctdb
->pnn
);
4130 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
4137 the main monitoring loop
4139 static void monitor_cluster(struct ctdb_context
*ctdb
)
4141 struct ctdb_recoverd
*rec
;
4143 DEBUG(DEBUG_NOTICE
,("monitor_cluster starting\n"));
4145 rec
= talloc_zero(ctdb
, struct ctdb_recoverd
);
4146 CTDB_NO_MEMORY_FATAL(ctdb
, rec
);
4150 rec
->takeover_run_in_progress
= false;
4152 rec
->priority_time
= timeval_current();
4154 /* register a message port for sending memory dumps */
4155 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_MEM_DUMP
, mem_dump_handler
, rec
);
4157 /* register a message port for requesting logs */
4158 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_GETLOG
, getlog_handler
, rec
);
4160 /* register a message port for clearing logs */
4161 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_CLEARLOG
, clearlog_handler
, rec
);
4163 /* register a message port for recovery elections */
4164 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_RECOVERY
, election_handler
, rec
);
4166 /* when nodes are disabled/enabled */
4167 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_SET_NODE_FLAGS
, monitor_handler
, rec
);
4169 /* when we are asked to puch out a flag change */
4170 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_PUSH_NODE_FLAGS
, push_flags_handler
, rec
);
4172 /* register a message port for vacuum fetch */
4173 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_VACUUM_FETCH
, vacuum_fetch_handler
, rec
);
4175 /* register a message port for reloadnodes */
4176 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_RELOAD_NODES
, reload_nodes_handler
, rec
);
4178 /* register a message port for performing a takeover run */
4179 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_TAKEOVER_RUN
, ip_reallocate_handler
, rec
);
4181 /* register a message port for disabling the ip check for a short while */
4182 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_DISABLE_IP_CHECK
, disable_ip_check_handler
, rec
);
4184 /* register a message port for updating the recovery daemons node assignment for an ip */
4185 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_RECD_UPDATE_IP
, recd_update_ip_handler
, rec
);
4187 /* register a message port for forcing a rebalance of a node next
4189 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_REBALANCE_NODE
, recd_node_rebalance_handler
, rec
);
4191 /* Register a message port for disabling takeover runs */
4192 ctdb_client_set_message_handler(ctdb
,
4193 CTDB_SRVID_DISABLE_TAKEOVER_RUNS
,
4194 disable_takeover_runs_handler
, rec
);
4196 /* register a message port for detaching database */
4197 ctdb_client_set_message_handler(ctdb
,
4198 CTDB_SRVID_DETACH_DATABASE
,
4199 detach_database_handler
, rec
);
4202 TALLOC_CTX
*mem_ctx
= talloc_new(ctdb
);
4203 struct timeval start
;
4207 DEBUG(DEBUG_CRIT
,(__location__
4208 " Failed to create temp context\n"));
4212 start
= timeval_current();
4213 main_loop(ctdb
, rec
, mem_ctx
);
4214 talloc_free(mem_ctx
);
4216 /* we only check for recovery once every second */
4217 elapsed
= timeval_elapsed(&start
);
4218 if (elapsed
< ctdb
->tunable
.recover_interval
) {
4219 ctdb_wait_timeout(ctdb
, ctdb
->tunable
.recover_interval
4226 event handler for when the main ctdbd dies
4228 static void ctdb_recoverd_parent(struct event_context
*ev
, struct fd_event
*fde
,
4229 uint16_t flags
, void *private_data
)
4231 DEBUG(DEBUG_ALERT
,("recovery daemon parent died - exiting\n"));
4236 called regularly to verify that the recovery daemon is still running
4238 static void ctdb_check_recd(struct event_context
*ev
, struct timed_event
*te
,
4239 struct timeval yt
, void *p
)
4241 struct ctdb_context
*ctdb
= talloc_get_type(p
, struct ctdb_context
);
4243 if (ctdb_kill(ctdb
, ctdb
->recoverd_pid
, 0) != 0) {
4244 DEBUG(DEBUG_ERR
,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb
->recoverd_pid
));
4246 event_add_timed(ctdb
->ev
, ctdb
, timeval_zero(),
4247 ctdb_restart_recd
, ctdb
);
4252 event_add_timed(ctdb
->ev
, ctdb
->recd_ctx
,
4253 timeval_current_ofs(30, 0),
4254 ctdb_check_recd
, ctdb
);
4257 static void recd_sig_child_handler(struct event_context
*ev
,
4258 struct signal_event
*se
, int signum
, int count
,
4262 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4267 pid
= waitpid(-1, &status
, WNOHANG
);
4269 if (errno
!= ECHILD
) {
4270 DEBUG(DEBUG_ERR
, (__location__
" waitpid() returned error. errno:%s(%d)\n", strerror(errno
),errno
));
4275 DEBUG(DEBUG_DEBUG
, ("RECD SIGCHLD from %d\n", (int)pid
));
4281 startup the recovery daemon as a child of the main ctdb daemon
4283 int ctdb_start_recoverd(struct ctdb_context
*ctdb
)
4286 struct signal_event
*se
;
4287 struct tevent_fd
*fde
;
4289 if (pipe(fd
) != 0) {
4293 ctdb
->recoverd_pid
= ctdb_fork_no_free_ringbuffer(ctdb
);
4294 if (ctdb
->recoverd_pid
== -1) {
4298 if (ctdb
->recoverd_pid
!= 0) {
4299 talloc_free(ctdb
->recd_ctx
);
4300 ctdb
->recd_ctx
= talloc_new(ctdb
);
4301 CTDB_NO_MEMORY(ctdb
, ctdb
->recd_ctx
);
4304 event_add_timed(ctdb
->ev
, ctdb
->recd_ctx
,
4305 timeval_current_ofs(30, 0),
4306 ctdb_check_recd
, ctdb
);
4312 srandom(getpid() ^ time(NULL
));
4314 /* Clear the log ringbuffer */
4315 ctdb_clear_log(ctdb
);
4317 ctdb_set_process_name("ctdb_recovered");
4318 if (switch_from_server_to_client(ctdb
, "recoverd") != 0) {
4319 DEBUG(DEBUG_CRIT
, (__location__
"ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
4323 DEBUG(DEBUG_DEBUG
, (__location__
" Created PIPE FD:%d to recovery daemon\n", fd
[0]));
4325 fde
= event_add_fd(ctdb
->ev
, ctdb
, fd
[0], EVENT_FD_READ
,
4326 ctdb_recoverd_parent
, &fd
[0]);
4327 tevent_fd_set_auto_close(fde
);
4329 /* set up a handler to pick up sigchld */
4330 se
= event_add_signal(ctdb
->ev
, ctdb
,
4332 recd_sig_child_handler
,
4335 DEBUG(DEBUG_CRIT
,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
4339 monitor_cluster(ctdb
);
4341 DEBUG(DEBUG_ALERT
,("ERROR: ctdb_recoverd finished!?\n"));
4346 shutdown the recovery daemon
4348 void ctdb_stop_recoverd(struct ctdb_context
*ctdb
)
4350 if (ctdb
->recoverd_pid
== 0) {
4354 DEBUG(DEBUG_NOTICE
,("Shutting down recovery daemon\n"));
4355 ctdb_kill(ctdb
, ctdb
->recoverd_pid
, SIGTERM
);
4357 TALLOC_FREE(ctdb
->recd_ctx
);
4358 TALLOC_FREE(ctdb
->recd_ping_count
);
4361 static void ctdb_restart_recd(struct event_context
*ev
, struct timed_event
*te
,
4362 struct timeval t
, void *private_data
)
4364 struct ctdb_context
*ctdb
= talloc_get_type(private_data
, struct ctdb_context
);
4366 DEBUG(DEBUG_ERR
,("Restarting recovery daemon\n"));
4367 ctdb_stop_recoverd(ctdb
);
4368 ctdb_start_recoverd(ctdb
);