4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "system/filesys.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/wait.h"
27 #include "../include/ctdb_client.h"
28 #include "../include/ctdb_private.h"
29 #include "lib/tdb_wrap/tdb_wrap.h"
30 #include "lib/util/dlinklist.h"
33 /* List of SRVID requests that need to be processed */
35 struct srvid_list
*next
, *prev
;
36 struct srvid_request
*request
;
39 struct srvid_requests
{
40 struct srvid_list
*requests
;
43 static void srvid_request_reply(struct ctdb_context
*ctdb
,
44 struct srvid_request
*request
,
47 /* Someone that sent srvid==0 does not want a reply */
48 if (request
->srvid
== 0) {
53 if (ctdb_client_send_message(ctdb
, request
->pnn
, request
->srvid
,
55 DEBUG(DEBUG_INFO
,("Sent SRVID reply to %u:%llu\n",
56 (unsigned)request
->pnn
,
57 (unsigned long long)request
->srvid
));
59 DEBUG(DEBUG_ERR
,("Failed to send SRVID reply to %u:%llu\n",
60 (unsigned)request
->pnn
,
61 (unsigned long long)request
->srvid
));
67 static void srvid_requests_reply(struct ctdb_context
*ctdb
,
68 struct srvid_requests
**requests
,
73 for (r
= (*requests
)->requests
; r
!= NULL
; r
= r
->next
) {
74 srvid_request_reply(ctdb
, r
->request
, result
);
77 /* Free the list structure... */
78 TALLOC_FREE(*requests
);
81 static void srvid_request_add(struct ctdb_context
*ctdb
,
82 struct srvid_requests
**requests
,
83 struct srvid_request
*request
)
89 if (*requests
== NULL
) {
90 *requests
= talloc_zero(ctdb
, struct srvid_requests
);
91 if (*requests
== NULL
) {
96 t
= talloc_zero(*requests
, struct srvid_list
);
98 /* If *requests was just allocated above then free it */
99 if ((*requests
)->requests
== NULL
) {
100 TALLOC_FREE(*requests
);
105 t
->request
= (struct srvid_request
*)talloc_steal(t
, request
);
106 DLIST_ADD((*requests
)->requests
, t
);
111 /* Failed to add the request to the list. Send a fail. */
112 DEBUG(DEBUG_ERR
, (__location__
113 " Out of memory, failed to queue SRVID request\n"));
115 result
.dsize
= sizeof(ret
);
116 result
.dptr
= (uint8_t *)&ret
;
117 srvid_request_reply(ctdb
, request
, result
);
120 /* An abstraction to allow an operation (takeover runs, recoveries,
121 * ...) to be disabled for a given timeout */
122 struct ctdb_op_state
{
123 struct tevent_timer
*timer
;
128 static struct ctdb_op_state
*ctdb_op_init(TALLOC_CTX
*mem_ctx
, const char *name
)
130 struct ctdb_op_state
*state
= talloc_zero(mem_ctx
, struct ctdb_op_state
);
133 state
->in_progress
= false;
140 static bool ctdb_op_is_disabled(struct ctdb_op_state
*state
)
142 return state
->timer
!= NULL
;
145 static bool ctdb_op_begin(struct ctdb_op_state
*state
)
147 if (ctdb_op_is_disabled(state
)) {
149 ("Unable to begin - %s are disabled\n", state
->name
));
153 state
->in_progress
= true;
157 static bool ctdb_op_end(struct ctdb_op_state
*state
)
159 return state
->in_progress
= false;
162 static bool ctdb_op_is_in_progress(struct ctdb_op_state
*state
)
164 return state
->in_progress
;
167 static void ctdb_op_enable(struct ctdb_op_state
*state
)
169 TALLOC_FREE(state
->timer
);
172 static void ctdb_op_timeout_handler(struct event_context
*ev
,
173 struct timed_event
*te
,
174 struct timeval yt
, void *p
)
176 struct ctdb_op_state
*state
=
177 talloc_get_type(p
, struct ctdb_op_state
);
179 DEBUG(DEBUG_NOTICE
,("Reenabling %s after timeout\n", state
->name
));
180 ctdb_op_enable(state
);
183 static int ctdb_op_disable(struct ctdb_op_state
*state
,
184 struct tevent_context
*ev
,
188 DEBUG(DEBUG_NOTICE
,("Reenabling %s\n", state
->name
));
189 ctdb_op_enable(state
);
193 if (state
->in_progress
) {
195 ("Unable to disable %s - in progress\n", state
->name
));
199 DEBUG(DEBUG_NOTICE
,("Disabling %s for %u seconds\n",
200 state
->name
, timeout
));
202 /* Clear any old timers */
203 talloc_free(state
->timer
);
205 /* Arrange for the timeout to occur */
206 state
->timer
= tevent_add_timer(ev
, state
,
207 timeval_current_ofs(timeout
, 0),
208 ctdb_op_timeout_handler
, state
);
209 if (state
->timer
== NULL
) {
210 DEBUG(DEBUG_ERR
,(__location__
" Unable to setup timer\n"));
217 struct ctdb_banning_state
{
219 struct timeval last_reported_time
;
223 private state of recovery daemon
225 struct ctdb_recoverd
{
226 struct ctdb_context
*ctdb
;
228 uint32_t last_culprit_node
;
229 struct ctdb_node_map
*nodemap
;
230 struct timeval priority_time
;
231 bool need_takeover_run
;
234 struct timed_event
*send_election_te
;
235 struct timed_event
*election_timeout
;
236 struct vacuum_info
*vacuum_info
;
237 struct srvid_requests
*reallocate_requests
;
238 struct ctdb_op_state
*takeover_run
;
239 struct ctdb_op_state
*recovery
;
240 struct ctdb_control_get_ifaces
*ifaces
;
241 uint32_t *force_rebalance_nodes
;
242 struct ctdb_node_capabilities
*caps
;
245 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
246 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
248 static void ctdb_restart_recd(struct event_context
*ev
, struct timed_event
*te
, struct timeval t
, void *private_data
);
251 ban a node for a period of time
253 static void ctdb_ban_node(struct ctdb_recoverd
*rec
, uint32_t pnn
, uint32_t ban_time
)
256 struct ctdb_context
*ctdb
= rec
->ctdb
;
257 struct ctdb_ban_time bantime
;
259 if (!ctdb_validate_pnn(ctdb
, pnn
)) {
260 DEBUG(DEBUG_ERR
,("Bad pnn %u in ctdb_ban_node\n", pnn
));
264 DEBUG(DEBUG_NOTICE
,("Banning node %u for %u seconds\n", pnn
, ban_time
));
267 bantime
.time
= ban_time
;
269 ret
= ctdb_ctrl_set_ban(ctdb
, CONTROL_TIMEOUT(), pnn
, &bantime
);
271 DEBUG(DEBUG_ERR
,(__location__
" Failed to ban node %d\n", pnn
));
277 enum monitor_result
{ MONITOR_OK
, MONITOR_RECOVERY_NEEDED
, MONITOR_ELECTION_NEEDED
, MONITOR_FAILED
};
281 remember the trouble maker
283 static void ctdb_set_culprit_count(struct ctdb_recoverd
*rec
, uint32_t culprit
, uint32_t count
)
285 struct ctdb_context
*ctdb
= talloc_get_type(rec
->ctdb
, struct ctdb_context
);
286 struct ctdb_banning_state
*ban_state
;
288 if (culprit
> ctdb
->num_nodes
) {
289 DEBUG(DEBUG_ERR
,("Trying to set culprit %d but num_nodes is %d\n", culprit
, ctdb
->num_nodes
));
293 /* If we are banned or stopped, do not set other nodes as culprits */
294 if (rec
->node_flags
& NODE_FLAGS_INACTIVE
) {
295 DEBUG(DEBUG_NOTICE
, ("This node is INACTIVE, cannot set culprit node %d\n", culprit
));
299 if (ctdb
->nodes
[culprit
]->ban_state
== NULL
) {
300 ctdb
->nodes
[culprit
]->ban_state
= talloc_zero(ctdb
->nodes
[culprit
], struct ctdb_banning_state
);
301 CTDB_NO_MEMORY_VOID(ctdb
, ctdb
->nodes
[culprit
]->ban_state
);
305 ban_state
= ctdb
->nodes
[culprit
]->ban_state
;
306 if (timeval_elapsed(&ban_state
->last_reported_time
) > ctdb
->tunable
.recovery_grace_period
) {
307 /* this was the first time in a long while this node
308 misbehaved so we will forgive any old transgressions.
310 ban_state
->count
= 0;
313 ban_state
->count
+= count
;
314 ban_state
->last_reported_time
= timeval_current();
315 rec
->last_culprit_node
= culprit
;
319 remember the trouble maker
321 static void ctdb_set_culprit(struct ctdb_recoverd
*rec
, uint32_t culprit
)
323 ctdb_set_culprit_count(rec
, culprit
, 1);
327 /* this callback is called for every node that failed to execute the
330 static void recovered_fail_callback(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
332 struct ctdb_recoverd
*rec
= talloc_get_type(callback_data
, struct ctdb_recoverd
);
334 DEBUG(DEBUG_ERR
, (__location__
" Node %u failed the recovered event. Setting it as recovery fail culprit\n", node_pnn
));
336 ctdb_set_culprit(rec
, node_pnn
);
340 run the "recovered" eventscript on all nodes
342 static int run_recovered_eventscript(struct ctdb_recoverd
*rec
, struct ctdb_node_map
*nodemap
, const char *caller
)
346 struct ctdb_context
*ctdb
= rec
->ctdb
;
348 tmp_ctx
= talloc_new(ctdb
);
349 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
351 nodes
= list_of_active_nodes(ctdb
, nodemap
, tmp_ctx
, true);
352 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_END_RECOVERY
,
354 CONTROL_TIMEOUT(), false, tdb_null
,
355 NULL
, recovered_fail_callback
,
357 DEBUG(DEBUG_ERR
, (__location__
" Unable to run the 'recovered' event when called from %s\n", caller
));
359 talloc_free(tmp_ctx
);
363 talloc_free(tmp_ctx
);
367 /* this callback is called for every node that failed to execute the
370 static void startrecovery_fail_callback(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
372 struct ctdb_recoverd
*rec
= talloc_get_type(callback_data
, struct ctdb_recoverd
);
374 DEBUG(DEBUG_ERR
, (__location__
" Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn
));
376 ctdb_set_culprit(rec
, node_pnn
);
380 run the "startrecovery" eventscript on all nodes
382 static int run_startrecovery_eventscript(struct ctdb_recoverd
*rec
, struct ctdb_node_map
*nodemap
)
386 struct ctdb_context
*ctdb
= rec
->ctdb
;
388 tmp_ctx
= talloc_new(ctdb
);
389 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
391 nodes
= list_of_active_nodes(ctdb
, nodemap
, tmp_ctx
, true);
392 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_START_RECOVERY
,
394 CONTROL_TIMEOUT(), false, tdb_null
,
396 startrecovery_fail_callback
,
398 DEBUG(DEBUG_ERR
, (__location__
" Unable to run the 'startrecovery' event. Recovery failed.\n"));
399 talloc_free(tmp_ctx
);
403 talloc_free(tmp_ctx
);
408 update the node capabilities for all connected nodes
410 static int update_capabilities(struct ctdb_recoverd
*rec
,
411 struct ctdb_node_map
*nodemap
)
415 struct ctdb_node_capabilities
*caps
;
416 struct ctdb_context
*ctdb
= rec
->ctdb
;
418 tmp_ctx
= talloc_new(rec
);
419 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
421 caps
= ctdb_get_capabilities(ctdb
, tmp_ctx
,
422 CONTROL_TIMEOUT(), nodemap
);
426 (__location__
" Failed to get node capabilities\n"));
427 talloc_free(tmp_ctx
);
431 capp
= ctdb_get_node_capabilities(caps
, ctdb_get_pnn(ctdb
));
435 " Capabilities don't include current node.\n"));
436 talloc_free(tmp_ctx
);
439 ctdb
->capabilities
= *capp
;
441 TALLOC_FREE(rec
->caps
);
442 rec
->caps
= talloc_steal(rec
, caps
);
444 talloc_free(tmp_ctx
);
448 static void set_recmode_fail_callback(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
450 struct ctdb_recoverd
*rec
= talloc_get_type(callback_data
, struct ctdb_recoverd
);
452 DEBUG(DEBUG_ERR
,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn
, rec
->nodemap
->num
));
453 ctdb_set_culprit_count(rec
, node_pnn
, rec
->nodemap
->num
);
456 static void transaction_start_fail_callback(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
458 struct ctdb_recoverd
*rec
= talloc_get_type(callback_data
, struct ctdb_recoverd
);
460 DEBUG(DEBUG_ERR
,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn
, rec
->nodemap
->num
));
461 ctdb_set_culprit_count(rec
, node_pnn
, rec
->nodemap
->num
);
465 change recovery mode on all nodes
467 static int set_recovery_mode(struct ctdb_context
*ctdb
, struct ctdb_recoverd
*rec
, struct ctdb_node_map
*nodemap
, uint32_t rec_mode
)
473 tmp_ctx
= talloc_new(ctdb
);
474 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
476 nodes
= list_of_active_nodes(ctdb
, nodemap
, tmp_ctx
, true);
478 data
.dsize
= sizeof(uint32_t);
479 data
.dptr
= (unsigned char *)&rec_mode
;
481 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_SET_RECMODE
,
487 DEBUG(DEBUG_ERR
, (__location__
" Unable to set recovery mode. Recovery failed.\n"));
488 talloc_free(tmp_ctx
);
492 /* freeze all nodes */
493 if (rec_mode
== CTDB_RECOVERY_ACTIVE
) {
496 for (i
=1; i
<=NUM_DB_PRIORITIES
; i
++) {
497 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_FREEZE
,
502 set_recmode_fail_callback
,
504 DEBUG(DEBUG_ERR
, (__location__
" Unable to freeze nodes. Recovery failed.\n"));
505 talloc_free(tmp_ctx
);
511 talloc_free(tmp_ctx
);
516 change recovery master on all node
518 static int set_recovery_master(struct ctdb_context
*ctdb
, struct ctdb_node_map
*nodemap
, uint32_t pnn
)
524 tmp_ctx
= talloc_new(ctdb
);
525 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
527 data
.dsize
= sizeof(uint32_t);
528 data
.dptr
= (unsigned char *)&pnn
;
530 nodes
= list_of_active_nodes(ctdb
, nodemap
, tmp_ctx
, true);
531 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_SET_RECMASTER
,
533 CONTROL_TIMEOUT(), false, data
,
536 DEBUG(DEBUG_ERR
, (__location__
" Unable to set recmaster. Recovery failed.\n"));
537 talloc_free(tmp_ctx
);
541 talloc_free(tmp_ctx
);
545 /* update all remote nodes to use the same db priority that we have
546 this can fail if the remove node has not yet been upgraded to
547 support this function, so we always return success and never fail
548 a recovery if this call fails.
550 static int update_db_priority_on_remote_nodes(struct ctdb_context
*ctdb
,
551 struct ctdb_node_map
*nodemap
,
552 uint32_t pnn
, struct ctdb_dbid_map
*dbmap
, TALLOC_CTX
*mem_ctx
)
556 /* step through all local databases */
557 for (db
=0; db
<dbmap
->num
;db
++) {
558 struct ctdb_db_priority db_prio
;
561 db_prio
.db_id
= dbmap
->dbs
[db
].dbid
;
562 ret
= ctdb_ctrl_get_db_priority(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, dbmap
->dbs
[db
].dbid
, &db_prio
.priority
);
564 DEBUG(DEBUG_ERR
,(__location__
" Failed to read database priority from local node for db 0x%08x\n", dbmap
->dbs
[db
].dbid
));
568 DEBUG(DEBUG_INFO
,("Update DB priority for db 0x%08x to %u\n", dbmap
->dbs
[db
].dbid
, db_prio
.priority
));
570 ret
= ctdb_ctrl_set_db_priority(ctdb
, CONTROL_TIMEOUT(),
571 CTDB_CURRENT_NODE
, &db_prio
);
573 DEBUG(DEBUG_ERR
,(__location__
" Failed to set DB priority for 0x%08x\n",
582 ensure all other nodes have attached to any databases that we have
584 static int create_missing_remote_databases(struct ctdb_context
*ctdb
, struct ctdb_node_map
*nodemap
,
585 uint32_t pnn
, struct ctdb_dbid_map
*dbmap
, TALLOC_CTX
*mem_ctx
)
588 struct ctdb_dbid_map
*remote_dbmap
;
590 /* verify that all other nodes have all our databases */
591 for (j
=0; j
<nodemap
->num
; j
++) {
592 /* we dont need to ourself ourselves */
593 if (nodemap
->nodes
[j
].pnn
== pnn
) {
596 /* dont check nodes that are unavailable */
597 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
601 ret
= ctdb_ctrl_getdbmap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
602 mem_ctx
, &remote_dbmap
);
604 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbids from node %u\n", pnn
));
608 /* step through all local databases */
609 for (db
=0; db
<dbmap
->num
;db
++) {
613 for (i
=0;i
<remote_dbmap
->num
;i
++) {
614 if (dbmap
->dbs
[db
].dbid
== remote_dbmap
->dbs
[i
].dbid
) {
618 /* the remote node already have this database */
619 if (i
!=remote_dbmap
->num
) {
622 /* ok so we need to create this database */
623 ret
= ctdb_ctrl_getdbname(ctdb
, CONTROL_TIMEOUT(), pnn
,
624 dbmap
->dbs
[db
].dbid
, mem_ctx
,
627 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbname from node %u\n", pnn
));
630 ret
= ctdb_ctrl_createdb(ctdb
, CONTROL_TIMEOUT(),
631 nodemap
->nodes
[j
].pnn
,
633 dbmap
->dbs
[db
].flags
& CTDB_DB_FLAGS_PERSISTENT
);
635 DEBUG(DEBUG_ERR
, (__location__
" Unable to create remote db:%s\n", name
));
646 ensure we are attached to any databases that anyone else is attached to
648 static int create_missing_local_databases(struct ctdb_context
*ctdb
, struct ctdb_node_map
*nodemap
,
649 uint32_t pnn
, struct ctdb_dbid_map
**dbmap
, TALLOC_CTX
*mem_ctx
)
652 struct ctdb_dbid_map
*remote_dbmap
;
654 /* verify that we have all database any other node has */
655 for (j
=0; j
<nodemap
->num
; j
++) {
656 /* we dont need to ourself ourselves */
657 if (nodemap
->nodes
[j
].pnn
== pnn
) {
660 /* dont check nodes that are unavailable */
661 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
665 ret
= ctdb_ctrl_getdbmap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
666 mem_ctx
, &remote_dbmap
);
668 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbids from node %u\n", pnn
));
672 /* step through all databases on the remote node */
673 for (db
=0; db
<remote_dbmap
->num
;db
++) {
676 for (i
=0;i
<(*dbmap
)->num
;i
++) {
677 if (remote_dbmap
->dbs
[db
].dbid
== (*dbmap
)->dbs
[i
].dbid
) {
681 /* we already have this db locally */
682 if (i
!=(*dbmap
)->num
) {
685 /* ok so we need to create this database and
688 ctdb_ctrl_getdbname(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
689 remote_dbmap
->dbs
[db
].dbid
, mem_ctx
, &name
);
691 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbname from node %u\n",
692 nodemap
->nodes
[j
].pnn
));
695 ctdb_ctrl_createdb(ctdb
, CONTROL_TIMEOUT(), pnn
, mem_ctx
, name
,
696 remote_dbmap
->dbs
[db
].flags
& CTDB_DB_FLAGS_PERSISTENT
);
698 DEBUG(DEBUG_ERR
, (__location__
" Unable to create local db:%s\n", name
));
701 ret
= ctdb_ctrl_getdbmap(ctdb
, CONTROL_TIMEOUT(), pnn
, mem_ctx
, dbmap
);
703 DEBUG(DEBUG_ERR
, (__location__
" Unable to reread dbmap on node %u\n", pnn
));
714 pull the remote database contents from one node into the recdb
716 static int pull_one_remote_database(struct ctdb_context
*ctdb
, uint32_t srcnode
,
717 struct tdb_wrap
*recdb
, uint32_t dbid
)
721 struct ctdb_marshall_buffer
*reply
;
722 struct ctdb_rec_data
*recdata
;
724 TALLOC_CTX
*tmp_ctx
= talloc_new(recdb
);
726 ret
= ctdb_ctrl_pulldb(ctdb
, srcnode
, dbid
, CTDB_LMASTER_ANY
, tmp_ctx
,
727 CONTROL_TIMEOUT(), &outdata
);
729 DEBUG(DEBUG_ERR
,(__location__
" Unable to copy db from node %u\n", srcnode
));
730 talloc_free(tmp_ctx
);
734 reply
= (struct ctdb_marshall_buffer
*)outdata
.dptr
;
736 if (outdata
.dsize
< offsetof(struct ctdb_marshall_buffer
, data
)) {
737 DEBUG(DEBUG_ERR
,(__location__
" invalid data in pulldb reply\n"));
738 talloc_free(tmp_ctx
);
742 recdata
= (struct ctdb_rec_data
*)&reply
->data
[0];
746 recdata
= (struct ctdb_rec_data
*)(recdata
->length
+ (uint8_t *)recdata
), i
++) {
748 struct ctdb_ltdb_header
*hdr
;
751 key
.dptr
= &recdata
->data
[0];
752 key
.dsize
= recdata
->keylen
;
753 data
.dptr
= &recdata
->data
[key
.dsize
];
754 data
.dsize
= recdata
->datalen
;
756 hdr
= (struct ctdb_ltdb_header
*)data
.dptr
;
758 if (data
.dsize
< sizeof(struct ctdb_ltdb_header
)) {
759 DEBUG(DEBUG_CRIT
,(__location__
" bad ltdb record\n"));
760 talloc_free(tmp_ctx
);
764 /* fetch the existing record, if any */
765 existing
= tdb_fetch(recdb
->tdb
, key
);
767 if (existing
.dptr
!= NULL
) {
768 struct ctdb_ltdb_header header
;
769 if (existing
.dsize
< sizeof(struct ctdb_ltdb_header
)) {
770 DEBUG(DEBUG_CRIT
,(__location__
" Bad record size %u from node %u\n",
771 (unsigned)existing
.dsize
, srcnode
));
773 talloc_free(tmp_ctx
);
776 header
= *(struct ctdb_ltdb_header
*)existing
.dptr
;
778 if (!(header
.rsn
< hdr
->rsn
||
779 (header
.dmaster
!= ctdb_get_pnn(ctdb
) &&
780 header
.rsn
== hdr
->rsn
))) {
785 if (tdb_store(recdb
->tdb
, key
, data
, TDB_REPLACE
) != 0) {
786 DEBUG(DEBUG_CRIT
,(__location__
" Failed to store record\n"));
787 talloc_free(tmp_ctx
);
792 talloc_free(tmp_ctx
);
798 struct pull_seqnum_cbdata
{
804 static void pull_seqnum_cb(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
806 struct pull_seqnum_cbdata
*cb_data
= talloc_get_type(callback_data
, struct pull_seqnum_cbdata
);
809 if (cb_data
->failed
!= 0) {
810 DEBUG(DEBUG_ERR
, ("Got seqnum from node %d but we have already failed the entire operation\n", node_pnn
));
815 DEBUG(DEBUG_ERR
, ("Error when pulling seqnum from node %d\n", node_pnn
));
820 if (outdata
.dsize
!= sizeof(uint64_t)) {
821 DEBUG(DEBUG_ERR
, ("Error when reading pull seqnum from node %d, got %d bytes but expected %d\n", node_pnn
, (int)outdata
.dsize
, (int)sizeof(uint64_t)));
822 cb_data
->failed
= -1;
826 seqnum
= *((uint64_t *)outdata
.dptr
);
828 if (seqnum
> cb_data
->seqnum
||
829 (cb_data
->pnn
== -1 && seqnum
== 0)) {
830 cb_data
->seqnum
= seqnum
;
831 cb_data
->pnn
= node_pnn
;
835 static void pull_seqnum_fail_cb(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
837 struct pull_seqnum_cbdata
*cb_data
= talloc_get_type(callback_data
, struct pull_seqnum_cbdata
);
839 DEBUG(DEBUG_ERR
, ("Failed to pull db seqnum from node %d\n", node_pnn
));
843 static int pull_highest_seqnum_pdb(struct ctdb_context
*ctdb
,
844 struct ctdb_recoverd
*rec
,
845 struct ctdb_node_map
*nodemap
,
846 struct tdb_wrap
*recdb
, uint32_t dbid
)
848 TALLOC_CTX
*tmp_ctx
= talloc_new(NULL
);
852 struct pull_seqnum_cbdata
*cb_data
;
854 DEBUG(DEBUG_NOTICE
, ("Scan for highest seqnum pdb for db:0x%08x\n", dbid
));
859 data
.dsize
= sizeof(outdata
);
860 data
.dptr
= (uint8_t *)&outdata
[0];
862 cb_data
= talloc(tmp_ctx
, struct pull_seqnum_cbdata
);
863 if (cb_data
== NULL
) {
864 DEBUG(DEBUG_ERR
, ("Failed to allocate pull highest seqnum cb_data structure\n"));
865 talloc_free(tmp_ctx
);
873 nodes
= list_of_active_nodes(ctdb
, nodemap
, tmp_ctx
, true);
874 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_GET_DB_SEQNUM
,
876 CONTROL_TIMEOUT(), false, data
,
880 DEBUG(DEBUG_ERR
, (__location__
" Failed to run async GET_DB_SEQNUM\n"));
882 talloc_free(tmp_ctx
);
886 if (cb_data
->failed
!= 0) {
887 DEBUG(DEBUG_NOTICE
, ("Failed to pull sequence numbers for DB 0x%08x\n", dbid
));
888 talloc_free(tmp_ctx
);
892 if (cb_data
->pnn
== -1) {
893 DEBUG(DEBUG_NOTICE
, ("Failed to find a node with highest sequence numbers for DB 0x%08x\n", dbid
));
894 talloc_free(tmp_ctx
);
898 DEBUG(DEBUG_NOTICE
, ("Pull persistent db:0x%08x from node %d with highest seqnum:%lld\n", dbid
, cb_data
->pnn
, (long long)cb_data
->seqnum
));
900 if (pull_one_remote_database(ctdb
, cb_data
->pnn
, recdb
, dbid
) != 0) {
901 DEBUG(DEBUG_ERR
, ("Failed to pull higest seqnum database 0x%08x from node %d\n", dbid
, cb_data
->pnn
));
902 talloc_free(tmp_ctx
);
906 talloc_free(tmp_ctx
);
912 pull all the remote database contents into the recdb
914 static int pull_remote_database(struct ctdb_context
*ctdb
,
915 struct ctdb_recoverd
*rec
,
916 struct ctdb_node_map
*nodemap
,
917 struct tdb_wrap
*recdb
, uint32_t dbid
,
922 if (persistent
&& ctdb
->tunable
.recover_pdb_by_seqnum
!= 0) {
924 ret
= pull_highest_seqnum_pdb(ctdb
, rec
, nodemap
, recdb
, dbid
);
930 /* pull all records from all other nodes across onto this node
931 (this merges based on rsn)
933 for (j
=0; j
<nodemap
->num
; j
++) {
934 /* dont merge from nodes that are unavailable */
935 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
938 if (pull_one_remote_database(ctdb
, nodemap
->nodes
[j
].pnn
, recdb
, dbid
) != 0) {
939 DEBUG(DEBUG_ERR
,(__location__
" Failed to pull remote database from node %u\n",
940 nodemap
->nodes
[j
].pnn
));
941 ctdb_set_culprit_count(rec
, nodemap
->nodes
[j
].pnn
, nodemap
->num
);
951 update flags on all active nodes
953 static int update_flags_on_all_nodes(struct ctdb_context
*ctdb
, struct ctdb_node_map
*nodemap
, uint32_t pnn
, uint32_t flags
)
957 ret
= ctdb_ctrl_modflags(ctdb
, CONTROL_TIMEOUT(), pnn
, flags
, ~flags
);
959 DEBUG(DEBUG_ERR
, (__location__
" Unable to update nodeflags on remote nodes\n"));
967 ensure all nodes have the same vnnmap we do
969 static int update_vnnmap_on_all_nodes(struct ctdb_context
*ctdb
, struct ctdb_node_map
*nodemap
,
970 uint32_t pnn
, struct ctdb_vnn_map
*vnnmap
, TALLOC_CTX
*mem_ctx
)
974 /* push the new vnn map out to all the nodes */
975 for (j
=0; j
<nodemap
->num
; j
++) {
976 /* dont push to nodes that are unavailable */
977 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
981 ret
= ctdb_ctrl_setvnnmap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
, mem_ctx
, vnnmap
);
983 DEBUG(DEBUG_ERR
, (__location__
" Unable to set vnnmap for node %u\n", pnn
));
993 struct vacuum_info
*next
, *prev
;
994 struct ctdb_recoverd
*rec
;
996 struct ctdb_db_context
*ctdb_db
;
997 struct ctdb_marshall_buffer
*recs
;
998 struct ctdb_rec_data
*r
;
1003 called when a vacuum fetch has completed - just free it and do the next one
1005 static void vacuum_fetch_callback(struct ctdb_client_call_state
*state
)
1012 process the next element from the vacuum list
1014 static void vacuum_fetch_next(struct vacuum_info
*v
)
1016 struct ctdb_call call
;
1017 struct ctdb_rec_data
*r
;
1019 while (v
->recs
->count
) {
1020 struct ctdb_client_call_state
*state
;
1022 struct ctdb_ltdb_header
*hdr
;
1025 call
.call_id
= CTDB_NULL_FUNC
;
1026 call
.flags
= CTDB_IMMEDIATE_MIGRATION
;
1027 call
.flags
|= CTDB_CALL_FLAG_VACUUM_MIGRATION
;
1030 v
->r
= (struct ctdb_rec_data
*)(r
->length
+ (uint8_t *)r
);
1033 call
.key
.dptr
= &r
->data
[0];
1034 call
.key
.dsize
= r
->keylen
;
1036 /* ensure we don't block this daemon - just skip a record if we can't get
1038 if (tdb_chainlock_nonblock(v
->ctdb_db
->ltdb
->tdb
, call
.key
) != 0) {
1042 data
= tdb_fetch(v
->ctdb_db
->ltdb
->tdb
, call
.key
);
1043 if (data
.dptr
== NULL
) {
1044 tdb_chainunlock(v
->ctdb_db
->ltdb
->tdb
, call
.key
);
1048 if (data
.dsize
< sizeof(struct ctdb_ltdb_header
)) {
1050 tdb_chainunlock(v
->ctdb_db
->ltdb
->tdb
, call
.key
);
1054 hdr
= (struct ctdb_ltdb_header
*)data
.dptr
;
1055 if (hdr
->dmaster
== v
->rec
->ctdb
->pnn
) {
1056 /* its already local */
1058 tdb_chainunlock(v
->ctdb_db
->ltdb
->tdb
, call
.key
);
1064 state
= ctdb_call_send(v
->ctdb_db
, &call
);
1065 tdb_chainunlock(v
->ctdb_db
->ltdb
->tdb
, call
.key
);
1066 if (state
== NULL
) {
1067 DEBUG(DEBUG_ERR
,(__location__
" Failed to setup vacuum fetch call\n"));
1071 state
->async
.fn
= vacuum_fetch_callback
;
1072 state
->async
.private_data
= NULL
;
1080 destroy a vacuum info structure
1082 static int vacuum_info_destructor(struct vacuum_info
*v
)
1084 DLIST_REMOVE(v
->rec
->vacuum_info
, v
);
1090 handler for vacuum fetch
1092 static void vacuum_fetch_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
1093 TDB_DATA data
, void *private_data
)
1095 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
, struct ctdb_recoverd
);
1096 struct ctdb_marshall_buffer
*recs
;
1098 TALLOC_CTX
*tmp_ctx
= talloc_new(ctdb
);
1100 struct ctdb_dbid_map
*dbmap
=NULL
;
1101 bool persistent
= false;
1102 struct ctdb_db_context
*ctdb_db
;
1103 struct ctdb_rec_data
*r
;
1105 struct vacuum_info
*v
;
1107 recs
= (struct ctdb_marshall_buffer
*)data
.dptr
;
1108 r
= (struct ctdb_rec_data
*)&recs
->data
[0];
1110 if (recs
->count
== 0) {
1111 talloc_free(tmp_ctx
);
1117 for (v
=rec
->vacuum_info
;v
;v
=v
->next
) {
1118 if (srcnode
== v
->srcnode
&& recs
->db_id
== v
->ctdb_db
->db_id
) {
1119 /* we're already working on records from this node */
1120 talloc_free(tmp_ctx
);
1125 /* work out if the database is persistent */
1126 ret
= ctdb_ctrl_getdbmap(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, tmp_ctx
, &dbmap
);
1128 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbids from local node\n"));
1129 talloc_free(tmp_ctx
);
1133 for (i
=0;i
<dbmap
->num
;i
++) {
1134 if (dbmap
->dbs
[i
].dbid
== recs
->db_id
) {
1135 persistent
= dbmap
->dbs
[i
].flags
& CTDB_DB_FLAGS_PERSISTENT
;
1139 if (i
== dbmap
->num
) {
1140 DEBUG(DEBUG_ERR
, (__location__
" Unable to find db_id 0x%x on local node\n", recs
->db_id
));
1141 talloc_free(tmp_ctx
);
1145 /* find the name of this database */
1146 if (ctdb_ctrl_getdbname(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, recs
->db_id
, tmp_ctx
, &name
) != 0) {
1147 DEBUG(DEBUG_ERR
,(__location__
" Failed to get name of db 0x%x\n", recs
->db_id
));
1148 talloc_free(tmp_ctx
);
1153 ctdb_db
= ctdb_attach(ctdb
, CONTROL_TIMEOUT(), name
, persistent
, 0);
1154 if (ctdb_db
== NULL
) {
1155 DEBUG(DEBUG_ERR
,(__location__
" Failed to attach to database '%s'\n", name
));
1156 talloc_free(tmp_ctx
);
1160 v
= talloc_zero(rec
, struct vacuum_info
);
1162 DEBUG(DEBUG_CRIT
,(__location__
" Out of memory\n"));
1163 talloc_free(tmp_ctx
);
1168 v
->srcnode
= srcnode
;
1169 v
->ctdb_db
= ctdb_db
;
1170 v
->recs
= talloc_memdup(v
, recs
, data
.dsize
);
1171 if (v
->recs
== NULL
) {
1172 DEBUG(DEBUG_CRIT
,(__location__
" Out of memory\n"));
1174 talloc_free(tmp_ctx
);
1177 v
->r
= (struct ctdb_rec_data
*)&v
->recs
->data
[0];
1179 DLIST_ADD(rec
->vacuum_info
, v
);
1181 talloc_set_destructor(v
, vacuum_info_destructor
);
1183 vacuum_fetch_next(v
);
1184 talloc_free(tmp_ctx
);
1189 * handler for database detach
1191 static void detach_database_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
1192 TDB_DATA data
, void *private_data
)
1194 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
,
1195 struct ctdb_recoverd
);
1197 struct vacuum_info
*v
, *vnext
;
1198 struct ctdb_db_context
*ctdb_db
;
1200 if (data
.dsize
!= sizeof(db_id
)) {
1203 db_id
= *(uint32_t *)data
.dptr
;
1205 ctdb_db
= find_ctdb_db(ctdb
, db_id
);
1206 if (ctdb_db
== NULL
) {
1207 /* database is not attached */
1211 /* Stop any active vacuum fetch */
1212 v
= rec
->vacuum_info
;
1216 if (v
->ctdb_db
->db_id
== db_id
) {
1222 DLIST_REMOVE(ctdb
->db_list
, ctdb_db
);
1224 DEBUG(DEBUG_NOTICE
, ("Detached from database '%s'\n",
1226 talloc_free(ctdb_db
);
1230 called when ctdb_wait_timeout should finish
1232 static void ctdb_wait_handler(struct event_context
*ev
, struct timed_event
*te
,
1233 struct timeval yt
, void *p
)
1235 uint32_t *timed_out
= (uint32_t *)p
;
1240 wait for a given number of seconds
1242 static void ctdb_wait_timeout(struct ctdb_context
*ctdb
, double secs
)
1244 uint32_t timed_out
= 0;
1245 time_t usecs
= (secs
- (time_t)secs
) * 1000000;
1246 event_add_timed(ctdb
->ev
, ctdb
, timeval_current_ofs(secs
, usecs
), ctdb_wait_handler
, &timed_out
);
1247 while (!timed_out
) {
1248 event_loop_once(ctdb
->ev
);
1253 called when an election times out (ends)
1255 static void ctdb_election_timeout(struct event_context
*ev
, struct timed_event
*te
,
1256 struct timeval t
, void *p
)
1258 struct ctdb_recoverd
*rec
= talloc_get_type(p
, struct ctdb_recoverd
);
1259 rec
->election_timeout
= NULL
;
1262 DEBUG(DEBUG_WARNING
,("Election period ended\n"));
1267 wait for an election to finish. It finished election_timeout seconds after
1268 the last election packet is received
1270 static void ctdb_wait_election(struct ctdb_recoverd
*rec
)
1272 struct ctdb_context
*ctdb
= rec
->ctdb
;
1273 while (rec
->election_timeout
) {
1274 event_loop_once(ctdb
->ev
);
1279 Update our local flags from all remote connected nodes.
1280 This is only run when we are or we belive we are the recovery master
1282 static int update_local_flags(struct ctdb_recoverd
*rec
, struct ctdb_node_map
*nodemap
)
1285 struct ctdb_context
*ctdb
= rec
->ctdb
;
1286 TALLOC_CTX
*mem_ctx
= talloc_new(ctdb
);
1288 /* get the nodemap for all active remote nodes and verify
1289 they are the same as for this node
1291 for (j
=0; j
<nodemap
->num
; j
++) {
1292 struct ctdb_node_map
*remote_nodemap
=NULL
;
1295 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_DISCONNECTED
) {
1298 if (nodemap
->nodes
[j
].pnn
== ctdb
->pnn
) {
1302 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
1303 mem_ctx
, &remote_nodemap
);
1305 DEBUG(DEBUG_ERR
, (__location__
" Unable to get nodemap from remote node %u\n",
1306 nodemap
->nodes
[j
].pnn
));
1307 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
1308 talloc_free(mem_ctx
);
1309 return MONITOR_FAILED
;
1311 if (nodemap
->nodes
[j
].flags
!= remote_nodemap
->nodes
[j
].flags
) {
1312 /* We should tell our daemon about this so it
1313 updates its flags or else we will log the same
1314 message again in the next iteration of recovery.
1315 Since we are the recovery master we can just as
1316 well update the flags on all nodes.
1318 ret
= ctdb_ctrl_modflags(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
, remote_nodemap
->nodes
[j
].flags
, ~remote_nodemap
->nodes
[j
].flags
);
1320 DEBUG(DEBUG_ERR
, (__location__
" Unable to update nodeflags on remote nodes\n"));
1324 /* Update our local copy of the flags in the recovery
1327 DEBUG(DEBUG_NOTICE
,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
1328 nodemap
->nodes
[j
].pnn
, remote_nodemap
->nodes
[j
].flags
,
1329 nodemap
->nodes
[j
].flags
));
1330 nodemap
->nodes
[j
].flags
= remote_nodemap
->nodes
[j
].flags
;
1332 talloc_free(remote_nodemap
);
1334 talloc_free(mem_ctx
);
1339 /* Create a new random generation ip.
1340 The generation id can not be the INVALID_GENERATION id
1342 static uint32_t new_generation(void)
1344 uint32_t generation
;
1347 generation
= random();
1349 if (generation
!= INVALID_GENERATION
) {
1359 create a temporary working database
1361 static struct tdb_wrap
*create_recdb(struct ctdb_context
*ctdb
, TALLOC_CTX
*mem_ctx
)
1364 struct tdb_wrap
*recdb
;
1367 /* open up the temporary recovery database */
1368 name
= talloc_asprintf(mem_ctx
, "%s/recdb.tdb.%u",
1369 ctdb
->db_directory_state
,
1376 tdb_flags
= TDB_NOLOCK
;
1377 if (ctdb
->valgrinding
) {
1378 tdb_flags
|= TDB_NOMMAP
;
1380 tdb_flags
|= (TDB_INCOMPATIBLE_HASH
| TDB_DISALLOW_NESTING
);
1382 recdb
= tdb_wrap_open(mem_ctx
, name
, ctdb
->tunable
.database_hash_size
,
1383 tdb_flags
, O_RDWR
|O_CREAT
|O_EXCL
, 0600);
1384 if (recdb
== NULL
) {
1385 DEBUG(DEBUG_CRIT
,(__location__
" Failed to create temp recovery database '%s'\n", name
));
1395 a traverse function for pulling all relevant records from recdb
1398 struct ctdb_context
*ctdb
;
1399 struct ctdb_marshall_buffer
*recdata
;
1401 uint32_t allocated_len
;
1406 static int traverse_recdb(struct tdb_context
*tdb
, TDB_DATA key
, TDB_DATA data
, void *p
)
1408 struct recdb_data
*params
= (struct recdb_data
*)p
;
1409 struct ctdb_rec_data
*recdata
;
1410 struct ctdb_ltdb_header
*hdr
;
1413 * skip empty records - but NOT for persistent databases:
1415 * The record-by-record mode of recovery deletes empty records.
1416 * For persistent databases, this can lead to data corruption
1417 * by deleting records that should be there:
1419 * - Assume the cluster has been running for a while.
1421 * - A record R in a persistent database has been created and
1422 * deleted a couple of times, the last operation being deletion,
1423 * leaving an empty record with a high RSN, say 10.
1425 * - Now a node N is turned off.
1427 * - This leaves the local database copy of D on N with the empty
1428 * copy of R and RSN 10. On all other nodes, the recovery has deleted
1429 * the copy of record R.
1431 * - Now the record is created again while node N is turned off.
1432 * This creates R with RSN = 1 on all nodes except for N.
1434 * - Now node N is turned on again. The following recovery will chose
1435 * the older empty copy of R due to RSN 10 > RSN 1.
1437 * ==> Hence the record is gone after the recovery.
1439 * On databases like Samba's registry, this can damage the higher-level
1440 * data structures built from the various tdb-level records.
1442 if (!params
->persistent
&& data
.dsize
<= sizeof(struct ctdb_ltdb_header
)) {
1446 /* update the dmaster field to point to us */
1447 hdr
= (struct ctdb_ltdb_header
*)data
.dptr
;
1448 if (!params
->persistent
) {
1449 hdr
->dmaster
= params
->ctdb
->pnn
;
1450 hdr
->flags
|= CTDB_REC_FLAG_MIGRATED_WITH_DATA
;
1453 /* add the record to the blob ready to send to the nodes */
1454 recdata
= ctdb_marshall_record(params
->recdata
, 0, key
, NULL
, data
);
1455 if (recdata
== NULL
) {
1456 params
->failed
= true;
1459 if (params
->len
+ recdata
->length
>= params
->allocated_len
) {
1460 params
->allocated_len
= recdata
->length
+ params
->len
+ params
->ctdb
->tunable
.pulldb_preallocation_size
;
1461 params
->recdata
= talloc_realloc_size(NULL
, params
->recdata
, params
->allocated_len
);
1463 if (params
->recdata
== NULL
) {
1464 DEBUG(DEBUG_CRIT
,(__location__
" Failed to expand recdata to %u\n",
1465 recdata
->length
+ params
->len
));
1466 params
->failed
= true;
1469 params
->recdata
->count
++;
1470 memcpy(params
->len
+(uint8_t *)params
->recdata
, recdata
, recdata
->length
);
1471 params
->len
+= recdata
->length
;
1472 talloc_free(recdata
);
1478 push the recdb database out to all nodes
1480 static int push_recdb_database(struct ctdb_context
*ctdb
, uint32_t dbid
,
1482 struct tdb_wrap
*recdb
, struct ctdb_node_map
*nodemap
)
1484 struct recdb_data params
;
1485 struct ctdb_marshall_buffer
*recdata
;
1487 TALLOC_CTX
*tmp_ctx
;
1490 tmp_ctx
= talloc_new(ctdb
);
1491 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
1493 recdata
= talloc_zero(recdb
, struct ctdb_marshall_buffer
);
1494 CTDB_NO_MEMORY(ctdb
, recdata
);
1496 recdata
->db_id
= dbid
;
1499 params
.recdata
= recdata
;
1500 params
.len
= offsetof(struct ctdb_marshall_buffer
, data
);
1501 params
.allocated_len
= params
.len
;
1502 params
.failed
= false;
1503 params
.persistent
= persistent
;
1505 if (tdb_traverse_read(recdb
->tdb
, traverse_recdb
, ¶ms
) == -1) {
1506 DEBUG(DEBUG_ERR
,(__location__
" Failed to traverse recdb database\n"));
1507 talloc_free(params
.recdata
);
1508 talloc_free(tmp_ctx
);
1512 if (params
.failed
) {
1513 DEBUG(DEBUG_ERR
,(__location__
" Failed to traverse recdb database\n"));
1514 talloc_free(params
.recdata
);
1515 talloc_free(tmp_ctx
);
1519 recdata
= params
.recdata
;
1521 outdata
.dptr
= (void *)recdata
;
1522 outdata
.dsize
= params
.len
;
1524 nodes
= list_of_active_nodes(ctdb
, nodemap
, tmp_ctx
, true);
1525 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_PUSH_DB
,
1527 CONTROL_TIMEOUT(), false, outdata
,
1530 DEBUG(DEBUG_ERR
,(__location__
" Failed to push recdb records to nodes for db 0x%x\n", dbid
));
1531 talloc_free(recdata
);
1532 talloc_free(tmp_ctx
);
1536 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - pushed remote database 0x%x of size %u\n",
1537 dbid
, recdata
->count
));
1539 talloc_free(recdata
);
1540 talloc_free(tmp_ctx
);
1547 go through a full recovery on one database
1549 static int recover_database(struct ctdb_recoverd
*rec
,
1550 TALLOC_CTX
*mem_ctx
,
1554 struct ctdb_node_map
*nodemap
,
1555 uint32_t transaction_id
)
1557 struct tdb_wrap
*recdb
;
1559 struct ctdb_context
*ctdb
= rec
->ctdb
;
1561 struct ctdb_control_wipe_database w
;
1564 recdb
= create_recdb(ctdb
, mem_ctx
);
1565 if (recdb
== NULL
) {
1569 /* pull all remote databases onto the recdb */
1570 ret
= pull_remote_database(ctdb
, rec
, nodemap
, recdb
, dbid
, persistent
);
1572 DEBUG(DEBUG_ERR
, (__location__
" Unable to pull remote database 0x%x\n", dbid
));
1576 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - pulled remote database 0x%x\n", dbid
));
1578 /* wipe all the remote databases. This is safe as we are in a transaction */
1580 w
.transaction_id
= transaction_id
;
1582 data
.dptr
= (void *)&w
;
1583 data
.dsize
= sizeof(w
);
1585 nodes
= list_of_active_nodes(ctdb
, nodemap
, recdb
, true);
1586 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_WIPE_DATABASE
,
1588 CONTROL_TIMEOUT(), false, data
,
1591 DEBUG(DEBUG_ERR
, (__location__
" Unable to wipe database. Recovery failed.\n"));
1596 /* push out the correct database. This sets the dmaster and skips
1597 the empty records */
1598 ret
= push_recdb_database(ctdb
, dbid
, persistent
, recdb
, nodemap
);
1604 /* all done with this database */
1610 static int ctdb_reload_remote_public_ips(struct ctdb_context
*ctdb
,
1611 struct ctdb_recoverd
*rec
,
1612 struct ctdb_node_map
*nodemap
,
1618 if (ctdb
->num_nodes
!= nodemap
->num
) {
1619 DEBUG(DEBUG_ERR
, (__location__
" ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
1620 ctdb
->num_nodes
, nodemap
->num
));
1622 *culprit
= ctdb
->pnn
;
1627 for (j
=0; j
<nodemap
->num
; j
++) {
1628 /* For readability */
1629 struct ctdb_node
*node
= ctdb
->nodes
[j
];
1631 /* release any existing data */
1632 if (node
->known_public_ips
) {
1633 talloc_free(node
->known_public_ips
);
1634 node
->known_public_ips
= NULL
;
1636 if (node
->available_public_ips
) {
1637 talloc_free(node
->available_public_ips
);
1638 node
->available_public_ips
= NULL
;
1641 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
1645 /* Retrieve the list of known public IPs from the node */
1646 ret
= ctdb_ctrl_get_public_ips_flags(ctdb
,
1651 &node
->known_public_ips
);
1654 ("Failed to read known public IPs from node: %u\n",
1657 *culprit
= node
->pnn
;
1662 if (ctdb
->do_checkpublicip
&&
1663 !ctdb_op_is_disabled(rec
->takeover_run
) &&
1664 verify_remote_ip_allocation(ctdb
,
1665 node
->known_public_ips
,
1667 DEBUG(DEBUG_ERR
,("Trigger IP reallocation\n"));
1668 rec
->need_takeover_run
= true;
1671 /* Retrieve the list of available public IPs from the node */
1672 ret
= ctdb_ctrl_get_public_ips_flags(ctdb
,
1676 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE
,
1677 &node
->available_public_ips
);
1680 ("Failed to read available public IPs from node: %u\n",
1683 *culprit
= node
->pnn
;
1692 /* when we start a recovery, make sure all nodes use the same reclock file
1695 static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd
*rec
)
1697 struct ctdb_context
*ctdb
= rec
->ctdb
;
1698 TALLOC_CTX
*tmp_ctx
= talloc_new(NULL
);
1702 if (ctdb
->recovery_lock_file
== NULL
) {
1706 data
.dsize
= strlen(ctdb
->recovery_lock_file
) + 1;
1707 data
.dptr
= (uint8_t *)ctdb
->recovery_lock_file
;
1710 nodes
= list_of_active_nodes(ctdb
, rec
->nodemap
, tmp_ctx
, true);
1711 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_SET_RECLOCK_FILE
,
1717 DEBUG(DEBUG_ERR
, (__location__
" Failed to sync reclock file settings\n"));
1718 talloc_free(tmp_ctx
);
1722 talloc_free(tmp_ctx
);
1728 * this callback is called for every node that failed to execute ctdb_takeover_run()
1729 * and set flag to re-run takeover run.
1731 static void takeover_fail_callback(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
1733 DEBUG(DEBUG_ERR
, ("Node %u failed the takeover run\n", node_pnn
));
1735 if (callback_data
!= NULL
) {
1736 struct ctdb_recoverd
*rec
= talloc_get_type(callback_data
, struct ctdb_recoverd
);
1738 DEBUG(DEBUG_ERR
, ("Setting node %u as recovery fail culprit\n", node_pnn
));
1740 ctdb_set_culprit(rec
, node_pnn
);
1745 static void ban_misbehaving_nodes(struct ctdb_recoverd
*rec
, bool *self_ban
)
1747 struct ctdb_context
*ctdb
= rec
->ctdb
;
1749 struct ctdb_banning_state
*ban_state
;
1752 for (i
=0; i
<ctdb
->num_nodes
; i
++) {
1753 if (ctdb
->nodes
[i
]->ban_state
== NULL
) {
1756 ban_state
= (struct ctdb_banning_state
*)ctdb
->nodes
[i
]->ban_state
;
1757 if (ban_state
->count
< 2*ctdb
->num_nodes
) {
1761 DEBUG(DEBUG_NOTICE
,("Node %u reached %u banning credits - banning it for %u seconds\n",
1762 ctdb
->nodes
[i
]->pnn
, ban_state
->count
,
1763 ctdb
->tunable
.recovery_ban_period
));
1764 ctdb_ban_node(rec
, ctdb
->nodes
[i
]->pnn
, ctdb
->tunable
.recovery_ban_period
);
1765 ban_state
->count
= 0;
1767 /* Banning ourself? */
1768 if (ctdb
->nodes
[i
]->pnn
== rec
->ctdb
->pnn
) {
1774 static bool do_takeover_run(struct ctdb_recoverd
*rec
,
1775 struct ctdb_node_map
*nodemap
,
1776 bool banning_credits_on_fail
)
1778 uint32_t *nodes
= NULL
;
1779 struct srvid_request_data dtr
;
1782 uint32_t *rebalance_nodes
= rec
->force_rebalance_nodes
;
1786 DEBUG(DEBUG_NOTICE
, ("Takeover run starting\n"));
1788 if (ctdb_op_is_in_progress(rec
->takeover_run
)) {
1789 DEBUG(DEBUG_ERR
, (__location__
1790 " takeover run already in progress \n"));
1795 if (!ctdb_op_begin(rec
->takeover_run
)) {
1800 /* Disable IP checks (takeover runs, really) on other nodes
1801 * while doing this takeover run. This will stop those other
1802 * nodes from triggering takeover runs when think they should
1803 * be hosting an IP but it isn't yet on an interface. Don't
1804 * wait for replies since a failure here might cause some
1805 * noise in the logs but will not actually cause a problem.
1807 dtr
.srvid
= 0; /* No reply */
1810 data
.dptr
= (uint8_t*)&dtr
;
1811 data
.dsize
= sizeof(dtr
);
1813 nodes
= list_of_connected_nodes(rec
->ctdb
, nodemap
, rec
, false);
1815 /* Disable for 60 seconds. This can be a tunable later if
1819 for (i
= 0; i
< talloc_array_length(nodes
); i
++) {
1820 if (ctdb_client_send_message(rec
->ctdb
, nodes
[i
],
1821 CTDB_SRVID_DISABLE_TAKEOVER_RUNS
,
1823 DEBUG(DEBUG_INFO
,("Failed to disable takeover runs\n"));
1827 ret
= ctdb_takeover_run(rec
->ctdb
, nodemap
,
1828 rec
->force_rebalance_nodes
,
1829 takeover_fail_callback
,
1830 banning_credits_on_fail
? rec
: NULL
);
1832 /* Reenable takeover runs and IP checks on other nodes */
1834 for (i
= 0; i
< talloc_array_length(nodes
); i
++) {
1835 if (ctdb_client_send_message(rec
->ctdb
, nodes
[i
],
1836 CTDB_SRVID_DISABLE_TAKEOVER_RUNS
,
1838 DEBUG(DEBUG_INFO
,("Failed to reenable takeover runs\n"));
1843 DEBUG(DEBUG_ERR
, ("ctdb_takeover_run() failed\n"));
1849 /* Takeover run was successful so clear force rebalance targets */
1850 if (rebalance_nodes
== rec
->force_rebalance_nodes
) {
1851 TALLOC_FREE(rec
->force_rebalance_nodes
);
1853 DEBUG(DEBUG_WARNING
,
1854 ("Rebalance target nodes changed during takeover run - not clearing\n"));
1857 rec
->need_takeover_run
= !ok
;
1859 ctdb_op_end(rec
->takeover_run
);
1861 DEBUG(DEBUG_NOTICE
, ("Takeover run %s\n", ok
? "completed successfully" : "unsuccessful"));
1867 we are the recmaster, and recovery is needed - start a recovery run
1869 static int do_recovery(struct ctdb_recoverd
*rec
,
1870 TALLOC_CTX
*mem_ctx
, uint32_t pnn
,
1871 struct ctdb_node_map
*nodemap
, struct ctdb_vnn_map
*vnnmap
)
1873 struct ctdb_context
*ctdb
= rec
->ctdb
;
1875 uint32_t generation
;
1876 struct ctdb_dbid_map
*dbmap
;
1879 struct timeval start_time
;
1880 uint32_t culprit
= (uint32_t)-1;
1883 DEBUG(DEBUG_NOTICE
, (__location__
" Starting do_recovery\n"));
1885 /* if recovery fails, force it again */
1886 rec
->need_recovery
= true;
1888 if (!ctdb_op_begin(rec
->recovery
)) {
1892 if (rec
->election_timeout
) {
1893 /* an election is in progress */
1894 DEBUG(DEBUG_ERR
, ("do_recovery called while election in progress - try again later\n"));
1898 ban_misbehaving_nodes(rec
, &self_ban
);
1900 DEBUG(DEBUG_NOTICE
, ("This node was banned, aborting recovery\n"));
1904 if (ctdb
->recovery_lock_file
!= NULL
) {
1905 if (ctdb_recovery_have_lock(ctdb
)) {
1906 DEBUG(DEBUG_NOTICE
, ("Already holding recovery lock\n"));
1908 start_time
= timeval_current();
1909 DEBUG(DEBUG_NOTICE
, ("Attempting to take recovery lock (%s)\n",
1910 ctdb
->recovery_lock_file
));
1911 if (!ctdb_recovery_lock(ctdb
)) {
1912 if (ctdb
->runstate
== CTDB_RUNSTATE_FIRST_RECOVERY
) {
1913 /* If ctdb is trying first recovery, it's
1914 * possible that current node does not know
1915 * yet who the recmaster is.
1917 DEBUG(DEBUG_ERR
, ("Unable to get recovery lock"
1918 " - retrying recovery\n"));
1922 DEBUG(DEBUG_ERR
,("Unable to get recovery lock - aborting recovery "
1923 "and ban ourself for %u seconds\n",
1924 ctdb
->tunable
.recovery_ban_period
));
1925 ctdb_ban_node(rec
, pnn
, ctdb
->tunable
.recovery_ban_period
);
1928 ctdb_ctrl_report_recd_lock_latency(ctdb
,
1930 timeval_elapsed(&start_time
));
1932 ("Recovery lock taken successfully by recovery daemon\n"));
1936 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery initiated due to problem with node %u\n", rec
->last_culprit_node
));
1938 /* get a list of all databases */
1939 ret
= ctdb_ctrl_getdbmap(ctdb
, CONTROL_TIMEOUT(), pnn
, mem_ctx
, &dbmap
);
1941 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbids from node :%u\n", pnn
));
1945 /* we do the db creation before we set the recovery mode, so the freeze happens
1946 on all databases we will be dealing with. */
1948 /* verify that we have all the databases any other node has */
1949 ret
= create_missing_local_databases(ctdb
, nodemap
, pnn
, &dbmap
, mem_ctx
);
1951 DEBUG(DEBUG_ERR
, (__location__
" Unable to create missing local databases\n"));
1955 /* verify that all other nodes have all our databases */
1956 ret
= create_missing_remote_databases(ctdb
, nodemap
, pnn
, dbmap
, mem_ctx
);
1958 DEBUG(DEBUG_ERR
, (__location__
" Unable to create missing remote databases\n"));
1961 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - created remote databases\n"));
1963 /* update the database priority for all remote databases */
1964 ret
= update_db_priority_on_remote_nodes(ctdb
, nodemap
, pnn
, dbmap
, mem_ctx
);
1966 DEBUG(DEBUG_ERR
, (__location__
" Unable to set db priority on remote nodes\n"));
1968 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - updated db priority for all databases\n"));
1971 /* update all other nodes to use the same setting for reclock files
1972 as the local recovery master.
1974 sync_recovery_lock_file_across_cluster(rec
);
1976 /* set recovery mode to active on all nodes */
1977 ret
= set_recovery_mode(ctdb
, rec
, nodemap
, CTDB_RECOVERY_ACTIVE
);
1979 DEBUG(DEBUG_ERR
, (__location__
" Unable to set recovery mode to active on cluster\n"));
1983 /* execute the "startrecovery" event script on all nodes */
1984 ret
= run_startrecovery_eventscript(rec
, nodemap
);
1986 DEBUG(DEBUG_ERR
, (__location__
" Unable to run the 'startrecovery' event on cluster\n"));
1991 update all nodes to have the same flags that we have
1993 for (i
=0;i
<nodemap
->num
;i
++) {
1994 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
) {
1998 ret
= update_flags_on_all_nodes(ctdb
, nodemap
, i
, nodemap
->nodes
[i
].flags
);
2000 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_INACTIVE
) {
2001 DEBUG(DEBUG_WARNING
, (__location__
"Unable to update flags on inactive node %d\n", i
));
2003 DEBUG(DEBUG_ERR
, (__location__
" Unable to update flags on all nodes for node %d\n", i
));
2009 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - updated flags\n"));
2011 /* pick a new generation number */
2012 generation
= new_generation();
2014 /* change the vnnmap on this node to use the new generation
2015 number but not on any other nodes.
2016 this guarantees that if we abort the recovery prematurely
2017 for some reason (a node stops responding?)
2018 that we can just return immediately and we will reenter
2019 recovery shortly again.
2020 I.e. we deliberately leave the cluster with an inconsistent
2021 generation id to allow us to abort recovery at any stage and
2022 just restart it from scratch.
2024 vnnmap
->generation
= generation
;
2025 ret
= ctdb_ctrl_setvnnmap(ctdb
, CONTROL_TIMEOUT(), pnn
, mem_ctx
, vnnmap
);
2027 DEBUG(DEBUG_ERR
, (__location__
" Unable to set vnnmap for node %u\n", pnn
));
2031 data
.dptr
= (void *)&generation
;
2032 data
.dsize
= sizeof(uint32_t);
2034 nodes
= list_of_active_nodes(ctdb
, nodemap
, mem_ctx
, true);
2035 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_TRANSACTION_START
,
2037 CONTROL_TIMEOUT(), false, data
,
2039 transaction_start_fail_callback
,
2041 DEBUG(DEBUG_ERR
, (__location__
" Unable to start transactions. Recovery failed.\n"));
2042 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_TRANSACTION_CANCEL
,
2044 CONTROL_TIMEOUT(), false, tdb_null
,
2048 DEBUG(DEBUG_ERR
,("Failed to cancel recovery transaction\n"));
2053 DEBUG(DEBUG_NOTICE
,(__location__
" started transactions on all nodes\n"));
2055 for (i
=0;i
<dbmap
->num
;i
++) {
2056 ret
= recover_database(rec
, mem_ctx
,
2058 dbmap
->dbs
[i
].flags
& CTDB_DB_FLAGS_PERSISTENT
,
2059 pnn
, nodemap
, generation
);
2061 DEBUG(DEBUG_ERR
, (__location__
" Failed to recover database 0x%x\n", dbmap
->dbs
[i
].dbid
));
2066 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - starting database commits\n"));
2068 /* commit all the changes */
2069 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_TRANSACTION_COMMIT
,
2071 CONTROL_TIMEOUT(), false, data
,
2074 DEBUG(DEBUG_ERR
, (__location__
" Unable to commit recovery changes. Recovery failed.\n"));
2078 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - committed databases\n"));
2081 /* update the capabilities for all nodes */
2082 ret
= update_capabilities(rec
, nodemap
);
2084 DEBUG(DEBUG_ERR
, (__location__
" Unable to update node capabilities.\n"));
2088 /* build a new vnn map with all the currently active and
2090 generation
= new_generation();
2091 vnnmap
= talloc(mem_ctx
, struct ctdb_vnn_map
);
2092 CTDB_NO_MEMORY(ctdb
, vnnmap
);
2093 vnnmap
->generation
= generation
;
2095 vnnmap
->map
= talloc_zero_array(vnnmap
, uint32_t, vnnmap
->size
);
2096 CTDB_NO_MEMORY(ctdb
, vnnmap
->map
);
2097 for (i
=j
=0;i
<nodemap
->num
;i
++) {
2098 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_INACTIVE
) {
2101 if (!ctdb_node_has_capabilities(rec
->caps
,
2102 ctdb
->nodes
[i
]->pnn
,
2103 CTDB_CAP_LMASTER
)) {
2104 /* this node can not be an lmaster */
2105 DEBUG(DEBUG_DEBUG
, ("Node %d cant be a LMASTER, skipping it\n", i
));
2110 vnnmap
->map
= talloc_realloc(vnnmap
, vnnmap
->map
, uint32_t, vnnmap
->size
);
2111 CTDB_NO_MEMORY(ctdb
, vnnmap
->map
);
2112 vnnmap
->map
[j
++] = nodemap
->nodes
[i
].pnn
;
2115 if (vnnmap
->size
== 0) {
2116 DEBUG(DEBUG_NOTICE
, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
2118 vnnmap
->map
= talloc_realloc(vnnmap
, vnnmap
->map
, uint32_t, vnnmap
->size
);
2119 CTDB_NO_MEMORY(ctdb
, vnnmap
->map
);
2120 vnnmap
->map
[0] = pnn
;
2123 /* update to the new vnnmap on all nodes */
2124 ret
= update_vnnmap_on_all_nodes(ctdb
, nodemap
, pnn
, vnnmap
, mem_ctx
);
2126 DEBUG(DEBUG_ERR
, (__location__
" Unable to update vnnmap on all nodes\n"));
2130 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - updated vnnmap\n"));
2132 /* update recmaster to point to us for all nodes */
2133 ret
= set_recovery_master(ctdb
, nodemap
, pnn
);
2135 DEBUG(DEBUG_ERR
, (__location__
" Unable to set recovery master\n"));
2139 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - updated recmaster\n"));
2141 /* disable recovery mode */
2142 ret
= set_recovery_mode(ctdb
, rec
, nodemap
, CTDB_RECOVERY_NORMAL
);
2144 DEBUG(DEBUG_ERR
, (__location__
" Unable to set recovery mode to normal on cluster\n"));
2148 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - disabled recovery mode\n"));
2150 /* Fetch known/available public IPs from each active node */
2151 ret
= ctdb_reload_remote_public_ips(ctdb
, rec
, nodemap
, &culprit
);
2153 DEBUG(DEBUG_ERR
,("Failed to read public ips from remote node %d\n",
2155 rec
->need_takeover_run
= true;
2159 do_takeover_run(rec
, nodemap
, false);
2161 /* execute the "recovered" event script on all nodes */
2162 ret
= run_recovered_eventscript(rec
, nodemap
, "do_recovery");
2164 DEBUG(DEBUG_ERR
, (__location__
" Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
2168 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - finished the recovered event\n"));
2170 /* send a message to all clients telling them that the cluster
2171 has been reconfigured */
2172 ret
= ctdb_client_send_message(ctdb
, CTDB_BROADCAST_CONNECTED
,
2173 CTDB_SRVID_RECONFIGURE
, tdb_null
);
2175 DEBUG(DEBUG_ERR
, (__location__
" Failed to send reconfigure message\n"));
2179 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery complete\n"));
2181 rec
->need_recovery
= false;
2182 ctdb_op_end(rec
->recovery
);
2184 /* we managed to complete a full recovery, make sure to forgive
2185 any past sins by the nodes that could now participate in the
2188 DEBUG(DEBUG_ERR
,("Resetting ban count to 0 for all nodes\n"));
2189 for (i
=0;i
<nodemap
->num
;i
++) {
2190 struct ctdb_banning_state
*ban_state
;
2192 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
) {
2196 ban_state
= (struct ctdb_banning_state
*)ctdb
->nodes
[nodemap
->nodes
[i
].pnn
]->ban_state
;
2197 if (ban_state
== NULL
) {
2201 ban_state
->count
= 0;
2204 /* We just finished a recovery successfully.
2205 We now wait for rerecovery_timeout before we allow
2206 another recovery to take place.
2208 DEBUG(DEBUG_NOTICE
, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb
->tunable
.rerecovery_timeout
));
2209 ctdb_op_disable(rec
->recovery
, ctdb
->ev
,
2210 ctdb
->tunable
.rerecovery_timeout
);
2214 ctdb_op_end(rec
->recovery
);
2220 elections are won by first checking the number of connected nodes, then
2221 the priority time, then the pnn
2223 struct election_message
{
2224 uint32_t num_connected
;
2225 struct timeval priority_time
;
2227 uint32_t node_flags
;
2231 form this nodes election data
2233 static void ctdb_election_data(struct ctdb_recoverd
*rec
, struct election_message
*em
)
2236 struct ctdb_node_map
*nodemap
;
2237 struct ctdb_context
*ctdb
= rec
->ctdb
;
2241 em
->pnn
= rec
->ctdb
->pnn
;
2242 em
->priority_time
= rec
->priority_time
;
2244 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, rec
, &nodemap
);
2246 DEBUG(DEBUG_ERR
,(__location__
" unable to get node map\n"));
2250 rec
->node_flags
= nodemap
->nodes
[ctdb
->pnn
].flags
;
2251 em
->node_flags
= rec
->node_flags
;
2253 for (i
=0;i
<nodemap
->num
;i
++) {
2254 if (!(nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
)) {
2255 em
->num_connected
++;
2259 /* we shouldnt try to win this election if we cant be a recmaster */
2260 if ((ctdb
->capabilities
& CTDB_CAP_RECMASTER
) == 0) {
2261 em
->num_connected
= 0;
2262 em
->priority_time
= timeval_current();
2265 talloc_free(nodemap
);
2269 see if the given election data wins
2271 static bool ctdb_election_win(struct ctdb_recoverd
*rec
, struct election_message
*em
)
2273 struct election_message myem
;
2276 ctdb_election_data(rec
, &myem
);
2278 /* we cant win if we dont have the recmaster capability */
2279 if ((rec
->ctdb
->capabilities
& CTDB_CAP_RECMASTER
) == 0) {
2283 /* we cant win if we are banned */
2284 if (rec
->node_flags
& NODE_FLAGS_BANNED
) {
2288 /* we cant win if we are stopped */
2289 if (rec
->node_flags
& NODE_FLAGS_STOPPED
) {
2293 /* we will automatically win if the other node is banned */
2294 if (em
->node_flags
& NODE_FLAGS_BANNED
) {
2298 /* we will automatically win if the other node is banned */
2299 if (em
->node_flags
& NODE_FLAGS_STOPPED
) {
2303 /* try to use the most connected node */
2305 cmp
= (int)myem
.num_connected
- (int)em
->num_connected
;
2308 /* then the longest running node */
2310 cmp
= timeval_compare(&em
->priority_time
, &myem
.priority_time
);
2314 cmp
= (int)myem
.pnn
- (int)em
->pnn
;
2321 send out an election request
2323 static int send_election_request(struct ctdb_recoverd
*rec
, uint32_t pnn
)
2326 TDB_DATA election_data
;
2327 struct election_message emsg
;
2329 struct ctdb_context
*ctdb
= rec
->ctdb
;
2331 srvid
= CTDB_SRVID_RECOVERY
;
2333 ctdb_election_data(rec
, &emsg
);
2335 election_data
.dsize
= sizeof(struct election_message
);
2336 election_data
.dptr
= (unsigned char *)&emsg
;
2339 /* first we assume we will win the election and set
2340 recoverymaster to be ourself on the current node
2342 ret
= ctdb_ctrl_setrecmaster(ctdb
, CONTROL_TIMEOUT(), pnn
, pnn
);
2344 DEBUG(DEBUG_ERR
, (__location__
" failed to send recmaster election request\n"));
2349 /* send an election message to all active nodes */
2350 DEBUG(DEBUG_INFO
,(__location__
" Send election request to all active nodes\n"));
2351 return ctdb_client_send_message(ctdb
, CTDB_BROADCAST_ALL
, srvid
, election_data
);
2355 this function will unban all nodes in the cluster
2357 static void unban_all_nodes(struct ctdb_context
*ctdb
)
2360 struct ctdb_node_map
*nodemap
;
2361 TALLOC_CTX
*tmp_ctx
= talloc_new(ctdb
);
2363 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, tmp_ctx
, &nodemap
);
2365 DEBUG(DEBUG_ERR
,(__location__
" failed to get nodemap to unban all nodes\n"));
2369 for (i
=0;i
<nodemap
->num
;i
++) {
2370 if ( (!(nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
))
2371 && (nodemap
->nodes
[i
].flags
& NODE_FLAGS_BANNED
) ) {
2372 ret
= ctdb_ctrl_modflags(ctdb
, CONTROL_TIMEOUT(),
2373 nodemap
->nodes
[i
].pnn
, 0,
2376 DEBUG(DEBUG_ERR
, (__location__
" failed to reset ban state\n"));
2381 talloc_free(tmp_ctx
);
2386 we think we are winning the election - send a broadcast election request
2388 static void election_send_request(struct event_context
*ev
, struct timed_event
*te
, struct timeval t
, void *p
)
2390 struct ctdb_recoverd
*rec
= talloc_get_type(p
, struct ctdb_recoverd
);
2393 ret
= send_election_request(rec
, ctdb_get_pnn(rec
->ctdb
));
2395 DEBUG(DEBUG_ERR
,("Failed to send election request!\n"));
2398 talloc_free(rec
->send_election_te
);
2399 rec
->send_election_te
= NULL
;
2403 handler for memory dumps
2405 static void mem_dump_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2406 TDB_DATA data
, void *private_data
)
2408 TALLOC_CTX
*tmp_ctx
= talloc_new(ctdb
);
2411 struct srvid_request
*rd
;
2413 if (data
.dsize
!= sizeof(struct srvid_request
)) {
2414 DEBUG(DEBUG_ERR
, (__location__
" Wrong size of return address.\n"));
2415 talloc_free(tmp_ctx
);
2418 rd
= (struct srvid_request
*)data
.dptr
;
2420 dump
= talloc_zero(tmp_ctx
, TDB_DATA
);
2422 DEBUG(DEBUG_ERR
, (__location__
" Failed to allocate memory for memdump\n"));
2423 talloc_free(tmp_ctx
);
2426 ret
= ctdb_dump_memory(ctdb
, dump
);
2428 DEBUG(DEBUG_ERR
, (__location__
" ctdb_dump_memory() failed\n"));
2429 talloc_free(tmp_ctx
);
2433 DEBUG(DEBUG_ERR
, ("recovery master memory dump\n"));
2435 ret
= ctdb_client_send_message(ctdb
, rd
->pnn
, rd
->srvid
, *dump
);
2437 DEBUG(DEBUG_ERR
,("Failed to send rd memdump reply message\n"));
2438 talloc_free(tmp_ctx
);
2442 talloc_free(tmp_ctx
);
2446 handler for reload_nodes
2448 static void reload_nodes_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2449 TDB_DATA data
, void *private_data
)
2451 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
, struct ctdb_recoverd
);
2453 DEBUG(DEBUG_ERR
, (__location__
" Reload nodes file from recovery daemon\n"));
2455 ctdb_load_nodes_file(rec
->ctdb
);
2459 static void ctdb_rebalance_timeout(struct event_context
*ev
,
2460 struct timed_event
*te
,
2461 struct timeval t
, void *p
)
2463 struct ctdb_recoverd
*rec
= talloc_get_type(p
, struct ctdb_recoverd
);
2465 if (rec
->force_rebalance_nodes
== NULL
) {
2467 ("Rebalance timeout occurred - no nodes to rebalance\n"));
2472 ("Rebalance timeout occurred - do takeover run\n"));
2473 do_takeover_run(rec
, rec
->nodemap
, false);
2477 static void recd_node_rebalance_handler(struct ctdb_context
*ctdb
,
2479 TDB_DATA data
, void *private_data
)
2484 uint32_t deferred_rebalance
;
2485 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
, struct ctdb_recoverd
);
2487 if (rec
->recmaster
!= ctdb_get_pnn(ctdb
)) {
2491 if (data
.dsize
!= sizeof(uint32_t)) {
2492 DEBUG(DEBUG_ERR
,(__location__
" Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data
.dsize
, sizeof(uint32_t)));
2496 pnn
= *(uint32_t *)&data
.dptr
[0];
2498 DEBUG(DEBUG_NOTICE
,("Setting up rebalance of IPs to node %u\n", pnn
));
2500 /* Copy any existing list of nodes. There's probably some
2501 * sort of realloc variant that will do this but we need to
2502 * make sure that freeing the old array also cancels the timer
2503 * event for the timeout... not sure if realloc will do that.
2505 len
= (rec
->force_rebalance_nodes
!= NULL
) ?
2506 talloc_array_length(rec
->force_rebalance_nodes
) :
2509 /* This allows duplicates to be added but they don't cause
2510 * harm. A call to add a duplicate PNN arguably means that
2511 * the timeout should be reset, so this is the simplest
2514 t
= talloc_zero_array(rec
, uint32_t, len
+1);
2515 CTDB_NO_MEMORY_VOID(ctdb
, t
);
2517 memcpy(t
, rec
->force_rebalance_nodes
, sizeof(uint32_t) * len
);
2521 talloc_free(rec
->force_rebalance_nodes
);
2523 rec
->force_rebalance_nodes
= t
;
2525 /* If configured, setup a deferred takeover run to make sure
2526 * that certain nodes get IPs rebalanced to them. This will
2527 * be cancelled if a successful takeover run happens before
2528 * the timeout. Assign tunable value to variable for
2531 deferred_rebalance
= ctdb
->tunable
.deferred_rebalance_on_node_add
;
2532 if (deferred_rebalance
!= 0) {
2533 event_add_timed(ctdb
->ev
, rec
->force_rebalance_nodes
,
2534 timeval_current_ofs(deferred_rebalance
, 0),
2535 ctdb_rebalance_timeout
, rec
);
2541 static void recd_update_ip_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2542 TDB_DATA data
, void *private_data
)
2544 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
, struct ctdb_recoverd
);
2545 struct ctdb_public_ip
*ip
;
2547 if (rec
->recmaster
!= rec
->ctdb
->pnn
) {
2548 DEBUG(DEBUG_INFO
,("Not recmaster, ignore update ip message\n"));
2552 if (data
.dsize
!= sizeof(struct ctdb_public_ip
)) {
2553 DEBUG(DEBUG_ERR
,(__location__
" Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data
.dsize
, sizeof(struct ctdb_public_ip
)));
2557 ip
= (struct ctdb_public_ip
*)data
.dptr
;
2559 update_ip_assignment_tree(rec
->ctdb
, ip
);
2562 static void srvid_disable_and_reply(struct ctdb_context
*ctdb
,
2564 struct ctdb_op_state
*op_state
)
2566 struct srvid_request_data
*r
;
2571 /* Validate input data */
2572 if (data
.dsize
!= sizeof(struct srvid_request_data
)) {
2573 DEBUG(DEBUG_ERR
,(__location__
" Wrong size for data :%lu "
2574 "expecting %lu\n", (long unsigned)data
.dsize
,
2575 (long unsigned)sizeof(struct srvid_request
)));
2578 if (data
.dptr
== NULL
) {
2579 DEBUG(DEBUG_ERR
,(__location__
" No data received\n"));
2583 r
= (struct srvid_request_data
*)data
.dptr
;
2586 ret
= ctdb_op_disable(op_state
, ctdb
->ev
, timeout
);
2591 /* Returning our PNN tells the caller that we succeeded */
2592 ret
= ctdb_get_pnn(ctdb
);
2594 result
.dsize
= sizeof(int32_t);
2595 result
.dptr
= (uint8_t *)&ret
;
2596 srvid_request_reply(ctdb
, (struct srvid_request
*)r
, result
);
2599 static void disable_takeover_runs_handler(struct ctdb_context
*ctdb
,
2600 uint64_t srvid
, TDB_DATA data
,
2603 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
,
2604 struct ctdb_recoverd
);
2606 srvid_disable_and_reply(ctdb
, data
, rec
->takeover_run
);
2609 /* Backward compatibility for this SRVID */
2610 static void disable_ip_check_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2611 TDB_DATA data
, void *private_data
)
2613 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
,
2614 struct ctdb_recoverd
);
2617 if (data
.dsize
!= sizeof(uint32_t)) {
2618 DEBUG(DEBUG_ERR
,(__location__
" Wrong size for data :%lu "
2619 "expecting %lu\n", (long unsigned)data
.dsize
,
2620 (long unsigned)sizeof(uint32_t)));
2623 if (data
.dptr
== NULL
) {
2624 DEBUG(DEBUG_ERR
,(__location__
" No data received\n"));
2628 timeout
= *((uint32_t *)data
.dptr
);
2630 ctdb_op_disable(rec
->takeover_run
, ctdb
->ev
, timeout
);
2633 static void disable_recoveries_handler(struct ctdb_context
*ctdb
,
2634 uint64_t srvid
, TDB_DATA data
,
2637 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
,
2638 struct ctdb_recoverd
);
2640 srvid_disable_and_reply(ctdb
, data
, rec
->recovery
);
2644 handler for ip reallocate, just add it to the list of requests and
2645 handle this later in the monitor_cluster loop so we do not recurse
2646 with other requests to takeover_run()
2648 static void ip_reallocate_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2649 TDB_DATA data
, void *private_data
)
2651 struct srvid_request
*request
;
2652 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
,
2653 struct ctdb_recoverd
);
2655 if (data
.dsize
!= sizeof(struct srvid_request
)) {
2656 DEBUG(DEBUG_ERR
, (__location__
" Wrong size of return address.\n"));
2660 request
= (struct srvid_request
*)data
.dptr
;
2662 srvid_request_add(ctdb
, &rec
->reallocate_requests
, request
);
2665 static void process_ipreallocate_requests(struct ctdb_context
*ctdb
,
2666 struct ctdb_recoverd
*rec
)
2671 struct srvid_requests
*current
;
2673 DEBUG(DEBUG_INFO
, ("recovery master forced ip reallocation\n"));
2675 /* Only process requests that are currently pending. More
2676 * might come in while the takeover run is in progress and
2677 * they will need to be processed later since they might
2678 * be in response flag changes.
2680 current
= rec
->reallocate_requests
;
2681 rec
->reallocate_requests
= NULL
;
2683 /* update the list of public ips that a node can handle for
2686 ret
= ctdb_reload_remote_public_ips(ctdb
, rec
, rec
->nodemap
, &culprit
);
2688 DEBUG(DEBUG_ERR
,("Failed to read public ips from remote node %d\n",
2690 rec
->need_takeover_run
= true;
2693 if (do_takeover_run(rec
, rec
->nodemap
, false)) {
2694 ret
= ctdb_get_pnn(ctdb
);
2700 result
.dsize
= sizeof(int32_t);
2701 result
.dptr
= (uint8_t *)&ret
;
2703 srvid_requests_reply(ctdb
, ¤t
, result
);
2708 handler for recovery master elections
2710 static void election_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2711 TDB_DATA data
, void *private_data
)
2713 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
, struct ctdb_recoverd
);
2715 struct election_message
*em
= (struct election_message
*)data
.dptr
;
2717 /* Ignore election packets from ourself */
2718 if (ctdb
->pnn
== em
->pnn
) {
2722 /* we got an election packet - update the timeout for the election */
2723 talloc_free(rec
->election_timeout
);
2724 rec
->election_timeout
= event_add_timed(ctdb
->ev
, ctdb
,
2726 timeval_current_ofs(0, 500000) :
2727 timeval_current_ofs(ctdb
->tunable
.election_timeout
, 0),
2728 ctdb_election_timeout
, rec
);
2730 /* someone called an election. check their election data
2731 and if we disagree and we would rather be the elected node,
2732 send a new election message to all other nodes
2734 if (ctdb_election_win(rec
, em
)) {
2735 if (!rec
->send_election_te
) {
2736 rec
->send_election_te
= event_add_timed(ctdb
->ev
, rec
,
2737 timeval_current_ofs(0, 500000),
2738 election_send_request
, rec
);
2740 /*unban_all_nodes(ctdb);*/
2745 TALLOC_FREE(rec
->send_election_te
);
2747 /* Release the recovery lock file */
2748 if (ctdb_recovery_have_lock(ctdb
)) {
2749 ctdb_recovery_unlock(ctdb
);
2750 unban_all_nodes(ctdb
);
2753 /* ok, let that guy become recmaster then */
2754 ret
= ctdb_ctrl_setrecmaster(ctdb
, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb
), em
->pnn
);
2756 DEBUG(DEBUG_ERR
, (__location__
" failed to send recmaster election request"));
2765 force the start of the election process
2767 static void force_election(struct ctdb_recoverd
*rec
, uint32_t pnn
,
2768 struct ctdb_node_map
*nodemap
)
2771 struct ctdb_context
*ctdb
= rec
->ctdb
;
2773 DEBUG(DEBUG_INFO
,(__location__
" Force an election\n"));
2775 /* set all nodes to recovery mode to stop all internode traffic */
2776 ret
= set_recovery_mode(ctdb
, rec
, nodemap
, CTDB_RECOVERY_ACTIVE
);
2778 DEBUG(DEBUG_ERR
, (__location__
" Unable to set recovery mode to active on cluster\n"));
2782 talloc_free(rec
->election_timeout
);
2783 rec
->election_timeout
= event_add_timed(ctdb
->ev
, ctdb
,
2785 timeval_current_ofs(0, 500000) :
2786 timeval_current_ofs(ctdb
->tunable
.election_timeout
, 0),
2787 ctdb_election_timeout
, rec
);
2789 ret
= send_election_request(rec
, pnn
);
2791 DEBUG(DEBUG_ERR
, (__location__
" failed to initiate recmaster election"));
2795 /* wait for a few seconds to collect all responses */
2796 ctdb_wait_election(rec
);
2802 handler for when a node changes its flags
2804 static void monitor_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2805 TDB_DATA data
, void *private_data
)
2808 struct ctdb_node_flag_change
*c
= (struct ctdb_node_flag_change
*)data
.dptr
;
2809 struct ctdb_node_map
*nodemap
=NULL
;
2810 TALLOC_CTX
*tmp_ctx
;
2812 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
, struct ctdb_recoverd
);
2813 int disabled_flag_changed
;
2815 if (data
.dsize
!= sizeof(*c
)) {
2816 DEBUG(DEBUG_ERR
,(__location__
"Invalid data in ctdb_node_flag_change\n"));
2820 tmp_ctx
= talloc_new(ctdb
);
2821 CTDB_NO_MEMORY_VOID(ctdb
, tmp_ctx
);
2823 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, tmp_ctx
, &nodemap
);
2825 DEBUG(DEBUG_ERR
,(__location__
"ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2826 talloc_free(tmp_ctx
);
2831 for (i
=0;i
<nodemap
->num
;i
++) {
2832 if (nodemap
->nodes
[i
].pnn
== c
->pnn
) break;
2835 if (i
== nodemap
->num
) {
2836 DEBUG(DEBUG_CRIT
,(__location__
"Flag change for non-existant node %u\n", c
->pnn
));
2837 talloc_free(tmp_ctx
);
2841 if (c
->old_flags
!= c
->new_flags
) {
2842 DEBUG(DEBUG_NOTICE
,("Node %u has changed flags - now 0x%x was 0x%x\n", c
->pnn
, c
->new_flags
, c
->old_flags
));
2845 disabled_flag_changed
= (nodemap
->nodes
[i
].flags
^ c
->new_flags
) & NODE_FLAGS_DISABLED
;
2847 nodemap
->nodes
[i
].flags
= c
->new_flags
;
2849 ret
= ctdb_ctrl_getrecmaster(ctdb
, tmp_ctx
, CONTROL_TIMEOUT(),
2850 CTDB_CURRENT_NODE
, &ctdb
->recovery_master
);
2853 ret
= ctdb_ctrl_getrecmode(ctdb
, tmp_ctx
, CONTROL_TIMEOUT(),
2854 CTDB_CURRENT_NODE
, &ctdb
->recovery_mode
);
2858 ctdb
->recovery_master
== ctdb
->pnn
&&
2859 ctdb
->recovery_mode
== CTDB_RECOVERY_NORMAL
) {
2860 /* Only do the takeover run if the perm disabled or unhealthy
2861 flags changed since these will cause an ip failover but not
2863 If the node became disconnected or banned this will also
2864 lead to an ip address failover but that is handled
2867 if (disabled_flag_changed
) {
2868 rec
->need_takeover_run
= true;
2872 talloc_free(tmp_ctx
);
2876 handler for when we need to push out flag changes ot all other nodes
2878 static void push_flags_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2879 TDB_DATA data
, void *private_data
)
2882 struct ctdb_node_flag_change
*c
= (struct ctdb_node_flag_change
*)data
.dptr
;
2883 struct ctdb_node_map
*nodemap
=NULL
;
2884 TALLOC_CTX
*tmp_ctx
= talloc_new(ctdb
);
2888 /* find the recovery master */
2889 ret
= ctdb_ctrl_getrecmaster(ctdb
, tmp_ctx
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, &recmaster
);
2891 DEBUG(DEBUG_ERR
, (__location__
" Unable to get recmaster from local node\n"));
2892 talloc_free(tmp_ctx
);
2896 /* read the node flags from the recmaster */
2897 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), recmaster
, tmp_ctx
, &nodemap
);
2899 DEBUG(DEBUG_ERR
, (__location__
" Unable to get nodemap from node %u\n", c
->pnn
));
2900 talloc_free(tmp_ctx
);
2903 if (c
->pnn
>= nodemap
->num
) {
2904 DEBUG(DEBUG_ERR
,(__location__
" Nodemap from recmaster does not contain node %d\n", c
->pnn
));
2905 talloc_free(tmp_ctx
);
2909 /* send the flags update to all connected nodes */
2910 nodes
= list_of_connected_nodes(ctdb
, nodemap
, tmp_ctx
, true);
2912 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_MODIFY_FLAGS
,
2913 nodes
, 0, CONTROL_TIMEOUT(),
2917 DEBUG(DEBUG_ERR
, (__location__
" ctdb_control to modify node flags failed\n"));
2919 talloc_free(tmp_ctx
);
2923 talloc_free(tmp_ctx
);
2927 struct verify_recmode_normal_data
{
2929 enum monitor_result status
;
2932 static void verify_recmode_normal_callback(struct ctdb_client_control_state
*state
)
2934 struct verify_recmode_normal_data
*rmdata
= talloc_get_type(state
->async
.private_data
, struct verify_recmode_normal_data
);
2937 /* one more node has responded with recmode data*/
2940 /* if we failed to get the recmode, then return an error and let
2941 the main loop try again.
2943 if (state
->state
!= CTDB_CONTROL_DONE
) {
2944 if (rmdata
->status
== MONITOR_OK
) {
2945 rmdata
->status
= MONITOR_FAILED
;
2950 /* if we got a response, then the recmode will be stored in the
2953 if (state
->status
!= CTDB_RECOVERY_NORMAL
) {
2954 DEBUG(DEBUG_NOTICE
, ("Node:%u was in recovery mode. Start recovery process\n", state
->c
->hdr
.destnode
));
2955 rmdata
->status
= MONITOR_RECOVERY_NEEDED
;
2962 /* verify that all nodes are in normal recovery mode */
2963 static enum monitor_result
verify_recmode(struct ctdb_context
*ctdb
, struct ctdb_node_map
*nodemap
)
2965 struct verify_recmode_normal_data
*rmdata
;
2966 TALLOC_CTX
*mem_ctx
= talloc_new(ctdb
);
2967 struct ctdb_client_control_state
*state
;
2968 enum monitor_result status
;
2971 rmdata
= talloc(mem_ctx
, struct verify_recmode_normal_data
);
2972 CTDB_NO_MEMORY_FATAL(ctdb
, rmdata
);
2974 rmdata
->status
= MONITOR_OK
;
2976 /* loop over all active nodes and send an async getrecmode call to
2978 for (j
=0; j
<nodemap
->num
; j
++) {
2979 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
2982 state
= ctdb_ctrl_getrecmode_send(ctdb
, mem_ctx
,
2984 nodemap
->nodes
[j
].pnn
);
2985 if (state
== NULL
) {
2986 /* we failed to send the control, treat this as
2987 an error and try again next iteration
2989 DEBUG(DEBUG_ERR
,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2990 talloc_free(mem_ctx
);
2991 return MONITOR_FAILED
;
2994 /* set up the callback functions */
2995 state
->async
.fn
= verify_recmode_normal_callback
;
2996 state
->async
.private_data
= rmdata
;
2998 /* one more control to wait for to complete */
3003 /* now wait for up to the maximum number of seconds allowed
3004 or until all nodes we expect a response from has replied
3006 while (rmdata
->count
> 0) {
3007 event_loop_once(ctdb
->ev
);
3010 status
= rmdata
->status
;
3011 talloc_free(mem_ctx
);
3016 struct verify_recmaster_data
{
3017 struct ctdb_recoverd
*rec
;
3020 enum monitor_result status
;
3023 static void verify_recmaster_callback(struct ctdb_client_control_state
*state
)
3025 struct verify_recmaster_data
*rmdata
= talloc_get_type(state
->async
.private_data
, struct verify_recmaster_data
);
3028 /* one more node has responded with recmaster data*/
3031 /* if we failed to get the recmaster, then return an error and let
3032 the main loop try again.
3034 if (state
->state
!= CTDB_CONTROL_DONE
) {
3035 if (rmdata
->status
== MONITOR_OK
) {
3036 rmdata
->status
= MONITOR_FAILED
;
3041 /* if we got a response, then the recmaster will be stored in the
3044 if (state
->status
!= rmdata
->pnn
) {
3045 DEBUG(DEBUG_ERR
,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state
->c
->hdr
.destnode
, state
->status
));
3046 ctdb_set_culprit(rmdata
->rec
, state
->c
->hdr
.destnode
);
3047 rmdata
->status
= MONITOR_ELECTION_NEEDED
;
3054 /* verify that all nodes agree that we are the recmaster */
3055 static enum monitor_result
verify_recmaster(struct ctdb_recoverd
*rec
, struct ctdb_node_map
*nodemap
, uint32_t pnn
)
3057 struct ctdb_context
*ctdb
= rec
->ctdb
;
3058 struct verify_recmaster_data
*rmdata
;
3059 TALLOC_CTX
*mem_ctx
= talloc_new(ctdb
);
3060 struct ctdb_client_control_state
*state
;
3061 enum monitor_result status
;
3064 rmdata
= talloc(mem_ctx
, struct verify_recmaster_data
);
3065 CTDB_NO_MEMORY_FATAL(ctdb
, rmdata
);
3069 rmdata
->status
= MONITOR_OK
;
3071 /* loop over all active nodes and send an async getrecmaster call to
3073 for (j
=0; j
<nodemap
->num
; j
++) {
3074 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
3077 state
= ctdb_ctrl_getrecmaster_send(ctdb
, mem_ctx
,
3079 nodemap
->nodes
[j
].pnn
);
3080 if (state
== NULL
) {
3081 /* we failed to send the control, treat this as
3082 an error and try again next iteration
3084 DEBUG(DEBUG_ERR
,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
3085 talloc_free(mem_ctx
);
3086 return MONITOR_FAILED
;
3089 /* set up the callback functions */
3090 state
->async
.fn
= verify_recmaster_callback
;
3091 state
->async
.private_data
= rmdata
;
3093 /* one more control to wait for to complete */
3098 /* now wait for up to the maximum number of seconds allowed
3099 or until all nodes we expect a response from has replied
3101 while (rmdata
->count
> 0) {
3102 event_loop_once(ctdb
->ev
);
3105 status
= rmdata
->status
;
3106 talloc_free(mem_ctx
);
3110 static bool interfaces_have_changed(struct ctdb_context
*ctdb
,
3111 struct ctdb_recoverd
*rec
)
3113 struct ctdb_control_get_ifaces
*ifaces
= NULL
;
3114 TALLOC_CTX
*mem_ctx
;
3117 mem_ctx
= talloc_new(NULL
);
3119 /* Read the interfaces from the local node */
3120 if (ctdb_ctrl_get_ifaces(ctdb
, CONTROL_TIMEOUT(),
3121 CTDB_CURRENT_NODE
, mem_ctx
, &ifaces
) != 0) {
3122 DEBUG(DEBUG_ERR
, ("Unable to get interfaces from local node %u\n", ctdb
->pnn
));
3123 /* We could return an error. However, this will be
3124 * rare so we'll decide that the interfaces have
3125 * actually changed, just in case.
3127 talloc_free(mem_ctx
);
3132 /* We haven't been here before so things have changed */
3133 DEBUG(DEBUG_NOTICE
, ("Initial interface fetched\n"));
3135 } else if (rec
->ifaces
->num
!= ifaces
->num
) {
3136 /* Number of interfaces has changed */
3137 DEBUG(DEBUG_NOTICE
, ("Interface count changed from %d to %d\n",
3138 rec
->ifaces
->num
, ifaces
->num
));
3141 /* See if interface names or link states have changed */
3143 for (i
= 0; i
< rec
->ifaces
->num
; i
++) {
3144 struct ctdb_control_iface_info
* iface
= &rec
->ifaces
->ifaces
[i
];
3145 if (strcmp(iface
->name
, ifaces
->ifaces
[i
].name
) != 0) {
3147 ("Interface in slot %d changed: %s => %s\n",
3148 i
, iface
->name
, ifaces
->ifaces
[i
].name
));
3152 if (iface
->link_state
!= ifaces
->ifaces
[i
].link_state
) {
3154 ("Interface %s changed state: %d => %d\n",
3155 iface
->name
, iface
->link_state
,
3156 ifaces
->ifaces
[i
].link_state
));
3163 talloc_free(rec
->ifaces
);
3164 rec
->ifaces
= talloc_steal(rec
, ifaces
);
3166 talloc_free(mem_ctx
);
3170 /* called to check that the local allocation of public ip addresses is ok.
3172 static int verify_local_ip_allocation(struct ctdb_context
*ctdb
, struct ctdb_recoverd
*rec
, uint32_t pnn
, struct ctdb_node_map
*nodemap
)
3174 TALLOC_CTX
*mem_ctx
= talloc_new(NULL
);
3175 struct ctdb_uptime
*uptime1
= NULL
;
3176 struct ctdb_uptime
*uptime2
= NULL
;
3178 bool need_takeover_run
= false;
3180 ret
= ctdb_ctrl_uptime(ctdb
, mem_ctx
, CONTROL_TIMEOUT(),
3181 CTDB_CURRENT_NODE
, &uptime1
);
3183 DEBUG(DEBUG_ERR
, ("Unable to get uptime from local node %u\n", pnn
));
3184 talloc_free(mem_ctx
);
3188 if (interfaces_have_changed(ctdb
, rec
)) {
3189 DEBUG(DEBUG_NOTICE
, ("The interfaces status has changed on "
3190 "local node %u - force takeover run\n",
3192 need_takeover_run
= true;
3195 ret
= ctdb_ctrl_uptime(ctdb
, mem_ctx
, CONTROL_TIMEOUT(),
3196 CTDB_CURRENT_NODE
, &uptime2
);
3198 DEBUG(DEBUG_ERR
, ("Unable to get uptime from local node %u\n", pnn
));
3199 talloc_free(mem_ctx
);
3203 /* skip the check if the startrecovery time has changed */
3204 if (timeval_compare(&uptime1
->last_recovery_started
,
3205 &uptime2
->last_recovery_started
) != 0) {
3206 DEBUG(DEBUG_NOTICE
, (__location__
" last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3207 talloc_free(mem_ctx
);
3211 /* skip the check if the endrecovery time has changed */
3212 if (timeval_compare(&uptime1
->last_recovery_finished
,
3213 &uptime2
->last_recovery_finished
) != 0) {
3214 DEBUG(DEBUG_NOTICE
, (__location__
" last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3215 talloc_free(mem_ctx
);
3219 /* skip the check if we have started but not finished recovery */
3220 if (timeval_compare(&uptime1
->last_recovery_finished
,
3221 &uptime1
->last_recovery_started
) != 1) {
3222 DEBUG(DEBUG_INFO
, (__location__
" in the middle of recovery or ip reallocation. skipping public ip address check\n"));
3223 talloc_free(mem_ctx
);
3228 /* verify that we have the ip addresses we should have
3229 and we dont have ones we shouldnt have.
3230 if we find an inconsistency we set recmode to
3231 active on the local node and wait for the recmaster
3232 to do a full blown recovery.
3233 also if the pnn is -1 and we are healthy and can host the ip
3234 we also request a ip reallocation.
3236 if (ctdb
->tunable
.disable_ip_failover
== 0) {
3237 struct ctdb_all_public_ips
*ips
= NULL
;
3239 /* read the *available* IPs from the local node */
3240 ret
= ctdb_ctrl_get_public_ips_flags(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, mem_ctx
, CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE
, &ips
);
3242 DEBUG(DEBUG_ERR
, ("Unable to get available public IPs from local node %u\n", pnn
));
3243 talloc_free(mem_ctx
);
3247 for (j
=0; j
<ips
->num
; j
++) {
3248 if (ips
->ips
[j
].pnn
== -1 &&
3249 nodemap
->nodes
[pnn
].flags
== 0) {
3250 DEBUG(DEBUG_CRIT
,("Public IP '%s' is not assigned and we could serve it\n",
3251 ctdb_addr_to_str(&ips
->ips
[j
].addr
)));
3252 need_takeover_run
= true;
3258 /* read the *known* IPs from the local node */
3259 ret
= ctdb_ctrl_get_public_ips_flags(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, mem_ctx
, 0, &ips
);
3261 DEBUG(DEBUG_ERR
, ("Unable to get known public IPs from local node %u\n", pnn
));
3262 talloc_free(mem_ctx
);
3266 for (j
=0; j
<ips
->num
; j
++) {
3267 if (ips
->ips
[j
].pnn
== pnn
) {
3268 if (ctdb
->do_checkpublicip
&& !ctdb_sys_have_ip(&ips
->ips
[j
].addr
)) {
3269 DEBUG(DEBUG_CRIT
,("Public IP '%s' is assigned to us but not on an interface\n",
3270 ctdb_addr_to_str(&ips
->ips
[j
].addr
)));
3271 need_takeover_run
= true;
3274 if (ctdb
->do_checkpublicip
&&
3275 ctdb_sys_have_ip(&ips
->ips
[j
].addr
)) {
3277 DEBUG(DEBUG_CRIT
,("We are still serving a public IP '%s' that we should not be serving. Removing it\n",
3278 ctdb_addr_to_str(&ips
->ips
[j
].addr
)));
3280 if (ctdb_ctrl_release_ip(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, &ips
->ips
[j
]) != 0) {
3281 DEBUG(DEBUG_ERR
,("Failed to release local IP address\n"));
3288 if (need_takeover_run
) {
3289 struct srvid_request rd
;
3292 DEBUG(DEBUG_CRIT
,("Trigger takeoverrun\n"));
3296 data
.dptr
= (uint8_t *)&rd
;
3297 data
.dsize
= sizeof(rd
);
3299 ret
= ctdb_client_send_message(ctdb
, rec
->recmaster
, CTDB_SRVID_TAKEOVER_RUN
, data
);
3301 DEBUG(DEBUG_ERR
,(__location__
" Failed to send ipreallocate to recmaster :%d\n", (int)rec
->recmaster
));
3304 talloc_free(mem_ctx
);
3309 static void async_getnodemap_callback(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
3311 struct ctdb_node_map
**remote_nodemaps
= callback_data
;
3313 if (node_pnn
>= ctdb
->num_nodes
) {
3314 DEBUG(DEBUG_ERR
,(__location__
" pnn from invalid node\n"));
3318 remote_nodemaps
[node_pnn
] = (struct ctdb_node_map
*)talloc_steal(remote_nodemaps
, outdata
.dptr
);
3322 static int get_remote_nodemaps(struct ctdb_context
*ctdb
, TALLOC_CTX
*mem_ctx
,
3323 struct ctdb_node_map
*nodemap
,
3324 struct ctdb_node_map
**remote_nodemaps
)
3328 nodes
= list_of_active_nodes(ctdb
, nodemap
, mem_ctx
, true);
3329 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_GET_NODEMAP
,
3331 CONTROL_TIMEOUT(), false, tdb_null
,
3332 async_getnodemap_callback
,
3334 remote_nodemaps
) != 0) {
3335 DEBUG(DEBUG_ERR
, (__location__
" Unable to pull all remote nodemaps\n"));
3343 static int update_recovery_lock_file(struct ctdb_context
*ctdb
)
3345 TALLOC_CTX
*tmp_ctx
= talloc_new(NULL
);
3346 const char *reclockfile
;
3348 if (ctdb_ctrl_getreclock(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, tmp_ctx
, &reclockfile
) != 0) {
3349 DEBUG(DEBUG_ERR
,("Failed to read reclock file from daemon\n"));
3350 talloc_free(tmp_ctx
);
3354 if (reclockfile
== NULL
) {
3355 if (ctdb
->recovery_lock_file
!= NULL
) {
3356 DEBUG(DEBUG_NOTICE
,("Recovery lock file disabled\n"));
3357 talloc_free(ctdb
->recovery_lock_file
);
3358 ctdb
->recovery_lock_file
= NULL
;
3359 ctdb_recovery_unlock(ctdb
);
3361 talloc_free(tmp_ctx
);
3365 if (ctdb
->recovery_lock_file
== NULL
) {
3367 ("Recovery lock file enabled (%s)\n", reclockfile
));
3368 ctdb
->recovery_lock_file
= talloc_strdup(ctdb
, reclockfile
);
3369 ctdb_recovery_unlock(ctdb
);
3370 talloc_free(tmp_ctx
);
3375 if (!strcmp(reclockfile
, ctdb
->recovery_lock_file
)) {
3376 talloc_free(tmp_ctx
);
3381 ("Recovery lock file changed (now %s)\n", reclockfile
));
3382 talloc_free(ctdb
->recovery_lock_file
);
3383 ctdb
->recovery_lock_file
= talloc_strdup(ctdb
, reclockfile
);
3384 ctdb_recovery_unlock(ctdb
);
3386 talloc_free(tmp_ctx
);
3390 static void main_loop(struct ctdb_context
*ctdb
, struct ctdb_recoverd
*rec
,
3391 TALLOC_CTX
*mem_ctx
)
3394 struct ctdb_node_map
*nodemap
=NULL
;
3395 struct ctdb_node_map
*recmaster_nodemap
=NULL
;
3396 struct ctdb_node_map
**remote_nodemaps
=NULL
;
3397 struct ctdb_vnn_map
*vnnmap
=NULL
;
3398 struct ctdb_vnn_map
*remote_vnnmap
=NULL
;
3399 uint32_t num_lmasters
;
3400 int32_t debug_level
;
3405 /* verify that the main daemon is still running */
3406 if (ctdb_kill(ctdb
, ctdb
->ctdbd_pid
, 0) != 0) {
3407 DEBUG(DEBUG_CRIT
,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
3411 /* ping the local daemon to tell it we are alive */
3412 ctdb_ctrl_recd_ping(ctdb
);
3414 if (rec
->election_timeout
) {
3415 /* an election is in progress */
3419 /* read the debug level from the parent and update locally */
3420 ret
= ctdb_ctrl_get_debuglevel(ctdb
, CTDB_CURRENT_NODE
, &debug_level
);
3422 DEBUG(DEBUG_ERR
, (__location__
" Failed to read debuglevel from parent\n"));
3425 DEBUGLEVEL
= debug_level
;
3427 /* get relevant tunables */
3428 ret
= ctdb_ctrl_get_all_tunables(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, &ctdb
->tunable
);
3430 DEBUG(DEBUG_ERR
,("Failed to get tunables - retrying\n"));
3435 ret
= ctdb_ctrl_get_runstate(ctdb
, CONTROL_TIMEOUT(),
3436 CTDB_CURRENT_NODE
, &ctdb
->runstate
);
3438 DEBUG(DEBUG_ERR
, ("Failed to get runstate - retrying\n"));
3442 /* get the current recovery lock file from the server */
3443 if (update_recovery_lock_file(ctdb
) != 0) {
3444 DEBUG(DEBUG_ERR
,("Failed to update the recovery lock file\n"));
3448 /* Make sure that if recovery lock verification becomes disabled when
3451 if (ctdb
->recovery_lock_file
== NULL
) {
3452 ctdb_recovery_unlock(ctdb
);
3455 pnn
= ctdb_get_pnn(ctdb
);
3457 /* get the vnnmap */
3458 ret
= ctdb_ctrl_getvnnmap(ctdb
, CONTROL_TIMEOUT(), pnn
, mem_ctx
, &vnnmap
);
3460 DEBUG(DEBUG_ERR
, (__location__
" Unable to get vnnmap from node %u\n", pnn
));
3465 /* get number of nodes */
3467 talloc_free(rec
->nodemap
);
3468 rec
->nodemap
= NULL
;
3471 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), pnn
, rec
, &rec
->nodemap
);
3473 DEBUG(DEBUG_ERR
, (__location__
" Unable to get nodemap from node %u\n", pnn
));
3476 nodemap
= rec
->nodemap
;
3478 /* remember our own node flags */
3479 rec
->node_flags
= nodemap
->nodes
[pnn
].flags
;
3481 ban_misbehaving_nodes(rec
, &self_ban
);
3483 DEBUG(DEBUG_NOTICE
, ("This node was banned, restart main_loop\n"));
3487 /* if the local daemon is STOPPED or BANNED, we verify that the databases are
3488 also frozen and that the recmode is set to active.
3490 if (rec
->node_flags
& (NODE_FLAGS_STOPPED
| NODE_FLAGS_BANNED
)) {
3491 /* If this node has become inactive then we want to
3492 * reduce the chances of it taking over the recovery
3493 * master role when it becomes active again. This
3494 * helps to stabilise the recovery master role so that
3495 * it stays on the most stable node.
3497 rec
->priority_time
= timeval_current();
3499 ret
= ctdb_ctrl_getrecmode(ctdb
, mem_ctx
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, &ctdb
->recovery_mode
);
3501 DEBUG(DEBUG_ERR
,(__location__
" Failed to read recmode from local node\n"));
3503 if (ctdb
->recovery_mode
== CTDB_RECOVERY_NORMAL
) {
3504 DEBUG(DEBUG_ERR
,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
3506 ret
= ctdb_ctrl_setrecmode(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, CTDB_RECOVERY_ACTIVE
);
3508 DEBUG(DEBUG_ERR
,(__location__
" Failed to activate recovery mode in STOPPED or BANNED state\n"));
3512 ret
= ctdb_ctrl_freeze(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
);
3514 DEBUG(DEBUG_ERR
,(__location__
" Failed to freeze node in STOPPED or BANNED state\n"));
3519 /* If this node is stopped or banned then it is not the recovery
3520 * master, so don't do anything. This prevents stopped or banned
3521 * node from starting election and sending unnecessary controls.
3526 /* check which node is the recovery master */
3527 ret
= ctdb_ctrl_getrecmaster(ctdb
, mem_ctx
, CONTROL_TIMEOUT(), pnn
, &rec
->recmaster
);
3529 DEBUG(DEBUG_ERR
, (__location__
" Unable to get recmaster from node %u\n", pnn
));
3533 /* If we are not the recmaster then do some housekeeping */
3534 if (rec
->recmaster
!= pnn
) {
3535 /* Ignore any IP reallocate requests - only recmaster
3538 TALLOC_FREE(rec
->reallocate_requests
);
3539 /* Clear any nodes that should be force rebalanced in
3540 * the next takeover run. If the recovery master role
3541 * has moved then we don't want to process these some
3542 * time in the future.
3544 TALLOC_FREE(rec
->force_rebalance_nodes
);
3547 /* This is a special case. When recovery daemon is started, recmaster
3548 * is set to -1. If a node is not started in stopped state, then
3549 * start election to decide recovery master
3551 if (rec
->recmaster
== (uint32_t)-1) {
3552 DEBUG(DEBUG_NOTICE
,(__location__
" Initial recovery master set - forcing election\n"));
3553 force_election(rec
, pnn
, nodemap
);
3557 /* update the capabilities for all nodes */
3558 ret
= update_capabilities(rec
, nodemap
);
3560 DEBUG(DEBUG_ERR
, (__location__
" Unable to update node capabilities.\n"));
3565 * If the current recmaster does not have CTDB_CAP_RECMASTER,
3566 * but we have, then force an election and try to become the new
3569 if (!ctdb_node_has_capabilities(rec
->caps
,
3571 CTDB_CAP_RECMASTER
) &&
3572 (rec
->ctdb
->capabilities
& CTDB_CAP_RECMASTER
) &&
3573 !(nodemap
->nodes
[pnn
].flags
& NODE_FLAGS_INACTIVE
)) {
3574 DEBUG(DEBUG_ERR
, (__location__
" Current recmaster node %u does not have CAP_RECMASTER,"
3575 " but we (node %u) have - force an election\n",
3576 rec
->recmaster
, pnn
));
3577 force_election(rec
, pnn
, nodemap
);
3581 /* verify that the recmaster node is still active */
3582 for (j
=0; j
<nodemap
->num
; j
++) {
3583 if (nodemap
->nodes
[j
].pnn
==rec
->recmaster
) {
3588 if (j
== nodemap
->num
) {
3589 DEBUG(DEBUG_ERR
, ("Recmaster node %u not in list. Force reelection\n", rec
->recmaster
));
3590 force_election(rec
, pnn
, nodemap
);
3594 /* if recovery master is disconnected we must elect a new recmaster */
3595 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_DISCONNECTED
) {
3596 DEBUG(DEBUG_NOTICE
, ("Recmaster node %u is disconnected. Force reelection\n", nodemap
->nodes
[j
].pnn
));
3597 force_election(rec
, pnn
, nodemap
);
3601 /* get nodemap from the recovery master to check if it is inactive */
3602 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
3603 mem_ctx
, &recmaster_nodemap
);
3605 DEBUG(DEBUG_ERR
, (__location__
" Unable to get nodemap from recovery master %u\n",
3606 nodemap
->nodes
[j
].pnn
));
3611 if ((recmaster_nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) &&
3612 (rec
->node_flags
& NODE_FLAGS_INACTIVE
) == 0) {
3613 DEBUG(DEBUG_NOTICE
, ("Recmaster node %u no longer available. Force reelection\n", nodemap
->nodes
[j
].pnn
));
3615 * update our nodemap to carry the recmaster's notion of
3616 * its own flags, so that we don't keep freezing the
3617 * inactive recmaster node...
3619 nodemap
->nodes
[j
].flags
= recmaster_nodemap
->nodes
[j
].flags
;
3620 force_election(rec
, pnn
, nodemap
);
3624 /* verify that we have all ip addresses we should have and we dont
3625 * have addresses we shouldnt have.
3627 if (ctdb
->tunable
.disable_ip_failover
== 0 &&
3628 !ctdb_op_is_disabled(rec
->takeover_run
)) {
3629 if (verify_local_ip_allocation(ctdb
, rec
, pnn
, nodemap
) != 0) {
3630 DEBUG(DEBUG_ERR
, (__location__
" Public IPs were inconsistent.\n"));
3635 /* if we are not the recmaster then we do not need to check
3636 if recovery is needed
3638 if (pnn
!= rec
->recmaster
) {
3643 /* ensure our local copies of flags are right */
3644 ret
= update_local_flags(rec
, nodemap
);
3645 if (ret
== MONITOR_ELECTION_NEEDED
) {
3646 DEBUG(DEBUG_NOTICE
,("update_local_flags() called for a re-election.\n"));
3647 force_election(rec
, pnn
, nodemap
);
3650 if (ret
!= MONITOR_OK
) {
3651 DEBUG(DEBUG_ERR
,("Unable to update local flags\n"));
3655 if (ctdb
->num_nodes
!= nodemap
->num
) {
3656 DEBUG(DEBUG_ERR
, (__location__
" ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb
->num_nodes
, nodemap
->num
));
3657 ctdb_load_nodes_file(ctdb
);
3661 /* verify that all active nodes agree that we are the recmaster */
3662 switch (verify_recmaster(rec
, nodemap
, pnn
)) {
3663 case MONITOR_RECOVERY_NEEDED
:
3664 /* can not happen */
3666 case MONITOR_ELECTION_NEEDED
:
3667 force_election(rec
, pnn
, nodemap
);
3671 case MONITOR_FAILED
:
3676 if (rec
->need_recovery
) {
3677 /* a previous recovery didn't finish */
3678 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3682 /* verify that all active nodes are in normal mode
3683 and not in recovery mode
3685 switch (verify_recmode(ctdb
, nodemap
)) {
3686 case MONITOR_RECOVERY_NEEDED
:
3687 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3689 case MONITOR_FAILED
:
3691 case MONITOR_ELECTION_NEEDED
:
3692 /* can not happen */
3698 if (ctdb
->recovery_lock_file
!= NULL
) {
3699 /* We must already hold the recovery lock */
3700 if (!ctdb_recovery_have_lock(ctdb
)) {
3701 DEBUG(DEBUG_ERR
,("Failed recovery lock sanity check. Force a recovery\n"));
3702 ctdb_set_culprit(rec
, ctdb
->pnn
);
3703 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3709 /* if there are takeovers requested, perform it and notify the waiters */
3710 if (!ctdb_op_is_disabled(rec
->takeover_run
) &&
3711 rec
->reallocate_requests
) {
3712 process_ipreallocate_requests(ctdb
, rec
);
3715 /* If recoveries are disabled then there is no use doing any
3716 * nodemap or flags checks. Recoveries might be disabled due
3717 * to "reloadnodes", so doing these checks might cause an
3718 * unnecessary recovery. */
3719 if (ctdb_op_is_disabled(rec
->recovery
)) {
3723 /* get the nodemap for all active remote nodes
3725 remote_nodemaps
= talloc_array(mem_ctx
, struct ctdb_node_map
*, nodemap
->num
);
3726 if (remote_nodemaps
== NULL
) {
3727 DEBUG(DEBUG_ERR
, (__location__
" failed to allocate remote nodemap array\n"));
3730 for(i
=0; i
<nodemap
->num
; i
++) {
3731 remote_nodemaps
[i
] = NULL
;
3733 if (get_remote_nodemaps(ctdb
, mem_ctx
, nodemap
, remote_nodemaps
) != 0) {
3734 DEBUG(DEBUG_ERR
,(__location__
" Failed to read remote nodemaps\n"));
3738 /* verify that all other nodes have the same nodemap as we have
3740 for (j
=0; j
<nodemap
->num
; j
++) {
3741 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
3745 if (remote_nodemaps
[j
] == NULL
) {
3746 DEBUG(DEBUG_ERR
,(__location__
" Did not get a remote nodemap for node %d, restarting monitoring\n", j
));
3747 ctdb_set_culprit(rec
, j
);
3752 /* if the nodes disagree on how many nodes there are
3753 then this is a good reason to try recovery
3755 if (remote_nodemaps
[j
]->num
!= nodemap
->num
) {
3756 DEBUG(DEBUG_ERR
, (__location__
" Remote node:%u has different node count. %u vs %u of the local node\n",
3757 nodemap
->nodes
[j
].pnn
, remote_nodemaps
[j
]->num
, nodemap
->num
));
3758 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3759 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3763 /* if the nodes disagree on which nodes exist and are
3764 active, then that is also a good reason to do recovery
3766 for (i
=0;i
<nodemap
->num
;i
++) {
3767 if (remote_nodemaps
[j
]->nodes
[i
].pnn
!= nodemap
->nodes
[i
].pnn
) {
3768 DEBUG(DEBUG_ERR
, (__location__
" Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3769 nodemap
->nodes
[j
].pnn
, i
,
3770 remote_nodemaps
[j
]->nodes
[i
].pnn
, nodemap
->nodes
[i
].pnn
));
3771 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3772 do_recovery(rec
, mem_ctx
, pnn
, nodemap
,
3780 * Update node flags obtained from each active node. This ensure we have
3781 * up-to-date information for all the nodes.
3783 for (j
=0; j
<nodemap
->num
; j
++) {
3784 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
3787 nodemap
->nodes
[j
].flags
= remote_nodemaps
[j
]->nodes
[j
].flags
;
3790 for (j
=0; j
<nodemap
->num
; j
++) {
3791 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
3795 /* verify the flags are consistent
3797 for (i
=0; i
<nodemap
->num
; i
++) {
3798 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
) {
3802 if (nodemap
->nodes
[i
].flags
!= remote_nodemaps
[j
]->nodes
[i
].flags
) {
3803 DEBUG(DEBUG_ERR
, (__location__
" Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3804 nodemap
->nodes
[j
].pnn
,
3805 nodemap
->nodes
[i
].pnn
,
3806 remote_nodemaps
[j
]->nodes
[i
].flags
,
3807 nodemap
->nodes
[i
].flags
));
3809 DEBUG(DEBUG_ERR
,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps
[j
]->nodes
[i
].flags
, j
));
3810 update_flags_on_all_nodes(ctdb
, nodemap
, nodemap
->nodes
[i
].pnn
, remote_nodemaps
[j
]->nodes
[i
].flags
);
3811 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3812 do_recovery(rec
, mem_ctx
, pnn
, nodemap
,
3816 DEBUG(DEBUG_ERR
,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap
->nodes
[i
].flags
, i
));
3817 update_flags_on_all_nodes(ctdb
, nodemap
, nodemap
->nodes
[i
].pnn
, nodemap
->nodes
[i
].flags
);
3818 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3819 do_recovery(rec
, mem_ctx
, pnn
, nodemap
,
3828 /* count how many active nodes there are */
3830 for (i
=0; i
<nodemap
->num
; i
++) {
3831 if (!(nodemap
->nodes
[i
].flags
& NODE_FLAGS_INACTIVE
)) {
3832 if (ctdb_node_has_capabilities(rec
->caps
,
3833 ctdb
->nodes
[i
]->pnn
,
3834 CTDB_CAP_LMASTER
)) {
3841 /* There must be the same number of lmasters in the vnn map as
3842 * there are active nodes with the lmaster capability... or
3845 if (vnnmap
->size
!= num_lmasters
) {
3846 DEBUG(DEBUG_ERR
, (__location__
" The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
3847 vnnmap
->size
, num_lmasters
));
3848 ctdb_set_culprit(rec
, ctdb
->pnn
);
3849 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3853 /* verify that all active nodes in the nodemap also exist in
3856 for (j
=0; j
<nodemap
->num
; j
++) {
3857 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
3860 if (nodemap
->nodes
[j
].pnn
== pnn
) {
3864 for (i
=0; i
<vnnmap
->size
; i
++) {
3865 if (vnnmap
->map
[i
] == nodemap
->nodes
[j
].pnn
) {
3869 if (i
== vnnmap
->size
) {
3870 DEBUG(DEBUG_ERR
, (__location__
" Node %u is active in the nodemap but did not exist in the vnnmap\n",
3871 nodemap
->nodes
[j
].pnn
));
3872 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3873 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3879 /* verify that all other nodes have the same vnnmap
3880 and are from the same generation
3882 for (j
=0; j
<nodemap
->num
; j
++) {
3883 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
3886 if (nodemap
->nodes
[j
].pnn
== pnn
) {
3890 ret
= ctdb_ctrl_getvnnmap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
3891 mem_ctx
, &remote_vnnmap
);
3893 DEBUG(DEBUG_ERR
, (__location__
" Unable to get vnnmap from remote node %u\n",
3894 nodemap
->nodes
[j
].pnn
));
3898 /* verify the vnnmap generation is the same */
3899 if (vnnmap
->generation
!= remote_vnnmap
->generation
) {
3900 DEBUG(DEBUG_ERR
, (__location__
" Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3901 nodemap
->nodes
[j
].pnn
, remote_vnnmap
->generation
, vnnmap
->generation
));
3902 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3903 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3907 /* verify the vnnmap size is the same */
3908 if (vnnmap
->size
!= remote_vnnmap
->size
) {
3909 DEBUG(DEBUG_ERR
, (__location__
" Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3910 nodemap
->nodes
[j
].pnn
, remote_vnnmap
->size
, vnnmap
->size
));
3911 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3912 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3916 /* verify the vnnmap is the same */
3917 for (i
=0;i
<vnnmap
->size
;i
++) {
3918 if (remote_vnnmap
->map
[i
] != vnnmap
->map
[i
]) {
3919 DEBUG(DEBUG_ERR
, (__location__
" Remote node %u has different vnnmap.\n",
3920 nodemap
->nodes
[j
].pnn
));
3921 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3922 do_recovery(rec
, mem_ctx
, pnn
, nodemap
,
3929 /* we might need to change who has what IP assigned */
3930 if (rec
->need_takeover_run
) {
3931 uint32_t culprit
= (uint32_t)-1;
3933 rec
->need_takeover_run
= false;
3935 /* update the list of public ips that a node can handle for
3938 ret
= ctdb_reload_remote_public_ips(ctdb
, rec
, nodemap
, &culprit
);
3940 DEBUG(DEBUG_ERR
,("Failed to read public ips from remote node %d\n",
3942 rec
->need_takeover_run
= true;
3946 /* execute the "startrecovery" event script on all nodes */
3947 ret
= run_startrecovery_eventscript(rec
, nodemap
);
3949 DEBUG(DEBUG_ERR
, (__location__
" Unable to run the 'startrecovery' event on cluster\n"));
3950 ctdb_set_culprit(rec
, ctdb
->pnn
);
3951 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3955 /* If takeover run fails, then the offending nodes are
3956 * assigned ban culprit counts. And we re-try takeover.
3957 * If takeover run fails repeatedly, the node would get
3960 * If rec->need_takeover_run is not set to true at this
3961 * failure, monitoring is disabled cluster-wide (via
3962 * startrecovery eventscript) and will not get enabled.
3964 if (!do_takeover_run(rec
, nodemap
, true)) {
3968 /* execute the "recovered" event script on all nodes */
3969 ret
= run_recovered_eventscript(rec
, nodemap
, "monitor_cluster");
3971 // we cant check whether the event completed successfully
3972 // since this script WILL fail if the node is in recovery mode
3973 // and if that race happens, the code here would just cause a second
3974 // cascading recovery.
3976 DEBUG(DEBUG_ERR
, (__location__
" Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
3977 ctdb_set_culprit(rec
, ctdb
->pnn
);
3978 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3985 the main monitoring loop
3987 static void monitor_cluster(struct ctdb_context
*ctdb
)
3989 struct ctdb_recoverd
*rec
;
3991 DEBUG(DEBUG_NOTICE
,("monitor_cluster starting\n"));
3993 rec
= talloc_zero(ctdb
, struct ctdb_recoverd
);
3994 CTDB_NO_MEMORY_FATAL(ctdb
, rec
);
3998 rec
->takeover_run
= ctdb_op_init(rec
, "takeover runs");
3999 CTDB_NO_MEMORY_FATAL(ctdb
, rec
->takeover_run
);
4001 rec
->recovery
= ctdb_op_init(rec
, "recoveries");
4002 CTDB_NO_MEMORY_FATAL(ctdb
, rec
->recovery
);
4004 rec
->priority_time
= timeval_current();
4006 /* register a message port for sending memory dumps */
4007 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_MEM_DUMP
, mem_dump_handler
, rec
);
4009 /* register a message port for recovery elections */
4010 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_RECOVERY
, election_handler
, rec
);
4012 /* when nodes are disabled/enabled */
4013 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_SET_NODE_FLAGS
, monitor_handler
, rec
);
4015 /* when we are asked to puch out a flag change */
4016 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_PUSH_NODE_FLAGS
, push_flags_handler
, rec
);
4018 /* register a message port for vacuum fetch */
4019 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_VACUUM_FETCH
, vacuum_fetch_handler
, rec
);
4021 /* register a message port for reloadnodes */
4022 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_RELOAD_NODES
, reload_nodes_handler
, rec
);
4024 /* register a message port for performing a takeover run */
4025 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_TAKEOVER_RUN
, ip_reallocate_handler
, rec
);
4027 /* register a message port for disabling the ip check for a short while */
4028 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_DISABLE_IP_CHECK
, disable_ip_check_handler
, rec
);
4030 /* register a message port for updating the recovery daemons node assignment for an ip */
4031 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_RECD_UPDATE_IP
, recd_update_ip_handler
, rec
);
4033 /* register a message port for forcing a rebalance of a node next
4035 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_REBALANCE_NODE
, recd_node_rebalance_handler
, rec
);
4037 /* Register a message port for disabling takeover runs */
4038 ctdb_client_set_message_handler(ctdb
,
4039 CTDB_SRVID_DISABLE_TAKEOVER_RUNS
,
4040 disable_takeover_runs_handler
, rec
);
4042 /* Register a message port for disabling recoveries */
4043 ctdb_client_set_message_handler(ctdb
,
4044 CTDB_SRVID_DISABLE_RECOVERIES
,
4045 disable_recoveries_handler
, rec
);
4047 /* register a message port for detaching database */
4048 ctdb_client_set_message_handler(ctdb
,
4049 CTDB_SRVID_DETACH_DATABASE
,
4050 detach_database_handler
, rec
);
4053 TALLOC_CTX
*mem_ctx
= talloc_new(ctdb
);
4054 struct timeval start
;
4058 DEBUG(DEBUG_CRIT
,(__location__
4059 " Failed to create temp context\n"));
4063 start
= timeval_current();
4064 main_loop(ctdb
, rec
, mem_ctx
);
4065 talloc_free(mem_ctx
);
4067 /* we only check for recovery once every second */
4068 elapsed
= timeval_elapsed(&start
);
4069 if (elapsed
< ctdb
->tunable
.recover_interval
) {
4070 ctdb_wait_timeout(ctdb
, ctdb
->tunable
.recover_interval
4077 event handler for when the main ctdbd dies
4079 static void ctdb_recoverd_parent(struct event_context
*ev
, struct fd_event
*fde
,
4080 uint16_t flags
, void *private_data
)
4082 DEBUG(DEBUG_ALERT
,("recovery daemon parent died - exiting\n"));
4087 called regularly to verify that the recovery daemon is still running
4089 static void ctdb_check_recd(struct event_context
*ev
, struct timed_event
*te
,
4090 struct timeval yt
, void *p
)
4092 struct ctdb_context
*ctdb
= talloc_get_type(p
, struct ctdb_context
);
4094 if (ctdb_kill(ctdb
, ctdb
->recoverd_pid
, 0) != 0) {
4095 DEBUG(DEBUG_ERR
,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb
->recoverd_pid
));
4097 event_add_timed(ctdb
->ev
, ctdb
, timeval_zero(),
4098 ctdb_restart_recd
, ctdb
);
4103 event_add_timed(ctdb
->ev
, ctdb
->recd_ctx
,
4104 timeval_current_ofs(30, 0),
4105 ctdb_check_recd
, ctdb
);
4108 static void recd_sig_child_handler(struct event_context
*ev
,
4109 struct signal_event
*se
, int signum
, int count
,
4113 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4118 pid
= waitpid(-1, &status
, WNOHANG
);
4120 if (errno
!= ECHILD
) {
4121 DEBUG(DEBUG_ERR
, (__location__
" waitpid() returned error. errno:%s(%d)\n", strerror(errno
),errno
));
4126 DEBUG(DEBUG_DEBUG
, ("RECD SIGCHLD from %d\n", (int)pid
));
4132 startup the recovery daemon as a child of the main ctdb daemon
4134 int ctdb_start_recoverd(struct ctdb_context
*ctdb
)
4137 struct signal_event
*se
;
4138 struct tevent_fd
*fde
;
4140 if (pipe(fd
) != 0) {
4144 ctdb
->recoverd_pid
= ctdb_fork(ctdb
);
4145 if (ctdb
->recoverd_pid
== -1) {
4149 if (ctdb
->recoverd_pid
!= 0) {
4150 talloc_free(ctdb
->recd_ctx
);
4151 ctdb
->recd_ctx
= talloc_new(ctdb
);
4152 CTDB_NO_MEMORY(ctdb
, ctdb
->recd_ctx
);
4155 event_add_timed(ctdb
->ev
, ctdb
->recd_ctx
,
4156 timeval_current_ofs(30, 0),
4157 ctdb_check_recd
, ctdb
);
4163 srandom(getpid() ^ time(NULL
));
4165 ctdb_set_process_name("ctdb_recovered");
4166 if (switch_from_server_to_client(ctdb
, "recoverd") != 0) {
4167 DEBUG(DEBUG_CRIT
, (__location__
"ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
4171 DEBUG(DEBUG_DEBUG
, (__location__
" Created PIPE FD:%d to recovery daemon\n", fd
[0]));
4173 fde
= event_add_fd(ctdb
->ev
, ctdb
, fd
[0], EVENT_FD_READ
,
4174 ctdb_recoverd_parent
, &fd
[0]);
4175 tevent_fd_set_auto_close(fde
);
4177 /* set up a handler to pick up sigchld */
4178 se
= event_add_signal(ctdb
->ev
, ctdb
,
4180 recd_sig_child_handler
,
4183 DEBUG(DEBUG_CRIT
,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
4187 monitor_cluster(ctdb
);
4189 DEBUG(DEBUG_ALERT
,("ERROR: ctdb_recoverd finished!?\n"));
4194 shutdown the recovery daemon
4196 void ctdb_stop_recoverd(struct ctdb_context
*ctdb
)
4198 if (ctdb
->recoverd_pid
== 0) {
4202 DEBUG(DEBUG_NOTICE
,("Shutting down recovery daemon\n"));
4203 ctdb_kill(ctdb
, ctdb
->recoverd_pid
, SIGTERM
);
4205 TALLOC_FREE(ctdb
->recd_ctx
);
4206 TALLOC_FREE(ctdb
->recd_ping_count
);
4209 static void ctdb_restart_recd(struct event_context
*ev
, struct timed_event
*te
,
4210 struct timeval t
, void *private_data
)
4212 struct ctdb_context
*ctdb
= talloc_get_type(private_data
, struct ctdb_context
);
4214 DEBUG(DEBUG_ERR
,("Restarting recovery daemon\n"));
4215 ctdb_stop_recoverd(ctdb
);
4216 ctdb_start_recoverd(ctdb
);