4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "system/filesys.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/wait.h"
27 #include "../include/ctdb_client.h"
28 #include "../include/ctdb_private.h"
29 #include "lib/tdb_wrap/tdb_wrap.h"
30 #include "lib/util/dlinklist.h"
33 /* List of SRVID requests that need to be processed */
35 struct srvid_list
*next
, *prev
;
36 struct srvid_request
*request
;
39 struct srvid_requests
{
40 struct srvid_list
*requests
;
43 static void srvid_request_reply(struct ctdb_context
*ctdb
,
44 struct srvid_request
*request
,
47 /* Someone that sent srvid==0 does not want a reply */
48 if (request
->srvid
== 0) {
53 if (ctdb_client_send_message(ctdb
, request
->pnn
, request
->srvid
,
55 DEBUG(DEBUG_INFO
,("Sent SRVID reply to %u:%llu\n",
56 (unsigned)request
->pnn
,
57 (unsigned long long)request
->srvid
));
59 DEBUG(DEBUG_ERR
,("Failed to send SRVID reply to %u:%llu\n",
60 (unsigned)request
->pnn
,
61 (unsigned long long)request
->srvid
));
67 static void srvid_requests_reply(struct ctdb_context
*ctdb
,
68 struct srvid_requests
**requests
,
73 for (r
= (*requests
)->requests
; r
!= NULL
; r
= r
->next
) {
74 srvid_request_reply(ctdb
, r
->request
, result
);
77 /* Free the list structure... */
78 TALLOC_FREE(*requests
);
81 static void srvid_request_add(struct ctdb_context
*ctdb
,
82 struct srvid_requests
**requests
,
83 struct srvid_request
*request
)
89 if (*requests
== NULL
) {
90 *requests
= talloc_zero(ctdb
, struct srvid_requests
);
91 if (*requests
== NULL
) {
96 t
= talloc_zero(*requests
, struct srvid_list
);
98 /* If *requests was just allocated above then free it */
99 if ((*requests
)->requests
== NULL
) {
100 TALLOC_FREE(*requests
);
105 t
->request
= (struct srvid_request
*)talloc_steal(t
, request
);
106 DLIST_ADD((*requests
)->requests
, t
);
111 /* Failed to add the request to the list. Send a fail. */
112 DEBUG(DEBUG_ERR
, (__location__
113 " Out of memory, failed to queue SRVID request\n"));
115 result
.dsize
= sizeof(ret
);
116 result
.dptr
= (uint8_t *)&ret
;
117 srvid_request_reply(ctdb
, request
, result
);
120 /* An abstraction to allow an operation (takeover runs, recoveries,
121 * ...) to be disabled for a given timeout */
122 struct ctdb_op_state
{
123 struct tevent_timer
*timer
;
128 static struct ctdb_op_state
*ctdb_op_init(TALLOC_CTX
*mem_ctx
, const char *name
)
130 struct ctdb_op_state
*state
= talloc_zero(mem_ctx
, struct ctdb_op_state
);
133 state
->in_progress
= false;
140 static bool ctdb_op_is_disabled(struct ctdb_op_state
*state
)
142 return state
->timer
!= NULL
;
145 static bool ctdb_op_begin(struct ctdb_op_state
*state
)
147 if (ctdb_op_is_disabled(state
)) {
149 ("Unable to begin - %s are disabled\n", state
->name
));
153 state
->in_progress
= true;
157 static bool ctdb_op_end(struct ctdb_op_state
*state
)
159 return state
->in_progress
= false;
162 static bool ctdb_op_is_in_progress(struct ctdb_op_state
*state
)
164 return state
->in_progress
;
167 static void ctdb_op_enable(struct ctdb_op_state
*state
)
169 TALLOC_FREE(state
->timer
);
172 static void ctdb_op_timeout_handler(struct event_context
*ev
,
173 struct timed_event
*te
,
174 struct timeval yt
, void *p
)
176 struct ctdb_op_state
*state
=
177 talloc_get_type(p
, struct ctdb_op_state
);
179 DEBUG(DEBUG_NOTICE
,("Reenabling %s after timeout\n", state
->name
));
180 ctdb_op_enable(state
);
183 static int ctdb_op_disable(struct ctdb_op_state
*state
,
184 struct tevent_context
*ev
,
188 DEBUG(DEBUG_NOTICE
,("Reenabling %s\n", state
->name
));
189 ctdb_op_enable(state
);
193 if (state
->in_progress
) {
195 ("Unable to disable %s - in progress\n", state
->name
));
199 DEBUG(DEBUG_NOTICE
,("Disabling %s for %u seconds\n",
200 state
->name
, timeout
));
202 /* Clear any old timers */
203 talloc_free(state
->timer
);
205 /* Arrange for the timeout to occur */
206 state
->timer
= tevent_add_timer(ev
, state
,
207 timeval_current_ofs(timeout
, 0),
208 ctdb_op_timeout_handler
, state
);
209 if (state
->timer
== NULL
) {
210 DEBUG(DEBUG_ERR
,(__location__
" Unable to setup timer\n"));
217 struct ctdb_banning_state
{
219 struct timeval last_reported_time
;
223 private state of recovery daemon
225 struct ctdb_recoverd
{
226 struct ctdb_context
*ctdb
;
228 uint32_t last_culprit_node
;
229 struct ctdb_node_map
*nodemap
;
230 struct timeval priority_time
;
231 bool need_takeover_run
;
234 struct timed_event
*send_election_te
;
235 struct timed_event
*election_timeout
;
236 struct vacuum_info
*vacuum_info
;
237 struct srvid_requests
*reallocate_requests
;
238 struct ctdb_op_state
*takeover_run
;
239 struct ctdb_op_state
*recovery
;
240 struct ctdb_control_get_ifaces
*ifaces
;
241 uint32_t *force_rebalance_nodes
;
242 struct ctdb_node_capabilities
*caps
;
245 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
246 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
248 static void ctdb_restart_recd(struct event_context
*ev
, struct timed_event
*te
, struct timeval t
, void *private_data
);
251 ban a node for a period of time
253 static void ctdb_ban_node(struct ctdb_recoverd
*rec
, uint32_t pnn
, uint32_t ban_time
)
256 struct ctdb_context
*ctdb
= rec
->ctdb
;
257 struct ctdb_ban_time bantime
;
259 if (!ctdb_validate_pnn(ctdb
, pnn
)) {
260 DEBUG(DEBUG_ERR
,("Bad pnn %u in ctdb_ban_node\n", pnn
));
264 DEBUG(DEBUG_NOTICE
,("Banning node %u for %u seconds\n", pnn
, ban_time
));
267 bantime
.time
= ban_time
;
269 ret
= ctdb_ctrl_set_ban(ctdb
, CONTROL_TIMEOUT(), pnn
, &bantime
);
271 DEBUG(DEBUG_ERR
,(__location__
" Failed to ban node %d\n", pnn
));
277 enum monitor_result
{ MONITOR_OK
, MONITOR_RECOVERY_NEEDED
, MONITOR_ELECTION_NEEDED
, MONITOR_FAILED
};
281 remember the trouble maker
283 static void ctdb_set_culprit_count(struct ctdb_recoverd
*rec
, uint32_t culprit
, uint32_t count
)
285 struct ctdb_context
*ctdb
= talloc_get_type(rec
->ctdb
, struct ctdb_context
);
286 struct ctdb_banning_state
*ban_state
;
288 if (culprit
> ctdb
->num_nodes
) {
289 DEBUG(DEBUG_ERR
,("Trying to set culprit %d but num_nodes is %d\n", culprit
, ctdb
->num_nodes
));
293 /* If we are banned or stopped, do not set other nodes as culprits */
294 if (rec
->node_flags
& NODE_FLAGS_INACTIVE
) {
295 DEBUG(DEBUG_NOTICE
, ("This node is INACTIVE, cannot set culprit node %d\n", culprit
));
299 if (ctdb
->nodes
[culprit
]->ban_state
== NULL
) {
300 ctdb
->nodes
[culprit
]->ban_state
= talloc_zero(ctdb
->nodes
[culprit
], struct ctdb_banning_state
);
301 CTDB_NO_MEMORY_VOID(ctdb
, ctdb
->nodes
[culprit
]->ban_state
);
305 ban_state
= ctdb
->nodes
[culprit
]->ban_state
;
306 if (timeval_elapsed(&ban_state
->last_reported_time
) > ctdb
->tunable
.recovery_grace_period
) {
307 /* this was the first time in a long while this node
308 misbehaved so we will forgive any old transgressions.
310 ban_state
->count
= 0;
313 ban_state
->count
+= count
;
314 ban_state
->last_reported_time
= timeval_current();
315 rec
->last_culprit_node
= culprit
;
319 remember the trouble maker
321 static void ctdb_set_culprit(struct ctdb_recoverd
*rec
, uint32_t culprit
)
323 ctdb_set_culprit_count(rec
, culprit
, 1);
327 /* this callback is called for every node that failed to execute the
330 static void recovered_fail_callback(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
332 struct ctdb_recoverd
*rec
= talloc_get_type(callback_data
, struct ctdb_recoverd
);
334 DEBUG(DEBUG_ERR
, (__location__
" Node %u failed the recovered event. Setting it as recovery fail culprit\n", node_pnn
));
336 ctdb_set_culprit(rec
, node_pnn
);
340 run the "recovered" eventscript on all nodes
342 static int run_recovered_eventscript(struct ctdb_recoverd
*rec
, struct ctdb_node_map
*nodemap
, const char *caller
)
346 struct ctdb_context
*ctdb
= rec
->ctdb
;
348 tmp_ctx
= talloc_new(ctdb
);
349 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
351 nodes
= list_of_active_nodes(ctdb
, nodemap
, tmp_ctx
, true);
352 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_END_RECOVERY
,
354 CONTROL_TIMEOUT(), false, tdb_null
,
355 NULL
, recovered_fail_callback
,
357 DEBUG(DEBUG_ERR
, (__location__
" Unable to run the 'recovered' event when called from %s\n", caller
));
359 talloc_free(tmp_ctx
);
363 talloc_free(tmp_ctx
);
367 /* this callback is called for every node that failed to execute the
370 static void startrecovery_fail_callback(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
372 struct ctdb_recoverd
*rec
= talloc_get_type(callback_data
, struct ctdb_recoverd
);
374 DEBUG(DEBUG_ERR
, (__location__
" Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn
));
376 ctdb_set_culprit(rec
, node_pnn
);
380 run the "startrecovery" eventscript on all nodes
382 static int run_startrecovery_eventscript(struct ctdb_recoverd
*rec
, struct ctdb_node_map
*nodemap
)
386 struct ctdb_context
*ctdb
= rec
->ctdb
;
388 tmp_ctx
= talloc_new(ctdb
);
389 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
391 nodes
= list_of_active_nodes(ctdb
, nodemap
, tmp_ctx
, true);
392 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_START_RECOVERY
,
394 CONTROL_TIMEOUT(), false, tdb_null
,
396 startrecovery_fail_callback
,
398 DEBUG(DEBUG_ERR
, (__location__
" Unable to run the 'startrecovery' event. Recovery failed.\n"));
399 talloc_free(tmp_ctx
);
403 talloc_free(tmp_ctx
);
408 update the node capabilities for all connected nodes
410 static int update_capabilities(struct ctdb_recoverd
*rec
,
411 struct ctdb_node_map
*nodemap
)
415 struct ctdb_node_capabilities
*caps
;
416 struct ctdb_context
*ctdb
= rec
->ctdb
;
418 tmp_ctx
= talloc_new(rec
);
419 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
421 caps
= ctdb_get_capabilities(ctdb
, tmp_ctx
,
422 CONTROL_TIMEOUT(), nodemap
);
426 (__location__
" Failed to get node capabilities\n"));
427 talloc_free(tmp_ctx
);
431 capp
= ctdb_get_node_capabilities(caps
, ctdb_get_pnn(ctdb
));
435 " Capabilities don't include current node.\n"));
436 talloc_free(tmp_ctx
);
439 ctdb
->capabilities
= *capp
;
441 TALLOC_FREE(rec
->caps
);
442 rec
->caps
= talloc_steal(rec
, caps
);
444 talloc_free(tmp_ctx
);
448 static void set_recmode_fail_callback(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
450 struct ctdb_recoverd
*rec
= talloc_get_type(callback_data
, struct ctdb_recoverd
);
452 DEBUG(DEBUG_ERR
,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn
, rec
->nodemap
->num
));
453 ctdb_set_culprit_count(rec
, node_pnn
, rec
->nodemap
->num
);
456 static void transaction_start_fail_callback(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
458 struct ctdb_recoverd
*rec
= talloc_get_type(callback_data
, struct ctdb_recoverd
);
460 DEBUG(DEBUG_ERR
,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn
, rec
->nodemap
->num
));
461 ctdb_set_culprit_count(rec
, node_pnn
, rec
->nodemap
->num
);
465 change recovery mode on all nodes
467 static int set_recovery_mode(struct ctdb_context
*ctdb
, struct ctdb_recoverd
*rec
, struct ctdb_node_map
*nodemap
, uint32_t rec_mode
)
473 tmp_ctx
= talloc_new(ctdb
);
474 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
476 nodes
= list_of_active_nodes(ctdb
, nodemap
, tmp_ctx
, true);
478 data
.dsize
= sizeof(uint32_t);
479 data
.dptr
= (unsigned char *)&rec_mode
;
481 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_SET_RECMODE
,
487 DEBUG(DEBUG_ERR
, (__location__
" Unable to set recovery mode. Recovery failed.\n"));
488 talloc_free(tmp_ctx
);
492 /* freeze all nodes */
493 if (rec_mode
== CTDB_RECOVERY_ACTIVE
) {
496 for (i
=1; i
<=NUM_DB_PRIORITIES
; i
++) {
497 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_FREEZE
,
502 set_recmode_fail_callback
,
504 DEBUG(DEBUG_ERR
, (__location__
" Unable to freeze nodes. Recovery failed.\n"));
505 talloc_free(tmp_ctx
);
511 talloc_free(tmp_ctx
);
516 change recovery master on all node
518 static int set_recovery_master(struct ctdb_context
*ctdb
, struct ctdb_node_map
*nodemap
, uint32_t pnn
)
524 tmp_ctx
= talloc_new(ctdb
);
525 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
527 data
.dsize
= sizeof(uint32_t);
528 data
.dptr
= (unsigned char *)&pnn
;
530 nodes
= list_of_active_nodes(ctdb
, nodemap
, tmp_ctx
, true);
531 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_SET_RECMASTER
,
533 CONTROL_TIMEOUT(), false, data
,
536 DEBUG(DEBUG_ERR
, (__location__
" Unable to set recmaster. Recovery failed.\n"));
537 talloc_free(tmp_ctx
);
541 talloc_free(tmp_ctx
);
545 /* update all remote nodes to use the same db priority that we have
546 this can fail if the remove node has not yet been upgraded to
547 support this function, so we always return success and never fail
548 a recovery if this call fails.
550 static int update_db_priority_on_remote_nodes(struct ctdb_context
*ctdb
,
551 struct ctdb_node_map
*nodemap
,
552 uint32_t pnn
, struct ctdb_dbid_map
*dbmap
, TALLOC_CTX
*mem_ctx
)
556 /* step through all local databases */
557 for (db
=0; db
<dbmap
->num
;db
++) {
558 struct ctdb_db_priority db_prio
;
561 db_prio
.db_id
= dbmap
->dbs
[db
].dbid
;
562 ret
= ctdb_ctrl_get_db_priority(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, dbmap
->dbs
[db
].dbid
, &db_prio
.priority
);
564 DEBUG(DEBUG_ERR
,(__location__
" Failed to read database priority from local node for db 0x%08x\n", dbmap
->dbs
[db
].dbid
));
568 DEBUG(DEBUG_INFO
,("Update DB priority for db 0x%08x to %u\n", dbmap
->dbs
[db
].dbid
, db_prio
.priority
));
570 ret
= ctdb_ctrl_set_db_priority(ctdb
, CONTROL_TIMEOUT(),
571 CTDB_CURRENT_NODE
, &db_prio
);
573 DEBUG(DEBUG_ERR
,(__location__
" Failed to set DB priority for 0x%08x\n",
582 ensure all other nodes have attached to any databases that we have
584 static int create_missing_remote_databases(struct ctdb_context
*ctdb
, struct ctdb_node_map
*nodemap
,
585 uint32_t pnn
, struct ctdb_dbid_map
*dbmap
, TALLOC_CTX
*mem_ctx
)
588 struct ctdb_dbid_map
*remote_dbmap
;
590 /* verify that all other nodes have all our databases */
591 for (j
=0; j
<nodemap
->num
; j
++) {
592 /* we dont need to ourself ourselves */
593 if (nodemap
->nodes
[j
].pnn
== pnn
) {
596 /* dont check nodes that are unavailable */
597 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
601 ret
= ctdb_ctrl_getdbmap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
602 mem_ctx
, &remote_dbmap
);
604 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbids from node %u\n", pnn
));
608 /* step through all local databases */
609 for (db
=0; db
<dbmap
->num
;db
++) {
613 for (i
=0;i
<remote_dbmap
->num
;i
++) {
614 if (dbmap
->dbs
[db
].dbid
== remote_dbmap
->dbs
[i
].dbid
) {
618 /* the remote node already have this database */
619 if (i
!=remote_dbmap
->num
) {
622 /* ok so we need to create this database */
623 ret
= ctdb_ctrl_getdbname(ctdb
, CONTROL_TIMEOUT(), pnn
,
624 dbmap
->dbs
[db
].dbid
, mem_ctx
,
627 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbname from node %u\n", pnn
));
630 ret
= ctdb_ctrl_createdb(ctdb
, CONTROL_TIMEOUT(),
631 nodemap
->nodes
[j
].pnn
,
633 dbmap
->dbs
[db
].flags
& CTDB_DB_FLAGS_PERSISTENT
);
635 DEBUG(DEBUG_ERR
, (__location__
" Unable to create remote db:%s\n", name
));
646 ensure we are attached to any databases that anyone else is attached to
648 static int create_missing_local_databases(struct ctdb_context
*ctdb
, struct ctdb_node_map
*nodemap
,
649 uint32_t pnn
, struct ctdb_dbid_map
**dbmap
, TALLOC_CTX
*mem_ctx
)
652 struct ctdb_dbid_map
*remote_dbmap
;
654 /* verify that we have all database any other node has */
655 for (j
=0; j
<nodemap
->num
; j
++) {
656 /* we dont need to ourself ourselves */
657 if (nodemap
->nodes
[j
].pnn
== pnn
) {
660 /* dont check nodes that are unavailable */
661 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
665 ret
= ctdb_ctrl_getdbmap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
666 mem_ctx
, &remote_dbmap
);
668 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbids from node %u\n", pnn
));
672 /* step through all databases on the remote node */
673 for (db
=0; db
<remote_dbmap
->num
;db
++) {
676 for (i
=0;i
<(*dbmap
)->num
;i
++) {
677 if (remote_dbmap
->dbs
[db
].dbid
== (*dbmap
)->dbs
[i
].dbid
) {
681 /* we already have this db locally */
682 if (i
!=(*dbmap
)->num
) {
685 /* ok so we need to create this database and
688 ctdb_ctrl_getdbname(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
689 remote_dbmap
->dbs
[db
].dbid
, mem_ctx
, &name
);
691 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbname from node %u\n",
692 nodemap
->nodes
[j
].pnn
));
695 ctdb_ctrl_createdb(ctdb
, CONTROL_TIMEOUT(), pnn
, mem_ctx
, name
,
696 remote_dbmap
->dbs
[db
].flags
& CTDB_DB_FLAGS_PERSISTENT
);
698 DEBUG(DEBUG_ERR
, (__location__
" Unable to create local db:%s\n", name
));
701 ret
= ctdb_ctrl_getdbmap(ctdb
, CONTROL_TIMEOUT(), pnn
, mem_ctx
, dbmap
);
703 DEBUG(DEBUG_ERR
, (__location__
" Unable to reread dbmap on node %u\n", pnn
));
714 pull the remote database contents from one node into the recdb
716 static int pull_one_remote_database(struct ctdb_context
*ctdb
, uint32_t srcnode
,
717 struct tdb_wrap
*recdb
, uint32_t dbid
)
721 struct ctdb_marshall_buffer
*reply
;
722 struct ctdb_rec_data
*recdata
;
724 TALLOC_CTX
*tmp_ctx
= talloc_new(recdb
);
726 ret
= ctdb_ctrl_pulldb(ctdb
, srcnode
, dbid
, CTDB_LMASTER_ANY
, tmp_ctx
,
727 CONTROL_TIMEOUT(), &outdata
);
729 DEBUG(DEBUG_ERR
,(__location__
" Unable to copy db from node %u\n", srcnode
));
730 talloc_free(tmp_ctx
);
734 reply
= (struct ctdb_marshall_buffer
*)outdata
.dptr
;
736 if (outdata
.dsize
< offsetof(struct ctdb_marshall_buffer
, data
)) {
737 DEBUG(DEBUG_ERR
,(__location__
" invalid data in pulldb reply\n"));
738 talloc_free(tmp_ctx
);
742 recdata
= (struct ctdb_rec_data
*)&reply
->data
[0];
746 recdata
= (struct ctdb_rec_data
*)(recdata
->length
+ (uint8_t *)recdata
), i
++) {
748 struct ctdb_ltdb_header
*hdr
;
751 key
.dptr
= &recdata
->data
[0];
752 key
.dsize
= recdata
->keylen
;
753 data
.dptr
= &recdata
->data
[key
.dsize
];
754 data
.dsize
= recdata
->datalen
;
756 hdr
= (struct ctdb_ltdb_header
*)data
.dptr
;
758 if (data
.dsize
< sizeof(struct ctdb_ltdb_header
)) {
759 DEBUG(DEBUG_CRIT
,(__location__
" bad ltdb record\n"));
760 talloc_free(tmp_ctx
);
764 /* fetch the existing record, if any */
765 existing
= tdb_fetch(recdb
->tdb
, key
);
767 if (existing
.dptr
!= NULL
) {
768 struct ctdb_ltdb_header header
;
769 if (existing
.dsize
< sizeof(struct ctdb_ltdb_header
)) {
770 DEBUG(DEBUG_CRIT
,(__location__
" Bad record size %u from node %u\n",
771 (unsigned)existing
.dsize
, srcnode
));
773 talloc_free(tmp_ctx
);
776 header
= *(struct ctdb_ltdb_header
*)existing
.dptr
;
778 if (!(header
.rsn
< hdr
->rsn
||
779 (header
.dmaster
!= ctdb_get_pnn(ctdb
) &&
780 header
.rsn
== hdr
->rsn
))) {
785 if (tdb_store(recdb
->tdb
, key
, data
, TDB_REPLACE
) != 0) {
786 DEBUG(DEBUG_CRIT
,(__location__
" Failed to store record\n"));
787 talloc_free(tmp_ctx
);
792 talloc_free(tmp_ctx
);
798 struct pull_seqnum_cbdata
{
804 static void pull_seqnum_cb(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
806 struct pull_seqnum_cbdata
*cb_data
= talloc_get_type(callback_data
, struct pull_seqnum_cbdata
);
809 if (cb_data
->failed
!= 0) {
810 DEBUG(DEBUG_ERR
, ("Got seqnum from node %d but we have already failed the entire operation\n", node_pnn
));
815 DEBUG(DEBUG_ERR
, ("Error when pulling seqnum from node %d\n", node_pnn
));
820 if (outdata
.dsize
!= sizeof(uint64_t)) {
821 DEBUG(DEBUG_ERR
, ("Error when reading pull seqnum from node %d, got %d bytes but expected %d\n", node_pnn
, (int)outdata
.dsize
, (int)sizeof(uint64_t)));
822 cb_data
->failed
= -1;
826 seqnum
= *((uint64_t *)outdata
.dptr
);
828 if (seqnum
> cb_data
->seqnum
||
829 (cb_data
->pnn
== -1 && seqnum
== 0)) {
830 cb_data
->seqnum
= seqnum
;
831 cb_data
->pnn
= node_pnn
;
835 static void pull_seqnum_fail_cb(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
837 struct pull_seqnum_cbdata
*cb_data
= talloc_get_type(callback_data
, struct pull_seqnum_cbdata
);
839 DEBUG(DEBUG_ERR
, ("Failed to pull db seqnum from node %d\n", node_pnn
));
843 static int pull_highest_seqnum_pdb(struct ctdb_context
*ctdb
,
844 struct ctdb_recoverd
*rec
,
845 struct ctdb_node_map
*nodemap
,
846 struct tdb_wrap
*recdb
, uint32_t dbid
)
848 TALLOC_CTX
*tmp_ctx
= talloc_new(NULL
);
852 struct pull_seqnum_cbdata
*cb_data
;
854 DEBUG(DEBUG_NOTICE
, ("Scan for highest seqnum pdb for db:0x%08x\n", dbid
));
859 data
.dsize
= sizeof(outdata
);
860 data
.dptr
= (uint8_t *)&outdata
[0];
862 cb_data
= talloc(tmp_ctx
, struct pull_seqnum_cbdata
);
863 if (cb_data
== NULL
) {
864 DEBUG(DEBUG_ERR
, ("Failed to allocate pull highest seqnum cb_data structure\n"));
865 talloc_free(tmp_ctx
);
873 nodes
= list_of_active_nodes(ctdb
, nodemap
, tmp_ctx
, true);
874 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_GET_DB_SEQNUM
,
876 CONTROL_TIMEOUT(), false, data
,
880 DEBUG(DEBUG_ERR
, (__location__
" Failed to run async GET_DB_SEQNUM\n"));
882 talloc_free(tmp_ctx
);
886 if (cb_data
->failed
!= 0) {
887 DEBUG(DEBUG_NOTICE
, ("Failed to pull sequence numbers for DB 0x%08x\n", dbid
));
888 talloc_free(tmp_ctx
);
892 if (cb_data
->pnn
== -1) {
893 DEBUG(DEBUG_NOTICE
, ("Failed to find a node with highest sequence numbers for DB 0x%08x\n", dbid
));
894 talloc_free(tmp_ctx
);
898 DEBUG(DEBUG_NOTICE
, ("Pull persistent db:0x%08x from node %d with highest seqnum:%lld\n", dbid
, cb_data
->pnn
, (long long)cb_data
->seqnum
));
900 if (pull_one_remote_database(ctdb
, cb_data
->pnn
, recdb
, dbid
) != 0) {
901 DEBUG(DEBUG_ERR
, ("Failed to pull higest seqnum database 0x%08x from node %d\n", dbid
, cb_data
->pnn
));
902 talloc_free(tmp_ctx
);
906 talloc_free(tmp_ctx
);
912 pull all the remote database contents into the recdb
914 static int pull_remote_database(struct ctdb_context
*ctdb
,
915 struct ctdb_recoverd
*rec
,
916 struct ctdb_node_map
*nodemap
,
917 struct tdb_wrap
*recdb
, uint32_t dbid
,
922 if (persistent
&& ctdb
->tunable
.recover_pdb_by_seqnum
!= 0) {
924 ret
= pull_highest_seqnum_pdb(ctdb
, rec
, nodemap
, recdb
, dbid
);
930 /* pull all records from all other nodes across onto this node
931 (this merges based on rsn)
933 for (j
=0; j
<nodemap
->num
; j
++) {
934 /* dont merge from nodes that are unavailable */
935 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
938 if (pull_one_remote_database(ctdb
, nodemap
->nodes
[j
].pnn
, recdb
, dbid
) != 0) {
939 DEBUG(DEBUG_ERR
,(__location__
" Failed to pull remote database from node %u\n",
940 nodemap
->nodes
[j
].pnn
));
941 ctdb_set_culprit_count(rec
, nodemap
->nodes
[j
].pnn
, nodemap
->num
);
951 update flags on all active nodes
953 static int update_flags_on_all_nodes(struct ctdb_context
*ctdb
, struct ctdb_node_map
*nodemap
, uint32_t pnn
, uint32_t flags
)
957 ret
= ctdb_ctrl_modflags(ctdb
, CONTROL_TIMEOUT(), pnn
, flags
, ~flags
);
959 DEBUG(DEBUG_ERR
, (__location__
" Unable to update nodeflags on remote nodes\n"));
967 ensure all nodes have the same vnnmap we do
969 static int update_vnnmap_on_all_nodes(struct ctdb_context
*ctdb
, struct ctdb_node_map
*nodemap
,
970 uint32_t pnn
, struct ctdb_vnn_map
*vnnmap
, TALLOC_CTX
*mem_ctx
)
974 /* push the new vnn map out to all the nodes */
975 for (j
=0; j
<nodemap
->num
; j
++) {
976 /* dont push to nodes that are unavailable */
977 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
981 ret
= ctdb_ctrl_setvnnmap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
, mem_ctx
, vnnmap
);
983 DEBUG(DEBUG_ERR
, (__location__
" Unable to set vnnmap for node %u\n", pnn
));
993 struct vacuum_info
*next
, *prev
;
994 struct ctdb_recoverd
*rec
;
996 struct ctdb_db_context
*ctdb_db
;
997 struct ctdb_marshall_buffer
*recs
;
998 struct ctdb_rec_data
*r
;
1001 static void vacuum_fetch_next(struct vacuum_info
*v
);
1004 called when a vacuum fetch has completed - just free it and do the next one
1006 static void vacuum_fetch_callback(struct ctdb_client_call_state
*state
)
1013 process the next element from the vacuum list
1015 static void vacuum_fetch_next(struct vacuum_info
*v
)
1017 struct ctdb_call call
;
1018 struct ctdb_rec_data
*r
;
1020 while (v
->recs
->count
) {
1021 struct ctdb_client_call_state
*state
;
1023 struct ctdb_ltdb_header
*hdr
;
1026 call
.call_id
= CTDB_NULL_FUNC
;
1027 call
.flags
= CTDB_IMMEDIATE_MIGRATION
;
1028 call
.flags
|= CTDB_CALL_FLAG_VACUUM_MIGRATION
;
1031 v
->r
= (struct ctdb_rec_data
*)(r
->length
+ (uint8_t *)r
);
1034 call
.key
.dptr
= &r
->data
[0];
1035 call
.key
.dsize
= r
->keylen
;
1037 /* ensure we don't block this daemon - just skip a record if we can't get
1039 if (tdb_chainlock_nonblock(v
->ctdb_db
->ltdb
->tdb
, call
.key
) != 0) {
1043 data
= tdb_fetch(v
->ctdb_db
->ltdb
->tdb
, call
.key
);
1044 if (data
.dptr
== NULL
) {
1045 tdb_chainunlock(v
->ctdb_db
->ltdb
->tdb
, call
.key
);
1049 if (data
.dsize
< sizeof(struct ctdb_ltdb_header
)) {
1051 tdb_chainunlock(v
->ctdb_db
->ltdb
->tdb
, call
.key
);
1055 hdr
= (struct ctdb_ltdb_header
*)data
.dptr
;
1056 if (hdr
->dmaster
== v
->rec
->ctdb
->pnn
) {
1057 /* its already local */
1059 tdb_chainunlock(v
->ctdb_db
->ltdb
->tdb
, call
.key
);
1065 state
= ctdb_call_send(v
->ctdb_db
, &call
);
1066 tdb_chainunlock(v
->ctdb_db
->ltdb
->tdb
, call
.key
);
1067 if (state
== NULL
) {
1068 DEBUG(DEBUG_ERR
,(__location__
" Failed to setup vacuum fetch call\n"));
1072 state
->async
.fn
= vacuum_fetch_callback
;
1073 state
->async
.private_data
= NULL
;
1081 destroy a vacuum info structure
1083 static int vacuum_info_destructor(struct vacuum_info
*v
)
1085 DLIST_REMOVE(v
->rec
->vacuum_info
, v
);
1091 handler for vacuum fetch
1093 static void vacuum_fetch_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
1094 TDB_DATA data
, void *private_data
)
1096 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
, struct ctdb_recoverd
);
1097 struct ctdb_marshall_buffer
*recs
;
1099 TALLOC_CTX
*tmp_ctx
= talloc_new(ctdb
);
1101 struct ctdb_dbid_map
*dbmap
=NULL
;
1102 bool persistent
= false;
1103 struct ctdb_db_context
*ctdb_db
;
1104 struct ctdb_rec_data
*r
;
1106 struct vacuum_info
*v
;
1108 recs
= (struct ctdb_marshall_buffer
*)data
.dptr
;
1109 r
= (struct ctdb_rec_data
*)&recs
->data
[0];
1111 if (recs
->count
== 0) {
1112 talloc_free(tmp_ctx
);
1118 for (v
=rec
->vacuum_info
;v
;v
=v
->next
) {
1119 if (srcnode
== v
->srcnode
&& recs
->db_id
== v
->ctdb_db
->db_id
) {
1120 /* we're already working on records from this node */
1121 talloc_free(tmp_ctx
);
1126 /* work out if the database is persistent */
1127 ret
= ctdb_ctrl_getdbmap(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, tmp_ctx
, &dbmap
);
1129 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbids from local node\n"));
1130 talloc_free(tmp_ctx
);
1134 for (i
=0;i
<dbmap
->num
;i
++) {
1135 if (dbmap
->dbs
[i
].dbid
== recs
->db_id
) {
1136 persistent
= dbmap
->dbs
[i
].flags
& CTDB_DB_FLAGS_PERSISTENT
;
1140 if (i
== dbmap
->num
) {
1141 DEBUG(DEBUG_ERR
, (__location__
" Unable to find db_id 0x%x on local node\n", recs
->db_id
));
1142 talloc_free(tmp_ctx
);
1146 /* find the name of this database */
1147 if (ctdb_ctrl_getdbname(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, recs
->db_id
, tmp_ctx
, &name
) != 0) {
1148 DEBUG(DEBUG_ERR
,(__location__
" Failed to get name of db 0x%x\n", recs
->db_id
));
1149 talloc_free(tmp_ctx
);
1154 ctdb_db
= ctdb_attach(ctdb
, CONTROL_TIMEOUT(), name
, persistent
, 0);
1155 if (ctdb_db
== NULL
) {
1156 DEBUG(DEBUG_ERR
,(__location__
" Failed to attach to database '%s'\n", name
));
1157 talloc_free(tmp_ctx
);
1161 v
= talloc_zero(rec
, struct vacuum_info
);
1163 DEBUG(DEBUG_CRIT
,(__location__
" Out of memory\n"));
1164 talloc_free(tmp_ctx
);
1169 v
->srcnode
= srcnode
;
1170 v
->ctdb_db
= ctdb_db
;
1171 v
->recs
= talloc_memdup(v
, recs
, data
.dsize
);
1172 if (v
->recs
== NULL
) {
1173 DEBUG(DEBUG_CRIT
,(__location__
" Out of memory\n"));
1175 talloc_free(tmp_ctx
);
1178 v
->r
= (struct ctdb_rec_data
*)&v
->recs
->data
[0];
1180 DLIST_ADD(rec
->vacuum_info
, v
);
1182 talloc_set_destructor(v
, vacuum_info_destructor
);
1184 vacuum_fetch_next(v
);
1185 talloc_free(tmp_ctx
);
1190 * handler for database detach
1192 static void detach_database_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
1193 TDB_DATA data
, void *private_data
)
1195 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
,
1196 struct ctdb_recoverd
);
1198 struct vacuum_info
*v
, *vnext
;
1199 struct ctdb_db_context
*ctdb_db
;
1201 if (data
.dsize
!= sizeof(db_id
)) {
1204 db_id
= *(uint32_t *)data
.dptr
;
1206 ctdb_db
= find_ctdb_db(ctdb
, db_id
);
1207 if (ctdb_db
== NULL
) {
1208 /* database is not attached */
1212 /* Stop any active vacuum fetch */
1213 v
= rec
->vacuum_info
;
1217 if (v
->ctdb_db
->db_id
== db_id
) {
1223 DLIST_REMOVE(ctdb
->db_list
, ctdb_db
);
1225 DEBUG(DEBUG_NOTICE
, ("Detached from database '%s'\n",
1227 talloc_free(ctdb_db
);
1231 called when ctdb_wait_timeout should finish
1233 static void ctdb_wait_handler(struct event_context
*ev
, struct timed_event
*te
,
1234 struct timeval yt
, void *p
)
1236 uint32_t *timed_out
= (uint32_t *)p
;
1241 wait for a given number of seconds
1243 static void ctdb_wait_timeout(struct ctdb_context
*ctdb
, double secs
)
1245 uint32_t timed_out
= 0;
1246 time_t usecs
= (secs
- (time_t)secs
) * 1000000;
1247 event_add_timed(ctdb
->ev
, ctdb
, timeval_current_ofs(secs
, usecs
), ctdb_wait_handler
, &timed_out
);
1248 while (!timed_out
) {
1249 event_loop_once(ctdb
->ev
);
1254 called when an election times out (ends)
1256 static void ctdb_election_timeout(struct event_context
*ev
, struct timed_event
*te
,
1257 struct timeval t
, void *p
)
1259 struct ctdb_recoverd
*rec
= talloc_get_type(p
, struct ctdb_recoverd
);
1260 rec
->election_timeout
= NULL
;
1263 DEBUG(DEBUG_WARNING
,("Election period ended\n"));
1268 wait for an election to finish. It finished election_timeout seconds after
1269 the last election packet is received
1271 static void ctdb_wait_election(struct ctdb_recoverd
*rec
)
1273 struct ctdb_context
*ctdb
= rec
->ctdb
;
1274 while (rec
->election_timeout
) {
1275 event_loop_once(ctdb
->ev
);
1280 Update our local flags from all remote connected nodes.
1281 This is only run when we are or we belive we are the recovery master
1283 static int update_local_flags(struct ctdb_recoverd
*rec
, struct ctdb_node_map
*nodemap
)
1286 struct ctdb_context
*ctdb
= rec
->ctdb
;
1287 TALLOC_CTX
*mem_ctx
= talloc_new(ctdb
);
1289 /* get the nodemap for all active remote nodes and verify
1290 they are the same as for this node
1292 for (j
=0; j
<nodemap
->num
; j
++) {
1293 struct ctdb_node_map
*remote_nodemap
=NULL
;
1296 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_DISCONNECTED
) {
1299 if (nodemap
->nodes
[j
].pnn
== ctdb
->pnn
) {
1303 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
1304 mem_ctx
, &remote_nodemap
);
1306 DEBUG(DEBUG_ERR
, (__location__
" Unable to get nodemap from remote node %u\n",
1307 nodemap
->nodes
[j
].pnn
));
1308 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
1309 talloc_free(mem_ctx
);
1310 return MONITOR_FAILED
;
1312 if (nodemap
->nodes
[j
].flags
!= remote_nodemap
->nodes
[j
].flags
) {
1313 /* We should tell our daemon about this so it
1314 updates its flags or else we will log the same
1315 message again in the next iteration of recovery.
1316 Since we are the recovery master we can just as
1317 well update the flags on all nodes.
1319 ret
= ctdb_ctrl_modflags(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
, remote_nodemap
->nodes
[j
].flags
, ~remote_nodemap
->nodes
[j
].flags
);
1321 DEBUG(DEBUG_ERR
, (__location__
" Unable to update nodeflags on remote nodes\n"));
1325 /* Update our local copy of the flags in the recovery
1328 DEBUG(DEBUG_NOTICE
,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
1329 nodemap
->nodes
[j
].pnn
, remote_nodemap
->nodes
[j
].flags
,
1330 nodemap
->nodes
[j
].flags
));
1331 nodemap
->nodes
[j
].flags
= remote_nodemap
->nodes
[j
].flags
;
1333 talloc_free(remote_nodemap
);
1335 talloc_free(mem_ctx
);
1340 /* Create a new random generation ip.
1341 The generation id can not be the INVALID_GENERATION id
1343 static uint32_t new_generation(void)
1345 uint32_t generation
;
1348 generation
= random();
1350 if (generation
!= INVALID_GENERATION
) {
1360 create a temporary working database
1362 static struct tdb_wrap
*create_recdb(struct ctdb_context
*ctdb
, TALLOC_CTX
*mem_ctx
)
1365 struct tdb_wrap
*recdb
;
1368 /* open up the temporary recovery database */
1369 name
= talloc_asprintf(mem_ctx
, "%s/recdb.tdb.%u",
1370 ctdb
->db_directory_state
,
1377 tdb_flags
= TDB_NOLOCK
;
1378 if (ctdb
->valgrinding
) {
1379 tdb_flags
|= TDB_NOMMAP
;
1381 tdb_flags
|= (TDB_INCOMPATIBLE_HASH
| TDB_DISALLOW_NESTING
);
1383 recdb
= tdb_wrap_open(mem_ctx
, name
, ctdb
->tunable
.database_hash_size
,
1384 tdb_flags
, O_RDWR
|O_CREAT
|O_EXCL
, 0600);
1385 if (recdb
== NULL
) {
1386 DEBUG(DEBUG_CRIT
,(__location__
" Failed to create temp recovery database '%s'\n", name
));
1396 a traverse function for pulling all relevant records from recdb
1399 struct ctdb_context
*ctdb
;
1400 struct ctdb_marshall_buffer
*recdata
;
1402 uint32_t allocated_len
;
1407 static int traverse_recdb(struct tdb_context
*tdb
, TDB_DATA key
, TDB_DATA data
, void *p
)
1409 struct recdb_data
*params
= (struct recdb_data
*)p
;
1410 struct ctdb_rec_data
*recdata
;
1411 struct ctdb_ltdb_header
*hdr
;
1414 * skip empty records - but NOT for persistent databases:
1416 * The record-by-record mode of recovery deletes empty records.
1417 * For persistent databases, this can lead to data corruption
1418 * by deleting records that should be there:
1420 * - Assume the cluster has been running for a while.
1422 * - A record R in a persistent database has been created and
1423 * deleted a couple of times, the last operation being deletion,
1424 * leaving an empty record with a high RSN, say 10.
1426 * - Now a node N is turned off.
1428 * - This leaves the local database copy of D on N with the empty
1429 * copy of R and RSN 10. On all other nodes, the recovery has deleted
1430 * the copy of record R.
1432 * - Now the record is created again while node N is turned off.
1433 * This creates R with RSN = 1 on all nodes except for N.
1435 * - Now node N is turned on again. The following recovery will chose
1436 * the older empty copy of R due to RSN 10 > RSN 1.
1438 * ==> Hence the record is gone after the recovery.
1440 * On databases like Samba's registry, this can damage the higher-level
1441 * data structures built from the various tdb-level records.
1443 if (!params
->persistent
&& data
.dsize
<= sizeof(struct ctdb_ltdb_header
)) {
1447 /* update the dmaster field to point to us */
1448 hdr
= (struct ctdb_ltdb_header
*)data
.dptr
;
1449 if (!params
->persistent
) {
1450 hdr
->dmaster
= params
->ctdb
->pnn
;
1451 hdr
->flags
|= CTDB_REC_FLAG_MIGRATED_WITH_DATA
;
1454 /* add the record to the blob ready to send to the nodes */
1455 recdata
= ctdb_marshall_record(params
->recdata
, 0, key
, NULL
, data
);
1456 if (recdata
== NULL
) {
1457 params
->failed
= true;
1460 if (params
->len
+ recdata
->length
>= params
->allocated_len
) {
1461 params
->allocated_len
= recdata
->length
+ params
->len
+ params
->ctdb
->tunable
.pulldb_preallocation_size
;
1462 params
->recdata
= talloc_realloc_size(NULL
, params
->recdata
, params
->allocated_len
);
1464 if (params
->recdata
== NULL
) {
1465 DEBUG(DEBUG_CRIT
,(__location__
" Failed to expand recdata to %u\n",
1466 recdata
->length
+ params
->len
));
1467 params
->failed
= true;
1470 params
->recdata
->count
++;
1471 memcpy(params
->len
+(uint8_t *)params
->recdata
, recdata
, recdata
->length
);
1472 params
->len
+= recdata
->length
;
1473 talloc_free(recdata
);
1479 push the recdb database out to all nodes
1481 static int push_recdb_database(struct ctdb_context
*ctdb
, uint32_t dbid
,
1483 struct tdb_wrap
*recdb
, struct ctdb_node_map
*nodemap
)
1485 struct recdb_data params
;
1486 struct ctdb_marshall_buffer
*recdata
;
1488 TALLOC_CTX
*tmp_ctx
;
1491 tmp_ctx
= talloc_new(ctdb
);
1492 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
1494 recdata
= talloc_zero(recdb
, struct ctdb_marshall_buffer
);
1495 CTDB_NO_MEMORY(ctdb
, recdata
);
1497 recdata
->db_id
= dbid
;
1500 params
.recdata
= recdata
;
1501 params
.len
= offsetof(struct ctdb_marshall_buffer
, data
);
1502 params
.allocated_len
= params
.len
;
1503 params
.failed
= false;
1504 params
.persistent
= persistent
;
1506 if (tdb_traverse_read(recdb
->tdb
, traverse_recdb
, ¶ms
) == -1) {
1507 DEBUG(DEBUG_ERR
,(__location__
" Failed to traverse recdb database\n"));
1508 talloc_free(params
.recdata
);
1509 talloc_free(tmp_ctx
);
1513 if (params
.failed
) {
1514 DEBUG(DEBUG_ERR
,(__location__
" Failed to traverse recdb database\n"));
1515 talloc_free(params
.recdata
);
1516 talloc_free(tmp_ctx
);
1520 recdata
= params
.recdata
;
1522 outdata
.dptr
= (void *)recdata
;
1523 outdata
.dsize
= params
.len
;
1525 nodes
= list_of_active_nodes(ctdb
, nodemap
, tmp_ctx
, true);
1526 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_PUSH_DB
,
1528 CONTROL_TIMEOUT(), false, outdata
,
1531 DEBUG(DEBUG_ERR
,(__location__
" Failed to push recdb records to nodes for db 0x%x\n", dbid
));
1532 talloc_free(recdata
);
1533 talloc_free(tmp_ctx
);
1537 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - pushed remote database 0x%x of size %u\n",
1538 dbid
, recdata
->count
));
1540 talloc_free(recdata
);
1541 talloc_free(tmp_ctx
);
1548 go through a full recovery on one database
1550 static int recover_database(struct ctdb_recoverd
*rec
,
1551 TALLOC_CTX
*mem_ctx
,
1555 struct ctdb_node_map
*nodemap
,
1556 uint32_t transaction_id
)
1558 struct tdb_wrap
*recdb
;
1560 struct ctdb_context
*ctdb
= rec
->ctdb
;
1562 struct ctdb_control_wipe_database w
;
1565 recdb
= create_recdb(ctdb
, mem_ctx
);
1566 if (recdb
== NULL
) {
1570 /* pull all remote databases onto the recdb */
1571 ret
= pull_remote_database(ctdb
, rec
, nodemap
, recdb
, dbid
, persistent
);
1573 DEBUG(DEBUG_ERR
, (__location__
" Unable to pull remote database 0x%x\n", dbid
));
1577 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - pulled remote database 0x%x\n", dbid
));
1579 /* wipe all the remote databases. This is safe as we are in a transaction */
1581 w
.transaction_id
= transaction_id
;
1583 data
.dptr
= (void *)&w
;
1584 data
.dsize
= sizeof(w
);
1586 nodes
= list_of_active_nodes(ctdb
, nodemap
, recdb
, true);
1587 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_WIPE_DATABASE
,
1589 CONTROL_TIMEOUT(), false, data
,
1592 DEBUG(DEBUG_ERR
, (__location__
" Unable to wipe database. Recovery failed.\n"));
1597 /* push out the correct database. This sets the dmaster and skips
1598 the empty records */
1599 ret
= push_recdb_database(ctdb
, dbid
, persistent
, recdb
, nodemap
);
1605 /* all done with this database */
1611 static int ctdb_reload_remote_public_ips(struct ctdb_context
*ctdb
,
1612 struct ctdb_recoverd
*rec
,
1613 struct ctdb_node_map
*nodemap
,
1619 if (ctdb
->num_nodes
!= nodemap
->num
) {
1620 DEBUG(DEBUG_ERR
, (__location__
" ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
1621 ctdb
->num_nodes
, nodemap
->num
));
1623 *culprit
= ctdb
->pnn
;
1628 for (j
=0; j
<nodemap
->num
; j
++) {
1629 /* For readability */
1630 struct ctdb_node
*node
= ctdb
->nodes
[j
];
1632 /* release any existing data */
1633 if (node
->known_public_ips
) {
1634 talloc_free(node
->known_public_ips
);
1635 node
->known_public_ips
= NULL
;
1637 if (node
->available_public_ips
) {
1638 talloc_free(node
->available_public_ips
);
1639 node
->available_public_ips
= NULL
;
1642 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
1646 /* Retrieve the list of known public IPs from the node */
1647 ret
= ctdb_ctrl_get_public_ips_flags(ctdb
,
1652 &node
->known_public_ips
);
1655 ("Failed to read known public IPs from node: %u\n",
1658 *culprit
= node
->pnn
;
1663 if (ctdb
->do_checkpublicip
&&
1664 !ctdb_op_is_disabled(rec
->takeover_run
) &&
1665 verify_remote_ip_allocation(ctdb
,
1666 node
->known_public_ips
,
1668 DEBUG(DEBUG_ERR
,("Trigger IP reallocation\n"));
1669 rec
->need_takeover_run
= true;
1672 /* Retrieve the list of available public IPs from the node */
1673 ret
= ctdb_ctrl_get_public_ips_flags(ctdb
,
1677 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE
,
1678 &node
->available_public_ips
);
1681 ("Failed to read available public IPs from node: %u\n",
1684 *culprit
= node
->pnn
;
1693 /* when we start a recovery, make sure all nodes use the same reclock file
1696 static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd
*rec
)
1698 struct ctdb_context
*ctdb
= rec
->ctdb
;
1699 TALLOC_CTX
*tmp_ctx
= talloc_new(NULL
);
1703 if (ctdb
->recovery_lock_file
== NULL
) {
1707 data
.dsize
= strlen(ctdb
->recovery_lock_file
) + 1;
1708 data
.dptr
= (uint8_t *)ctdb
->recovery_lock_file
;
1711 nodes
= list_of_active_nodes(ctdb
, rec
->nodemap
, tmp_ctx
, true);
1712 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_SET_RECLOCK_FILE
,
1718 DEBUG(DEBUG_ERR
, (__location__
" Failed to sync reclock file settings\n"));
1719 talloc_free(tmp_ctx
);
1723 talloc_free(tmp_ctx
);
1729 * this callback is called for every node that failed to execute ctdb_takeover_run()
1730 * and set flag to re-run takeover run.
1732 static void takeover_fail_callback(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
1734 DEBUG(DEBUG_ERR
, ("Node %u failed the takeover run\n", node_pnn
));
1736 if (callback_data
!= NULL
) {
1737 struct ctdb_recoverd
*rec
= talloc_get_type(callback_data
, struct ctdb_recoverd
);
1739 DEBUG(DEBUG_ERR
, ("Setting node %u as recovery fail culprit\n", node_pnn
));
1741 ctdb_set_culprit(rec
, node_pnn
);
1746 static void ban_misbehaving_nodes(struct ctdb_recoverd
*rec
, bool *self_ban
)
1748 struct ctdb_context
*ctdb
= rec
->ctdb
;
1750 struct ctdb_banning_state
*ban_state
;
1753 for (i
=0; i
<ctdb
->num_nodes
; i
++) {
1754 if (ctdb
->nodes
[i
]->ban_state
== NULL
) {
1757 ban_state
= (struct ctdb_banning_state
*)ctdb
->nodes
[i
]->ban_state
;
1758 if (ban_state
->count
< 2*ctdb
->num_nodes
) {
1762 DEBUG(DEBUG_NOTICE
,("Node %u reached %u banning credits - banning it for %u seconds\n",
1763 ctdb
->nodes
[i
]->pnn
, ban_state
->count
,
1764 ctdb
->tunable
.recovery_ban_period
));
1765 ctdb_ban_node(rec
, ctdb
->nodes
[i
]->pnn
, ctdb
->tunable
.recovery_ban_period
);
1766 ban_state
->count
= 0;
1768 /* Banning ourself? */
1769 if (ctdb
->nodes
[i
]->pnn
== rec
->ctdb
->pnn
) {
1775 static bool do_takeover_run(struct ctdb_recoverd
*rec
,
1776 struct ctdb_node_map
*nodemap
,
1777 bool banning_credits_on_fail
)
1779 uint32_t *nodes
= NULL
;
1780 struct srvid_request_data dtr
;
1783 uint32_t *rebalance_nodes
= rec
->force_rebalance_nodes
;
1787 DEBUG(DEBUG_NOTICE
, ("Takeover run starting\n"));
1789 if (ctdb_op_is_in_progress(rec
->takeover_run
)) {
1790 DEBUG(DEBUG_ERR
, (__location__
1791 " takeover run already in progress \n"));
1796 if (!ctdb_op_begin(rec
->takeover_run
)) {
1801 /* Disable IP checks (takeover runs, really) on other nodes
1802 * while doing this takeover run. This will stop those other
1803 * nodes from triggering takeover runs when think they should
1804 * be hosting an IP but it isn't yet on an interface. Don't
1805 * wait for replies since a failure here might cause some
1806 * noise in the logs but will not actually cause a problem.
1808 dtr
.srvid
= 0; /* No reply */
1811 data
.dptr
= (uint8_t*)&dtr
;
1812 data
.dsize
= sizeof(dtr
);
1814 nodes
= list_of_connected_nodes(rec
->ctdb
, nodemap
, rec
, false);
1816 /* Disable for 60 seconds. This can be a tunable later if
1820 for (i
= 0; i
< talloc_array_length(nodes
); i
++) {
1821 if (ctdb_client_send_message(rec
->ctdb
, nodes
[i
],
1822 CTDB_SRVID_DISABLE_TAKEOVER_RUNS
,
1824 DEBUG(DEBUG_INFO
,("Failed to disable takeover runs\n"));
1828 ret
= ctdb_takeover_run(rec
->ctdb
, nodemap
,
1829 rec
->force_rebalance_nodes
,
1830 takeover_fail_callback
,
1831 banning_credits_on_fail
? rec
: NULL
);
1833 /* Reenable takeover runs and IP checks on other nodes */
1835 for (i
= 0; i
< talloc_array_length(nodes
); i
++) {
1836 if (ctdb_client_send_message(rec
->ctdb
, nodes
[i
],
1837 CTDB_SRVID_DISABLE_TAKEOVER_RUNS
,
1839 DEBUG(DEBUG_INFO
,("Failed to reenable takeover runs\n"));
1844 DEBUG(DEBUG_ERR
, ("ctdb_takeover_run() failed\n"));
1850 /* Takeover run was successful so clear force rebalance targets */
1851 if (rebalance_nodes
== rec
->force_rebalance_nodes
) {
1852 TALLOC_FREE(rec
->force_rebalance_nodes
);
1854 DEBUG(DEBUG_WARNING
,
1855 ("Rebalance target nodes changed during takeover run - not clearing\n"));
1858 rec
->need_takeover_run
= !ok
;
1860 ctdb_op_end(rec
->takeover_run
);
1862 DEBUG(DEBUG_NOTICE
, ("Takeover run %s\n", ok
? "completed successfully" : "unsuccessful"));
1868 we are the recmaster, and recovery is needed - start a recovery run
1870 static int do_recovery(struct ctdb_recoverd
*rec
,
1871 TALLOC_CTX
*mem_ctx
, uint32_t pnn
,
1872 struct ctdb_node_map
*nodemap
, struct ctdb_vnn_map
*vnnmap
)
1874 struct ctdb_context
*ctdb
= rec
->ctdb
;
1876 uint32_t generation
;
1877 struct ctdb_dbid_map
*dbmap
;
1880 struct timeval start_time
;
1881 uint32_t culprit
= (uint32_t)-1;
1884 DEBUG(DEBUG_NOTICE
, (__location__
" Starting do_recovery\n"));
1886 /* if recovery fails, force it again */
1887 rec
->need_recovery
= true;
1889 if (!ctdb_op_begin(rec
->recovery
)) {
1893 if (rec
->election_timeout
) {
1894 /* an election is in progress */
1895 DEBUG(DEBUG_ERR
, ("do_recovery called while election in progress - try again later\n"));
1899 ban_misbehaving_nodes(rec
, &self_ban
);
1901 DEBUG(DEBUG_NOTICE
, ("This node was banned, aborting recovery\n"));
1905 if (ctdb
->recovery_lock_file
!= NULL
) {
1906 if (ctdb_recovery_have_lock(ctdb
)) {
1907 DEBUG(DEBUG_NOTICE
, ("Already holding recovery lock\n"));
1909 start_time
= timeval_current();
1910 DEBUG(DEBUG_NOTICE
, ("Attempting to take recovery lock (%s)\n",
1911 ctdb
->recovery_lock_file
));
1912 if (!ctdb_recovery_lock(ctdb
)) {
1913 if (ctdb
->runstate
== CTDB_RUNSTATE_FIRST_RECOVERY
) {
1914 /* If ctdb is trying first recovery, it's
1915 * possible that current node does not know
1916 * yet who the recmaster is.
1918 DEBUG(DEBUG_ERR
, ("Unable to get recovery lock"
1919 " - retrying recovery\n"));
1923 DEBUG(DEBUG_ERR
,("Unable to get recovery lock - aborting recovery "
1924 "and ban ourself for %u seconds\n",
1925 ctdb
->tunable
.recovery_ban_period
));
1926 ctdb_ban_node(rec
, pnn
, ctdb
->tunable
.recovery_ban_period
);
1929 ctdb_ctrl_report_recd_lock_latency(ctdb
,
1931 timeval_elapsed(&start_time
));
1933 ("Recovery lock taken successfully by recovery daemon\n"));
1937 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery initiated due to problem with node %u\n", rec
->last_culprit_node
));
1939 /* get a list of all databases */
1940 ret
= ctdb_ctrl_getdbmap(ctdb
, CONTROL_TIMEOUT(), pnn
, mem_ctx
, &dbmap
);
1942 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbids from node :%u\n", pnn
));
1946 /* we do the db creation before we set the recovery mode, so the freeze happens
1947 on all databases we will be dealing with. */
1949 /* verify that we have all the databases any other node has */
1950 ret
= create_missing_local_databases(ctdb
, nodemap
, pnn
, &dbmap
, mem_ctx
);
1952 DEBUG(DEBUG_ERR
, (__location__
" Unable to create missing local databases\n"));
1956 /* verify that all other nodes have all our databases */
1957 ret
= create_missing_remote_databases(ctdb
, nodemap
, pnn
, dbmap
, mem_ctx
);
1959 DEBUG(DEBUG_ERR
, (__location__
" Unable to create missing remote databases\n"));
1962 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - created remote databases\n"));
1964 /* update the database priority for all remote databases */
1965 ret
= update_db_priority_on_remote_nodes(ctdb
, nodemap
, pnn
, dbmap
, mem_ctx
);
1967 DEBUG(DEBUG_ERR
, (__location__
" Unable to set db priority on remote nodes\n"));
1969 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - updated db priority for all databases\n"));
1972 /* update all other nodes to use the same setting for reclock files
1973 as the local recovery master.
1975 sync_recovery_lock_file_across_cluster(rec
);
1977 /* set recovery mode to active on all nodes */
1978 ret
= set_recovery_mode(ctdb
, rec
, nodemap
, CTDB_RECOVERY_ACTIVE
);
1980 DEBUG(DEBUG_ERR
, (__location__
" Unable to set recovery mode to active on cluster\n"));
1984 /* execute the "startrecovery" event script on all nodes */
1985 ret
= run_startrecovery_eventscript(rec
, nodemap
);
1987 DEBUG(DEBUG_ERR
, (__location__
" Unable to run the 'startrecovery' event on cluster\n"));
1992 update all nodes to have the same flags that we have
1994 for (i
=0;i
<nodemap
->num
;i
++) {
1995 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
) {
1999 ret
= update_flags_on_all_nodes(ctdb
, nodemap
, i
, nodemap
->nodes
[i
].flags
);
2001 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_INACTIVE
) {
2002 DEBUG(DEBUG_WARNING
, (__location__
"Unable to update flags on inactive node %d\n", i
));
2004 DEBUG(DEBUG_ERR
, (__location__
" Unable to update flags on all nodes for node %d\n", i
));
2010 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - updated flags\n"));
2012 /* pick a new generation number */
2013 generation
= new_generation();
2015 /* change the vnnmap on this node to use the new generation
2016 number but not on any other nodes.
2017 this guarantees that if we abort the recovery prematurely
2018 for some reason (a node stops responding?)
2019 that we can just return immediately and we will reenter
2020 recovery shortly again.
2021 I.e. we deliberately leave the cluster with an inconsistent
2022 generation id to allow us to abort recovery at any stage and
2023 just restart it from scratch.
2025 vnnmap
->generation
= generation
;
2026 ret
= ctdb_ctrl_setvnnmap(ctdb
, CONTROL_TIMEOUT(), pnn
, mem_ctx
, vnnmap
);
2028 DEBUG(DEBUG_ERR
, (__location__
" Unable to set vnnmap for node %u\n", pnn
));
2032 data
.dptr
= (void *)&generation
;
2033 data
.dsize
= sizeof(uint32_t);
2035 nodes
= list_of_active_nodes(ctdb
, nodemap
, mem_ctx
, true);
2036 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_TRANSACTION_START
,
2038 CONTROL_TIMEOUT(), false, data
,
2040 transaction_start_fail_callback
,
2042 DEBUG(DEBUG_ERR
, (__location__
" Unable to start transactions. Recovery failed.\n"));
2043 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_TRANSACTION_CANCEL
,
2045 CONTROL_TIMEOUT(), false, tdb_null
,
2049 DEBUG(DEBUG_ERR
,("Failed to cancel recovery transaction\n"));
2054 DEBUG(DEBUG_NOTICE
,(__location__
" started transactions on all nodes\n"));
2056 for (i
=0;i
<dbmap
->num
;i
++) {
2057 ret
= recover_database(rec
, mem_ctx
,
2059 dbmap
->dbs
[i
].flags
& CTDB_DB_FLAGS_PERSISTENT
,
2060 pnn
, nodemap
, generation
);
2062 DEBUG(DEBUG_ERR
, (__location__
" Failed to recover database 0x%x\n", dbmap
->dbs
[i
].dbid
));
2067 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - starting database commits\n"));
2069 /* commit all the changes */
2070 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_TRANSACTION_COMMIT
,
2072 CONTROL_TIMEOUT(), false, data
,
2075 DEBUG(DEBUG_ERR
, (__location__
" Unable to commit recovery changes. Recovery failed.\n"));
2079 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - committed databases\n"));
2082 /* update the capabilities for all nodes */
2083 ret
= update_capabilities(rec
, nodemap
);
2085 DEBUG(DEBUG_ERR
, (__location__
" Unable to update node capabilities.\n"));
2089 /* build a new vnn map with all the currently active and
2091 generation
= new_generation();
2092 vnnmap
= talloc(mem_ctx
, struct ctdb_vnn_map
);
2093 CTDB_NO_MEMORY(ctdb
, vnnmap
);
2094 vnnmap
->generation
= generation
;
2096 vnnmap
->map
= talloc_zero_array(vnnmap
, uint32_t, vnnmap
->size
);
2097 CTDB_NO_MEMORY(ctdb
, vnnmap
->map
);
2098 for (i
=j
=0;i
<nodemap
->num
;i
++) {
2099 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_INACTIVE
) {
2102 if (!ctdb_node_has_capabilities(rec
->caps
,
2103 ctdb
->nodes
[i
]->pnn
,
2104 CTDB_CAP_LMASTER
)) {
2105 /* this node can not be an lmaster */
2106 DEBUG(DEBUG_DEBUG
, ("Node %d cant be a LMASTER, skipping it\n", i
));
2111 vnnmap
->map
= talloc_realloc(vnnmap
, vnnmap
->map
, uint32_t, vnnmap
->size
);
2112 CTDB_NO_MEMORY(ctdb
, vnnmap
->map
);
2113 vnnmap
->map
[j
++] = nodemap
->nodes
[i
].pnn
;
2116 if (vnnmap
->size
== 0) {
2117 DEBUG(DEBUG_NOTICE
, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
2119 vnnmap
->map
= talloc_realloc(vnnmap
, vnnmap
->map
, uint32_t, vnnmap
->size
);
2120 CTDB_NO_MEMORY(ctdb
, vnnmap
->map
);
2121 vnnmap
->map
[0] = pnn
;
2124 /* update to the new vnnmap on all nodes */
2125 ret
= update_vnnmap_on_all_nodes(ctdb
, nodemap
, pnn
, vnnmap
, mem_ctx
);
2127 DEBUG(DEBUG_ERR
, (__location__
" Unable to update vnnmap on all nodes\n"));
2131 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - updated vnnmap\n"));
2133 /* update recmaster to point to us for all nodes */
2134 ret
= set_recovery_master(ctdb
, nodemap
, pnn
);
2136 DEBUG(DEBUG_ERR
, (__location__
" Unable to set recovery master\n"));
2140 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - updated recmaster\n"));
2142 /* disable recovery mode */
2143 ret
= set_recovery_mode(ctdb
, rec
, nodemap
, CTDB_RECOVERY_NORMAL
);
2145 DEBUG(DEBUG_ERR
, (__location__
" Unable to set recovery mode to normal on cluster\n"));
2149 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - disabled recovery mode\n"));
2151 /* Fetch known/available public IPs from each active node */
2152 ret
= ctdb_reload_remote_public_ips(ctdb
, rec
, nodemap
, &culprit
);
2154 DEBUG(DEBUG_ERR
,("Failed to read public ips from remote node %d\n",
2156 rec
->need_takeover_run
= true;
2160 do_takeover_run(rec
, nodemap
, false);
2162 /* execute the "recovered" event script on all nodes */
2163 ret
= run_recovered_eventscript(rec
, nodemap
, "do_recovery");
2165 DEBUG(DEBUG_ERR
, (__location__
" Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
2169 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - finished the recovered event\n"));
2171 /* send a message to all clients telling them that the cluster
2172 has been reconfigured */
2173 ret
= ctdb_client_send_message(ctdb
, CTDB_BROADCAST_CONNECTED
,
2174 CTDB_SRVID_RECONFIGURE
, tdb_null
);
2176 DEBUG(DEBUG_ERR
, (__location__
" Failed to send reconfigure message\n"));
2180 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery complete\n"));
2182 rec
->need_recovery
= false;
2183 ctdb_op_end(rec
->recovery
);
2185 /* we managed to complete a full recovery, make sure to forgive
2186 any past sins by the nodes that could now participate in the
2189 DEBUG(DEBUG_ERR
,("Resetting ban count to 0 for all nodes\n"));
2190 for (i
=0;i
<nodemap
->num
;i
++) {
2191 struct ctdb_banning_state
*ban_state
;
2193 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
) {
2197 ban_state
= (struct ctdb_banning_state
*)ctdb
->nodes
[nodemap
->nodes
[i
].pnn
]->ban_state
;
2198 if (ban_state
== NULL
) {
2202 ban_state
->count
= 0;
2205 /* We just finished a recovery successfully.
2206 We now wait for rerecovery_timeout before we allow
2207 another recovery to take place.
2209 DEBUG(DEBUG_NOTICE
, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb
->tunable
.rerecovery_timeout
));
2210 ctdb_op_disable(rec
->recovery
, ctdb
->ev
,
2211 ctdb
->tunable
.rerecovery_timeout
);
2215 ctdb_op_end(rec
->recovery
);
2221 elections are won by first checking the number of connected nodes, then
2222 the priority time, then the pnn
2224 struct election_message
{
2225 uint32_t num_connected
;
2226 struct timeval priority_time
;
2228 uint32_t node_flags
;
2232 form this nodes election data
2234 static void ctdb_election_data(struct ctdb_recoverd
*rec
, struct election_message
*em
)
2237 struct ctdb_node_map
*nodemap
;
2238 struct ctdb_context
*ctdb
= rec
->ctdb
;
2242 em
->pnn
= rec
->ctdb
->pnn
;
2243 em
->priority_time
= rec
->priority_time
;
2245 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, rec
, &nodemap
);
2247 DEBUG(DEBUG_ERR
,(__location__
" unable to get node map\n"));
2251 rec
->node_flags
= nodemap
->nodes
[ctdb
->pnn
].flags
;
2252 em
->node_flags
= rec
->node_flags
;
2254 for (i
=0;i
<nodemap
->num
;i
++) {
2255 if (!(nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
)) {
2256 em
->num_connected
++;
2260 /* we shouldnt try to win this election if we cant be a recmaster */
2261 if ((ctdb
->capabilities
& CTDB_CAP_RECMASTER
) == 0) {
2262 em
->num_connected
= 0;
2263 em
->priority_time
= timeval_current();
2266 talloc_free(nodemap
);
2270 see if the given election data wins
2272 static bool ctdb_election_win(struct ctdb_recoverd
*rec
, struct election_message
*em
)
2274 struct election_message myem
;
2277 ctdb_election_data(rec
, &myem
);
2279 /* we cant win if we dont have the recmaster capability */
2280 if ((rec
->ctdb
->capabilities
& CTDB_CAP_RECMASTER
) == 0) {
2284 /* we cant win if we are banned */
2285 if (rec
->node_flags
& NODE_FLAGS_BANNED
) {
2289 /* we cant win if we are stopped */
2290 if (rec
->node_flags
& NODE_FLAGS_STOPPED
) {
2294 /* we will automatically win if the other node is banned */
2295 if (em
->node_flags
& NODE_FLAGS_BANNED
) {
2299 /* we will automatically win if the other node is banned */
2300 if (em
->node_flags
& NODE_FLAGS_STOPPED
) {
2304 /* try to use the most connected node */
2306 cmp
= (int)myem
.num_connected
- (int)em
->num_connected
;
2309 /* then the longest running node */
2311 cmp
= timeval_compare(&em
->priority_time
, &myem
.priority_time
);
2315 cmp
= (int)myem
.pnn
- (int)em
->pnn
;
2322 send out an election request
2324 static int send_election_request(struct ctdb_recoverd
*rec
, uint32_t pnn
)
2327 TDB_DATA election_data
;
2328 struct election_message emsg
;
2330 struct ctdb_context
*ctdb
= rec
->ctdb
;
2332 srvid
= CTDB_SRVID_RECOVERY
;
2334 ctdb_election_data(rec
, &emsg
);
2336 election_data
.dsize
= sizeof(struct election_message
);
2337 election_data
.dptr
= (unsigned char *)&emsg
;
2340 /* first we assume we will win the election and set
2341 recoverymaster to be ourself on the current node
2343 ret
= ctdb_ctrl_setrecmaster(ctdb
, CONTROL_TIMEOUT(), pnn
, pnn
);
2345 DEBUG(DEBUG_ERR
, (__location__
" failed to send recmaster election request\n"));
2350 /* send an election message to all active nodes */
2351 DEBUG(DEBUG_INFO
,(__location__
" Send election request to all active nodes\n"));
2352 return ctdb_client_send_message(ctdb
, CTDB_BROADCAST_ALL
, srvid
, election_data
);
2356 this function will unban all nodes in the cluster
2358 static void unban_all_nodes(struct ctdb_context
*ctdb
)
2361 struct ctdb_node_map
*nodemap
;
2362 TALLOC_CTX
*tmp_ctx
= talloc_new(ctdb
);
2364 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, tmp_ctx
, &nodemap
);
2366 DEBUG(DEBUG_ERR
,(__location__
" failed to get nodemap to unban all nodes\n"));
2370 for (i
=0;i
<nodemap
->num
;i
++) {
2371 if ( (!(nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
))
2372 && (nodemap
->nodes
[i
].flags
& NODE_FLAGS_BANNED
) ) {
2373 ret
= ctdb_ctrl_modflags(ctdb
, CONTROL_TIMEOUT(),
2374 nodemap
->nodes
[i
].pnn
, 0,
2377 DEBUG(DEBUG_ERR
, (__location__
" failed to reset ban state\n"));
2382 talloc_free(tmp_ctx
);
2387 we think we are winning the election - send a broadcast election request
2389 static void election_send_request(struct event_context
*ev
, struct timed_event
*te
, struct timeval t
, void *p
)
2391 struct ctdb_recoverd
*rec
= talloc_get_type(p
, struct ctdb_recoverd
);
2394 ret
= send_election_request(rec
, ctdb_get_pnn(rec
->ctdb
));
2396 DEBUG(DEBUG_ERR
,("Failed to send election request!\n"));
2399 talloc_free(rec
->send_election_te
);
2400 rec
->send_election_te
= NULL
;
2404 handler for memory dumps
2406 static void mem_dump_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2407 TDB_DATA data
, void *private_data
)
2409 TALLOC_CTX
*tmp_ctx
= talloc_new(ctdb
);
2412 struct srvid_request
*rd
;
2414 if (data
.dsize
!= sizeof(struct srvid_request
)) {
2415 DEBUG(DEBUG_ERR
, (__location__
" Wrong size of return address.\n"));
2416 talloc_free(tmp_ctx
);
2419 rd
= (struct srvid_request
*)data
.dptr
;
2421 dump
= talloc_zero(tmp_ctx
, TDB_DATA
);
2423 DEBUG(DEBUG_ERR
, (__location__
" Failed to allocate memory for memdump\n"));
2424 talloc_free(tmp_ctx
);
2427 ret
= ctdb_dump_memory(ctdb
, dump
);
2429 DEBUG(DEBUG_ERR
, (__location__
" ctdb_dump_memory() failed\n"));
2430 talloc_free(tmp_ctx
);
2434 DEBUG(DEBUG_ERR
, ("recovery master memory dump\n"));
2436 ret
= ctdb_client_send_message(ctdb
, rd
->pnn
, rd
->srvid
, *dump
);
2438 DEBUG(DEBUG_ERR
,("Failed to send rd memdump reply message\n"));
2439 talloc_free(tmp_ctx
);
2443 talloc_free(tmp_ctx
);
2447 handler for reload_nodes
2449 static void reload_nodes_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2450 TDB_DATA data
, void *private_data
)
2452 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
, struct ctdb_recoverd
);
2454 DEBUG(DEBUG_ERR
, (__location__
" Reload nodes file from recovery daemon\n"));
2456 ctdb_load_nodes_file(rec
->ctdb
);
2460 static void ctdb_rebalance_timeout(struct event_context
*ev
,
2461 struct timed_event
*te
,
2462 struct timeval t
, void *p
)
2464 struct ctdb_recoverd
*rec
= talloc_get_type(p
, struct ctdb_recoverd
);
2466 if (rec
->force_rebalance_nodes
== NULL
) {
2468 ("Rebalance timeout occurred - no nodes to rebalance\n"));
2473 ("Rebalance timeout occurred - do takeover run\n"));
2474 do_takeover_run(rec
, rec
->nodemap
, false);
2478 static void recd_node_rebalance_handler(struct ctdb_context
*ctdb
,
2480 TDB_DATA data
, void *private_data
)
2485 uint32_t deferred_rebalance
;
2486 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
, struct ctdb_recoverd
);
2488 if (rec
->recmaster
!= ctdb_get_pnn(ctdb
)) {
2492 if (data
.dsize
!= sizeof(uint32_t)) {
2493 DEBUG(DEBUG_ERR
,(__location__
" Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data
.dsize
, sizeof(uint32_t)));
2497 pnn
= *(uint32_t *)&data
.dptr
[0];
2499 DEBUG(DEBUG_NOTICE
,("Setting up rebalance of IPs to node %u\n", pnn
));
2501 /* Copy any existing list of nodes. There's probably some
2502 * sort of realloc variant that will do this but we need to
2503 * make sure that freeing the old array also cancels the timer
2504 * event for the timeout... not sure if realloc will do that.
2506 len
= (rec
->force_rebalance_nodes
!= NULL
) ?
2507 talloc_array_length(rec
->force_rebalance_nodes
) :
2510 /* This allows duplicates to be added but they don't cause
2511 * harm. A call to add a duplicate PNN arguably means that
2512 * the timeout should be reset, so this is the simplest
2515 t
= talloc_zero_array(rec
, uint32_t, len
+1);
2516 CTDB_NO_MEMORY_VOID(ctdb
, t
);
2518 memcpy(t
, rec
->force_rebalance_nodes
, sizeof(uint32_t) * len
);
2522 talloc_free(rec
->force_rebalance_nodes
);
2524 rec
->force_rebalance_nodes
= t
;
2526 /* If configured, setup a deferred takeover run to make sure
2527 * that certain nodes get IPs rebalanced to them. This will
2528 * be cancelled if a successful takeover run happens before
2529 * the timeout. Assign tunable value to variable for
2532 deferred_rebalance
= ctdb
->tunable
.deferred_rebalance_on_node_add
;
2533 if (deferred_rebalance
!= 0) {
2534 event_add_timed(ctdb
->ev
, rec
->force_rebalance_nodes
,
2535 timeval_current_ofs(deferred_rebalance
, 0),
2536 ctdb_rebalance_timeout
, rec
);
2542 static void recd_update_ip_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2543 TDB_DATA data
, void *private_data
)
2545 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
, struct ctdb_recoverd
);
2546 struct ctdb_public_ip
*ip
;
2548 if (rec
->recmaster
!= rec
->ctdb
->pnn
) {
2549 DEBUG(DEBUG_INFO
,("Not recmaster, ignore update ip message\n"));
2553 if (data
.dsize
!= sizeof(struct ctdb_public_ip
)) {
2554 DEBUG(DEBUG_ERR
,(__location__
" Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data
.dsize
, sizeof(struct ctdb_public_ip
)));
2558 ip
= (struct ctdb_public_ip
*)data
.dptr
;
2560 update_ip_assignment_tree(rec
->ctdb
, ip
);
2563 static void srvid_disable_and_reply(struct ctdb_context
*ctdb
,
2565 struct ctdb_op_state
*op_state
)
2567 struct srvid_request_data
*r
;
2572 /* Validate input data */
2573 if (data
.dsize
!= sizeof(struct srvid_request_data
)) {
2574 DEBUG(DEBUG_ERR
,(__location__
" Wrong size for data :%lu "
2575 "expecting %lu\n", (long unsigned)data
.dsize
,
2576 (long unsigned)sizeof(struct srvid_request
)));
2579 if (data
.dptr
== NULL
) {
2580 DEBUG(DEBUG_ERR
,(__location__
" No data received\n"));
2584 r
= (struct srvid_request_data
*)data
.dptr
;
2587 ret
= ctdb_op_disable(op_state
, ctdb
->ev
, timeout
);
2592 /* Returning our PNN tells the caller that we succeeded */
2593 ret
= ctdb_get_pnn(ctdb
);
2595 result
.dsize
= sizeof(int32_t);
2596 result
.dptr
= (uint8_t *)&ret
;
2597 srvid_request_reply(ctdb
, (struct srvid_request
*)r
, result
);
2600 static void disable_takeover_runs_handler(struct ctdb_context
*ctdb
,
2601 uint64_t srvid
, TDB_DATA data
,
2604 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
,
2605 struct ctdb_recoverd
);
2607 srvid_disable_and_reply(ctdb
, data
, rec
->takeover_run
);
2610 /* Backward compatibility for this SRVID */
2611 static void disable_ip_check_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2612 TDB_DATA data
, void *private_data
)
2614 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
,
2615 struct ctdb_recoverd
);
2618 if (data
.dsize
!= sizeof(uint32_t)) {
2619 DEBUG(DEBUG_ERR
,(__location__
" Wrong size for data :%lu "
2620 "expecting %lu\n", (long unsigned)data
.dsize
,
2621 (long unsigned)sizeof(uint32_t)));
2624 if (data
.dptr
== NULL
) {
2625 DEBUG(DEBUG_ERR
,(__location__
" No data received\n"));
2629 timeout
= *((uint32_t *)data
.dptr
);
2631 ctdb_op_disable(rec
->takeover_run
, ctdb
->ev
, timeout
);
2634 static void disable_recoveries_handler(struct ctdb_context
*ctdb
,
2635 uint64_t srvid
, TDB_DATA data
,
2638 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
,
2639 struct ctdb_recoverd
);
2641 srvid_disable_and_reply(ctdb
, data
, rec
->recovery
);
2645 handler for ip reallocate, just add it to the list of requests and
2646 handle this later in the monitor_cluster loop so we do not recurse
2647 with other requests to takeover_run()
2649 static void ip_reallocate_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2650 TDB_DATA data
, void *private_data
)
2652 struct srvid_request
*request
;
2653 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
,
2654 struct ctdb_recoverd
);
2656 if (data
.dsize
!= sizeof(struct srvid_request
)) {
2657 DEBUG(DEBUG_ERR
, (__location__
" Wrong size of return address.\n"));
2661 request
= (struct srvid_request
*)data
.dptr
;
2663 srvid_request_add(ctdb
, &rec
->reallocate_requests
, request
);
2666 static void process_ipreallocate_requests(struct ctdb_context
*ctdb
,
2667 struct ctdb_recoverd
*rec
)
2672 struct srvid_requests
*current
;
2674 DEBUG(DEBUG_INFO
, ("recovery master forced ip reallocation\n"));
2676 /* Only process requests that are currently pending. More
2677 * might come in while the takeover run is in progress and
2678 * they will need to be processed later since they might
2679 * be in response flag changes.
2681 current
= rec
->reallocate_requests
;
2682 rec
->reallocate_requests
= NULL
;
2684 /* update the list of public ips that a node can handle for
2687 ret
= ctdb_reload_remote_public_ips(ctdb
, rec
, rec
->nodemap
, &culprit
);
2689 DEBUG(DEBUG_ERR
,("Failed to read public ips from remote node %d\n",
2691 rec
->need_takeover_run
= true;
2694 if (do_takeover_run(rec
, rec
->nodemap
, false)) {
2695 ret
= ctdb_get_pnn(ctdb
);
2701 result
.dsize
= sizeof(int32_t);
2702 result
.dptr
= (uint8_t *)&ret
;
2704 srvid_requests_reply(ctdb
, ¤t
, result
);
2709 handler for recovery master elections
2711 static void election_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2712 TDB_DATA data
, void *private_data
)
2714 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
, struct ctdb_recoverd
);
2716 struct election_message
*em
= (struct election_message
*)data
.dptr
;
2718 /* Ignore election packets from ourself */
2719 if (ctdb
->pnn
== em
->pnn
) {
2723 /* we got an election packet - update the timeout for the election */
2724 talloc_free(rec
->election_timeout
);
2725 rec
->election_timeout
= event_add_timed(ctdb
->ev
, ctdb
,
2727 timeval_current_ofs(0, 500000) :
2728 timeval_current_ofs(ctdb
->tunable
.election_timeout
, 0),
2729 ctdb_election_timeout
, rec
);
2731 /* someone called an election. check their election data
2732 and if we disagree and we would rather be the elected node,
2733 send a new election message to all other nodes
2735 if (ctdb_election_win(rec
, em
)) {
2736 if (!rec
->send_election_te
) {
2737 rec
->send_election_te
= event_add_timed(ctdb
->ev
, rec
,
2738 timeval_current_ofs(0, 500000),
2739 election_send_request
, rec
);
2741 /*unban_all_nodes(ctdb);*/
2746 TALLOC_FREE(rec
->send_election_te
);
2748 if (ctdb
->recovery_lock_file
!= NULL
) {
2749 /* Release the recovery lock file */
2750 if (ctdb_recovery_have_lock(ctdb
)) {
2751 ctdb_recovery_unlock(ctdb
);
2752 unban_all_nodes(ctdb
);
2756 /* ok, let that guy become recmaster then */
2757 ret
= ctdb_ctrl_setrecmaster(ctdb
, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb
), em
->pnn
);
2759 DEBUG(DEBUG_ERR
, (__location__
" failed to send recmaster election request"));
2768 force the start of the election process
2770 static void force_election(struct ctdb_recoverd
*rec
, uint32_t pnn
,
2771 struct ctdb_node_map
*nodemap
)
2774 struct ctdb_context
*ctdb
= rec
->ctdb
;
2776 DEBUG(DEBUG_INFO
,(__location__
" Force an election\n"));
2778 /* set all nodes to recovery mode to stop all internode traffic */
2779 ret
= set_recovery_mode(ctdb
, rec
, nodemap
, CTDB_RECOVERY_ACTIVE
);
2781 DEBUG(DEBUG_ERR
, (__location__
" Unable to set recovery mode to active on cluster\n"));
2785 talloc_free(rec
->election_timeout
);
2786 rec
->election_timeout
= event_add_timed(ctdb
->ev
, ctdb
,
2788 timeval_current_ofs(0, 500000) :
2789 timeval_current_ofs(ctdb
->tunable
.election_timeout
, 0),
2790 ctdb_election_timeout
, rec
);
2792 ret
= send_election_request(rec
, pnn
);
2794 DEBUG(DEBUG_ERR
, (__location__
" failed to initiate recmaster election"));
2798 /* wait for a few seconds to collect all responses */
2799 ctdb_wait_election(rec
);
2805 handler for when a node changes its flags
2807 static void monitor_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2808 TDB_DATA data
, void *private_data
)
2811 struct ctdb_node_flag_change
*c
= (struct ctdb_node_flag_change
*)data
.dptr
;
2812 struct ctdb_node_map
*nodemap
=NULL
;
2813 TALLOC_CTX
*tmp_ctx
;
2815 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
, struct ctdb_recoverd
);
2816 int disabled_flag_changed
;
2818 if (data
.dsize
!= sizeof(*c
)) {
2819 DEBUG(DEBUG_ERR
,(__location__
"Invalid data in ctdb_node_flag_change\n"));
2823 tmp_ctx
= talloc_new(ctdb
);
2824 CTDB_NO_MEMORY_VOID(ctdb
, tmp_ctx
);
2826 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, tmp_ctx
, &nodemap
);
2828 DEBUG(DEBUG_ERR
,(__location__
"ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2829 talloc_free(tmp_ctx
);
2834 for (i
=0;i
<nodemap
->num
;i
++) {
2835 if (nodemap
->nodes
[i
].pnn
== c
->pnn
) break;
2838 if (i
== nodemap
->num
) {
2839 DEBUG(DEBUG_CRIT
,(__location__
"Flag change for non-existant node %u\n", c
->pnn
));
2840 talloc_free(tmp_ctx
);
2844 if (c
->old_flags
!= c
->new_flags
) {
2845 DEBUG(DEBUG_NOTICE
,("Node %u has changed flags - now 0x%x was 0x%x\n", c
->pnn
, c
->new_flags
, c
->old_flags
));
2848 disabled_flag_changed
= (nodemap
->nodes
[i
].flags
^ c
->new_flags
) & NODE_FLAGS_DISABLED
;
2850 nodemap
->nodes
[i
].flags
= c
->new_flags
;
2852 ret
= ctdb_ctrl_getrecmaster(ctdb
, tmp_ctx
, CONTROL_TIMEOUT(),
2853 CTDB_CURRENT_NODE
, &ctdb
->recovery_master
);
2856 ret
= ctdb_ctrl_getrecmode(ctdb
, tmp_ctx
, CONTROL_TIMEOUT(),
2857 CTDB_CURRENT_NODE
, &ctdb
->recovery_mode
);
2861 ctdb
->recovery_master
== ctdb
->pnn
&&
2862 ctdb
->recovery_mode
== CTDB_RECOVERY_NORMAL
) {
2863 /* Only do the takeover run if the perm disabled or unhealthy
2864 flags changed since these will cause an ip failover but not
2866 If the node became disconnected or banned this will also
2867 lead to an ip address failover but that is handled
2870 if (disabled_flag_changed
) {
2871 rec
->need_takeover_run
= true;
2875 talloc_free(tmp_ctx
);
2879 handler for when we need to push out flag changes ot all other nodes
2881 static void push_flags_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2882 TDB_DATA data
, void *private_data
)
2885 struct ctdb_node_flag_change
*c
= (struct ctdb_node_flag_change
*)data
.dptr
;
2886 struct ctdb_node_map
*nodemap
=NULL
;
2887 TALLOC_CTX
*tmp_ctx
= talloc_new(ctdb
);
2891 /* find the recovery master */
2892 ret
= ctdb_ctrl_getrecmaster(ctdb
, tmp_ctx
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, &recmaster
);
2894 DEBUG(DEBUG_ERR
, (__location__
" Unable to get recmaster from local node\n"));
2895 talloc_free(tmp_ctx
);
2899 /* read the node flags from the recmaster */
2900 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), recmaster
, tmp_ctx
, &nodemap
);
2902 DEBUG(DEBUG_ERR
, (__location__
" Unable to get nodemap from node %u\n", c
->pnn
));
2903 talloc_free(tmp_ctx
);
2906 if (c
->pnn
>= nodemap
->num
) {
2907 DEBUG(DEBUG_ERR
,(__location__
" Nodemap from recmaster does not contain node %d\n", c
->pnn
));
2908 talloc_free(tmp_ctx
);
2912 /* send the flags update to all connected nodes */
2913 nodes
= list_of_connected_nodes(ctdb
, nodemap
, tmp_ctx
, true);
2915 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_MODIFY_FLAGS
,
2916 nodes
, 0, CONTROL_TIMEOUT(),
2920 DEBUG(DEBUG_ERR
, (__location__
" ctdb_control to modify node flags failed\n"));
2922 talloc_free(tmp_ctx
);
2926 talloc_free(tmp_ctx
);
2930 struct verify_recmode_normal_data
{
2932 enum monitor_result status
;
2935 static void verify_recmode_normal_callback(struct ctdb_client_control_state
*state
)
2937 struct verify_recmode_normal_data
*rmdata
= talloc_get_type(state
->async
.private_data
, struct verify_recmode_normal_data
);
2940 /* one more node has responded with recmode data*/
2943 /* if we failed to get the recmode, then return an error and let
2944 the main loop try again.
2946 if (state
->state
!= CTDB_CONTROL_DONE
) {
2947 if (rmdata
->status
== MONITOR_OK
) {
2948 rmdata
->status
= MONITOR_FAILED
;
2953 /* if we got a response, then the recmode will be stored in the
2956 if (state
->status
!= CTDB_RECOVERY_NORMAL
) {
2957 DEBUG(DEBUG_NOTICE
, ("Node:%u was in recovery mode. Start recovery process\n", state
->c
->hdr
.destnode
));
2958 rmdata
->status
= MONITOR_RECOVERY_NEEDED
;
2965 /* verify that all nodes are in normal recovery mode */
2966 static enum monitor_result
verify_recmode(struct ctdb_context
*ctdb
, struct ctdb_node_map
*nodemap
)
2968 struct verify_recmode_normal_data
*rmdata
;
2969 TALLOC_CTX
*mem_ctx
= talloc_new(ctdb
);
2970 struct ctdb_client_control_state
*state
;
2971 enum monitor_result status
;
2974 rmdata
= talloc(mem_ctx
, struct verify_recmode_normal_data
);
2975 CTDB_NO_MEMORY_FATAL(ctdb
, rmdata
);
2977 rmdata
->status
= MONITOR_OK
;
2979 /* loop over all active nodes and send an async getrecmode call to
2981 for (j
=0; j
<nodemap
->num
; j
++) {
2982 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
2985 state
= ctdb_ctrl_getrecmode_send(ctdb
, mem_ctx
,
2987 nodemap
->nodes
[j
].pnn
);
2988 if (state
== NULL
) {
2989 /* we failed to send the control, treat this as
2990 an error and try again next iteration
2992 DEBUG(DEBUG_ERR
,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2993 talloc_free(mem_ctx
);
2994 return MONITOR_FAILED
;
2997 /* set up the callback functions */
2998 state
->async
.fn
= verify_recmode_normal_callback
;
2999 state
->async
.private_data
= rmdata
;
3001 /* one more control to wait for to complete */
3006 /* now wait for up to the maximum number of seconds allowed
3007 or until all nodes we expect a response from has replied
3009 while (rmdata
->count
> 0) {
3010 event_loop_once(ctdb
->ev
);
3013 status
= rmdata
->status
;
3014 talloc_free(mem_ctx
);
3019 struct verify_recmaster_data
{
3020 struct ctdb_recoverd
*rec
;
3023 enum monitor_result status
;
3026 static void verify_recmaster_callback(struct ctdb_client_control_state
*state
)
3028 struct verify_recmaster_data
*rmdata
= talloc_get_type(state
->async
.private_data
, struct verify_recmaster_data
);
3031 /* one more node has responded with recmaster data*/
3034 /* if we failed to get the recmaster, then return an error and let
3035 the main loop try again.
3037 if (state
->state
!= CTDB_CONTROL_DONE
) {
3038 if (rmdata
->status
== MONITOR_OK
) {
3039 rmdata
->status
= MONITOR_FAILED
;
3044 /* if we got a response, then the recmaster will be stored in the
3047 if (state
->status
!= rmdata
->pnn
) {
3048 DEBUG(DEBUG_ERR
,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state
->c
->hdr
.destnode
, state
->status
));
3049 ctdb_set_culprit(rmdata
->rec
, state
->c
->hdr
.destnode
);
3050 rmdata
->status
= MONITOR_ELECTION_NEEDED
;
3057 /* verify that all nodes agree that we are the recmaster */
3058 static enum monitor_result
verify_recmaster(struct ctdb_recoverd
*rec
, struct ctdb_node_map
*nodemap
, uint32_t pnn
)
3060 struct ctdb_context
*ctdb
= rec
->ctdb
;
3061 struct verify_recmaster_data
*rmdata
;
3062 TALLOC_CTX
*mem_ctx
= talloc_new(ctdb
);
3063 struct ctdb_client_control_state
*state
;
3064 enum monitor_result status
;
3067 rmdata
= talloc(mem_ctx
, struct verify_recmaster_data
);
3068 CTDB_NO_MEMORY_FATAL(ctdb
, rmdata
);
3072 rmdata
->status
= MONITOR_OK
;
3074 /* loop over all active nodes and send an async getrecmaster call to
3076 for (j
=0; j
<nodemap
->num
; j
++) {
3077 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
3080 state
= ctdb_ctrl_getrecmaster_send(ctdb
, mem_ctx
,
3082 nodemap
->nodes
[j
].pnn
);
3083 if (state
== NULL
) {
3084 /* we failed to send the control, treat this as
3085 an error and try again next iteration
3087 DEBUG(DEBUG_ERR
,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
3088 talloc_free(mem_ctx
);
3089 return MONITOR_FAILED
;
3092 /* set up the callback functions */
3093 state
->async
.fn
= verify_recmaster_callback
;
3094 state
->async
.private_data
= rmdata
;
3096 /* one more control to wait for to complete */
3101 /* now wait for up to the maximum number of seconds allowed
3102 or until all nodes we expect a response from has replied
3104 while (rmdata
->count
> 0) {
3105 event_loop_once(ctdb
->ev
);
3108 status
= rmdata
->status
;
3109 talloc_free(mem_ctx
);
3113 static bool interfaces_have_changed(struct ctdb_context
*ctdb
,
3114 struct ctdb_recoverd
*rec
)
3116 struct ctdb_control_get_ifaces
*ifaces
= NULL
;
3117 TALLOC_CTX
*mem_ctx
;
3120 mem_ctx
= talloc_new(NULL
);
3122 /* Read the interfaces from the local node */
3123 if (ctdb_ctrl_get_ifaces(ctdb
, CONTROL_TIMEOUT(),
3124 CTDB_CURRENT_NODE
, mem_ctx
, &ifaces
) != 0) {
3125 DEBUG(DEBUG_ERR
, ("Unable to get interfaces from local node %u\n", ctdb
->pnn
));
3126 /* We could return an error. However, this will be
3127 * rare so we'll decide that the interfaces have
3128 * actually changed, just in case.
3130 talloc_free(mem_ctx
);
3135 /* We haven't been here before so things have changed */
3136 DEBUG(DEBUG_NOTICE
, ("Initial interface fetched\n"));
3138 } else if (rec
->ifaces
->num
!= ifaces
->num
) {
3139 /* Number of interfaces has changed */
3140 DEBUG(DEBUG_NOTICE
, ("Interface count changed from %d to %d\n",
3141 rec
->ifaces
->num
, ifaces
->num
));
3144 /* See if interface names or link states have changed */
3146 for (i
= 0; i
< rec
->ifaces
->num
; i
++) {
3147 struct ctdb_control_iface_info
* iface
= &rec
->ifaces
->ifaces
[i
];
3148 if (strcmp(iface
->name
, ifaces
->ifaces
[i
].name
) != 0) {
3150 ("Interface in slot %d changed: %s => %s\n",
3151 i
, iface
->name
, ifaces
->ifaces
[i
].name
));
3155 if (iface
->link_state
!= ifaces
->ifaces
[i
].link_state
) {
3157 ("Interface %s changed state: %d => %d\n",
3158 iface
->name
, iface
->link_state
,
3159 ifaces
->ifaces
[i
].link_state
));
3166 talloc_free(rec
->ifaces
);
3167 rec
->ifaces
= talloc_steal(rec
, ifaces
);
3169 talloc_free(mem_ctx
);
3173 /* called to check that the local allocation of public ip addresses is ok.
3175 static int verify_local_ip_allocation(struct ctdb_context
*ctdb
, struct ctdb_recoverd
*rec
, uint32_t pnn
, struct ctdb_node_map
*nodemap
)
3177 TALLOC_CTX
*mem_ctx
= talloc_new(NULL
);
3178 struct ctdb_uptime
*uptime1
= NULL
;
3179 struct ctdb_uptime
*uptime2
= NULL
;
3181 bool need_takeover_run
= false;
3183 ret
= ctdb_ctrl_uptime(ctdb
, mem_ctx
, CONTROL_TIMEOUT(),
3184 CTDB_CURRENT_NODE
, &uptime1
);
3186 DEBUG(DEBUG_ERR
, ("Unable to get uptime from local node %u\n", pnn
));
3187 talloc_free(mem_ctx
);
3191 if (interfaces_have_changed(ctdb
, rec
)) {
3192 DEBUG(DEBUG_NOTICE
, ("The interfaces status has changed on "
3193 "local node %u - force takeover run\n",
3195 need_takeover_run
= true;
3198 ret
= ctdb_ctrl_uptime(ctdb
, mem_ctx
, CONTROL_TIMEOUT(),
3199 CTDB_CURRENT_NODE
, &uptime2
);
3201 DEBUG(DEBUG_ERR
, ("Unable to get uptime from local node %u\n", pnn
));
3202 talloc_free(mem_ctx
);
3206 /* skip the check if the startrecovery time has changed */
3207 if (timeval_compare(&uptime1
->last_recovery_started
,
3208 &uptime2
->last_recovery_started
) != 0) {
3209 DEBUG(DEBUG_NOTICE
, (__location__
" last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3210 talloc_free(mem_ctx
);
3214 /* skip the check if the endrecovery time has changed */
3215 if (timeval_compare(&uptime1
->last_recovery_finished
,
3216 &uptime2
->last_recovery_finished
) != 0) {
3217 DEBUG(DEBUG_NOTICE
, (__location__
" last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3218 talloc_free(mem_ctx
);
3222 /* skip the check if we have started but not finished recovery */
3223 if (timeval_compare(&uptime1
->last_recovery_finished
,
3224 &uptime1
->last_recovery_started
) != 1) {
3225 DEBUG(DEBUG_INFO
, (__location__
" in the middle of recovery or ip reallocation. skipping public ip address check\n"));
3226 talloc_free(mem_ctx
);
3231 /* verify that we have the ip addresses we should have
3232 and we dont have ones we shouldnt have.
3233 if we find an inconsistency we set recmode to
3234 active on the local node and wait for the recmaster
3235 to do a full blown recovery.
3236 also if the pnn is -1 and we are healthy and can host the ip
3237 we also request a ip reallocation.
3239 if (ctdb
->tunable
.disable_ip_failover
== 0) {
3240 struct ctdb_all_public_ips
*ips
= NULL
;
3242 /* read the *available* IPs from the local node */
3243 ret
= ctdb_ctrl_get_public_ips_flags(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, mem_ctx
, CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE
, &ips
);
3245 DEBUG(DEBUG_ERR
, ("Unable to get available public IPs from local node %u\n", pnn
));
3246 talloc_free(mem_ctx
);
3250 for (j
=0; j
<ips
->num
; j
++) {
3251 if (ips
->ips
[j
].pnn
== -1 &&
3252 nodemap
->nodes
[pnn
].flags
== 0) {
3253 DEBUG(DEBUG_CRIT
,("Public IP '%s' is not assigned and we could serve it\n",
3254 ctdb_addr_to_str(&ips
->ips
[j
].addr
)));
3255 need_takeover_run
= true;
3261 /* read the *known* IPs from the local node */
3262 ret
= ctdb_ctrl_get_public_ips_flags(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, mem_ctx
, 0, &ips
);
3264 DEBUG(DEBUG_ERR
, ("Unable to get known public IPs from local node %u\n", pnn
));
3265 talloc_free(mem_ctx
);
3269 for (j
=0; j
<ips
->num
; j
++) {
3270 if (ips
->ips
[j
].pnn
== pnn
) {
3271 if (ctdb
->do_checkpublicip
&& !ctdb_sys_have_ip(&ips
->ips
[j
].addr
)) {
3272 DEBUG(DEBUG_CRIT
,("Public IP '%s' is assigned to us but not on an interface\n",
3273 ctdb_addr_to_str(&ips
->ips
[j
].addr
)));
3274 need_takeover_run
= true;
3277 if (ctdb
->do_checkpublicip
&&
3278 ctdb_sys_have_ip(&ips
->ips
[j
].addr
)) {
3280 DEBUG(DEBUG_CRIT
,("We are still serving a public IP '%s' that we should not be serving. Removing it\n",
3281 ctdb_addr_to_str(&ips
->ips
[j
].addr
)));
3283 if (ctdb_ctrl_release_ip(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, &ips
->ips
[j
]) != 0) {
3284 DEBUG(DEBUG_ERR
,("Failed to release local IP address\n"));
3291 if (need_takeover_run
) {
3292 struct srvid_request rd
;
3295 DEBUG(DEBUG_CRIT
,("Trigger takeoverrun\n"));
3299 data
.dptr
= (uint8_t *)&rd
;
3300 data
.dsize
= sizeof(rd
);
3302 ret
= ctdb_client_send_message(ctdb
, rec
->recmaster
, CTDB_SRVID_TAKEOVER_RUN
, data
);
3304 DEBUG(DEBUG_ERR
,(__location__
" Failed to send ipreallocate to recmaster :%d\n", (int)rec
->recmaster
));
3307 talloc_free(mem_ctx
);
3312 static void async_getnodemap_callback(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
3314 struct ctdb_node_map
**remote_nodemaps
= callback_data
;
3316 if (node_pnn
>= ctdb
->num_nodes
) {
3317 DEBUG(DEBUG_ERR
,(__location__
" pnn from invalid node\n"));
3321 remote_nodemaps
[node_pnn
] = (struct ctdb_node_map
*)talloc_steal(remote_nodemaps
, outdata
.dptr
);
3325 static int get_remote_nodemaps(struct ctdb_context
*ctdb
, TALLOC_CTX
*mem_ctx
,
3326 struct ctdb_node_map
*nodemap
,
3327 struct ctdb_node_map
**remote_nodemaps
)
3331 nodes
= list_of_active_nodes(ctdb
, nodemap
, mem_ctx
, true);
3332 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_GET_NODEMAP
,
3334 CONTROL_TIMEOUT(), false, tdb_null
,
3335 async_getnodemap_callback
,
3337 remote_nodemaps
) != 0) {
3338 DEBUG(DEBUG_ERR
, (__location__
" Unable to pull all remote nodemaps\n"));
3346 static int update_recovery_lock_file(struct ctdb_context
*ctdb
)
3348 TALLOC_CTX
*tmp_ctx
= talloc_new(NULL
);
3349 const char *reclockfile
;
3351 if (ctdb_ctrl_getreclock(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, tmp_ctx
, &reclockfile
) != 0) {
3352 DEBUG(DEBUG_ERR
,("Failed to read reclock file from daemon\n"));
3353 talloc_free(tmp_ctx
);
3357 if (reclockfile
== NULL
) {
3358 if (ctdb
->recovery_lock_file
!= NULL
) {
3359 DEBUG(DEBUG_NOTICE
,("Recovery lock file disabled\n"));
3360 talloc_free(ctdb
->recovery_lock_file
);
3361 ctdb
->recovery_lock_file
= NULL
;
3362 ctdb_recovery_unlock(ctdb
);
3364 talloc_free(tmp_ctx
);
3368 if (ctdb
->recovery_lock_file
== NULL
) {
3370 ("Recovery lock file enabled (%s)\n", reclockfile
));
3371 ctdb
->recovery_lock_file
= talloc_strdup(ctdb
, reclockfile
);
3372 ctdb_recovery_unlock(ctdb
);
3373 talloc_free(tmp_ctx
);
3378 if (!strcmp(reclockfile
, ctdb
->recovery_lock_file
)) {
3379 talloc_free(tmp_ctx
);
3384 ("Recovery lock file changed (now %s)\n", reclockfile
));
3385 talloc_free(ctdb
->recovery_lock_file
);
3386 ctdb
->recovery_lock_file
= talloc_strdup(ctdb
, reclockfile
);
3387 ctdb_recovery_unlock(ctdb
);
3389 talloc_free(tmp_ctx
);
3393 static void main_loop(struct ctdb_context
*ctdb
, struct ctdb_recoverd
*rec
,
3394 TALLOC_CTX
*mem_ctx
)
3397 struct ctdb_node_map
*nodemap
=NULL
;
3398 struct ctdb_node_map
*recmaster_nodemap
=NULL
;
3399 struct ctdb_node_map
**remote_nodemaps
=NULL
;
3400 struct ctdb_vnn_map
*vnnmap
=NULL
;
3401 struct ctdb_vnn_map
*remote_vnnmap
=NULL
;
3402 uint32_t num_lmasters
;
3403 int32_t debug_level
;
3408 /* verify that the main daemon is still running */
3409 if (ctdb_kill(ctdb
, ctdb
->ctdbd_pid
, 0) != 0) {
3410 DEBUG(DEBUG_CRIT
,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
3414 /* ping the local daemon to tell it we are alive */
3415 ctdb_ctrl_recd_ping(ctdb
);
3417 if (rec
->election_timeout
) {
3418 /* an election is in progress */
3422 /* read the debug level from the parent and update locally */
3423 ret
= ctdb_ctrl_get_debuglevel(ctdb
, CTDB_CURRENT_NODE
, &debug_level
);
3425 DEBUG(DEBUG_ERR
, (__location__
" Failed to read debuglevel from parent\n"));
3428 DEBUGLEVEL
= debug_level
;
3430 /* get relevant tunables */
3431 ret
= ctdb_ctrl_get_all_tunables(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, &ctdb
->tunable
);
3433 DEBUG(DEBUG_ERR
,("Failed to get tunables - retrying\n"));
3438 ret
= ctdb_ctrl_get_runstate(ctdb
, CONTROL_TIMEOUT(),
3439 CTDB_CURRENT_NODE
, &ctdb
->runstate
);
3441 DEBUG(DEBUG_ERR
, ("Failed to get runstate - retrying\n"));
3445 /* get the current recovery lock file from the server */
3446 if (update_recovery_lock_file(ctdb
) != 0) {
3447 DEBUG(DEBUG_ERR
,("Failed to update the recovery lock file\n"));
3451 /* Make sure that if recovery lock verification becomes disabled when
3454 if (ctdb
->recovery_lock_file
== NULL
) {
3455 ctdb_recovery_unlock(ctdb
);
3458 pnn
= ctdb_get_pnn(ctdb
);
3460 /* get the vnnmap */
3461 ret
= ctdb_ctrl_getvnnmap(ctdb
, CONTROL_TIMEOUT(), pnn
, mem_ctx
, &vnnmap
);
3463 DEBUG(DEBUG_ERR
, (__location__
" Unable to get vnnmap from node %u\n", pnn
));
3468 /* get number of nodes */
3470 talloc_free(rec
->nodemap
);
3471 rec
->nodemap
= NULL
;
3474 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), pnn
, rec
, &rec
->nodemap
);
3476 DEBUG(DEBUG_ERR
, (__location__
" Unable to get nodemap from node %u\n", pnn
));
3479 nodemap
= rec
->nodemap
;
3481 /* remember our own node flags */
3482 rec
->node_flags
= nodemap
->nodes
[pnn
].flags
;
3484 ban_misbehaving_nodes(rec
, &self_ban
);
3486 DEBUG(DEBUG_NOTICE
, ("This node was banned, restart main_loop\n"));
3490 /* if the local daemon is STOPPED or BANNED, we verify that the databases are
3491 also frozen and that the recmode is set to active.
3493 if (rec
->node_flags
& (NODE_FLAGS_STOPPED
| NODE_FLAGS_BANNED
)) {
3494 /* If this node has become inactive then we want to
3495 * reduce the chances of it taking over the recovery
3496 * master role when it becomes active again. This
3497 * helps to stabilise the recovery master role so that
3498 * it stays on the most stable node.
3500 rec
->priority_time
= timeval_current();
3502 ret
= ctdb_ctrl_getrecmode(ctdb
, mem_ctx
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, &ctdb
->recovery_mode
);
3504 DEBUG(DEBUG_ERR
,(__location__
" Failed to read recmode from local node\n"));
3506 if (ctdb
->recovery_mode
== CTDB_RECOVERY_NORMAL
) {
3507 DEBUG(DEBUG_ERR
,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
3509 ret
= ctdb_ctrl_setrecmode(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, CTDB_RECOVERY_ACTIVE
);
3511 DEBUG(DEBUG_ERR
,(__location__
" Failed to activate recovery mode in STOPPED or BANNED state\n"));
3515 ret
= ctdb_ctrl_freeze(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
);
3517 DEBUG(DEBUG_ERR
,(__location__
" Failed to freeze node in STOPPED or BANNED state\n"));
3522 /* If this node is stopped or banned then it is not the recovery
3523 * master, so don't do anything. This prevents stopped or banned
3524 * node from starting election and sending unnecessary controls.
3529 /* check which node is the recovery master */
3530 ret
= ctdb_ctrl_getrecmaster(ctdb
, mem_ctx
, CONTROL_TIMEOUT(), pnn
, &rec
->recmaster
);
3532 DEBUG(DEBUG_ERR
, (__location__
" Unable to get recmaster from node %u\n", pnn
));
3536 /* If we are not the recmaster then do some housekeeping */
3537 if (rec
->recmaster
!= pnn
) {
3538 /* Ignore any IP reallocate requests - only recmaster
3541 TALLOC_FREE(rec
->reallocate_requests
);
3542 /* Clear any nodes that should be force rebalanced in
3543 * the next takeover run. If the recovery master role
3544 * has moved then we don't want to process these some
3545 * time in the future.
3547 TALLOC_FREE(rec
->force_rebalance_nodes
);
3550 /* This is a special case. When recovery daemon is started, recmaster
3551 * is set to -1. If a node is not started in stopped state, then
3552 * start election to decide recovery master
3554 if (rec
->recmaster
== (uint32_t)-1) {
3555 DEBUG(DEBUG_NOTICE
,(__location__
" Initial recovery master set - forcing election\n"));
3556 force_election(rec
, pnn
, nodemap
);
3560 /* update the capabilities for all nodes */
3561 ret
= update_capabilities(rec
, nodemap
);
3563 DEBUG(DEBUG_ERR
, (__location__
" Unable to update node capabilities.\n"));
3568 * If the current recmaster does not have CTDB_CAP_RECMASTER,
3569 * but we have, then force an election and try to become the new
3572 if (!ctdb_node_has_capabilities(rec
->caps
,
3574 CTDB_CAP_RECMASTER
) &&
3575 (rec
->ctdb
->capabilities
& CTDB_CAP_RECMASTER
) &&
3576 !(nodemap
->nodes
[pnn
].flags
& NODE_FLAGS_INACTIVE
)) {
3577 DEBUG(DEBUG_ERR
, (__location__
" Current recmaster node %u does not have CAP_RECMASTER,"
3578 " but we (node %u) have - force an election\n",
3579 rec
->recmaster
, pnn
));
3580 force_election(rec
, pnn
, nodemap
);
3584 /* verify that the recmaster node is still active */
3585 for (j
=0; j
<nodemap
->num
; j
++) {
3586 if (nodemap
->nodes
[j
].pnn
==rec
->recmaster
) {
3591 if (j
== nodemap
->num
) {
3592 DEBUG(DEBUG_ERR
, ("Recmaster node %u not in list. Force reelection\n", rec
->recmaster
));
3593 force_election(rec
, pnn
, nodemap
);
3597 /* if recovery master is disconnected we must elect a new recmaster */
3598 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_DISCONNECTED
) {
3599 DEBUG(DEBUG_NOTICE
, ("Recmaster node %u is disconnected. Force reelection\n", nodemap
->nodes
[j
].pnn
));
3600 force_election(rec
, pnn
, nodemap
);
3604 /* get nodemap from the recovery master to check if it is inactive */
3605 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
3606 mem_ctx
, &recmaster_nodemap
);
3608 DEBUG(DEBUG_ERR
, (__location__
" Unable to get nodemap from recovery master %u\n",
3609 nodemap
->nodes
[j
].pnn
));
3614 if ((recmaster_nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) &&
3615 (rec
->node_flags
& NODE_FLAGS_INACTIVE
) == 0) {
3616 DEBUG(DEBUG_NOTICE
, ("Recmaster node %u no longer available. Force reelection\n", nodemap
->nodes
[j
].pnn
));
3618 * update our nodemap to carry the recmaster's notion of
3619 * its own flags, so that we don't keep freezing the
3620 * inactive recmaster node...
3622 nodemap
->nodes
[j
].flags
= recmaster_nodemap
->nodes
[j
].flags
;
3623 force_election(rec
, pnn
, nodemap
);
3627 /* verify that we have all ip addresses we should have and we dont
3628 * have addresses we shouldnt have.
3630 if (ctdb
->tunable
.disable_ip_failover
== 0 &&
3631 !ctdb_op_is_disabled(rec
->takeover_run
)) {
3632 if (verify_local_ip_allocation(ctdb
, rec
, pnn
, nodemap
) != 0) {
3633 DEBUG(DEBUG_ERR
, (__location__
" Public IPs were inconsistent.\n"));
3638 /* if we are not the recmaster then we do not need to check
3639 if recovery is needed
3641 if (pnn
!= rec
->recmaster
) {
3646 /* ensure our local copies of flags are right */
3647 ret
= update_local_flags(rec
, nodemap
);
3648 if (ret
== MONITOR_ELECTION_NEEDED
) {
3649 DEBUG(DEBUG_NOTICE
,("update_local_flags() called for a re-election.\n"));
3650 force_election(rec
, pnn
, nodemap
);
3653 if (ret
!= MONITOR_OK
) {
3654 DEBUG(DEBUG_ERR
,("Unable to update local flags\n"));
3658 if (ctdb
->num_nodes
!= nodemap
->num
) {
3659 DEBUG(DEBUG_ERR
, (__location__
" ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb
->num_nodes
, nodemap
->num
));
3660 ctdb_load_nodes_file(ctdb
);
3664 /* verify that all active nodes agree that we are the recmaster */
3665 switch (verify_recmaster(rec
, nodemap
, pnn
)) {
3666 case MONITOR_RECOVERY_NEEDED
:
3667 /* can not happen */
3669 case MONITOR_ELECTION_NEEDED
:
3670 force_election(rec
, pnn
, nodemap
);
3674 case MONITOR_FAILED
:
3679 if (rec
->need_recovery
) {
3680 /* a previous recovery didn't finish */
3681 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3685 /* verify that all active nodes are in normal mode
3686 and not in recovery mode
3688 switch (verify_recmode(ctdb
, nodemap
)) {
3689 case MONITOR_RECOVERY_NEEDED
:
3690 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3692 case MONITOR_FAILED
:
3694 case MONITOR_ELECTION_NEEDED
:
3695 /* can not happen */
3701 if (ctdb
->recovery_lock_file
!= NULL
) {
3702 /* We must already hold the recovery lock */
3703 if (!ctdb_recovery_have_lock(ctdb
)) {
3704 DEBUG(DEBUG_ERR
,("Failed recovery lock sanity check. Force a recovery\n"));
3705 ctdb_set_culprit(rec
, ctdb
->pnn
);
3706 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3712 /* if there are takeovers requested, perform it and notify the waiters */
3713 if (!ctdb_op_is_disabled(rec
->takeover_run
) &&
3714 rec
->reallocate_requests
) {
3715 process_ipreallocate_requests(ctdb
, rec
);
3718 /* If recoveries are disabled then there is no use doing any
3719 * nodemap or flags checks. Recoveries might be disabled due
3720 * to "reloadnodes", so doing these checks might cause an
3721 * unnecessary recovery. */
3722 if (ctdb_op_is_disabled(rec
->recovery
)) {
3726 /* get the nodemap for all active remote nodes
3728 remote_nodemaps
= talloc_array(mem_ctx
, struct ctdb_node_map
*, nodemap
->num
);
3729 if (remote_nodemaps
== NULL
) {
3730 DEBUG(DEBUG_ERR
, (__location__
" failed to allocate remote nodemap array\n"));
3733 for(i
=0; i
<nodemap
->num
; i
++) {
3734 remote_nodemaps
[i
] = NULL
;
3736 if (get_remote_nodemaps(ctdb
, mem_ctx
, nodemap
, remote_nodemaps
) != 0) {
3737 DEBUG(DEBUG_ERR
,(__location__
" Failed to read remote nodemaps\n"));
3741 /* verify that all other nodes have the same nodemap as we have
3743 for (j
=0; j
<nodemap
->num
; j
++) {
3744 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
3748 if (remote_nodemaps
[j
] == NULL
) {
3749 DEBUG(DEBUG_ERR
,(__location__
" Did not get a remote nodemap for node %d, restarting monitoring\n", j
));
3750 ctdb_set_culprit(rec
, j
);
3755 /* if the nodes disagree on how many nodes there are
3756 then this is a good reason to try recovery
3758 if (remote_nodemaps
[j
]->num
!= nodemap
->num
) {
3759 DEBUG(DEBUG_ERR
, (__location__
" Remote node:%u has different node count. %u vs %u of the local node\n",
3760 nodemap
->nodes
[j
].pnn
, remote_nodemaps
[j
]->num
, nodemap
->num
));
3761 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3762 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3766 /* if the nodes disagree on which nodes exist and are
3767 active, then that is also a good reason to do recovery
3769 for (i
=0;i
<nodemap
->num
;i
++) {
3770 if (remote_nodemaps
[j
]->nodes
[i
].pnn
!= nodemap
->nodes
[i
].pnn
) {
3771 DEBUG(DEBUG_ERR
, (__location__
" Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3772 nodemap
->nodes
[j
].pnn
, i
,
3773 remote_nodemaps
[j
]->nodes
[i
].pnn
, nodemap
->nodes
[i
].pnn
));
3774 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3775 do_recovery(rec
, mem_ctx
, pnn
, nodemap
,
3783 * Update node flags obtained from each active node. This ensure we have
3784 * up-to-date information for all the nodes.
3786 for (j
=0; j
<nodemap
->num
; j
++) {
3787 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
3790 nodemap
->nodes
[j
].flags
= remote_nodemaps
[j
]->nodes
[j
].flags
;
3793 for (j
=0; j
<nodemap
->num
; j
++) {
3794 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
3798 /* verify the flags are consistent
3800 for (i
=0; i
<nodemap
->num
; i
++) {
3801 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
) {
3805 if (nodemap
->nodes
[i
].flags
!= remote_nodemaps
[j
]->nodes
[i
].flags
) {
3806 DEBUG(DEBUG_ERR
, (__location__
" Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3807 nodemap
->nodes
[j
].pnn
,
3808 nodemap
->nodes
[i
].pnn
,
3809 remote_nodemaps
[j
]->nodes
[i
].flags
,
3810 nodemap
->nodes
[i
].flags
));
3812 DEBUG(DEBUG_ERR
,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps
[j
]->nodes
[i
].flags
, j
));
3813 update_flags_on_all_nodes(ctdb
, nodemap
, nodemap
->nodes
[i
].pnn
, remote_nodemaps
[j
]->nodes
[i
].flags
);
3814 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3815 do_recovery(rec
, mem_ctx
, pnn
, nodemap
,
3819 DEBUG(DEBUG_ERR
,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap
->nodes
[i
].flags
, i
));
3820 update_flags_on_all_nodes(ctdb
, nodemap
, nodemap
->nodes
[i
].pnn
, nodemap
->nodes
[i
].flags
);
3821 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3822 do_recovery(rec
, mem_ctx
, pnn
, nodemap
,
3831 /* count how many active nodes there are */
3833 for (i
=0; i
<nodemap
->num
; i
++) {
3834 if (!(nodemap
->nodes
[i
].flags
& NODE_FLAGS_INACTIVE
)) {
3835 if (ctdb_node_has_capabilities(rec
->caps
,
3836 ctdb
->nodes
[i
]->pnn
,
3837 CTDB_CAP_LMASTER
)) {
3844 /* There must be the same number of lmasters in the vnn map as
3845 * there are active nodes with the lmaster capability... or
3848 if (vnnmap
->size
!= num_lmasters
) {
3849 DEBUG(DEBUG_ERR
, (__location__
" The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
3850 vnnmap
->size
, num_lmasters
));
3851 ctdb_set_culprit(rec
, ctdb
->pnn
);
3852 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3856 /* verify that all active nodes in the nodemap also exist in
3859 for (j
=0; j
<nodemap
->num
; j
++) {
3860 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
3863 if (nodemap
->nodes
[j
].pnn
== pnn
) {
3867 for (i
=0; i
<vnnmap
->size
; i
++) {
3868 if (vnnmap
->map
[i
] == nodemap
->nodes
[j
].pnn
) {
3872 if (i
== vnnmap
->size
) {
3873 DEBUG(DEBUG_ERR
, (__location__
" Node %u is active in the nodemap but did not exist in the vnnmap\n",
3874 nodemap
->nodes
[j
].pnn
));
3875 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3876 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3882 /* verify that all other nodes have the same vnnmap
3883 and are from the same generation
3885 for (j
=0; j
<nodemap
->num
; j
++) {
3886 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
3889 if (nodemap
->nodes
[j
].pnn
== pnn
) {
3893 ret
= ctdb_ctrl_getvnnmap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
3894 mem_ctx
, &remote_vnnmap
);
3896 DEBUG(DEBUG_ERR
, (__location__
" Unable to get vnnmap from remote node %u\n",
3897 nodemap
->nodes
[j
].pnn
));
3901 /* verify the vnnmap generation is the same */
3902 if (vnnmap
->generation
!= remote_vnnmap
->generation
) {
3903 DEBUG(DEBUG_ERR
, (__location__
" Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3904 nodemap
->nodes
[j
].pnn
, remote_vnnmap
->generation
, vnnmap
->generation
));
3905 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3906 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3910 /* verify the vnnmap size is the same */
3911 if (vnnmap
->size
!= remote_vnnmap
->size
) {
3912 DEBUG(DEBUG_ERR
, (__location__
" Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3913 nodemap
->nodes
[j
].pnn
, remote_vnnmap
->size
, vnnmap
->size
));
3914 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3915 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3919 /* verify the vnnmap is the same */
3920 for (i
=0;i
<vnnmap
->size
;i
++) {
3921 if (remote_vnnmap
->map
[i
] != vnnmap
->map
[i
]) {
3922 DEBUG(DEBUG_ERR
, (__location__
" Remote node %u has different vnnmap.\n",
3923 nodemap
->nodes
[j
].pnn
));
3924 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3925 do_recovery(rec
, mem_ctx
, pnn
, nodemap
,
3932 /* we might need to change who has what IP assigned */
3933 if (rec
->need_takeover_run
) {
3934 uint32_t culprit
= (uint32_t)-1;
3936 rec
->need_takeover_run
= false;
3938 /* update the list of public ips that a node can handle for
3941 ret
= ctdb_reload_remote_public_ips(ctdb
, rec
, nodemap
, &culprit
);
3943 DEBUG(DEBUG_ERR
,("Failed to read public ips from remote node %d\n",
3945 rec
->need_takeover_run
= true;
3949 /* execute the "startrecovery" event script on all nodes */
3950 ret
= run_startrecovery_eventscript(rec
, nodemap
);
3952 DEBUG(DEBUG_ERR
, (__location__
" Unable to run the 'startrecovery' event on cluster\n"));
3953 ctdb_set_culprit(rec
, ctdb
->pnn
);
3954 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3958 /* If takeover run fails, then the offending nodes are
3959 * assigned ban culprit counts. And we re-try takeover.
3960 * If takeover run fails repeatedly, the node would get
3963 * If rec->need_takeover_run is not set to true at this
3964 * failure, monitoring is disabled cluster-wide (via
3965 * startrecovery eventscript) and will not get enabled.
3967 if (!do_takeover_run(rec
, nodemap
, true)) {
3971 /* execute the "recovered" event script on all nodes */
3972 ret
= run_recovered_eventscript(rec
, nodemap
, "monitor_cluster");
3974 // we cant check whether the event completed successfully
3975 // since this script WILL fail if the node is in recovery mode
3976 // and if that race happens, the code here would just cause a second
3977 // cascading recovery.
3979 DEBUG(DEBUG_ERR
, (__location__
" Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
3980 ctdb_set_culprit(rec
, ctdb
->pnn
);
3981 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3988 the main monitoring loop
3990 static void monitor_cluster(struct ctdb_context
*ctdb
)
3992 struct ctdb_recoverd
*rec
;
3994 DEBUG(DEBUG_NOTICE
,("monitor_cluster starting\n"));
3996 rec
= talloc_zero(ctdb
, struct ctdb_recoverd
);
3997 CTDB_NO_MEMORY_FATAL(ctdb
, rec
);
4001 rec
->takeover_run
= ctdb_op_init(rec
, "takeover runs");
4002 CTDB_NO_MEMORY_FATAL(ctdb
, rec
->takeover_run
);
4004 rec
->recovery
= ctdb_op_init(rec
, "recoveries");
4005 CTDB_NO_MEMORY_FATAL(ctdb
, rec
->recovery
);
4007 rec
->priority_time
= timeval_current();
4009 /* register a message port for sending memory dumps */
4010 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_MEM_DUMP
, mem_dump_handler
, rec
);
4012 /* register a message port for recovery elections */
4013 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_RECOVERY
, election_handler
, rec
);
4015 /* when nodes are disabled/enabled */
4016 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_SET_NODE_FLAGS
, monitor_handler
, rec
);
4018 /* when we are asked to puch out a flag change */
4019 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_PUSH_NODE_FLAGS
, push_flags_handler
, rec
);
4021 /* register a message port for vacuum fetch */
4022 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_VACUUM_FETCH
, vacuum_fetch_handler
, rec
);
4024 /* register a message port for reloadnodes */
4025 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_RELOAD_NODES
, reload_nodes_handler
, rec
);
4027 /* register a message port for performing a takeover run */
4028 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_TAKEOVER_RUN
, ip_reallocate_handler
, rec
);
4030 /* register a message port for disabling the ip check for a short while */
4031 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_DISABLE_IP_CHECK
, disable_ip_check_handler
, rec
);
4033 /* register a message port for updating the recovery daemons node assignment for an ip */
4034 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_RECD_UPDATE_IP
, recd_update_ip_handler
, rec
);
4036 /* register a message port for forcing a rebalance of a node next
4038 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_REBALANCE_NODE
, recd_node_rebalance_handler
, rec
);
4040 /* Register a message port for disabling takeover runs */
4041 ctdb_client_set_message_handler(ctdb
,
4042 CTDB_SRVID_DISABLE_TAKEOVER_RUNS
,
4043 disable_takeover_runs_handler
, rec
);
4045 /* Register a message port for disabling recoveries */
4046 ctdb_client_set_message_handler(ctdb
,
4047 CTDB_SRVID_DISABLE_RECOVERIES
,
4048 disable_recoveries_handler
, rec
);
4050 /* register a message port for detaching database */
4051 ctdb_client_set_message_handler(ctdb
,
4052 CTDB_SRVID_DETACH_DATABASE
,
4053 detach_database_handler
, rec
);
4056 TALLOC_CTX
*mem_ctx
= talloc_new(ctdb
);
4057 struct timeval start
;
4061 DEBUG(DEBUG_CRIT
,(__location__
4062 " Failed to create temp context\n"));
4066 start
= timeval_current();
4067 main_loop(ctdb
, rec
, mem_ctx
);
4068 talloc_free(mem_ctx
);
4070 /* we only check for recovery once every second */
4071 elapsed
= timeval_elapsed(&start
);
4072 if (elapsed
< ctdb
->tunable
.recover_interval
) {
4073 ctdb_wait_timeout(ctdb
, ctdb
->tunable
.recover_interval
4080 event handler for when the main ctdbd dies
4082 static void ctdb_recoverd_parent(struct event_context
*ev
, struct fd_event
*fde
,
4083 uint16_t flags
, void *private_data
)
4085 DEBUG(DEBUG_ALERT
,("recovery daemon parent died - exiting\n"));
4090 called regularly to verify that the recovery daemon is still running
4092 static void ctdb_check_recd(struct event_context
*ev
, struct timed_event
*te
,
4093 struct timeval yt
, void *p
)
4095 struct ctdb_context
*ctdb
= talloc_get_type(p
, struct ctdb_context
);
4097 if (ctdb_kill(ctdb
, ctdb
->recoverd_pid
, 0) != 0) {
4098 DEBUG(DEBUG_ERR
,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb
->recoverd_pid
));
4100 event_add_timed(ctdb
->ev
, ctdb
, timeval_zero(),
4101 ctdb_restart_recd
, ctdb
);
4106 event_add_timed(ctdb
->ev
, ctdb
->recd_ctx
,
4107 timeval_current_ofs(30, 0),
4108 ctdb_check_recd
, ctdb
);
4111 static void recd_sig_child_handler(struct event_context
*ev
,
4112 struct signal_event
*se
, int signum
, int count
,
4116 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4121 pid
= waitpid(-1, &status
, WNOHANG
);
4123 if (errno
!= ECHILD
) {
4124 DEBUG(DEBUG_ERR
, (__location__
" waitpid() returned error. errno:%s(%d)\n", strerror(errno
),errno
));
4129 DEBUG(DEBUG_DEBUG
, ("RECD SIGCHLD from %d\n", (int)pid
));
4135 startup the recovery daemon as a child of the main ctdb daemon
4137 int ctdb_start_recoverd(struct ctdb_context
*ctdb
)
4140 struct signal_event
*se
;
4141 struct tevent_fd
*fde
;
4143 if (pipe(fd
) != 0) {
4147 ctdb
->recoverd_pid
= ctdb_fork(ctdb
);
4148 if (ctdb
->recoverd_pid
== -1) {
4152 if (ctdb
->recoverd_pid
!= 0) {
4153 talloc_free(ctdb
->recd_ctx
);
4154 ctdb
->recd_ctx
= talloc_new(ctdb
);
4155 CTDB_NO_MEMORY(ctdb
, ctdb
->recd_ctx
);
4158 event_add_timed(ctdb
->ev
, ctdb
->recd_ctx
,
4159 timeval_current_ofs(30, 0),
4160 ctdb_check_recd
, ctdb
);
4166 srandom(getpid() ^ time(NULL
));
4168 ctdb_set_process_name("ctdb_recovered");
4169 if (switch_from_server_to_client(ctdb
, "recoverd") != 0) {
4170 DEBUG(DEBUG_CRIT
, (__location__
"ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
4174 DEBUG(DEBUG_DEBUG
, (__location__
" Created PIPE FD:%d to recovery daemon\n", fd
[0]));
4176 fde
= event_add_fd(ctdb
->ev
, ctdb
, fd
[0], EVENT_FD_READ
,
4177 ctdb_recoverd_parent
, &fd
[0]);
4178 tevent_fd_set_auto_close(fde
);
4180 /* set up a handler to pick up sigchld */
4181 se
= event_add_signal(ctdb
->ev
, ctdb
,
4183 recd_sig_child_handler
,
4186 DEBUG(DEBUG_CRIT
,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
4190 monitor_cluster(ctdb
);
4192 DEBUG(DEBUG_ALERT
,("ERROR: ctdb_recoverd finished!?\n"));
4197 shutdown the recovery daemon
4199 void ctdb_stop_recoverd(struct ctdb_context
*ctdb
)
4201 if (ctdb
->recoverd_pid
== 0) {
4205 DEBUG(DEBUG_NOTICE
,("Shutting down recovery daemon\n"));
4206 ctdb_kill(ctdb
, ctdb
->recoverd_pid
, SIGTERM
);
4208 TALLOC_FREE(ctdb
->recd_ctx
);
4209 TALLOC_FREE(ctdb
->recd_ping_count
);
4212 static void ctdb_restart_recd(struct event_context
*ev
, struct timed_event
*te
,
4213 struct timeval t
, void *private_data
)
4215 struct ctdb_context
*ctdb
= talloc_get_type(private_data
, struct ctdb_context
);
4217 DEBUG(DEBUG_ERR
,("Restarting recovery daemon\n"));
4218 ctdb_stop_recoverd(ctdb
);
4219 ctdb_start_recoverd(ctdb
);