4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "system/filesys.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/wait.h"
27 #include "../include/ctdb_client.h"
28 #include "../include/ctdb_private.h"
29 #include "lib/tdb_wrap/tdb_wrap.h"
30 #include "lib/util/dlinklist.h"
33 /* List of SRVID requests that need to be processed */
35 struct srvid_list
*next
, *prev
;
36 struct srvid_request
*request
;
39 struct srvid_requests
{
40 struct srvid_list
*requests
;
43 static void srvid_request_reply(struct ctdb_context
*ctdb
,
44 struct srvid_request
*request
,
47 /* Someone that sent srvid==0 does not want a reply */
48 if (request
->srvid
== 0) {
53 if (ctdb_client_send_message(ctdb
, request
->pnn
, request
->srvid
,
55 DEBUG(DEBUG_INFO
,("Sent SRVID reply to %u:%llu\n",
56 (unsigned)request
->pnn
,
57 (unsigned long long)request
->srvid
));
59 DEBUG(DEBUG_ERR
,("Failed to send SRVID reply to %u:%llu\n",
60 (unsigned)request
->pnn
,
61 (unsigned long long)request
->srvid
));
67 static void srvid_requests_reply(struct ctdb_context
*ctdb
,
68 struct srvid_requests
**requests
,
73 for (r
= (*requests
)->requests
; r
!= NULL
; r
= r
->next
) {
74 srvid_request_reply(ctdb
, r
->request
, result
);
77 /* Free the list structure... */
78 TALLOC_FREE(*requests
);
81 static void srvid_request_add(struct ctdb_context
*ctdb
,
82 struct srvid_requests
**requests
,
83 struct srvid_request
*request
)
89 if (*requests
== NULL
) {
90 *requests
= talloc_zero(ctdb
, struct srvid_requests
);
91 if (*requests
== NULL
) {
96 t
= talloc_zero(*requests
, struct srvid_list
);
98 /* If *requests was just allocated above then free it */
99 if ((*requests
)->requests
== NULL
) {
100 TALLOC_FREE(*requests
);
105 t
->request
= (struct srvid_request
*)talloc_steal(t
, request
);
106 DLIST_ADD((*requests
)->requests
, t
);
111 /* Failed to add the request to the list. Send a fail. */
112 DEBUG(DEBUG_ERR
, (__location__
113 " Out of memory, failed to queue SRVID request\n"));
115 result
.dsize
= sizeof(ret
);
116 result
.dptr
= (uint8_t *)&ret
;
117 srvid_request_reply(ctdb
, request
, result
);
120 /* An abstraction to allow an operation (takeover runs, recoveries,
121 * ...) to be disabled for a given timeout */
122 struct ctdb_op_state
{
123 struct tevent_timer
*timer
;
128 static struct ctdb_op_state
*ctdb_op_init(TALLOC_CTX
*mem_ctx
, const char *name
)
130 struct ctdb_op_state
*state
= talloc_zero(mem_ctx
, struct ctdb_op_state
);
133 state
->in_progress
= false;
140 static bool ctdb_op_is_disabled(struct ctdb_op_state
*state
)
142 return state
->timer
!= NULL
;
145 static bool ctdb_op_begin(struct ctdb_op_state
*state
)
147 if (ctdb_op_is_disabled(state
)) {
149 ("Unable to begin - %s are disabled\n", state
->name
));
153 state
->in_progress
= true;
157 static bool ctdb_op_end(struct ctdb_op_state
*state
)
159 return state
->in_progress
= false;
162 static bool ctdb_op_is_in_progress(struct ctdb_op_state
*state
)
164 return state
->in_progress
;
167 static void ctdb_op_enable(struct ctdb_op_state
*state
)
169 TALLOC_FREE(state
->timer
);
172 static void ctdb_op_timeout_handler(struct event_context
*ev
,
173 struct timed_event
*te
,
174 struct timeval yt
, void *p
)
176 struct ctdb_op_state
*state
=
177 talloc_get_type(p
, struct ctdb_op_state
);
179 DEBUG(DEBUG_NOTICE
,("Reenabling %s after timeout\n", state
->name
));
180 ctdb_op_enable(state
);
183 static int ctdb_op_disable(struct ctdb_op_state
*state
,
184 struct tevent_context
*ev
,
188 DEBUG(DEBUG_NOTICE
,("Reenabling %s\n", state
->name
));
189 ctdb_op_enable(state
);
193 if (state
->in_progress
) {
195 ("Unable to disable %s - in progress\n", state
->name
));
199 DEBUG(DEBUG_NOTICE
,("Disabling %s for %u seconds\n",
200 state
->name
, timeout
));
202 /* Clear any old timers */
203 talloc_free(state
->timer
);
205 /* Arrange for the timeout to occur */
206 state
->timer
= tevent_add_timer(ev
, state
,
207 timeval_current_ofs(timeout
, 0),
208 ctdb_op_timeout_handler
, state
);
209 if (state
->timer
== NULL
) {
210 DEBUG(DEBUG_ERR
,(__location__
" Unable to setup timer\n"));
217 struct ctdb_banning_state
{
219 struct timeval last_reported_time
;
223 private state of recovery daemon
225 struct ctdb_recoverd
{
226 struct ctdb_context
*ctdb
;
228 uint32_t last_culprit_node
;
229 struct ctdb_node_map
*nodemap
;
230 struct timeval priority_time
;
231 bool need_takeover_run
;
234 struct timed_event
*send_election_te
;
235 struct timed_event
*election_timeout
;
236 struct vacuum_info
*vacuum_info
;
237 struct srvid_requests
*reallocate_requests
;
238 struct ctdb_op_state
*takeover_run
;
239 struct ctdb_op_state
*recovery
;
240 struct ctdb_control_get_ifaces
*ifaces
;
241 uint32_t *force_rebalance_nodes
;
242 struct ctdb_node_capabilities
*caps
;
245 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
246 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
248 static void ctdb_restart_recd(struct event_context
*ev
, struct timed_event
*te
, struct timeval t
, void *private_data
);
251 ban a node for a period of time
253 static void ctdb_ban_node(struct ctdb_recoverd
*rec
, uint32_t pnn
, uint32_t ban_time
)
256 struct ctdb_context
*ctdb
= rec
->ctdb
;
257 struct ctdb_ban_time bantime
;
259 if (!ctdb_validate_pnn(ctdb
, pnn
)) {
260 DEBUG(DEBUG_ERR
,("Bad pnn %u in ctdb_ban_node\n", pnn
));
264 DEBUG(DEBUG_NOTICE
,("Banning node %u for %u seconds\n", pnn
, ban_time
));
267 bantime
.time
= ban_time
;
269 ret
= ctdb_ctrl_set_ban(ctdb
, CONTROL_TIMEOUT(), pnn
, &bantime
);
271 DEBUG(DEBUG_ERR
,(__location__
" Failed to ban node %d\n", pnn
));
277 enum monitor_result
{ MONITOR_OK
, MONITOR_RECOVERY_NEEDED
, MONITOR_ELECTION_NEEDED
, MONITOR_FAILED
};
281 remember the trouble maker
283 static void ctdb_set_culprit_count(struct ctdb_recoverd
*rec
, uint32_t culprit
, uint32_t count
)
285 struct ctdb_context
*ctdb
= talloc_get_type(rec
->ctdb
, struct ctdb_context
);
286 struct ctdb_banning_state
*ban_state
;
288 if (culprit
> ctdb
->num_nodes
) {
289 DEBUG(DEBUG_ERR
,("Trying to set culprit %d but num_nodes is %d\n", culprit
, ctdb
->num_nodes
));
293 /* If we are banned or stopped, do not set other nodes as culprits */
294 if (rec
->node_flags
& NODE_FLAGS_INACTIVE
) {
295 DEBUG(DEBUG_NOTICE
, ("This node is INACTIVE, cannot set culprit node %d\n", culprit
));
299 if (ctdb
->nodes
[culprit
]->ban_state
== NULL
) {
300 ctdb
->nodes
[culprit
]->ban_state
= talloc_zero(ctdb
->nodes
[culprit
], struct ctdb_banning_state
);
301 CTDB_NO_MEMORY_VOID(ctdb
, ctdb
->nodes
[culprit
]->ban_state
);
305 ban_state
= ctdb
->nodes
[culprit
]->ban_state
;
306 if (timeval_elapsed(&ban_state
->last_reported_time
) > ctdb
->tunable
.recovery_grace_period
) {
307 /* this was the first time in a long while this node
308 misbehaved so we will forgive any old transgressions.
310 ban_state
->count
= 0;
313 ban_state
->count
+= count
;
314 ban_state
->last_reported_time
= timeval_current();
315 rec
->last_culprit_node
= culprit
;
319 remember the trouble maker
321 static void ctdb_set_culprit(struct ctdb_recoverd
*rec
, uint32_t culprit
)
323 ctdb_set_culprit_count(rec
, culprit
, 1);
327 /* this callback is called for every node that failed to execute the
330 static void recovered_fail_callback(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
332 struct ctdb_recoverd
*rec
= talloc_get_type(callback_data
, struct ctdb_recoverd
);
334 DEBUG(DEBUG_ERR
, (__location__
" Node %u failed the recovered event. Setting it as recovery fail culprit\n", node_pnn
));
336 ctdb_set_culprit(rec
, node_pnn
);
340 run the "recovered" eventscript on all nodes
342 static int run_recovered_eventscript(struct ctdb_recoverd
*rec
, struct ctdb_node_map
*nodemap
, const char *caller
)
346 struct ctdb_context
*ctdb
= rec
->ctdb
;
348 tmp_ctx
= talloc_new(ctdb
);
349 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
351 nodes
= list_of_active_nodes(ctdb
, nodemap
, tmp_ctx
, true);
352 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_END_RECOVERY
,
354 CONTROL_TIMEOUT(), false, tdb_null
,
355 NULL
, recovered_fail_callback
,
357 DEBUG(DEBUG_ERR
, (__location__
" Unable to run the 'recovered' event when called from %s\n", caller
));
359 talloc_free(tmp_ctx
);
363 talloc_free(tmp_ctx
);
367 /* this callback is called for every node that failed to execute the
370 static void startrecovery_fail_callback(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
372 struct ctdb_recoverd
*rec
= talloc_get_type(callback_data
, struct ctdb_recoverd
);
374 DEBUG(DEBUG_ERR
, (__location__
" Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn
));
376 ctdb_set_culprit(rec
, node_pnn
);
380 run the "startrecovery" eventscript on all nodes
382 static int run_startrecovery_eventscript(struct ctdb_recoverd
*rec
, struct ctdb_node_map
*nodemap
)
386 struct ctdb_context
*ctdb
= rec
->ctdb
;
388 tmp_ctx
= talloc_new(ctdb
);
389 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
391 nodes
= list_of_active_nodes(ctdb
, nodemap
, tmp_ctx
, true);
392 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_START_RECOVERY
,
394 CONTROL_TIMEOUT(), false, tdb_null
,
396 startrecovery_fail_callback
,
398 DEBUG(DEBUG_ERR
, (__location__
" Unable to run the 'startrecovery' event. Recovery failed.\n"));
399 talloc_free(tmp_ctx
);
403 talloc_free(tmp_ctx
);
408 update the node capabilities for all connected nodes
410 static int update_capabilities(struct ctdb_recoverd
*rec
,
411 struct ctdb_node_map
*nodemap
)
415 struct ctdb_node_capabilities
*caps
;
416 struct ctdb_context
*ctdb
= rec
->ctdb
;
418 tmp_ctx
= talloc_new(rec
);
419 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
421 caps
= ctdb_get_capabilities(ctdb
, tmp_ctx
,
422 CONTROL_TIMEOUT(), nodemap
);
426 (__location__
" Failed to get node capabilities\n"));
427 talloc_free(tmp_ctx
);
431 capp
= ctdb_get_node_capabilities(caps
, ctdb_get_pnn(ctdb
));
435 " Capabilities don't include current node.\n"));
436 talloc_free(tmp_ctx
);
439 ctdb
->capabilities
= *capp
;
441 TALLOC_FREE(rec
->caps
);
442 rec
->caps
= talloc_steal(rec
, caps
);
444 talloc_free(tmp_ctx
);
448 static void set_recmode_fail_callback(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
450 struct ctdb_recoverd
*rec
= talloc_get_type(callback_data
, struct ctdb_recoverd
);
452 DEBUG(DEBUG_ERR
,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn
, rec
->nodemap
->num
));
453 ctdb_set_culprit_count(rec
, node_pnn
, rec
->nodemap
->num
);
456 static void transaction_start_fail_callback(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
458 struct ctdb_recoverd
*rec
= talloc_get_type(callback_data
, struct ctdb_recoverd
);
460 DEBUG(DEBUG_ERR
,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn
, rec
->nodemap
->num
));
461 ctdb_set_culprit_count(rec
, node_pnn
, rec
->nodemap
->num
);
465 change recovery mode on all nodes
467 static int set_recovery_mode(struct ctdb_context
*ctdb
, struct ctdb_recoverd
*rec
, struct ctdb_node_map
*nodemap
, uint32_t rec_mode
)
473 tmp_ctx
= talloc_new(ctdb
);
474 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
476 nodes
= list_of_active_nodes(ctdb
, nodemap
, tmp_ctx
, true);
478 data
.dsize
= sizeof(uint32_t);
479 data
.dptr
= (unsigned char *)&rec_mode
;
481 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_SET_RECMODE
,
487 DEBUG(DEBUG_ERR
, (__location__
" Unable to set recovery mode. Recovery failed.\n"));
488 talloc_free(tmp_ctx
);
492 /* freeze all nodes */
493 if (rec_mode
== CTDB_RECOVERY_ACTIVE
) {
496 for (i
=1; i
<=NUM_DB_PRIORITIES
; i
++) {
497 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_FREEZE
,
502 set_recmode_fail_callback
,
504 DEBUG(DEBUG_ERR
, (__location__
" Unable to freeze nodes. Recovery failed.\n"));
505 talloc_free(tmp_ctx
);
511 talloc_free(tmp_ctx
);
516 change recovery master on all node
518 static int set_recovery_master(struct ctdb_context
*ctdb
, struct ctdb_node_map
*nodemap
, uint32_t pnn
)
524 tmp_ctx
= talloc_new(ctdb
);
525 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
527 data
.dsize
= sizeof(uint32_t);
528 data
.dptr
= (unsigned char *)&pnn
;
530 nodes
= list_of_active_nodes(ctdb
, nodemap
, tmp_ctx
, true);
531 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_SET_RECMASTER
,
533 CONTROL_TIMEOUT(), false, data
,
536 DEBUG(DEBUG_ERR
, (__location__
" Unable to set recmaster. Recovery failed.\n"));
537 talloc_free(tmp_ctx
);
541 talloc_free(tmp_ctx
);
545 /* update all remote nodes to use the same db priority that we have
546 this can fail if the remove node has not yet been upgraded to
547 support this function, so we always return success and never fail
548 a recovery if this call fails.
550 static int update_db_priority_on_remote_nodes(struct ctdb_context
*ctdb
,
551 struct ctdb_node_map
*nodemap
,
552 uint32_t pnn
, struct ctdb_dbid_map
*dbmap
, TALLOC_CTX
*mem_ctx
)
556 /* step through all local databases */
557 for (db
=0; db
<dbmap
->num
;db
++) {
558 struct ctdb_db_priority db_prio
;
561 db_prio
.db_id
= dbmap
->dbs
[db
].dbid
;
562 ret
= ctdb_ctrl_get_db_priority(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, dbmap
->dbs
[db
].dbid
, &db_prio
.priority
);
564 DEBUG(DEBUG_ERR
,(__location__
" Failed to read database priority from local node for db 0x%08x\n", dbmap
->dbs
[db
].dbid
));
568 DEBUG(DEBUG_INFO
,("Update DB priority for db 0x%08x to %u\n", dbmap
->dbs
[db
].dbid
, db_prio
.priority
));
570 ret
= ctdb_ctrl_set_db_priority(ctdb
, CONTROL_TIMEOUT(),
571 CTDB_CURRENT_NODE
, &db_prio
);
573 DEBUG(DEBUG_ERR
,(__location__
" Failed to set DB priority for 0x%08x\n",
582 ensure all other nodes have attached to any databases that we have
584 static int create_missing_remote_databases(struct ctdb_context
*ctdb
, struct ctdb_node_map
*nodemap
,
585 uint32_t pnn
, struct ctdb_dbid_map
*dbmap
, TALLOC_CTX
*mem_ctx
)
588 struct ctdb_dbid_map
*remote_dbmap
;
590 /* verify that all other nodes have all our databases */
591 for (j
=0; j
<nodemap
->num
; j
++) {
592 /* we dont need to ourself ourselves */
593 if (nodemap
->nodes
[j
].pnn
== pnn
) {
596 /* dont check nodes that are unavailable */
597 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
601 ret
= ctdb_ctrl_getdbmap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
602 mem_ctx
, &remote_dbmap
);
604 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbids from node %u\n", pnn
));
608 /* step through all local databases */
609 for (db
=0; db
<dbmap
->num
;db
++) {
613 for (i
=0;i
<remote_dbmap
->num
;i
++) {
614 if (dbmap
->dbs
[db
].dbid
== remote_dbmap
->dbs
[i
].dbid
) {
618 /* the remote node already have this database */
619 if (i
!=remote_dbmap
->num
) {
622 /* ok so we need to create this database */
623 ret
= ctdb_ctrl_getdbname(ctdb
, CONTROL_TIMEOUT(), pnn
,
624 dbmap
->dbs
[db
].dbid
, mem_ctx
,
627 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbname from node %u\n", pnn
));
630 ret
= ctdb_ctrl_createdb(ctdb
, CONTROL_TIMEOUT(),
631 nodemap
->nodes
[j
].pnn
,
633 dbmap
->dbs
[db
].flags
& CTDB_DB_FLAGS_PERSISTENT
);
635 DEBUG(DEBUG_ERR
, (__location__
" Unable to create remote db:%s\n", name
));
646 ensure we are attached to any databases that anyone else is attached to
648 static int create_missing_local_databases(struct ctdb_context
*ctdb
, struct ctdb_node_map
*nodemap
,
649 uint32_t pnn
, struct ctdb_dbid_map
**dbmap
, TALLOC_CTX
*mem_ctx
)
652 struct ctdb_dbid_map
*remote_dbmap
;
654 /* verify that we have all database any other node has */
655 for (j
=0; j
<nodemap
->num
; j
++) {
656 /* we dont need to ourself ourselves */
657 if (nodemap
->nodes
[j
].pnn
== pnn
) {
660 /* dont check nodes that are unavailable */
661 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
665 ret
= ctdb_ctrl_getdbmap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
666 mem_ctx
, &remote_dbmap
);
668 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbids from node %u\n", pnn
));
672 /* step through all databases on the remote node */
673 for (db
=0; db
<remote_dbmap
->num
;db
++) {
676 for (i
=0;i
<(*dbmap
)->num
;i
++) {
677 if (remote_dbmap
->dbs
[db
].dbid
== (*dbmap
)->dbs
[i
].dbid
) {
681 /* we already have this db locally */
682 if (i
!=(*dbmap
)->num
) {
685 /* ok so we need to create this database and
688 ctdb_ctrl_getdbname(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
689 remote_dbmap
->dbs
[db
].dbid
, mem_ctx
, &name
);
691 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbname from node %u\n",
692 nodemap
->nodes
[j
].pnn
));
695 ctdb_ctrl_createdb(ctdb
, CONTROL_TIMEOUT(), pnn
, mem_ctx
, name
,
696 remote_dbmap
->dbs
[db
].flags
& CTDB_DB_FLAGS_PERSISTENT
);
698 DEBUG(DEBUG_ERR
, (__location__
" Unable to create local db:%s\n", name
));
701 ret
= ctdb_ctrl_getdbmap(ctdb
, CONTROL_TIMEOUT(), pnn
, mem_ctx
, dbmap
);
703 DEBUG(DEBUG_ERR
, (__location__
" Unable to reread dbmap on node %u\n", pnn
));
714 pull the remote database contents from one node into the recdb
716 static int pull_one_remote_database(struct ctdb_context
*ctdb
, uint32_t srcnode
,
717 struct tdb_wrap
*recdb
, uint32_t dbid
)
721 struct ctdb_marshall_buffer
*reply
;
722 struct ctdb_rec_data
*recdata
;
724 TALLOC_CTX
*tmp_ctx
= talloc_new(recdb
);
726 ret
= ctdb_ctrl_pulldb(ctdb
, srcnode
, dbid
, CTDB_LMASTER_ANY
, tmp_ctx
,
727 CONTROL_TIMEOUT(), &outdata
);
729 DEBUG(DEBUG_ERR
,(__location__
" Unable to copy db from node %u\n", srcnode
));
730 talloc_free(tmp_ctx
);
734 reply
= (struct ctdb_marshall_buffer
*)outdata
.dptr
;
736 if (outdata
.dsize
< offsetof(struct ctdb_marshall_buffer
, data
)) {
737 DEBUG(DEBUG_ERR
,(__location__
" invalid data in pulldb reply\n"));
738 talloc_free(tmp_ctx
);
742 recdata
= (struct ctdb_rec_data
*)&reply
->data
[0];
746 recdata
= (struct ctdb_rec_data
*)(recdata
->length
+ (uint8_t *)recdata
), i
++) {
748 struct ctdb_ltdb_header
*hdr
;
751 key
.dptr
= &recdata
->data
[0];
752 key
.dsize
= recdata
->keylen
;
753 data
.dptr
= &recdata
->data
[key
.dsize
];
754 data
.dsize
= recdata
->datalen
;
756 hdr
= (struct ctdb_ltdb_header
*)data
.dptr
;
758 if (data
.dsize
< sizeof(struct ctdb_ltdb_header
)) {
759 DEBUG(DEBUG_CRIT
,(__location__
" bad ltdb record\n"));
760 talloc_free(tmp_ctx
);
764 /* fetch the existing record, if any */
765 existing
= tdb_fetch(recdb
->tdb
, key
);
767 if (existing
.dptr
!= NULL
) {
768 struct ctdb_ltdb_header header
;
769 if (existing
.dsize
< sizeof(struct ctdb_ltdb_header
)) {
770 DEBUG(DEBUG_CRIT
,(__location__
" Bad record size %u from node %u\n",
771 (unsigned)existing
.dsize
, srcnode
));
773 talloc_free(tmp_ctx
);
776 header
= *(struct ctdb_ltdb_header
*)existing
.dptr
;
778 if (!(header
.rsn
< hdr
->rsn
||
779 (header
.dmaster
!= ctdb_get_pnn(ctdb
) &&
780 header
.rsn
== hdr
->rsn
))) {
785 if (tdb_store(recdb
->tdb
, key
, data
, TDB_REPLACE
) != 0) {
786 DEBUG(DEBUG_CRIT
,(__location__
" Failed to store record\n"));
787 talloc_free(tmp_ctx
);
792 talloc_free(tmp_ctx
);
798 struct pull_seqnum_cbdata
{
804 static void pull_seqnum_cb(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
806 struct pull_seqnum_cbdata
*cb_data
= talloc_get_type(callback_data
, struct pull_seqnum_cbdata
);
809 if (cb_data
->failed
!= 0) {
810 DEBUG(DEBUG_ERR
, ("Got seqnum from node %d but we have already failed the entire operation\n", node_pnn
));
815 DEBUG(DEBUG_ERR
, ("Error when pulling seqnum from node %d\n", node_pnn
));
820 if (outdata
.dsize
!= sizeof(uint64_t)) {
821 DEBUG(DEBUG_ERR
, ("Error when reading pull seqnum from node %d, got %d bytes but expected %d\n", node_pnn
, (int)outdata
.dsize
, (int)sizeof(uint64_t)));
822 cb_data
->failed
= -1;
826 seqnum
= *((uint64_t *)outdata
.dptr
);
828 if (seqnum
> cb_data
->seqnum
||
829 (cb_data
->pnn
== -1 && seqnum
== 0)) {
830 cb_data
->seqnum
= seqnum
;
831 cb_data
->pnn
= node_pnn
;
835 static void pull_seqnum_fail_cb(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
837 struct pull_seqnum_cbdata
*cb_data
= talloc_get_type(callback_data
, struct pull_seqnum_cbdata
);
839 DEBUG(DEBUG_ERR
, ("Failed to pull db seqnum from node %d\n", node_pnn
));
843 static int pull_highest_seqnum_pdb(struct ctdb_context
*ctdb
,
844 struct ctdb_recoverd
*rec
,
845 struct ctdb_node_map
*nodemap
,
846 struct tdb_wrap
*recdb
, uint32_t dbid
)
848 TALLOC_CTX
*tmp_ctx
= talloc_new(NULL
);
852 struct pull_seqnum_cbdata
*cb_data
;
854 DEBUG(DEBUG_NOTICE
, ("Scan for highest seqnum pdb for db:0x%08x\n", dbid
));
859 data
.dsize
= sizeof(outdata
);
860 data
.dptr
= (uint8_t *)&outdata
[0];
862 cb_data
= talloc(tmp_ctx
, struct pull_seqnum_cbdata
);
863 if (cb_data
== NULL
) {
864 DEBUG(DEBUG_ERR
, ("Failed to allocate pull highest seqnum cb_data structure\n"));
865 talloc_free(tmp_ctx
);
873 nodes
= list_of_active_nodes(ctdb
, nodemap
, tmp_ctx
, true);
874 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_GET_DB_SEQNUM
,
876 CONTROL_TIMEOUT(), false, data
,
880 DEBUG(DEBUG_ERR
, (__location__
" Failed to run async GET_DB_SEQNUM\n"));
882 talloc_free(tmp_ctx
);
886 if (cb_data
->failed
!= 0) {
887 DEBUG(DEBUG_NOTICE
, ("Failed to pull sequence numbers for DB 0x%08x\n", dbid
));
888 talloc_free(tmp_ctx
);
892 if (cb_data
->pnn
== -1) {
893 DEBUG(DEBUG_NOTICE
, ("Failed to find a node with highest sequence numbers for DB 0x%08x\n", dbid
));
894 talloc_free(tmp_ctx
);
898 DEBUG(DEBUG_NOTICE
, ("Pull persistent db:0x%08x from node %d with highest seqnum:%lld\n", dbid
, cb_data
->pnn
, (long long)cb_data
->seqnum
));
900 if (pull_one_remote_database(ctdb
, cb_data
->pnn
, recdb
, dbid
) != 0) {
901 DEBUG(DEBUG_ERR
, ("Failed to pull higest seqnum database 0x%08x from node %d\n", dbid
, cb_data
->pnn
));
902 talloc_free(tmp_ctx
);
906 talloc_free(tmp_ctx
);
912 pull all the remote database contents into the recdb
914 static int pull_remote_database(struct ctdb_context
*ctdb
,
915 struct ctdb_recoverd
*rec
,
916 struct ctdb_node_map
*nodemap
,
917 struct tdb_wrap
*recdb
, uint32_t dbid
,
922 if (persistent
&& ctdb
->tunable
.recover_pdb_by_seqnum
!= 0) {
924 ret
= pull_highest_seqnum_pdb(ctdb
, rec
, nodemap
, recdb
, dbid
);
930 /* pull all records from all other nodes across onto this node
931 (this merges based on rsn)
933 for (j
=0; j
<nodemap
->num
; j
++) {
934 /* dont merge from nodes that are unavailable */
935 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
938 if (pull_one_remote_database(ctdb
, nodemap
->nodes
[j
].pnn
, recdb
, dbid
) != 0) {
939 DEBUG(DEBUG_ERR
,(__location__
" Failed to pull remote database from node %u\n",
940 nodemap
->nodes
[j
].pnn
));
941 ctdb_set_culprit_count(rec
, nodemap
->nodes
[j
].pnn
, nodemap
->num
);
951 update flags on all active nodes
953 static int update_flags_on_all_nodes(struct ctdb_context
*ctdb
, struct ctdb_node_map
*nodemap
, uint32_t pnn
, uint32_t flags
)
957 ret
= ctdb_ctrl_modflags(ctdb
, CONTROL_TIMEOUT(), pnn
, flags
, ~flags
);
959 DEBUG(DEBUG_ERR
, (__location__
" Unable to update nodeflags on remote nodes\n"));
967 ensure all nodes have the same vnnmap we do
969 static int update_vnnmap_on_all_nodes(struct ctdb_context
*ctdb
, struct ctdb_node_map
*nodemap
,
970 uint32_t pnn
, struct ctdb_vnn_map
*vnnmap
, TALLOC_CTX
*mem_ctx
)
974 /* push the new vnn map out to all the nodes */
975 for (j
=0; j
<nodemap
->num
; j
++) {
976 /* dont push to nodes that are unavailable */
977 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
981 ret
= ctdb_ctrl_setvnnmap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
, mem_ctx
, vnnmap
);
983 DEBUG(DEBUG_ERR
, (__location__
" Unable to set vnnmap for node %u\n", pnn
));
993 struct vacuum_info
*next
, *prev
;
994 struct ctdb_recoverd
*rec
;
996 struct ctdb_db_context
*ctdb_db
;
997 struct ctdb_marshall_buffer
*recs
;
998 struct ctdb_rec_data
*r
;
1001 static void vacuum_fetch_next(struct vacuum_info
*v
);
1004 called when a vacuum fetch has completed - just free it and do the next one
1006 static void vacuum_fetch_callback(struct ctdb_client_call_state
*state
)
1013 process the next element from the vacuum list
1015 static void vacuum_fetch_next(struct vacuum_info
*v
)
1017 struct ctdb_call call
;
1018 struct ctdb_rec_data
*r
;
1020 while (v
->recs
->count
) {
1021 struct ctdb_client_call_state
*state
;
1023 struct ctdb_ltdb_header
*hdr
;
1026 call
.call_id
= CTDB_NULL_FUNC
;
1027 call
.flags
= CTDB_IMMEDIATE_MIGRATION
;
1028 call
.flags
|= CTDB_CALL_FLAG_VACUUM_MIGRATION
;
1031 v
->r
= (struct ctdb_rec_data
*)(r
->length
+ (uint8_t *)r
);
1034 call
.key
.dptr
= &r
->data
[0];
1035 call
.key
.dsize
= r
->keylen
;
1037 /* ensure we don't block this daemon - just skip a record if we can't get
1039 if (tdb_chainlock_nonblock(v
->ctdb_db
->ltdb
->tdb
, call
.key
) != 0) {
1043 data
= tdb_fetch(v
->ctdb_db
->ltdb
->tdb
, call
.key
);
1044 if (data
.dptr
== NULL
) {
1045 tdb_chainunlock(v
->ctdb_db
->ltdb
->tdb
, call
.key
);
1049 if (data
.dsize
< sizeof(struct ctdb_ltdb_header
)) {
1051 tdb_chainunlock(v
->ctdb_db
->ltdb
->tdb
, call
.key
);
1055 hdr
= (struct ctdb_ltdb_header
*)data
.dptr
;
1056 if (hdr
->dmaster
== v
->rec
->ctdb
->pnn
) {
1057 /* its already local */
1059 tdb_chainunlock(v
->ctdb_db
->ltdb
->tdb
, call
.key
);
1065 state
= ctdb_call_send(v
->ctdb_db
, &call
);
1066 tdb_chainunlock(v
->ctdb_db
->ltdb
->tdb
, call
.key
);
1067 if (state
== NULL
) {
1068 DEBUG(DEBUG_ERR
,(__location__
" Failed to setup vacuum fetch call\n"));
1072 state
->async
.fn
= vacuum_fetch_callback
;
1073 state
->async
.private_data
= NULL
;
1081 destroy a vacuum info structure
1083 static int vacuum_info_destructor(struct vacuum_info
*v
)
1085 DLIST_REMOVE(v
->rec
->vacuum_info
, v
);
1091 handler for vacuum fetch
1093 static void vacuum_fetch_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
1094 TDB_DATA data
, void *private_data
)
1096 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
, struct ctdb_recoverd
);
1097 struct ctdb_marshall_buffer
*recs
;
1099 TALLOC_CTX
*tmp_ctx
= talloc_new(ctdb
);
1101 struct ctdb_dbid_map
*dbmap
=NULL
;
1102 bool persistent
= false;
1103 struct ctdb_db_context
*ctdb_db
;
1104 struct ctdb_rec_data
*r
;
1106 struct vacuum_info
*v
;
1108 recs
= (struct ctdb_marshall_buffer
*)data
.dptr
;
1109 r
= (struct ctdb_rec_data
*)&recs
->data
[0];
1111 if (recs
->count
== 0) {
1112 talloc_free(tmp_ctx
);
1118 for (v
=rec
->vacuum_info
;v
;v
=v
->next
) {
1119 if (srcnode
== v
->srcnode
&& recs
->db_id
== v
->ctdb_db
->db_id
) {
1120 /* we're already working on records from this node */
1121 talloc_free(tmp_ctx
);
1126 /* work out if the database is persistent */
1127 ret
= ctdb_ctrl_getdbmap(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, tmp_ctx
, &dbmap
);
1129 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbids from local node\n"));
1130 talloc_free(tmp_ctx
);
1134 for (i
=0;i
<dbmap
->num
;i
++) {
1135 if (dbmap
->dbs
[i
].dbid
== recs
->db_id
) {
1136 persistent
= dbmap
->dbs
[i
].flags
& CTDB_DB_FLAGS_PERSISTENT
;
1140 if (i
== dbmap
->num
) {
1141 DEBUG(DEBUG_ERR
, (__location__
" Unable to find db_id 0x%x on local node\n", recs
->db_id
));
1142 talloc_free(tmp_ctx
);
1146 /* find the name of this database */
1147 if (ctdb_ctrl_getdbname(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, recs
->db_id
, tmp_ctx
, &name
) != 0) {
1148 DEBUG(DEBUG_ERR
,(__location__
" Failed to get name of db 0x%x\n", recs
->db_id
));
1149 talloc_free(tmp_ctx
);
1154 ctdb_db
= ctdb_attach(ctdb
, CONTROL_TIMEOUT(), name
, persistent
, 0);
1155 if (ctdb_db
== NULL
) {
1156 DEBUG(DEBUG_ERR
,(__location__
" Failed to attach to database '%s'\n", name
));
1157 talloc_free(tmp_ctx
);
1161 v
= talloc_zero(rec
, struct vacuum_info
);
1163 DEBUG(DEBUG_CRIT
,(__location__
" Out of memory\n"));
1164 talloc_free(tmp_ctx
);
1169 v
->srcnode
= srcnode
;
1170 v
->ctdb_db
= ctdb_db
;
1171 v
->recs
= talloc_memdup(v
, recs
, data
.dsize
);
1172 if (v
->recs
== NULL
) {
1173 DEBUG(DEBUG_CRIT
,(__location__
" Out of memory\n"));
1175 talloc_free(tmp_ctx
);
1178 v
->r
= (struct ctdb_rec_data
*)&v
->recs
->data
[0];
1180 DLIST_ADD(rec
->vacuum_info
, v
);
1182 talloc_set_destructor(v
, vacuum_info_destructor
);
1184 vacuum_fetch_next(v
);
1185 talloc_free(tmp_ctx
);
1190 * handler for database detach
1192 static void detach_database_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
1193 TDB_DATA data
, void *private_data
)
1195 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
,
1196 struct ctdb_recoverd
);
1198 struct vacuum_info
*v
, *vnext
;
1199 struct ctdb_db_context
*ctdb_db
;
1201 if (data
.dsize
!= sizeof(db_id
)) {
1204 db_id
= *(uint32_t *)data
.dptr
;
1206 ctdb_db
= find_ctdb_db(ctdb
, db_id
);
1207 if (ctdb_db
== NULL
) {
1208 /* database is not attached */
1212 /* Stop any active vacuum fetch */
1213 v
= rec
->vacuum_info
;
1217 if (v
->ctdb_db
->db_id
== db_id
) {
1223 DLIST_REMOVE(ctdb
->db_list
, ctdb_db
);
1225 DEBUG(DEBUG_NOTICE
, ("Detached from database '%s'\n",
1227 talloc_free(ctdb_db
);
1231 called when ctdb_wait_timeout should finish
1233 static void ctdb_wait_handler(struct event_context
*ev
, struct timed_event
*te
,
1234 struct timeval yt
, void *p
)
1236 uint32_t *timed_out
= (uint32_t *)p
;
1241 wait for a given number of seconds
1243 static void ctdb_wait_timeout(struct ctdb_context
*ctdb
, double secs
)
1245 uint32_t timed_out
= 0;
1246 time_t usecs
= (secs
- (time_t)secs
) * 1000000;
1247 event_add_timed(ctdb
->ev
, ctdb
, timeval_current_ofs(secs
, usecs
), ctdb_wait_handler
, &timed_out
);
1248 while (!timed_out
) {
1249 event_loop_once(ctdb
->ev
);
1254 called when an election times out (ends)
1256 static void ctdb_election_timeout(struct event_context
*ev
, struct timed_event
*te
,
1257 struct timeval t
, void *p
)
1259 struct ctdb_recoverd
*rec
= talloc_get_type(p
, struct ctdb_recoverd
);
1260 rec
->election_timeout
= NULL
;
1263 DEBUG(DEBUG_WARNING
,("Election period ended\n"));
1268 wait for an election to finish. It finished election_timeout seconds after
1269 the last election packet is received
1271 static void ctdb_wait_election(struct ctdb_recoverd
*rec
)
1273 struct ctdb_context
*ctdb
= rec
->ctdb
;
1274 while (rec
->election_timeout
) {
1275 event_loop_once(ctdb
->ev
);
1280 Update our local flags from all remote connected nodes.
1281 This is only run when we are or we belive we are the recovery master
1283 static int update_local_flags(struct ctdb_recoverd
*rec
, struct ctdb_node_map
*nodemap
)
1286 struct ctdb_context
*ctdb
= rec
->ctdb
;
1287 TALLOC_CTX
*mem_ctx
= talloc_new(ctdb
);
1289 /* get the nodemap for all active remote nodes and verify
1290 they are the same as for this node
1292 for (j
=0; j
<nodemap
->num
; j
++) {
1293 struct ctdb_node_map
*remote_nodemap
=NULL
;
1296 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_DISCONNECTED
) {
1299 if (nodemap
->nodes
[j
].pnn
== ctdb
->pnn
) {
1303 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
1304 mem_ctx
, &remote_nodemap
);
1306 DEBUG(DEBUG_ERR
, (__location__
" Unable to get nodemap from remote node %u\n",
1307 nodemap
->nodes
[j
].pnn
));
1308 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
1309 talloc_free(mem_ctx
);
1310 return MONITOR_FAILED
;
1312 if (nodemap
->nodes
[j
].flags
!= remote_nodemap
->nodes
[j
].flags
) {
1313 /* We should tell our daemon about this so it
1314 updates its flags or else we will log the same
1315 message again in the next iteration of recovery.
1316 Since we are the recovery master we can just as
1317 well update the flags on all nodes.
1319 ret
= ctdb_ctrl_modflags(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
, remote_nodemap
->nodes
[j
].flags
, ~remote_nodemap
->nodes
[j
].flags
);
1321 DEBUG(DEBUG_ERR
, (__location__
" Unable to update nodeflags on remote nodes\n"));
1325 /* Update our local copy of the flags in the recovery
1328 DEBUG(DEBUG_NOTICE
,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
1329 nodemap
->nodes
[j
].pnn
, remote_nodemap
->nodes
[j
].flags
,
1330 nodemap
->nodes
[j
].flags
));
1331 nodemap
->nodes
[j
].flags
= remote_nodemap
->nodes
[j
].flags
;
1333 talloc_free(remote_nodemap
);
1335 talloc_free(mem_ctx
);
1340 /* Create a new random generation ip.
1341 The generation id can not be the INVALID_GENERATION id
1343 static uint32_t new_generation(void)
1345 uint32_t generation
;
1348 generation
= random();
1350 if (generation
!= INVALID_GENERATION
) {
1360 create a temporary working database
1362 static struct tdb_wrap
*create_recdb(struct ctdb_context
*ctdb
, TALLOC_CTX
*mem_ctx
)
1365 struct tdb_wrap
*recdb
;
1368 /* open up the temporary recovery database */
1369 name
= talloc_asprintf(mem_ctx
, "%s/recdb.tdb.%u",
1370 ctdb
->db_directory_state
,
1377 tdb_flags
= TDB_NOLOCK
;
1378 if (ctdb
->valgrinding
) {
1379 tdb_flags
|= TDB_NOMMAP
;
1381 tdb_flags
|= (TDB_INCOMPATIBLE_HASH
| TDB_DISALLOW_NESTING
);
1383 recdb
= tdb_wrap_open(mem_ctx
, name
, ctdb
->tunable
.database_hash_size
,
1384 tdb_flags
, O_RDWR
|O_CREAT
|O_EXCL
, 0600);
1385 if (recdb
== NULL
) {
1386 DEBUG(DEBUG_CRIT
,(__location__
" Failed to create temp recovery database '%s'\n", name
));
1396 a traverse function for pulling all relevant records from recdb
1399 struct ctdb_context
*ctdb
;
1400 struct ctdb_marshall_buffer
*recdata
;
1402 uint32_t allocated_len
;
1407 static int traverse_recdb(struct tdb_context
*tdb
, TDB_DATA key
, TDB_DATA data
, void *p
)
1409 struct recdb_data
*params
= (struct recdb_data
*)p
;
1410 struct ctdb_rec_data
*recdata
;
1411 struct ctdb_ltdb_header
*hdr
;
1414 * skip empty records - but NOT for persistent databases:
1416 * The record-by-record mode of recovery deletes empty records.
1417 * For persistent databases, this can lead to data corruption
1418 * by deleting records that should be there:
1420 * - Assume the cluster has been running for a while.
1422 * - A record R in a persistent database has been created and
1423 * deleted a couple of times, the last operation being deletion,
1424 * leaving an empty record with a high RSN, say 10.
1426 * - Now a node N is turned off.
1428 * - This leaves the local database copy of D on N with the empty
1429 * copy of R and RSN 10. On all other nodes, the recovery has deleted
1430 * the copy of record R.
1432 * - Now the record is created again while node N is turned off.
1433 * This creates R with RSN = 1 on all nodes except for N.
1435 * - Now node N is turned on again. The following recovery will chose
1436 * the older empty copy of R due to RSN 10 > RSN 1.
1438 * ==> Hence the record is gone after the recovery.
1440 * On databases like Samba's registry, this can damage the higher-level
1441 * data structures built from the various tdb-level records.
1443 if (!params
->persistent
&& data
.dsize
<= sizeof(struct ctdb_ltdb_header
)) {
1447 /* update the dmaster field to point to us */
1448 hdr
= (struct ctdb_ltdb_header
*)data
.dptr
;
1449 if (!params
->persistent
) {
1450 hdr
->dmaster
= params
->ctdb
->pnn
;
1451 hdr
->flags
|= CTDB_REC_FLAG_MIGRATED_WITH_DATA
;
1454 /* add the record to the blob ready to send to the nodes */
1455 recdata
= ctdb_marshall_record(params
->recdata
, 0, key
, NULL
, data
);
1456 if (recdata
== NULL
) {
1457 params
->failed
= true;
1460 if (params
->len
+ recdata
->length
>= params
->allocated_len
) {
1461 params
->allocated_len
= recdata
->length
+ params
->len
+ params
->ctdb
->tunable
.pulldb_preallocation_size
;
1462 params
->recdata
= talloc_realloc_size(NULL
, params
->recdata
, params
->allocated_len
);
1464 if (params
->recdata
== NULL
) {
1465 DEBUG(DEBUG_CRIT
,(__location__
" Failed to expand recdata to %u\n",
1466 recdata
->length
+ params
->len
));
1467 params
->failed
= true;
1470 params
->recdata
->count
++;
1471 memcpy(params
->len
+(uint8_t *)params
->recdata
, recdata
, recdata
->length
);
1472 params
->len
+= recdata
->length
;
1473 talloc_free(recdata
);
1479 push the recdb database out to all nodes
1481 static int push_recdb_database(struct ctdb_context
*ctdb
, uint32_t dbid
,
1483 struct tdb_wrap
*recdb
, struct ctdb_node_map
*nodemap
)
1485 struct recdb_data params
;
1486 struct ctdb_marshall_buffer
*recdata
;
1488 TALLOC_CTX
*tmp_ctx
;
1491 tmp_ctx
= talloc_new(ctdb
);
1492 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
1494 recdata
= talloc_zero(recdb
, struct ctdb_marshall_buffer
);
1495 CTDB_NO_MEMORY(ctdb
, recdata
);
1497 recdata
->db_id
= dbid
;
1500 params
.recdata
= recdata
;
1501 params
.len
= offsetof(struct ctdb_marshall_buffer
, data
);
1502 params
.allocated_len
= params
.len
;
1503 params
.failed
= false;
1504 params
.persistent
= persistent
;
1506 if (tdb_traverse_read(recdb
->tdb
, traverse_recdb
, ¶ms
) == -1) {
1507 DEBUG(DEBUG_ERR
,(__location__
" Failed to traverse recdb database\n"));
1508 talloc_free(params
.recdata
);
1509 talloc_free(tmp_ctx
);
1513 if (params
.failed
) {
1514 DEBUG(DEBUG_ERR
,(__location__
" Failed to traverse recdb database\n"));
1515 talloc_free(params
.recdata
);
1516 talloc_free(tmp_ctx
);
1520 recdata
= params
.recdata
;
1522 outdata
.dptr
= (void *)recdata
;
1523 outdata
.dsize
= params
.len
;
1525 nodes
= list_of_active_nodes(ctdb
, nodemap
, tmp_ctx
, true);
1526 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_PUSH_DB
,
1528 CONTROL_TIMEOUT(), false, outdata
,
1531 DEBUG(DEBUG_ERR
,(__location__
" Failed to push recdb records to nodes for db 0x%x\n", dbid
));
1532 talloc_free(recdata
);
1533 talloc_free(tmp_ctx
);
1537 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - pushed remote database 0x%x of size %u\n",
1538 dbid
, recdata
->count
));
1540 talloc_free(recdata
);
1541 talloc_free(tmp_ctx
);
1548 go through a full recovery on one database
1550 static int recover_database(struct ctdb_recoverd
*rec
,
1551 TALLOC_CTX
*mem_ctx
,
1555 struct ctdb_node_map
*nodemap
,
1556 uint32_t transaction_id
)
1558 struct tdb_wrap
*recdb
;
1560 struct ctdb_context
*ctdb
= rec
->ctdb
;
1562 struct ctdb_control_wipe_database w
;
1565 recdb
= create_recdb(ctdb
, mem_ctx
);
1566 if (recdb
== NULL
) {
1570 /* pull all remote databases onto the recdb */
1571 ret
= pull_remote_database(ctdb
, rec
, nodemap
, recdb
, dbid
, persistent
);
1573 DEBUG(DEBUG_ERR
, (__location__
" Unable to pull remote database 0x%x\n", dbid
));
1577 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - pulled remote database 0x%x\n", dbid
));
1579 /* wipe all the remote databases. This is safe as we are in a transaction */
1581 w
.transaction_id
= transaction_id
;
1583 data
.dptr
= (void *)&w
;
1584 data
.dsize
= sizeof(w
);
1586 nodes
= list_of_active_nodes(ctdb
, nodemap
, recdb
, true);
1587 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_WIPE_DATABASE
,
1589 CONTROL_TIMEOUT(), false, data
,
1592 DEBUG(DEBUG_ERR
, (__location__
" Unable to wipe database. Recovery failed.\n"));
1597 /* push out the correct database. This sets the dmaster and skips
1598 the empty records */
1599 ret
= push_recdb_database(ctdb
, dbid
, persistent
, recdb
, nodemap
);
1605 /* all done with this database */
1611 static int ctdb_reload_remote_public_ips(struct ctdb_context
*ctdb
,
1612 struct ctdb_recoverd
*rec
,
1613 struct ctdb_node_map
*nodemap
,
1619 if (ctdb
->num_nodes
!= nodemap
->num
) {
1620 DEBUG(DEBUG_ERR
, (__location__
" ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
1621 ctdb
->num_nodes
, nodemap
->num
));
1623 *culprit
= ctdb
->pnn
;
1628 for (j
=0; j
<nodemap
->num
; j
++) {
1629 /* For readability */
1630 struct ctdb_node
*node
= ctdb
->nodes
[j
];
1632 /* release any existing data */
1633 if (node
->known_public_ips
) {
1634 talloc_free(node
->known_public_ips
);
1635 node
->known_public_ips
= NULL
;
1637 if (node
->available_public_ips
) {
1638 talloc_free(node
->available_public_ips
);
1639 node
->available_public_ips
= NULL
;
1642 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
1646 /* Retrieve the list of known public IPs from the node */
1647 ret
= ctdb_ctrl_get_public_ips_flags(ctdb
,
1652 &node
->known_public_ips
);
1655 ("Failed to read known public IPs from node: %u\n",
1658 *culprit
= node
->pnn
;
1663 if (ctdb
->do_checkpublicip
&&
1664 !ctdb_op_is_disabled(rec
->takeover_run
) &&
1665 verify_remote_ip_allocation(ctdb
,
1666 node
->known_public_ips
,
1668 DEBUG(DEBUG_ERR
,("Trigger IP reallocation\n"));
1669 rec
->need_takeover_run
= true;
1672 /* Retrieve the list of available public IPs from the node */
1673 ret
= ctdb_ctrl_get_public_ips_flags(ctdb
,
1677 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE
,
1678 &node
->available_public_ips
);
1681 ("Failed to read available public IPs from node: %u\n",
1684 *culprit
= node
->pnn
;
1693 /* when we start a recovery, make sure all nodes use the same reclock file
1696 static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd
*rec
)
1698 struct ctdb_context
*ctdb
= rec
->ctdb
;
1699 TALLOC_CTX
*tmp_ctx
= talloc_new(NULL
);
1703 if (ctdb
->recovery_lock_file
== NULL
) {
1707 data
.dsize
= strlen(ctdb
->recovery_lock_file
) + 1;
1708 data
.dptr
= (uint8_t *)ctdb
->recovery_lock_file
;
1711 nodes
= list_of_active_nodes(ctdb
, rec
->nodemap
, tmp_ctx
, true);
1712 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_SET_RECLOCK_FILE
,
1718 DEBUG(DEBUG_ERR
, (__location__
" Failed to sync reclock file settings\n"));
1719 talloc_free(tmp_ctx
);
1723 talloc_free(tmp_ctx
);
1729 * this callback is called for every node that failed to execute ctdb_takeover_run()
1730 * and set flag to re-run takeover run.
1732 static void takeover_fail_callback(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
1734 DEBUG(DEBUG_ERR
, ("Node %u failed the takeover run\n", node_pnn
));
1736 if (callback_data
!= NULL
) {
1737 struct ctdb_recoverd
*rec
= talloc_get_type(callback_data
, struct ctdb_recoverd
);
1739 DEBUG(DEBUG_ERR
, ("Setting node %u as recovery fail culprit\n", node_pnn
));
1741 ctdb_set_culprit(rec
, node_pnn
);
1746 static void ban_misbehaving_nodes(struct ctdb_recoverd
*rec
, bool *self_ban
)
1748 struct ctdb_context
*ctdb
= rec
->ctdb
;
1750 struct ctdb_banning_state
*ban_state
;
1753 for (i
=0; i
<ctdb
->num_nodes
; i
++) {
1754 if (ctdb
->nodes
[i
]->ban_state
== NULL
) {
1757 ban_state
= (struct ctdb_banning_state
*)ctdb
->nodes
[i
]->ban_state
;
1758 if (ban_state
->count
< 2*ctdb
->num_nodes
) {
1762 DEBUG(DEBUG_NOTICE
,("Node %u reached %u banning credits - banning it for %u seconds\n",
1763 ctdb
->nodes
[i
]->pnn
, ban_state
->count
,
1764 ctdb
->tunable
.recovery_ban_period
));
1765 ctdb_ban_node(rec
, ctdb
->nodes
[i
]->pnn
, ctdb
->tunable
.recovery_ban_period
);
1766 ban_state
->count
= 0;
1768 /* Banning ourself? */
1769 if (ctdb
->nodes
[i
]->pnn
== rec
->ctdb
->pnn
) {
1775 static bool do_takeover_run(struct ctdb_recoverd
*rec
,
1776 struct ctdb_node_map
*nodemap
,
1777 bool banning_credits_on_fail
)
1779 uint32_t *nodes
= NULL
;
1780 struct srvid_request_data dtr
;
1783 uint32_t *rebalance_nodes
= rec
->force_rebalance_nodes
;
1787 DEBUG(DEBUG_NOTICE
, ("Takeover run starting\n"));
1789 if (ctdb_op_is_in_progress(rec
->takeover_run
)) {
1790 DEBUG(DEBUG_ERR
, (__location__
1791 " takeover run already in progress \n"));
1796 if (!ctdb_op_begin(rec
->takeover_run
)) {
1801 /* Disable IP checks (takeover runs, really) on other nodes
1802 * while doing this takeover run. This will stop those other
1803 * nodes from triggering takeover runs when think they should
1804 * be hosting an IP but it isn't yet on an interface. Don't
1805 * wait for replies since a failure here might cause some
1806 * noise in the logs but will not actually cause a problem.
1808 dtr
.srvid
= 0; /* No reply */
1811 data
.dptr
= (uint8_t*)&dtr
;
1812 data
.dsize
= sizeof(dtr
);
1814 nodes
= list_of_connected_nodes(rec
->ctdb
, nodemap
, rec
, false);
1816 /* Disable for 60 seconds. This can be a tunable later if
1820 for (i
= 0; i
< talloc_array_length(nodes
); i
++) {
1821 if (ctdb_client_send_message(rec
->ctdb
, nodes
[i
],
1822 CTDB_SRVID_DISABLE_TAKEOVER_RUNS
,
1824 DEBUG(DEBUG_INFO
,("Failed to disable takeover runs\n"));
1828 ret
= ctdb_takeover_run(rec
->ctdb
, nodemap
,
1829 rec
->force_rebalance_nodes
,
1830 takeover_fail_callback
,
1831 banning_credits_on_fail
? rec
: NULL
);
1833 /* Reenable takeover runs and IP checks on other nodes */
1835 for (i
= 0; i
< talloc_array_length(nodes
); i
++) {
1836 if (ctdb_client_send_message(rec
->ctdb
, nodes
[i
],
1837 CTDB_SRVID_DISABLE_TAKEOVER_RUNS
,
1839 DEBUG(DEBUG_INFO
,("Failed to reenable takeover runs\n"));
1844 DEBUG(DEBUG_ERR
, ("ctdb_takeover_run() failed\n"));
1850 /* Takeover run was successful so clear force rebalance targets */
1851 if (rebalance_nodes
== rec
->force_rebalance_nodes
) {
1852 TALLOC_FREE(rec
->force_rebalance_nodes
);
1854 DEBUG(DEBUG_WARNING
,
1855 ("Rebalance target nodes changed during takeover run - not clearing\n"));
1858 rec
->need_takeover_run
= !ok
;
1860 ctdb_op_end(rec
->takeover_run
);
1862 DEBUG(DEBUG_NOTICE
, ("Takeover run %s\n", ok
? "completed successfully" : "unsuccessful"));
1868 we are the recmaster, and recovery is needed - start a recovery run
1870 static int do_recovery(struct ctdb_recoverd
*rec
,
1871 TALLOC_CTX
*mem_ctx
, uint32_t pnn
,
1872 struct ctdb_node_map
*nodemap
, struct ctdb_vnn_map
*vnnmap
)
1874 struct ctdb_context
*ctdb
= rec
->ctdb
;
1876 uint32_t generation
;
1877 struct ctdb_dbid_map
*dbmap
;
1880 struct timeval start_time
;
1881 uint32_t culprit
= (uint32_t)-1;
1884 DEBUG(DEBUG_NOTICE
, (__location__
" Starting do_recovery\n"));
1886 /* if recovery fails, force it again */
1887 rec
->need_recovery
= true;
1889 if (!ctdb_op_begin(rec
->recovery
)) {
1893 if (rec
->election_timeout
) {
1894 /* an election is in progress */
1895 DEBUG(DEBUG_ERR
, ("do_recovery called while election in progress - try again later\n"));
1899 ban_misbehaving_nodes(rec
, &self_ban
);
1901 DEBUG(DEBUG_NOTICE
, ("This node was banned, aborting recovery\n"));
1905 if (ctdb
->recovery_lock_file
!= NULL
) {
1906 if (ctdb_recovery_have_lock(ctdb
)) {
1907 DEBUG(DEBUG_NOTICE
, ("Already holding recovery lock\n"));
1909 start_time
= timeval_current();
1910 DEBUG(DEBUG_NOTICE
, ("Attempting to take recovery lock (%s)\n",
1911 ctdb
->recovery_lock_file
));
1912 if (!ctdb_recovery_lock(ctdb
)) {
1913 if (ctdb
->runstate
== CTDB_RUNSTATE_FIRST_RECOVERY
) {
1914 /* If ctdb is trying first recovery, it's
1915 * possible that current node does not know
1916 * yet who the recmaster is.
1918 DEBUG(DEBUG_ERR
, ("Unable to get recovery lock"
1919 " - retrying recovery\n"));
1923 DEBUG(DEBUG_ERR
,("Unable to get recovery lock - aborting recovery "
1924 "and ban ourself for %u seconds\n",
1925 ctdb
->tunable
.recovery_ban_period
));
1926 ctdb_ban_node(rec
, pnn
, ctdb
->tunable
.recovery_ban_period
);
1929 ctdb_ctrl_report_recd_lock_latency(ctdb
,
1931 timeval_elapsed(&start_time
));
1933 ("Recovery lock taken successfully by recovery daemon\n"));
1937 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery initiated due to problem with node %u\n", rec
->last_culprit_node
));
1939 /* get a list of all databases */
1940 ret
= ctdb_ctrl_getdbmap(ctdb
, CONTROL_TIMEOUT(), pnn
, mem_ctx
, &dbmap
);
1942 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbids from node :%u\n", pnn
));
1946 /* we do the db creation before we set the recovery mode, so the freeze happens
1947 on all databases we will be dealing with. */
1949 /* verify that we have all the databases any other node has */
1950 ret
= create_missing_local_databases(ctdb
, nodemap
, pnn
, &dbmap
, mem_ctx
);
1952 DEBUG(DEBUG_ERR
, (__location__
" Unable to create missing local databases\n"));
1956 /* verify that all other nodes have all our databases */
1957 ret
= create_missing_remote_databases(ctdb
, nodemap
, pnn
, dbmap
, mem_ctx
);
1959 DEBUG(DEBUG_ERR
, (__location__
" Unable to create missing remote databases\n"));
1962 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - created remote databases\n"));
1964 /* update the database priority for all remote databases */
1965 ret
= update_db_priority_on_remote_nodes(ctdb
, nodemap
, pnn
, dbmap
, mem_ctx
);
1967 DEBUG(DEBUG_ERR
, (__location__
" Unable to set db priority on remote nodes\n"));
1969 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - updated db priority for all databases\n"));
1972 /* update all other nodes to use the same setting for reclock files
1973 as the local recovery master.
1975 sync_recovery_lock_file_across_cluster(rec
);
1977 /* set recovery mode to active on all nodes */
1978 ret
= set_recovery_mode(ctdb
, rec
, nodemap
, CTDB_RECOVERY_ACTIVE
);
1980 DEBUG(DEBUG_ERR
, (__location__
" Unable to set recovery mode to active on cluster\n"));
1984 /* execute the "startrecovery" event script on all nodes */
1985 ret
= run_startrecovery_eventscript(rec
, nodemap
);
1987 DEBUG(DEBUG_ERR
, (__location__
" Unable to run the 'startrecovery' event on cluster\n"));
1992 update all nodes to have the same flags that we have
1994 for (i
=0;i
<nodemap
->num
;i
++) {
1995 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
) {
1999 ret
= update_flags_on_all_nodes(ctdb
, nodemap
, i
, nodemap
->nodes
[i
].flags
);
2001 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_INACTIVE
) {
2002 DEBUG(DEBUG_WARNING
, (__location__
"Unable to update flags on inactive node %d\n", i
));
2004 DEBUG(DEBUG_ERR
, (__location__
" Unable to update flags on all nodes for node %d\n", i
));
2010 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - updated flags\n"));
2012 /* pick a new generation number */
2013 generation
= new_generation();
2015 /* change the vnnmap on this node to use the new generation
2016 number but not on any other nodes.
2017 this guarantees that if we abort the recovery prematurely
2018 for some reason (a node stops responding?)
2019 that we can just return immediately and we will reenter
2020 recovery shortly again.
2021 I.e. we deliberately leave the cluster with an inconsistent
2022 generation id to allow us to abort recovery at any stage and
2023 just restart it from scratch.
2025 vnnmap
->generation
= generation
;
2026 ret
= ctdb_ctrl_setvnnmap(ctdb
, CONTROL_TIMEOUT(), pnn
, mem_ctx
, vnnmap
);
2028 DEBUG(DEBUG_ERR
, (__location__
" Unable to set vnnmap for node %u\n", pnn
));
2032 data
.dptr
= (void *)&generation
;
2033 data
.dsize
= sizeof(uint32_t);
2035 nodes
= list_of_active_nodes(ctdb
, nodemap
, mem_ctx
, true);
2036 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_TRANSACTION_START
,
2038 CONTROL_TIMEOUT(), false, data
,
2040 transaction_start_fail_callback
,
2042 DEBUG(DEBUG_ERR
, (__location__
" Unable to start transactions. Recovery failed.\n"));
2043 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_TRANSACTION_CANCEL
,
2045 CONTROL_TIMEOUT(), false, tdb_null
,
2049 DEBUG(DEBUG_ERR
,("Failed to cancel recovery transaction\n"));
2054 DEBUG(DEBUG_NOTICE
,(__location__
" started transactions on all nodes\n"));
2056 for (i
=0;i
<dbmap
->num
;i
++) {
2057 ret
= recover_database(rec
, mem_ctx
,
2059 dbmap
->dbs
[i
].flags
& CTDB_DB_FLAGS_PERSISTENT
,
2060 pnn
, nodemap
, generation
);
2062 DEBUG(DEBUG_ERR
, (__location__
" Failed to recover database 0x%x\n", dbmap
->dbs
[i
].dbid
));
2067 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - starting database commits\n"));
2069 /* commit all the changes */
2070 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_TRANSACTION_COMMIT
,
2072 CONTROL_TIMEOUT(), false, data
,
2075 DEBUG(DEBUG_ERR
, (__location__
" Unable to commit recovery changes. Recovery failed.\n"));
2079 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - committed databases\n"));
2082 /* update the capabilities for all nodes */
2083 ret
= update_capabilities(rec
, nodemap
);
2085 DEBUG(DEBUG_ERR
, (__location__
" Unable to update node capabilities.\n"));
2089 /* build a new vnn map with all the currently active and
2091 generation
= new_generation();
2092 vnnmap
= talloc(mem_ctx
, struct ctdb_vnn_map
);
2093 CTDB_NO_MEMORY(ctdb
, vnnmap
);
2094 vnnmap
->generation
= generation
;
2096 vnnmap
->map
= talloc_zero_array(vnnmap
, uint32_t, vnnmap
->size
);
2097 CTDB_NO_MEMORY(ctdb
, vnnmap
->map
);
2098 for (i
=j
=0;i
<nodemap
->num
;i
++) {
2099 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_INACTIVE
) {
2102 if (!ctdb_node_has_capabilities(rec
->caps
,
2103 ctdb
->nodes
[i
]->pnn
,
2104 CTDB_CAP_LMASTER
)) {
2105 /* this node can not be an lmaster */
2106 DEBUG(DEBUG_DEBUG
, ("Node %d cant be a LMASTER, skipping it\n", i
));
2111 vnnmap
->map
= talloc_realloc(vnnmap
, vnnmap
->map
, uint32_t, vnnmap
->size
);
2112 CTDB_NO_MEMORY(ctdb
, vnnmap
->map
);
2113 vnnmap
->map
[j
++] = nodemap
->nodes
[i
].pnn
;
2116 if (vnnmap
->size
== 0) {
2117 DEBUG(DEBUG_NOTICE
, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
2119 vnnmap
->map
= talloc_realloc(vnnmap
, vnnmap
->map
, uint32_t, vnnmap
->size
);
2120 CTDB_NO_MEMORY(ctdb
, vnnmap
->map
);
2121 vnnmap
->map
[0] = pnn
;
2124 /* update to the new vnnmap on all nodes */
2125 ret
= update_vnnmap_on_all_nodes(ctdb
, nodemap
, pnn
, vnnmap
, mem_ctx
);
2127 DEBUG(DEBUG_ERR
, (__location__
" Unable to update vnnmap on all nodes\n"));
2131 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - updated vnnmap\n"));
2133 /* update recmaster to point to us for all nodes */
2134 ret
= set_recovery_master(ctdb
, nodemap
, pnn
);
2136 DEBUG(DEBUG_ERR
, (__location__
" Unable to set recovery master\n"));
2140 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - updated recmaster\n"));
2142 /* disable recovery mode */
2143 ret
= set_recovery_mode(ctdb
, rec
, nodemap
, CTDB_RECOVERY_NORMAL
);
2145 DEBUG(DEBUG_ERR
, (__location__
" Unable to set recovery mode to normal on cluster\n"));
2149 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - disabled recovery mode\n"));
2151 /* Fetch known/available public IPs from each active node */
2152 ret
= ctdb_reload_remote_public_ips(ctdb
, rec
, nodemap
, &culprit
);
2154 DEBUG(DEBUG_ERR
,("Failed to read public ips from remote node %d\n",
2156 rec
->need_takeover_run
= true;
2160 do_takeover_run(rec
, nodemap
, false);
2162 /* execute the "recovered" event script on all nodes */
2163 ret
= run_recovered_eventscript(rec
, nodemap
, "do_recovery");
2165 DEBUG(DEBUG_ERR
, (__location__
" Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
2169 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - finished the recovered event\n"));
2171 /* send a message to all clients telling them that the cluster
2172 has been reconfigured */
2173 ret
= ctdb_client_send_message(ctdb
, CTDB_BROADCAST_CONNECTED
,
2174 CTDB_SRVID_RECONFIGURE
, tdb_null
);
2176 DEBUG(DEBUG_ERR
, (__location__
" Failed to send reconfigure message\n"));
2180 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery complete\n"));
2182 rec
->need_recovery
= false;
2183 ctdb_op_end(rec
->recovery
);
2185 /* we managed to complete a full recovery, make sure to forgive
2186 any past sins by the nodes that could now participate in the
2189 DEBUG(DEBUG_ERR
,("Resetting ban count to 0 for all nodes\n"));
2190 for (i
=0;i
<nodemap
->num
;i
++) {
2191 struct ctdb_banning_state
*ban_state
;
2193 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
) {
2197 ban_state
= (struct ctdb_banning_state
*)ctdb
->nodes
[nodemap
->nodes
[i
].pnn
]->ban_state
;
2198 if (ban_state
== NULL
) {
2202 ban_state
->count
= 0;
2205 /* We just finished a recovery successfully.
2206 We now wait for rerecovery_timeout before we allow
2207 another recovery to take place.
2209 DEBUG(DEBUG_NOTICE
, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb
->tunable
.rerecovery_timeout
));
2210 ctdb_op_disable(rec
->recovery
, ctdb
->ev
,
2211 ctdb
->tunable
.rerecovery_timeout
);
2215 ctdb_op_end(rec
->recovery
);
2221 elections are won by first checking the number of connected nodes, then
2222 the priority time, then the pnn
2224 struct election_message
{
2225 uint32_t num_connected
;
2226 struct timeval priority_time
;
2228 uint32_t node_flags
;
2232 form this nodes election data
2234 static void ctdb_election_data(struct ctdb_recoverd
*rec
, struct election_message
*em
)
2237 struct ctdb_node_map
*nodemap
;
2238 struct ctdb_context
*ctdb
= rec
->ctdb
;
2242 em
->pnn
= rec
->ctdb
->pnn
;
2243 em
->priority_time
= rec
->priority_time
;
2245 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, rec
, &nodemap
);
2247 DEBUG(DEBUG_ERR
,(__location__
" unable to get node map\n"));
2251 rec
->node_flags
= nodemap
->nodes
[ctdb
->pnn
].flags
;
2252 em
->node_flags
= rec
->node_flags
;
2254 for (i
=0;i
<nodemap
->num
;i
++) {
2255 if (!(nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
)) {
2256 em
->num_connected
++;
2260 /* we shouldnt try to win this election if we cant be a recmaster */
2261 if ((ctdb
->capabilities
& CTDB_CAP_RECMASTER
) == 0) {
2262 em
->num_connected
= 0;
2263 em
->priority_time
= timeval_current();
2266 talloc_free(nodemap
);
2270 see if the given election data wins
2272 static bool ctdb_election_win(struct ctdb_recoverd
*rec
, struct election_message
*em
)
2274 struct election_message myem
;
2277 ctdb_election_data(rec
, &myem
);
2279 /* we cant win if we dont have the recmaster capability */
2280 if ((rec
->ctdb
->capabilities
& CTDB_CAP_RECMASTER
) == 0) {
2284 /* we cant win if we are banned */
2285 if (rec
->node_flags
& NODE_FLAGS_BANNED
) {
2289 /* we cant win if we are stopped */
2290 if (rec
->node_flags
& NODE_FLAGS_STOPPED
) {
2294 /* we will automatically win if the other node is banned */
2295 if (em
->node_flags
& NODE_FLAGS_BANNED
) {
2299 /* we will automatically win if the other node is banned */
2300 if (em
->node_flags
& NODE_FLAGS_STOPPED
) {
2304 /* try to use the most connected node */
2306 cmp
= (int)myem
.num_connected
- (int)em
->num_connected
;
2309 /* then the longest running node */
2311 cmp
= timeval_compare(&em
->priority_time
, &myem
.priority_time
);
2315 cmp
= (int)myem
.pnn
- (int)em
->pnn
;
2322 send out an election request
2324 static int send_election_request(struct ctdb_recoverd
*rec
, uint32_t pnn
)
2327 TDB_DATA election_data
;
2328 struct election_message emsg
;
2330 struct ctdb_context
*ctdb
= rec
->ctdb
;
2332 srvid
= CTDB_SRVID_RECOVERY
;
2334 ctdb_election_data(rec
, &emsg
);
2336 election_data
.dsize
= sizeof(struct election_message
);
2337 election_data
.dptr
= (unsigned char *)&emsg
;
2340 /* first we assume we will win the election and set
2341 recoverymaster to be ourself on the current node
2343 ret
= ctdb_ctrl_setrecmaster(ctdb
, CONTROL_TIMEOUT(), pnn
, pnn
);
2345 DEBUG(DEBUG_ERR
, (__location__
" failed to send recmaster election request\n"));
2350 /* send an election message to all active nodes */
2351 DEBUG(DEBUG_INFO
,(__location__
" Send election request to all active nodes\n"));
2352 return ctdb_client_send_message(ctdb
, CTDB_BROADCAST_ALL
, srvid
, election_data
);
2356 this function will unban all nodes in the cluster
2358 static void unban_all_nodes(struct ctdb_context
*ctdb
)
2361 struct ctdb_node_map
*nodemap
;
2362 TALLOC_CTX
*tmp_ctx
= talloc_new(ctdb
);
2364 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, tmp_ctx
, &nodemap
);
2366 DEBUG(DEBUG_ERR
,(__location__
" failed to get nodemap to unban all nodes\n"));
2370 for (i
=0;i
<nodemap
->num
;i
++) {
2371 if ( (!(nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
))
2372 && (nodemap
->nodes
[i
].flags
& NODE_FLAGS_BANNED
) ) {
2373 ret
= ctdb_ctrl_modflags(ctdb
, CONTROL_TIMEOUT(),
2374 nodemap
->nodes
[i
].pnn
, 0,
2377 DEBUG(DEBUG_ERR
, (__location__
" failed to reset ban state\n"));
2382 talloc_free(tmp_ctx
);
2387 we think we are winning the election - send a broadcast election request
2389 static void election_send_request(struct event_context
*ev
, struct timed_event
*te
, struct timeval t
, void *p
)
2391 struct ctdb_recoverd
*rec
= talloc_get_type(p
, struct ctdb_recoverd
);
2394 ret
= send_election_request(rec
, ctdb_get_pnn(rec
->ctdb
));
2396 DEBUG(DEBUG_ERR
,("Failed to send election request!\n"));
2399 talloc_free(rec
->send_election_te
);
2400 rec
->send_election_te
= NULL
;
2404 handler for memory dumps
2406 static void mem_dump_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2407 TDB_DATA data
, void *private_data
)
2409 TALLOC_CTX
*tmp_ctx
= talloc_new(ctdb
);
2412 struct srvid_request
*rd
;
2414 if (data
.dsize
!= sizeof(struct srvid_request
)) {
2415 DEBUG(DEBUG_ERR
, (__location__
" Wrong size of return address.\n"));
2416 talloc_free(tmp_ctx
);
2419 rd
= (struct srvid_request
*)data
.dptr
;
2421 dump
= talloc_zero(tmp_ctx
, TDB_DATA
);
2423 DEBUG(DEBUG_ERR
, (__location__
" Failed to allocate memory for memdump\n"));
2424 talloc_free(tmp_ctx
);
2427 ret
= ctdb_dump_memory(ctdb
, dump
);
2429 DEBUG(DEBUG_ERR
, (__location__
" ctdb_dump_memory() failed\n"));
2430 talloc_free(tmp_ctx
);
2434 DEBUG(DEBUG_ERR
, ("recovery master memory dump\n"));
2436 ret
= ctdb_client_send_message(ctdb
, rd
->pnn
, rd
->srvid
, *dump
);
2438 DEBUG(DEBUG_ERR
,("Failed to send rd memdump reply message\n"));
2439 talloc_free(tmp_ctx
);
2443 talloc_free(tmp_ctx
);
2447 handler for reload_nodes
2449 static void reload_nodes_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2450 TDB_DATA data
, void *private_data
)
2452 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
, struct ctdb_recoverd
);
2454 DEBUG(DEBUG_ERR
, (__location__
" Reload nodes file from recovery daemon\n"));
2456 ctdb_load_nodes_file(rec
->ctdb
);
2460 static void ctdb_rebalance_timeout(struct event_context
*ev
,
2461 struct timed_event
*te
,
2462 struct timeval t
, void *p
)
2464 struct ctdb_recoverd
*rec
= talloc_get_type(p
, struct ctdb_recoverd
);
2466 if (rec
->force_rebalance_nodes
== NULL
) {
2468 ("Rebalance timeout occurred - no nodes to rebalance\n"));
2473 ("Rebalance timeout occurred - do takeover run\n"));
2474 do_takeover_run(rec
, rec
->nodemap
, false);
2478 static void recd_node_rebalance_handler(struct ctdb_context
*ctdb
,
2480 TDB_DATA data
, void *private_data
)
2485 uint32_t deferred_rebalance
;
2486 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
, struct ctdb_recoverd
);
2488 if (rec
->recmaster
!= ctdb_get_pnn(ctdb
)) {
2492 if (data
.dsize
!= sizeof(uint32_t)) {
2493 DEBUG(DEBUG_ERR
,(__location__
" Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data
.dsize
, sizeof(uint32_t)));
2497 pnn
= *(uint32_t *)&data
.dptr
[0];
2499 DEBUG(DEBUG_NOTICE
,("Setting up rebalance of IPs to node %u\n", pnn
));
2501 /* Copy any existing list of nodes. There's probably some
2502 * sort of realloc variant that will do this but we need to
2503 * make sure that freeing the old array also cancels the timer
2504 * event for the timeout... not sure if realloc will do that.
2506 len
= (rec
->force_rebalance_nodes
!= NULL
) ?
2507 talloc_array_length(rec
->force_rebalance_nodes
) :
2510 /* This allows duplicates to be added but they don't cause
2511 * harm. A call to add a duplicate PNN arguably means that
2512 * the timeout should be reset, so this is the simplest
2515 t
= talloc_zero_array(rec
, uint32_t, len
+1);
2516 CTDB_NO_MEMORY_VOID(ctdb
, t
);
2518 memcpy(t
, rec
->force_rebalance_nodes
, sizeof(uint32_t) * len
);
2522 talloc_free(rec
->force_rebalance_nodes
);
2524 rec
->force_rebalance_nodes
= t
;
2526 /* If configured, setup a deferred takeover run to make sure
2527 * that certain nodes get IPs rebalanced to them. This will
2528 * be cancelled if a successful takeover run happens before
2529 * the timeout. Assign tunable value to variable for
2532 deferred_rebalance
= ctdb
->tunable
.deferred_rebalance_on_node_add
;
2533 if (deferred_rebalance
!= 0) {
2534 event_add_timed(ctdb
->ev
, rec
->force_rebalance_nodes
,
2535 timeval_current_ofs(deferred_rebalance
, 0),
2536 ctdb_rebalance_timeout
, rec
);
2542 static void recd_update_ip_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2543 TDB_DATA data
, void *private_data
)
2545 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
, struct ctdb_recoverd
);
2546 struct ctdb_public_ip
*ip
;
2548 if (rec
->recmaster
!= rec
->ctdb
->pnn
) {
2549 DEBUG(DEBUG_INFO
,("Not recmaster, ignore update ip message\n"));
2553 if (data
.dsize
!= sizeof(struct ctdb_public_ip
)) {
2554 DEBUG(DEBUG_ERR
,(__location__
" Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data
.dsize
, sizeof(struct ctdb_public_ip
)));
2558 ip
= (struct ctdb_public_ip
*)data
.dptr
;
2560 update_ip_assignment_tree(rec
->ctdb
, ip
);
2563 static void srvid_disable_and_reply(struct ctdb_context
*ctdb
,
2565 struct ctdb_op_state
*op_state
)
2567 struct srvid_request_data
*r
;
2572 /* Validate input data */
2573 if (data
.dsize
!= sizeof(struct srvid_request_data
)) {
2574 DEBUG(DEBUG_ERR
,(__location__
" Wrong size for data :%lu "
2575 "expecting %lu\n", (long unsigned)data
.dsize
,
2576 (long unsigned)sizeof(struct srvid_request
)));
2579 if (data
.dptr
== NULL
) {
2580 DEBUG(DEBUG_ERR
,(__location__
" No data received\n"));
2584 r
= (struct srvid_request_data
*)data
.dptr
;
2587 ret
= ctdb_op_disable(op_state
, ctdb
->ev
, timeout
);
2592 /* Returning our PNN tells the caller that we succeeded */
2593 ret
= ctdb_get_pnn(ctdb
);
2595 result
.dsize
= sizeof(int32_t);
2596 result
.dptr
= (uint8_t *)&ret
;
2597 srvid_request_reply(ctdb
, (struct srvid_request
*)r
, result
);
2600 static void disable_takeover_runs_handler(struct ctdb_context
*ctdb
,
2601 uint64_t srvid
, TDB_DATA data
,
2604 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
,
2605 struct ctdb_recoverd
);
2607 srvid_disable_and_reply(ctdb
, data
, rec
->takeover_run
);
2610 /* Backward compatibility for this SRVID */
2611 static void disable_ip_check_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2612 TDB_DATA data
, void *private_data
)
2614 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
,
2615 struct ctdb_recoverd
);
2618 if (data
.dsize
!= sizeof(uint32_t)) {
2619 DEBUG(DEBUG_ERR
,(__location__
" Wrong size for data :%lu "
2620 "expecting %lu\n", (long unsigned)data
.dsize
,
2621 (long unsigned)sizeof(uint32_t)));
2624 if (data
.dptr
== NULL
) {
2625 DEBUG(DEBUG_ERR
,(__location__
" No data received\n"));
2629 timeout
= *((uint32_t *)data
.dptr
);
2631 ctdb_op_disable(rec
->takeover_run
, ctdb
->ev
, timeout
);
2634 static void disable_recoveries_handler(struct ctdb_context
*ctdb
,
2635 uint64_t srvid
, TDB_DATA data
,
2638 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
,
2639 struct ctdb_recoverd
);
2641 srvid_disable_and_reply(ctdb
, data
, rec
->recovery
);
2645 handler for ip reallocate, just add it to the list of requests and
2646 handle this later in the monitor_cluster loop so we do not recurse
2647 with other requests to takeover_run()
2649 static void ip_reallocate_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2650 TDB_DATA data
, void *private_data
)
2652 struct srvid_request
*request
;
2653 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
,
2654 struct ctdb_recoverd
);
2656 if (data
.dsize
!= sizeof(struct srvid_request
)) {
2657 DEBUG(DEBUG_ERR
, (__location__
" Wrong size of return address.\n"));
2661 request
= (struct srvid_request
*)data
.dptr
;
2663 srvid_request_add(ctdb
, &rec
->reallocate_requests
, request
);
2666 static void process_ipreallocate_requests(struct ctdb_context
*ctdb
,
2667 struct ctdb_recoverd
*rec
)
2672 struct srvid_requests
*current
;
2674 DEBUG(DEBUG_INFO
, ("recovery master forced ip reallocation\n"));
2676 /* Only process requests that are currently pending. More
2677 * might come in while the takeover run is in progress and
2678 * they will need to be processed later since they might
2679 * be in response flag changes.
2681 current
= rec
->reallocate_requests
;
2682 rec
->reallocate_requests
= NULL
;
2684 /* update the list of public ips that a node can handle for
2687 ret
= ctdb_reload_remote_public_ips(ctdb
, rec
, rec
->nodemap
, &culprit
);
2689 DEBUG(DEBUG_ERR
,("Failed to read public ips from remote node %d\n",
2691 rec
->need_takeover_run
= true;
2694 if (do_takeover_run(rec
, rec
->nodemap
, false)) {
2695 ret
= ctdb_get_pnn(ctdb
);
2701 result
.dsize
= sizeof(int32_t);
2702 result
.dptr
= (uint8_t *)&ret
;
2704 srvid_requests_reply(ctdb
, ¤t
, result
);
2709 handler for recovery master elections
2711 static void election_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2712 TDB_DATA data
, void *private_data
)
2714 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
, struct ctdb_recoverd
);
2716 struct election_message
*em
= (struct election_message
*)data
.dptr
;
2718 /* Ignore election packets from ourself */
2719 if (ctdb
->pnn
== em
->pnn
) {
2723 /* we got an election packet - update the timeout for the election */
2724 talloc_free(rec
->election_timeout
);
2725 rec
->election_timeout
= event_add_timed(ctdb
->ev
, ctdb
,
2727 timeval_current_ofs(0, 500000) :
2728 timeval_current_ofs(ctdb
->tunable
.election_timeout
, 0),
2729 ctdb_election_timeout
, rec
);
2731 /* someone called an election. check their election data
2732 and if we disagree and we would rather be the elected node,
2733 send a new election message to all other nodes
2735 if (ctdb_election_win(rec
, em
)) {
2736 if (!rec
->send_election_te
) {
2737 rec
->send_election_te
= event_add_timed(ctdb
->ev
, rec
,
2738 timeval_current_ofs(0, 500000),
2739 election_send_request
, rec
);
2741 /*unban_all_nodes(ctdb);*/
2746 TALLOC_FREE(rec
->send_election_te
);
2748 /* Release the recovery lock file */
2749 if (ctdb_recovery_have_lock(ctdb
)) {
2750 ctdb_recovery_unlock(ctdb
);
2751 unban_all_nodes(ctdb
);
2754 /* ok, let that guy become recmaster then */
2755 ret
= ctdb_ctrl_setrecmaster(ctdb
, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb
), em
->pnn
);
2757 DEBUG(DEBUG_ERR
, (__location__
" failed to send recmaster election request"));
2766 force the start of the election process
2768 static void force_election(struct ctdb_recoverd
*rec
, uint32_t pnn
,
2769 struct ctdb_node_map
*nodemap
)
2772 struct ctdb_context
*ctdb
= rec
->ctdb
;
2774 DEBUG(DEBUG_INFO
,(__location__
" Force an election\n"));
2776 /* set all nodes to recovery mode to stop all internode traffic */
2777 ret
= set_recovery_mode(ctdb
, rec
, nodemap
, CTDB_RECOVERY_ACTIVE
);
2779 DEBUG(DEBUG_ERR
, (__location__
" Unable to set recovery mode to active on cluster\n"));
2783 talloc_free(rec
->election_timeout
);
2784 rec
->election_timeout
= event_add_timed(ctdb
->ev
, ctdb
,
2786 timeval_current_ofs(0, 500000) :
2787 timeval_current_ofs(ctdb
->tunable
.election_timeout
, 0),
2788 ctdb_election_timeout
, rec
);
2790 ret
= send_election_request(rec
, pnn
);
2792 DEBUG(DEBUG_ERR
, (__location__
" failed to initiate recmaster election"));
2796 /* wait for a few seconds to collect all responses */
2797 ctdb_wait_election(rec
);
2803 handler for when a node changes its flags
2805 static void monitor_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2806 TDB_DATA data
, void *private_data
)
2809 struct ctdb_node_flag_change
*c
= (struct ctdb_node_flag_change
*)data
.dptr
;
2810 struct ctdb_node_map
*nodemap
=NULL
;
2811 TALLOC_CTX
*tmp_ctx
;
2813 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
, struct ctdb_recoverd
);
2814 int disabled_flag_changed
;
2816 if (data
.dsize
!= sizeof(*c
)) {
2817 DEBUG(DEBUG_ERR
,(__location__
"Invalid data in ctdb_node_flag_change\n"));
2821 tmp_ctx
= talloc_new(ctdb
);
2822 CTDB_NO_MEMORY_VOID(ctdb
, tmp_ctx
);
2824 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, tmp_ctx
, &nodemap
);
2826 DEBUG(DEBUG_ERR
,(__location__
"ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2827 talloc_free(tmp_ctx
);
2832 for (i
=0;i
<nodemap
->num
;i
++) {
2833 if (nodemap
->nodes
[i
].pnn
== c
->pnn
) break;
2836 if (i
== nodemap
->num
) {
2837 DEBUG(DEBUG_CRIT
,(__location__
"Flag change for non-existant node %u\n", c
->pnn
));
2838 talloc_free(tmp_ctx
);
2842 if (c
->old_flags
!= c
->new_flags
) {
2843 DEBUG(DEBUG_NOTICE
,("Node %u has changed flags - now 0x%x was 0x%x\n", c
->pnn
, c
->new_flags
, c
->old_flags
));
2846 disabled_flag_changed
= (nodemap
->nodes
[i
].flags
^ c
->new_flags
) & NODE_FLAGS_DISABLED
;
2848 nodemap
->nodes
[i
].flags
= c
->new_flags
;
2850 ret
= ctdb_ctrl_getrecmaster(ctdb
, tmp_ctx
, CONTROL_TIMEOUT(),
2851 CTDB_CURRENT_NODE
, &ctdb
->recovery_master
);
2854 ret
= ctdb_ctrl_getrecmode(ctdb
, tmp_ctx
, CONTROL_TIMEOUT(),
2855 CTDB_CURRENT_NODE
, &ctdb
->recovery_mode
);
2859 ctdb
->recovery_master
== ctdb
->pnn
&&
2860 ctdb
->recovery_mode
== CTDB_RECOVERY_NORMAL
) {
2861 /* Only do the takeover run if the perm disabled or unhealthy
2862 flags changed since these will cause an ip failover but not
2864 If the node became disconnected or banned this will also
2865 lead to an ip address failover but that is handled
2868 if (disabled_flag_changed
) {
2869 rec
->need_takeover_run
= true;
2873 talloc_free(tmp_ctx
);
2877 handler for when we need to push out flag changes ot all other nodes
2879 static void push_flags_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2880 TDB_DATA data
, void *private_data
)
2883 struct ctdb_node_flag_change
*c
= (struct ctdb_node_flag_change
*)data
.dptr
;
2884 struct ctdb_node_map
*nodemap
=NULL
;
2885 TALLOC_CTX
*tmp_ctx
= talloc_new(ctdb
);
2889 /* find the recovery master */
2890 ret
= ctdb_ctrl_getrecmaster(ctdb
, tmp_ctx
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, &recmaster
);
2892 DEBUG(DEBUG_ERR
, (__location__
" Unable to get recmaster from local node\n"));
2893 talloc_free(tmp_ctx
);
2897 /* read the node flags from the recmaster */
2898 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), recmaster
, tmp_ctx
, &nodemap
);
2900 DEBUG(DEBUG_ERR
, (__location__
" Unable to get nodemap from node %u\n", c
->pnn
));
2901 talloc_free(tmp_ctx
);
2904 if (c
->pnn
>= nodemap
->num
) {
2905 DEBUG(DEBUG_ERR
,(__location__
" Nodemap from recmaster does not contain node %d\n", c
->pnn
));
2906 talloc_free(tmp_ctx
);
2910 /* send the flags update to all connected nodes */
2911 nodes
= list_of_connected_nodes(ctdb
, nodemap
, tmp_ctx
, true);
2913 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_MODIFY_FLAGS
,
2914 nodes
, 0, CONTROL_TIMEOUT(),
2918 DEBUG(DEBUG_ERR
, (__location__
" ctdb_control to modify node flags failed\n"));
2920 talloc_free(tmp_ctx
);
2924 talloc_free(tmp_ctx
);
2928 struct verify_recmode_normal_data
{
2930 enum monitor_result status
;
2933 static void verify_recmode_normal_callback(struct ctdb_client_control_state
*state
)
2935 struct verify_recmode_normal_data
*rmdata
= talloc_get_type(state
->async
.private_data
, struct verify_recmode_normal_data
);
2938 /* one more node has responded with recmode data*/
2941 /* if we failed to get the recmode, then return an error and let
2942 the main loop try again.
2944 if (state
->state
!= CTDB_CONTROL_DONE
) {
2945 if (rmdata
->status
== MONITOR_OK
) {
2946 rmdata
->status
= MONITOR_FAILED
;
2951 /* if we got a response, then the recmode will be stored in the
2954 if (state
->status
!= CTDB_RECOVERY_NORMAL
) {
2955 DEBUG(DEBUG_NOTICE
, ("Node:%u was in recovery mode. Start recovery process\n", state
->c
->hdr
.destnode
));
2956 rmdata
->status
= MONITOR_RECOVERY_NEEDED
;
2963 /* verify that all nodes are in normal recovery mode */
2964 static enum monitor_result
verify_recmode(struct ctdb_context
*ctdb
, struct ctdb_node_map
*nodemap
)
2966 struct verify_recmode_normal_data
*rmdata
;
2967 TALLOC_CTX
*mem_ctx
= talloc_new(ctdb
);
2968 struct ctdb_client_control_state
*state
;
2969 enum monitor_result status
;
2972 rmdata
= talloc(mem_ctx
, struct verify_recmode_normal_data
);
2973 CTDB_NO_MEMORY_FATAL(ctdb
, rmdata
);
2975 rmdata
->status
= MONITOR_OK
;
2977 /* loop over all active nodes and send an async getrecmode call to
2979 for (j
=0; j
<nodemap
->num
; j
++) {
2980 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
2983 state
= ctdb_ctrl_getrecmode_send(ctdb
, mem_ctx
,
2985 nodemap
->nodes
[j
].pnn
);
2986 if (state
== NULL
) {
2987 /* we failed to send the control, treat this as
2988 an error and try again next iteration
2990 DEBUG(DEBUG_ERR
,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2991 talloc_free(mem_ctx
);
2992 return MONITOR_FAILED
;
2995 /* set up the callback functions */
2996 state
->async
.fn
= verify_recmode_normal_callback
;
2997 state
->async
.private_data
= rmdata
;
2999 /* one more control to wait for to complete */
3004 /* now wait for up to the maximum number of seconds allowed
3005 or until all nodes we expect a response from has replied
3007 while (rmdata
->count
> 0) {
3008 event_loop_once(ctdb
->ev
);
3011 status
= rmdata
->status
;
3012 talloc_free(mem_ctx
);
3017 struct verify_recmaster_data
{
3018 struct ctdb_recoverd
*rec
;
3021 enum monitor_result status
;
3024 static void verify_recmaster_callback(struct ctdb_client_control_state
*state
)
3026 struct verify_recmaster_data
*rmdata
= talloc_get_type(state
->async
.private_data
, struct verify_recmaster_data
);
3029 /* one more node has responded with recmaster data*/
3032 /* if we failed to get the recmaster, then return an error and let
3033 the main loop try again.
3035 if (state
->state
!= CTDB_CONTROL_DONE
) {
3036 if (rmdata
->status
== MONITOR_OK
) {
3037 rmdata
->status
= MONITOR_FAILED
;
3042 /* if we got a response, then the recmaster will be stored in the
3045 if (state
->status
!= rmdata
->pnn
) {
3046 DEBUG(DEBUG_ERR
,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state
->c
->hdr
.destnode
, state
->status
));
3047 ctdb_set_culprit(rmdata
->rec
, state
->c
->hdr
.destnode
);
3048 rmdata
->status
= MONITOR_ELECTION_NEEDED
;
3055 /* verify that all nodes agree that we are the recmaster */
3056 static enum monitor_result
verify_recmaster(struct ctdb_recoverd
*rec
, struct ctdb_node_map
*nodemap
, uint32_t pnn
)
3058 struct ctdb_context
*ctdb
= rec
->ctdb
;
3059 struct verify_recmaster_data
*rmdata
;
3060 TALLOC_CTX
*mem_ctx
= talloc_new(ctdb
);
3061 struct ctdb_client_control_state
*state
;
3062 enum monitor_result status
;
3065 rmdata
= talloc(mem_ctx
, struct verify_recmaster_data
);
3066 CTDB_NO_MEMORY_FATAL(ctdb
, rmdata
);
3070 rmdata
->status
= MONITOR_OK
;
3072 /* loop over all active nodes and send an async getrecmaster call to
3074 for (j
=0; j
<nodemap
->num
; j
++) {
3075 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
3078 state
= ctdb_ctrl_getrecmaster_send(ctdb
, mem_ctx
,
3080 nodemap
->nodes
[j
].pnn
);
3081 if (state
== NULL
) {
3082 /* we failed to send the control, treat this as
3083 an error and try again next iteration
3085 DEBUG(DEBUG_ERR
,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
3086 talloc_free(mem_ctx
);
3087 return MONITOR_FAILED
;
3090 /* set up the callback functions */
3091 state
->async
.fn
= verify_recmaster_callback
;
3092 state
->async
.private_data
= rmdata
;
3094 /* one more control to wait for to complete */
3099 /* now wait for up to the maximum number of seconds allowed
3100 or until all nodes we expect a response from has replied
3102 while (rmdata
->count
> 0) {
3103 event_loop_once(ctdb
->ev
);
3106 status
= rmdata
->status
;
3107 talloc_free(mem_ctx
);
3111 static bool interfaces_have_changed(struct ctdb_context
*ctdb
,
3112 struct ctdb_recoverd
*rec
)
3114 struct ctdb_control_get_ifaces
*ifaces
= NULL
;
3115 TALLOC_CTX
*mem_ctx
;
3118 mem_ctx
= talloc_new(NULL
);
3120 /* Read the interfaces from the local node */
3121 if (ctdb_ctrl_get_ifaces(ctdb
, CONTROL_TIMEOUT(),
3122 CTDB_CURRENT_NODE
, mem_ctx
, &ifaces
) != 0) {
3123 DEBUG(DEBUG_ERR
, ("Unable to get interfaces from local node %u\n", ctdb
->pnn
));
3124 /* We could return an error. However, this will be
3125 * rare so we'll decide that the interfaces have
3126 * actually changed, just in case.
3128 talloc_free(mem_ctx
);
3133 /* We haven't been here before so things have changed */
3134 DEBUG(DEBUG_NOTICE
, ("Initial interface fetched\n"));
3136 } else if (rec
->ifaces
->num
!= ifaces
->num
) {
3137 /* Number of interfaces has changed */
3138 DEBUG(DEBUG_NOTICE
, ("Interface count changed from %d to %d\n",
3139 rec
->ifaces
->num
, ifaces
->num
));
3142 /* See if interface names or link states have changed */
3144 for (i
= 0; i
< rec
->ifaces
->num
; i
++) {
3145 struct ctdb_control_iface_info
* iface
= &rec
->ifaces
->ifaces
[i
];
3146 if (strcmp(iface
->name
, ifaces
->ifaces
[i
].name
) != 0) {
3148 ("Interface in slot %d changed: %s => %s\n",
3149 i
, iface
->name
, ifaces
->ifaces
[i
].name
));
3153 if (iface
->link_state
!= ifaces
->ifaces
[i
].link_state
) {
3155 ("Interface %s changed state: %d => %d\n",
3156 iface
->name
, iface
->link_state
,
3157 ifaces
->ifaces
[i
].link_state
));
3164 talloc_free(rec
->ifaces
);
3165 rec
->ifaces
= talloc_steal(rec
, ifaces
);
3167 talloc_free(mem_ctx
);
3171 /* called to check that the local allocation of public ip addresses is ok.
3173 static int verify_local_ip_allocation(struct ctdb_context
*ctdb
, struct ctdb_recoverd
*rec
, uint32_t pnn
, struct ctdb_node_map
*nodemap
)
3175 TALLOC_CTX
*mem_ctx
= talloc_new(NULL
);
3176 struct ctdb_uptime
*uptime1
= NULL
;
3177 struct ctdb_uptime
*uptime2
= NULL
;
3179 bool need_takeover_run
= false;
3181 ret
= ctdb_ctrl_uptime(ctdb
, mem_ctx
, CONTROL_TIMEOUT(),
3182 CTDB_CURRENT_NODE
, &uptime1
);
3184 DEBUG(DEBUG_ERR
, ("Unable to get uptime from local node %u\n", pnn
));
3185 talloc_free(mem_ctx
);
3189 if (interfaces_have_changed(ctdb
, rec
)) {
3190 DEBUG(DEBUG_NOTICE
, ("The interfaces status has changed on "
3191 "local node %u - force takeover run\n",
3193 need_takeover_run
= true;
3196 ret
= ctdb_ctrl_uptime(ctdb
, mem_ctx
, CONTROL_TIMEOUT(),
3197 CTDB_CURRENT_NODE
, &uptime2
);
3199 DEBUG(DEBUG_ERR
, ("Unable to get uptime from local node %u\n", pnn
));
3200 talloc_free(mem_ctx
);
3204 /* skip the check if the startrecovery time has changed */
3205 if (timeval_compare(&uptime1
->last_recovery_started
,
3206 &uptime2
->last_recovery_started
) != 0) {
3207 DEBUG(DEBUG_NOTICE
, (__location__
" last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3208 talloc_free(mem_ctx
);
3212 /* skip the check if the endrecovery time has changed */
3213 if (timeval_compare(&uptime1
->last_recovery_finished
,
3214 &uptime2
->last_recovery_finished
) != 0) {
3215 DEBUG(DEBUG_NOTICE
, (__location__
" last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3216 talloc_free(mem_ctx
);
3220 /* skip the check if we have started but not finished recovery */
3221 if (timeval_compare(&uptime1
->last_recovery_finished
,
3222 &uptime1
->last_recovery_started
) != 1) {
3223 DEBUG(DEBUG_INFO
, (__location__
" in the middle of recovery or ip reallocation. skipping public ip address check\n"));
3224 talloc_free(mem_ctx
);
3229 /* verify that we have the ip addresses we should have
3230 and we dont have ones we shouldnt have.
3231 if we find an inconsistency we set recmode to
3232 active on the local node and wait for the recmaster
3233 to do a full blown recovery.
3234 also if the pnn is -1 and we are healthy and can host the ip
3235 we also request a ip reallocation.
3237 if (ctdb
->tunable
.disable_ip_failover
== 0) {
3238 struct ctdb_all_public_ips
*ips
= NULL
;
3240 /* read the *available* IPs from the local node */
3241 ret
= ctdb_ctrl_get_public_ips_flags(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, mem_ctx
, CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE
, &ips
);
3243 DEBUG(DEBUG_ERR
, ("Unable to get available public IPs from local node %u\n", pnn
));
3244 talloc_free(mem_ctx
);
3248 for (j
=0; j
<ips
->num
; j
++) {
3249 if (ips
->ips
[j
].pnn
== -1 &&
3250 nodemap
->nodes
[pnn
].flags
== 0) {
3251 DEBUG(DEBUG_CRIT
,("Public IP '%s' is not assigned and we could serve it\n",
3252 ctdb_addr_to_str(&ips
->ips
[j
].addr
)));
3253 need_takeover_run
= true;
3259 /* read the *known* IPs from the local node */
3260 ret
= ctdb_ctrl_get_public_ips_flags(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, mem_ctx
, 0, &ips
);
3262 DEBUG(DEBUG_ERR
, ("Unable to get known public IPs from local node %u\n", pnn
));
3263 talloc_free(mem_ctx
);
3267 for (j
=0; j
<ips
->num
; j
++) {
3268 if (ips
->ips
[j
].pnn
== pnn
) {
3269 if (ctdb
->do_checkpublicip
&& !ctdb_sys_have_ip(&ips
->ips
[j
].addr
)) {
3270 DEBUG(DEBUG_CRIT
,("Public IP '%s' is assigned to us but not on an interface\n",
3271 ctdb_addr_to_str(&ips
->ips
[j
].addr
)));
3272 need_takeover_run
= true;
3275 if (ctdb
->do_checkpublicip
&&
3276 ctdb_sys_have_ip(&ips
->ips
[j
].addr
)) {
3278 DEBUG(DEBUG_CRIT
,("We are still serving a public IP '%s' that we should not be serving. Removing it\n",
3279 ctdb_addr_to_str(&ips
->ips
[j
].addr
)));
3281 if (ctdb_ctrl_release_ip(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, &ips
->ips
[j
]) != 0) {
3282 DEBUG(DEBUG_ERR
,("Failed to release local IP address\n"));
3289 if (need_takeover_run
) {
3290 struct srvid_request rd
;
3293 DEBUG(DEBUG_CRIT
,("Trigger takeoverrun\n"));
3297 data
.dptr
= (uint8_t *)&rd
;
3298 data
.dsize
= sizeof(rd
);
3300 ret
= ctdb_client_send_message(ctdb
, rec
->recmaster
, CTDB_SRVID_TAKEOVER_RUN
, data
);
3302 DEBUG(DEBUG_ERR
,(__location__
" Failed to send ipreallocate to recmaster :%d\n", (int)rec
->recmaster
));
3305 talloc_free(mem_ctx
);
3310 static void async_getnodemap_callback(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
3312 struct ctdb_node_map
**remote_nodemaps
= callback_data
;
3314 if (node_pnn
>= ctdb
->num_nodes
) {
3315 DEBUG(DEBUG_ERR
,(__location__
" pnn from invalid node\n"));
3319 remote_nodemaps
[node_pnn
] = (struct ctdb_node_map
*)talloc_steal(remote_nodemaps
, outdata
.dptr
);
3323 static int get_remote_nodemaps(struct ctdb_context
*ctdb
, TALLOC_CTX
*mem_ctx
,
3324 struct ctdb_node_map
*nodemap
,
3325 struct ctdb_node_map
**remote_nodemaps
)
3329 nodes
= list_of_active_nodes(ctdb
, nodemap
, mem_ctx
, true);
3330 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_GET_NODEMAP
,
3332 CONTROL_TIMEOUT(), false, tdb_null
,
3333 async_getnodemap_callback
,
3335 remote_nodemaps
) != 0) {
3336 DEBUG(DEBUG_ERR
, (__location__
" Unable to pull all remote nodemaps\n"));
3344 static int update_recovery_lock_file(struct ctdb_context
*ctdb
)
3346 TALLOC_CTX
*tmp_ctx
= talloc_new(NULL
);
3347 const char *reclockfile
;
3349 if (ctdb_ctrl_getreclock(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, tmp_ctx
, &reclockfile
) != 0) {
3350 DEBUG(DEBUG_ERR
,("Failed to read reclock file from daemon\n"));
3351 talloc_free(tmp_ctx
);
3355 if (reclockfile
== NULL
) {
3356 if (ctdb
->recovery_lock_file
!= NULL
) {
3357 DEBUG(DEBUG_NOTICE
,("Recovery lock file disabled\n"));
3358 talloc_free(ctdb
->recovery_lock_file
);
3359 ctdb
->recovery_lock_file
= NULL
;
3360 ctdb_recovery_unlock(ctdb
);
3362 talloc_free(tmp_ctx
);
3366 if (ctdb
->recovery_lock_file
== NULL
) {
3368 ("Recovery lock file enabled (%s)\n", reclockfile
));
3369 ctdb
->recovery_lock_file
= talloc_strdup(ctdb
, reclockfile
);
3370 ctdb_recovery_unlock(ctdb
);
3371 talloc_free(tmp_ctx
);
3376 if (!strcmp(reclockfile
, ctdb
->recovery_lock_file
)) {
3377 talloc_free(tmp_ctx
);
3382 ("Recovery lock file changed (now %s)\n", reclockfile
));
3383 talloc_free(ctdb
->recovery_lock_file
);
3384 ctdb
->recovery_lock_file
= talloc_strdup(ctdb
, reclockfile
);
3385 ctdb_recovery_unlock(ctdb
);
3387 talloc_free(tmp_ctx
);
3391 static void main_loop(struct ctdb_context
*ctdb
, struct ctdb_recoverd
*rec
,
3392 TALLOC_CTX
*mem_ctx
)
3395 struct ctdb_node_map
*nodemap
=NULL
;
3396 struct ctdb_node_map
*recmaster_nodemap
=NULL
;
3397 struct ctdb_node_map
**remote_nodemaps
=NULL
;
3398 struct ctdb_vnn_map
*vnnmap
=NULL
;
3399 struct ctdb_vnn_map
*remote_vnnmap
=NULL
;
3400 uint32_t num_lmasters
;
3401 int32_t debug_level
;
3406 /* verify that the main daemon is still running */
3407 if (ctdb_kill(ctdb
, ctdb
->ctdbd_pid
, 0) != 0) {
3408 DEBUG(DEBUG_CRIT
,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
3412 /* ping the local daemon to tell it we are alive */
3413 ctdb_ctrl_recd_ping(ctdb
);
3415 if (rec
->election_timeout
) {
3416 /* an election is in progress */
3420 /* read the debug level from the parent and update locally */
3421 ret
= ctdb_ctrl_get_debuglevel(ctdb
, CTDB_CURRENT_NODE
, &debug_level
);
3423 DEBUG(DEBUG_ERR
, (__location__
" Failed to read debuglevel from parent\n"));
3426 DEBUGLEVEL
= debug_level
;
3428 /* get relevant tunables */
3429 ret
= ctdb_ctrl_get_all_tunables(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, &ctdb
->tunable
);
3431 DEBUG(DEBUG_ERR
,("Failed to get tunables - retrying\n"));
3436 ret
= ctdb_ctrl_get_runstate(ctdb
, CONTROL_TIMEOUT(),
3437 CTDB_CURRENT_NODE
, &ctdb
->runstate
);
3439 DEBUG(DEBUG_ERR
, ("Failed to get runstate - retrying\n"));
3443 /* get the current recovery lock file from the server */
3444 if (update_recovery_lock_file(ctdb
) != 0) {
3445 DEBUG(DEBUG_ERR
,("Failed to update the recovery lock file\n"));
3449 /* Make sure that if recovery lock verification becomes disabled when
3452 if (ctdb
->recovery_lock_file
== NULL
) {
3453 ctdb_recovery_unlock(ctdb
);
3456 pnn
= ctdb_get_pnn(ctdb
);
3458 /* get the vnnmap */
3459 ret
= ctdb_ctrl_getvnnmap(ctdb
, CONTROL_TIMEOUT(), pnn
, mem_ctx
, &vnnmap
);
3461 DEBUG(DEBUG_ERR
, (__location__
" Unable to get vnnmap from node %u\n", pnn
));
3466 /* get number of nodes */
3468 talloc_free(rec
->nodemap
);
3469 rec
->nodemap
= NULL
;
3472 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), pnn
, rec
, &rec
->nodemap
);
3474 DEBUG(DEBUG_ERR
, (__location__
" Unable to get nodemap from node %u\n", pnn
));
3477 nodemap
= rec
->nodemap
;
3479 /* remember our own node flags */
3480 rec
->node_flags
= nodemap
->nodes
[pnn
].flags
;
3482 ban_misbehaving_nodes(rec
, &self_ban
);
3484 DEBUG(DEBUG_NOTICE
, ("This node was banned, restart main_loop\n"));
3488 /* if the local daemon is STOPPED or BANNED, we verify that the databases are
3489 also frozen and that the recmode is set to active.
3491 if (rec
->node_flags
& (NODE_FLAGS_STOPPED
| NODE_FLAGS_BANNED
)) {
3492 /* If this node has become inactive then we want to
3493 * reduce the chances of it taking over the recovery
3494 * master role when it becomes active again. This
3495 * helps to stabilise the recovery master role so that
3496 * it stays on the most stable node.
3498 rec
->priority_time
= timeval_current();
3500 ret
= ctdb_ctrl_getrecmode(ctdb
, mem_ctx
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, &ctdb
->recovery_mode
);
3502 DEBUG(DEBUG_ERR
,(__location__
" Failed to read recmode from local node\n"));
3504 if (ctdb
->recovery_mode
== CTDB_RECOVERY_NORMAL
) {
3505 DEBUG(DEBUG_ERR
,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
3507 ret
= ctdb_ctrl_setrecmode(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, CTDB_RECOVERY_ACTIVE
);
3509 DEBUG(DEBUG_ERR
,(__location__
" Failed to activate recovery mode in STOPPED or BANNED state\n"));
3513 ret
= ctdb_ctrl_freeze(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
);
3515 DEBUG(DEBUG_ERR
,(__location__
" Failed to freeze node in STOPPED or BANNED state\n"));
3520 /* If this node is stopped or banned then it is not the recovery
3521 * master, so don't do anything. This prevents stopped or banned
3522 * node from starting election and sending unnecessary controls.
3527 /* check which node is the recovery master */
3528 ret
= ctdb_ctrl_getrecmaster(ctdb
, mem_ctx
, CONTROL_TIMEOUT(), pnn
, &rec
->recmaster
);
3530 DEBUG(DEBUG_ERR
, (__location__
" Unable to get recmaster from node %u\n", pnn
));
3534 /* If we are not the recmaster then do some housekeeping */
3535 if (rec
->recmaster
!= pnn
) {
3536 /* Ignore any IP reallocate requests - only recmaster
3539 TALLOC_FREE(rec
->reallocate_requests
);
3540 /* Clear any nodes that should be force rebalanced in
3541 * the next takeover run. If the recovery master role
3542 * has moved then we don't want to process these some
3543 * time in the future.
3545 TALLOC_FREE(rec
->force_rebalance_nodes
);
3548 /* This is a special case. When recovery daemon is started, recmaster
3549 * is set to -1. If a node is not started in stopped state, then
3550 * start election to decide recovery master
3552 if (rec
->recmaster
== (uint32_t)-1) {
3553 DEBUG(DEBUG_NOTICE
,(__location__
" Initial recovery master set - forcing election\n"));
3554 force_election(rec
, pnn
, nodemap
);
3558 /* update the capabilities for all nodes */
3559 ret
= update_capabilities(rec
, nodemap
);
3561 DEBUG(DEBUG_ERR
, (__location__
" Unable to update node capabilities.\n"));
3566 * If the current recmaster does not have CTDB_CAP_RECMASTER,
3567 * but we have, then force an election and try to become the new
3570 if (!ctdb_node_has_capabilities(rec
->caps
,
3572 CTDB_CAP_RECMASTER
) &&
3573 (rec
->ctdb
->capabilities
& CTDB_CAP_RECMASTER
) &&
3574 !(nodemap
->nodes
[pnn
].flags
& NODE_FLAGS_INACTIVE
)) {
3575 DEBUG(DEBUG_ERR
, (__location__
" Current recmaster node %u does not have CAP_RECMASTER,"
3576 " but we (node %u) have - force an election\n",
3577 rec
->recmaster
, pnn
));
3578 force_election(rec
, pnn
, nodemap
);
3582 /* verify that the recmaster node is still active */
3583 for (j
=0; j
<nodemap
->num
; j
++) {
3584 if (nodemap
->nodes
[j
].pnn
==rec
->recmaster
) {
3589 if (j
== nodemap
->num
) {
3590 DEBUG(DEBUG_ERR
, ("Recmaster node %u not in list. Force reelection\n", rec
->recmaster
));
3591 force_election(rec
, pnn
, nodemap
);
3595 /* if recovery master is disconnected we must elect a new recmaster */
3596 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_DISCONNECTED
) {
3597 DEBUG(DEBUG_NOTICE
, ("Recmaster node %u is disconnected. Force reelection\n", nodemap
->nodes
[j
].pnn
));
3598 force_election(rec
, pnn
, nodemap
);
3602 /* get nodemap from the recovery master to check if it is inactive */
3603 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
3604 mem_ctx
, &recmaster_nodemap
);
3606 DEBUG(DEBUG_ERR
, (__location__
" Unable to get nodemap from recovery master %u\n",
3607 nodemap
->nodes
[j
].pnn
));
3612 if ((recmaster_nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) &&
3613 (rec
->node_flags
& NODE_FLAGS_INACTIVE
) == 0) {
3614 DEBUG(DEBUG_NOTICE
, ("Recmaster node %u no longer available. Force reelection\n", nodemap
->nodes
[j
].pnn
));
3616 * update our nodemap to carry the recmaster's notion of
3617 * its own flags, so that we don't keep freezing the
3618 * inactive recmaster node...
3620 nodemap
->nodes
[j
].flags
= recmaster_nodemap
->nodes
[j
].flags
;
3621 force_election(rec
, pnn
, nodemap
);
3625 /* verify that we have all ip addresses we should have and we dont
3626 * have addresses we shouldnt have.
3628 if (ctdb
->tunable
.disable_ip_failover
== 0 &&
3629 !ctdb_op_is_disabled(rec
->takeover_run
)) {
3630 if (verify_local_ip_allocation(ctdb
, rec
, pnn
, nodemap
) != 0) {
3631 DEBUG(DEBUG_ERR
, (__location__
" Public IPs were inconsistent.\n"));
3636 /* if we are not the recmaster then we do not need to check
3637 if recovery is needed
3639 if (pnn
!= rec
->recmaster
) {
3644 /* ensure our local copies of flags are right */
3645 ret
= update_local_flags(rec
, nodemap
);
3646 if (ret
== MONITOR_ELECTION_NEEDED
) {
3647 DEBUG(DEBUG_NOTICE
,("update_local_flags() called for a re-election.\n"));
3648 force_election(rec
, pnn
, nodemap
);
3651 if (ret
!= MONITOR_OK
) {
3652 DEBUG(DEBUG_ERR
,("Unable to update local flags\n"));
3656 if (ctdb
->num_nodes
!= nodemap
->num
) {
3657 DEBUG(DEBUG_ERR
, (__location__
" ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb
->num_nodes
, nodemap
->num
));
3658 ctdb_load_nodes_file(ctdb
);
3662 /* verify that all active nodes agree that we are the recmaster */
3663 switch (verify_recmaster(rec
, nodemap
, pnn
)) {
3664 case MONITOR_RECOVERY_NEEDED
:
3665 /* can not happen */
3667 case MONITOR_ELECTION_NEEDED
:
3668 force_election(rec
, pnn
, nodemap
);
3672 case MONITOR_FAILED
:
3677 if (rec
->need_recovery
) {
3678 /* a previous recovery didn't finish */
3679 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3683 /* verify that all active nodes are in normal mode
3684 and not in recovery mode
3686 switch (verify_recmode(ctdb
, nodemap
)) {
3687 case MONITOR_RECOVERY_NEEDED
:
3688 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3690 case MONITOR_FAILED
:
3692 case MONITOR_ELECTION_NEEDED
:
3693 /* can not happen */
3699 if (ctdb
->recovery_lock_file
!= NULL
) {
3700 /* We must already hold the recovery lock */
3701 if (!ctdb_recovery_have_lock(ctdb
)) {
3702 DEBUG(DEBUG_ERR
,("Failed recovery lock sanity check. Force a recovery\n"));
3703 ctdb_set_culprit(rec
, ctdb
->pnn
);
3704 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3710 /* if there are takeovers requested, perform it and notify the waiters */
3711 if (!ctdb_op_is_disabled(rec
->takeover_run
) &&
3712 rec
->reallocate_requests
) {
3713 process_ipreallocate_requests(ctdb
, rec
);
3716 /* If recoveries are disabled then there is no use doing any
3717 * nodemap or flags checks. Recoveries might be disabled due
3718 * to "reloadnodes", so doing these checks might cause an
3719 * unnecessary recovery. */
3720 if (ctdb_op_is_disabled(rec
->recovery
)) {
3724 /* get the nodemap for all active remote nodes
3726 remote_nodemaps
= talloc_array(mem_ctx
, struct ctdb_node_map
*, nodemap
->num
);
3727 if (remote_nodemaps
== NULL
) {
3728 DEBUG(DEBUG_ERR
, (__location__
" failed to allocate remote nodemap array\n"));
3731 for(i
=0; i
<nodemap
->num
; i
++) {
3732 remote_nodemaps
[i
] = NULL
;
3734 if (get_remote_nodemaps(ctdb
, mem_ctx
, nodemap
, remote_nodemaps
) != 0) {
3735 DEBUG(DEBUG_ERR
,(__location__
" Failed to read remote nodemaps\n"));
3739 /* verify that all other nodes have the same nodemap as we have
3741 for (j
=0; j
<nodemap
->num
; j
++) {
3742 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
3746 if (remote_nodemaps
[j
] == NULL
) {
3747 DEBUG(DEBUG_ERR
,(__location__
" Did not get a remote nodemap for node %d, restarting monitoring\n", j
));
3748 ctdb_set_culprit(rec
, j
);
3753 /* if the nodes disagree on how many nodes there are
3754 then this is a good reason to try recovery
3756 if (remote_nodemaps
[j
]->num
!= nodemap
->num
) {
3757 DEBUG(DEBUG_ERR
, (__location__
" Remote node:%u has different node count. %u vs %u of the local node\n",
3758 nodemap
->nodes
[j
].pnn
, remote_nodemaps
[j
]->num
, nodemap
->num
));
3759 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3760 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3764 /* if the nodes disagree on which nodes exist and are
3765 active, then that is also a good reason to do recovery
3767 for (i
=0;i
<nodemap
->num
;i
++) {
3768 if (remote_nodemaps
[j
]->nodes
[i
].pnn
!= nodemap
->nodes
[i
].pnn
) {
3769 DEBUG(DEBUG_ERR
, (__location__
" Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3770 nodemap
->nodes
[j
].pnn
, i
,
3771 remote_nodemaps
[j
]->nodes
[i
].pnn
, nodemap
->nodes
[i
].pnn
));
3772 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3773 do_recovery(rec
, mem_ctx
, pnn
, nodemap
,
3781 * Update node flags obtained from each active node. This ensure we have
3782 * up-to-date information for all the nodes.
3784 for (j
=0; j
<nodemap
->num
; j
++) {
3785 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
3788 nodemap
->nodes
[j
].flags
= remote_nodemaps
[j
]->nodes
[j
].flags
;
3791 for (j
=0; j
<nodemap
->num
; j
++) {
3792 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
3796 /* verify the flags are consistent
3798 for (i
=0; i
<nodemap
->num
; i
++) {
3799 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
) {
3803 if (nodemap
->nodes
[i
].flags
!= remote_nodemaps
[j
]->nodes
[i
].flags
) {
3804 DEBUG(DEBUG_ERR
, (__location__
" Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3805 nodemap
->nodes
[j
].pnn
,
3806 nodemap
->nodes
[i
].pnn
,
3807 remote_nodemaps
[j
]->nodes
[i
].flags
,
3808 nodemap
->nodes
[i
].flags
));
3810 DEBUG(DEBUG_ERR
,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps
[j
]->nodes
[i
].flags
, j
));
3811 update_flags_on_all_nodes(ctdb
, nodemap
, nodemap
->nodes
[i
].pnn
, remote_nodemaps
[j
]->nodes
[i
].flags
);
3812 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3813 do_recovery(rec
, mem_ctx
, pnn
, nodemap
,
3817 DEBUG(DEBUG_ERR
,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap
->nodes
[i
].flags
, i
));
3818 update_flags_on_all_nodes(ctdb
, nodemap
, nodemap
->nodes
[i
].pnn
, nodemap
->nodes
[i
].flags
);
3819 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3820 do_recovery(rec
, mem_ctx
, pnn
, nodemap
,
3829 /* count how many active nodes there are */
3831 for (i
=0; i
<nodemap
->num
; i
++) {
3832 if (!(nodemap
->nodes
[i
].flags
& NODE_FLAGS_INACTIVE
)) {
3833 if (ctdb_node_has_capabilities(rec
->caps
,
3834 ctdb
->nodes
[i
]->pnn
,
3835 CTDB_CAP_LMASTER
)) {
3842 /* There must be the same number of lmasters in the vnn map as
3843 * there are active nodes with the lmaster capability... or
3846 if (vnnmap
->size
!= num_lmasters
) {
3847 DEBUG(DEBUG_ERR
, (__location__
" The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
3848 vnnmap
->size
, num_lmasters
));
3849 ctdb_set_culprit(rec
, ctdb
->pnn
);
3850 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3854 /* verify that all active nodes in the nodemap also exist in
3857 for (j
=0; j
<nodemap
->num
; j
++) {
3858 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
3861 if (nodemap
->nodes
[j
].pnn
== pnn
) {
3865 for (i
=0; i
<vnnmap
->size
; i
++) {
3866 if (vnnmap
->map
[i
] == nodemap
->nodes
[j
].pnn
) {
3870 if (i
== vnnmap
->size
) {
3871 DEBUG(DEBUG_ERR
, (__location__
" Node %u is active in the nodemap but did not exist in the vnnmap\n",
3872 nodemap
->nodes
[j
].pnn
));
3873 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3874 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3880 /* verify that all other nodes have the same vnnmap
3881 and are from the same generation
3883 for (j
=0; j
<nodemap
->num
; j
++) {
3884 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
3887 if (nodemap
->nodes
[j
].pnn
== pnn
) {
3891 ret
= ctdb_ctrl_getvnnmap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
3892 mem_ctx
, &remote_vnnmap
);
3894 DEBUG(DEBUG_ERR
, (__location__
" Unable to get vnnmap from remote node %u\n",
3895 nodemap
->nodes
[j
].pnn
));
3899 /* verify the vnnmap generation is the same */
3900 if (vnnmap
->generation
!= remote_vnnmap
->generation
) {
3901 DEBUG(DEBUG_ERR
, (__location__
" Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3902 nodemap
->nodes
[j
].pnn
, remote_vnnmap
->generation
, vnnmap
->generation
));
3903 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3904 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3908 /* verify the vnnmap size is the same */
3909 if (vnnmap
->size
!= remote_vnnmap
->size
) {
3910 DEBUG(DEBUG_ERR
, (__location__
" Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3911 nodemap
->nodes
[j
].pnn
, remote_vnnmap
->size
, vnnmap
->size
));
3912 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3913 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3917 /* verify the vnnmap is the same */
3918 for (i
=0;i
<vnnmap
->size
;i
++) {
3919 if (remote_vnnmap
->map
[i
] != vnnmap
->map
[i
]) {
3920 DEBUG(DEBUG_ERR
, (__location__
" Remote node %u has different vnnmap.\n",
3921 nodemap
->nodes
[j
].pnn
));
3922 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3923 do_recovery(rec
, mem_ctx
, pnn
, nodemap
,
3930 /* we might need to change who has what IP assigned */
3931 if (rec
->need_takeover_run
) {
3932 uint32_t culprit
= (uint32_t)-1;
3934 rec
->need_takeover_run
= false;
3936 /* update the list of public ips that a node can handle for
3939 ret
= ctdb_reload_remote_public_ips(ctdb
, rec
, nodemap
, &culprit
);
3941 DEBUG(DEBUG_ERR
,("Failed to read public ips from remote node %d\n",
3943 rec
->need_takeover_run
= true;
3947 /* execute the "startrecovery" event script on all nodes */
3948 ret
= run_startrecovery_eventscript(rec
, nodemap
);
3950 DEBUG(DEBUG_ERR
, (__location__
" Unable to run the 'startrecovery' event on cluster\n"));
3951 ctdb_set_culprit(rec
, ctdb
->pnn
);
3952 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3956 /* If takeover run fails, then the offending nodes are
3957 * assigned ban culprit counts. And we re-try takeover.
3958 * If takeover run fails repeatedly, the node would get
3961 * If rec->need_takeover_run is not set to true at this
3962 * failure, monitoring is disabled cluster-wide (via
3963 * startrecovery eventscript) and will not get enabled.
3965 if (!do_takeover_run(rec
, nodemap
, true)) {
3969 /* execute the "recovered" event script on all nodes */
3970 ret
= run_recovered_eventscript(rec
, nodemap
, "monitor_cluster");
3972 // we cant check whether the event completed successfully
3973 // since this script WILL fail if the node is in recovery mode
3974 // and if that race happens, the code here would just cause a second
3975 // cascading recovery.
3977 DEBUG(DEBUG_ERR
, (__location__
" Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
3978 ctdb_set_culprit(rec
, ctdb
->pnn
);
3979 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3986 the main monitoring loop
3988 static void monitor_cluster(struct ctdb_context
*ctdb
)
3990 struct ctdb_recoverd
*rec
;
3992 DEBUG(DEBUG_NOTICE
,("monitor_cluster starting\n"));
3994 rec
= talloc_zero(ctdb
, struct ctdb_recoverd
);
3995 CTDB_NO_MEMORY_FATAL(ctdb
, rec
);
3999 rec
->takeover_run
= ctdb_op_init(rec
, "takeover runs");
4000 CTDB_NO_MEMORY_FATAL(ctdb
, rec
->takeover_run
);
4002 rec
->recovery
= ctdb_op_init(rec
, "recoveries");
4003 CTDB_NO_MEMORY_FATAL(ctdb
, rec
->recovery
);
4005 rec
->priority_time
= timeval_current();
4007 /* register a message port for sending memory dumps */
4008 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_MEM_DUMP
, mem_dump_handler
, rec
);
4010 /* register a message port for recovery elections */
4011 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_RECOVERY
, election_handler
, rec
);
4013 /* when nodes are disabled/enabled */
4014 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_SET_NODE_FLAGS
, monitor_handler
, rec
);
4016 /* when we are asked to puch out a flag change */
4017 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_PUSH_NODE_FLAGS
, push_flags_handler
, rec
);
4019 /* register a message port for vacuum fetch */
4020 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_VACUUM_FETCH
, vacuum_fetch_handler
, rec
);
4022 /* register a message port for reloadnodes */
4023 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_RELOAD_NODES
, reload_nodes_handler
, rec
);
4025 /* register a message port for performing a takeover run */
4026 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_TAKEOVER_RUN
, ip_reallocate_handler
, rec
);
4028 /* register a message port for disabling the ip check for a short while */
4029 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_DISABLE_IP_CHECK
, disable_ip_check_handler
, rec
);
4031 /* register a message port for updating the recovery daemons node assignment for an ip */
4032 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_RECD_UPDATE_IP
, recd_update_ip_handler
, rec
);
4034 /* register a message port for forcing a rebalance of a node next
4036 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_REBALANCE_NODE
, recd_node_rebalance_handler
, rec
);
4038 /* Register a message port for disabling takeover runs */
4039 ctdb_client_set_message_handler(ctdb
,
4040 CTDB_SRVID_DISABLE_TAKEOVER_RUNS
,
4041 disable_takeover_runs_handler
, rec
);
4043 /* Register a message port for disabling recoveries */
4044 ctdb_client_set_message_handler(ctdb
,
4045 CTDB_SRVID_DISABLE_RECOVERIES
,
4046 disable_recoveries_handler
, rec
);
4048 /* register a message port for detaching database */
4049 ctdb_client_set_message_handler(ctdb
,
4050 CTDB_SRVID_DETACH_DATABASE
,
4051 detach_database_handler
, rec
);
4054 TALLOC_CTX
*mem_ctx
= talloc_new(ctdb
);
4055 struct timeval start
;
4059 DEBUG(DEBUG_CRIT
,(__location__
4060 " Failed to create temp context\n"));
4064 start
= timeval_current();
4065 main_loop(ctdb
, rec
, mem_ctx
);
4066 talloc_free(mem_ctx
);
4068 /* we only check for recovery once every second */
4069 elapsed
= timeval_elapsed(&start
);
4070 if (elapsed
< ctdb
->tunable
.recover_interval
) {
4071 ctdb_wait_timeout(ctdb
, ctdb
->tunable
.recover_interval
4078 event handler for when the main ctdbd dies
4080 static void ctdb_recoverd_parent(struct event_context
*ev
, struct fd_event
*fde
,
4081 uint16_t flags
, void *private_data
)
4083 DEBUG(DEBUG_ALERT
,("recovery daemon parent died - exiting\n"));
4088 called regularly to verify that the recovery daemon is still running
4090 static void ctdb_check_recd(struct event_context
*ev
, struct timed_event
*te
,
4091 struct timeval yt
, void *p
)
4093 struct ctdb_context
*ctdb
= talloc_get_type(p
, struct ctdb_context
);
4095 if (ctdb_kill(ctdb
, ctdb
->recoverd_pid
, 0) != 0) {
4096 DEBUG(DEBUG_ERR
,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb
->recoverd_pid
));
4098 event_add_timed(ctdb
->ev
, ctdb
, timeval_zero(),
4099 ctdb_restart_recd
, ctdb
);
4104 event_add_timed(ctdb
->ev
, ctdb
->recd_ctx
,
4105 timeval_current_ofs(30, 0),
4106 ctdb_check_recd
, ctdb
);
4109 static void recd_sig_child_handler(struct event_context
*ev
,
4110 struct signal_event
*se
, int signum
, int count
,
4114 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4119 pid
= waitpid(-1, &status
, WNOHANG
);
4121 if (errno
!= ECHILD
) {
4122 DEBUG(DEBUG_ERR
, (__location__
" waitpid() returned error. errno:%s(%d)\n", strerror(errno
),errno
));
4127 DEBUG(DEBUG_DEBUG
, ("RECD SIGCHLD from %d\n", (int)pid
));
4133 startup the recovery daemon as a child of the main ctdb daemon
4135 int ctdb_start_recoverd(struct ctdb_context
*ctdb
)
4138 struct signal_event
*se
;
4139 struct tevent_fd
*fde
;
4141 if (pipe(fd
) != 0) {
4145 ctdb
->recoverd_pid
= ctdb_fork(ctdb
);
4146 if (ctdb
->recoverd_pid
== -1) {
4150 if (ctdb
->recoverd_pid
!= 0) {
4151 talloc_free(ctdb
->recd_ctx
);
4152 ctdb
->recd_ctx
= talloc_new(ctdb
);
4153 CTDB_NO_MEMORY(ctdb
, ctdb
->recd_ctx
);
4156 event_add_timed(ctdb
->ev
, ctdb
->recd_ctx
,
4157 timeval_current_ofs(30, 0),
4158 ctdb_check_recd
, ctdb
);
4164 srandom(getpid() ^ time(NULL
));
4166 ctdb_set_process_name("ctdb_recovered");
4167 if (switch_from_server_to_client(ctdb
, "recoverd") != 0) {
4168 DEBUG(DEBUG_CRIT
, (__location__
"ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
4172 DEBUG(DEBUG_DEBUG
, (__location__
" Created PIPE FD:%d to recovery daemon\n", fd
[0]));
4174 fde
= event_add_fd(ctdb
->ev
, ctdb
, fd
[0], EVENT_FD_READ
,
4175 ctdb_recoverd_parent
, &fd
[0]);
4176 tevent_fd_set_auto_close(fde
);
4178 /* set up a handler to pick up sigchld */
4179 se
= event_add_signal(ctdb
->ev
, ctdb
,
4181 recd_sig_child_handler
,
4184 DEBUG(DEBUG_CRIT
,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
4188 monitor_cluster(ctdb
);
4190 DEBUG(DEBUG_ALERT
,("ERROR: ctdb_recoverd finished!?\n"));
4195 shutdown the recovery daemon
4197 void ctdb_stop_recoverd(struct ctdb_context
*ctdb
)
4199 if (ctdb
->recoverd_pid
== 0) {
4203 DEBUG(DEBUG_NOTICE
,("Shutting down recovery daemon\n"));
4204 ctdb_kill(ctdb
, ctdb
->recoverd_pid
, SIGTERM
);
4206 TALLOC_FREE(ctdb
->recd_ctx
);
4207 TALLOC_FREE(ctdb
->recd_ping_count
);
4210 static void ctdb_restart_recd(struct event_context
*ev
, struct timed_event
*te
,
4211 struct timeval t
, void *private_data
)
4213 struct ctdb_context
*ctdb
= talloc_get_type(private_data
, struct ctdb_context
);
4215 DEBUG(DEBUG_ERR
,("Restarting recovery daemon\n"));
4216 ctdb_stop_recoverd(ctdb
);
4217 ctdb_start_recoverd(ctdb
);