4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "system/filesys.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/wait.h"
27 #include "../include/ctdb_client.h"
28 #include "../include/ctdb_private.h"
29 #include "lib/tdb_wrap/tdb_wrap.h"
30 #include "lib/util/dlinklist.h"
33 /* List of SRVID requests that need to be processed */
35 struct srvid_list
*next
, *prev
;
36 struct srvid_request
*request
;
39 struct srvid_requests
{
40 struct srvid_list
*requests
;
43 static void srvid_request_reply(struct ctdb_context
*ctdb
,
44 struct srvid_request
*request
,
47 /* Someone that sent srvid==0 does not want a reply */
48 if (request
->srvid
== 0) {
53 if (ctdb_client_send_message(ctdb
, request
->pnn
, request
->srvid
,
55 DEBUG(DEBUG_INFO
,("Sent SRVID reply to %u:%llu\n",
56 (unsigned)request
->pnn
,
57 (unsigned long long)request
->srvid
));
59 DEBUG(DEBUG_ERR
,("Failed to send SRVID reply to %u:%llu\n",
60 (unsigned)request
->pnn
,
61 (unsigned long long)request
->srvid
));
67 static void srvid_requests_reply(struct ctdb_context
*ctdb
,
68 struct srvid_requests
**requests
,
73 for (r
= (*requests
)->requests
; r
!= NULL
; r
= r
->next
) {
74 srvid_request_reply(ctdb
, r
->request
, result
);
77 /* Free the list structure... */
78 TALLOC_FREE(*requests
);
81 static void srvid_request_add(struct ctdb_context
*ctdb
,
82 struct srvid_requests
**requests
,
83 struct srvid_request
*request
)
89 if (*requests
== NULL
) {
90 *requests
= talloc_zero(ctdb
, struct srvid_requests
);
91 if (*requests
== NULL
) {
96 t
= talloc_zero(*requests
, struct srvid_list
);
98 /* If *requests was just allocated above then free it */
99 if ((*requests
)->requests
== NULL
) {
100 TALLOC_FREE(*requests
);
105 t
->request
= (struct srvid_request
*)talloc_steal(t
, request
);
106 DLIST_ADD((*requests
)->requests
, t
);
111 /* Failed to add the request to the list. Send a fail. */
112 DEBUG(DEBUG_ERR
, (__location__
113 " Out of memory, failed to queue SRVID request\n"));
115 result
.dsize
= sizeof(ret
);
116 result
.dptr
= (uint8_t *)&ret
;
117 srvid_request_reply(ctdb
, request
, result
);
120 struct ctdb_banning_state
{
122 struct timeval last_reported_time
;
126 private state of recovery daemon
128 struct ctdb_recoverd
{
129 struct ctdb_context
*ctdb
;
132 uint32_t num_lmasters
;
133 uint32_t num_connected
;
134 uint32_t last_culprit_node
;
135 struct ctdb_node_map
*nodemap
;
136 struct timeval priority_time
;
137 bool need_takeover_run
;
140 struct timed_event
*send_election_te
;
141 struct timed_event
*election_timeout
;
142 struct vacuum_info
*vacuum_info
;
143 struct srvid_requests
*reallocate_requests
;
144 bool takeover_run_in_progress
;
145 TALLOC_CTX
*takeover_runs_disable_ctx
;
146 struct ctdb_control_get_ifaces
*ifaces
;
147 uint32_t *force_rebalance_nodes
;
150 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
151 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
153 static void ctdb_restart_recd(struct event_context
*ev
, struct timed_event
*te
, struct timeval t
, void *private_data
);
156 ban a node for a period of time
158 static void ctdb_ban_node(struct ctdb_recoverd
*rec
, uint32_t pnn
, uint32_t ban_time
)
161 struct ctdb_context
*ctdb
= rec
->ctdb
;
162 struct ctdb_ban_time bantime
;
164 if (!ctdb_validate_pnn(ctdb
, pnn
)) {
165 DEBUG(DEBUG_ERR
,("Bad pnn %u in ctdb_ban_node\n", pnn
));
169 DEBUG(DEBUG_NOTICE
,("Banning node %u for %u seconds\n", pnn
, ban_time
));
172 bantime
.time
= ban_time
;
174 ret
= ctdb_ctrl_set_ban(ctdb
, CONTROL_TIMEOUT(), pnn
, &bantime
);
176 DEBUG(DEBUG_ERR
,(__location__
" Failed to ban node %d\n", pnn
));
182 enum monitor_result
{ MONITOR_OK
, MONITOR_RECOVERY_NEEDED
, MONITOR_ELECTION_NEEDED
, MONITOR_FAILED
};
186 remember the trouble maker
188 static void ctdb_set_culprit_count(struct ctdb_recoverd
*rec
, uint32_t culprit
, uint32_t count
)
190 struct ctdb_context
*ctdb
= talloc_get_type(rec
->ctdb
, struct ctdb_context
);
191 struct ctdb_banning_state
*ban_state
;
193 if (culprit
> ctdb
->num_nodes
) {
194 DEBUG(DEBUG_ERR
,("Trying to set culprit %d but num_nodes is %d\n", culprit
, ctdb
->num_nodes
));
198 /* If we are banned or stopped, do not set other nodes as culprits */
199 if (rec
->node_flags
& NODE_FLAGS_INACTIVE
) {
200 DEBUG(DEBUG_NOTICE
, ("This node is INACTIVE, cannot set culprit node %d\n", culprit
));
204 if (ctdb
->nodes
[culprit
]->ban_state
== NULL
) {
205 ctdb
->nodes
[culprit
]->ban_state
= talloc_zero(ctdb
->nodes
[culprit
], struct ctdb_banning_state
);
206 CTDB_NO_MEMORY_VOID(ctdb
, ctdb
->nodes
[culprit
]->ban_state
);
210 ban_state
= ctdb
->nodes
[culprit
]->ban_state
;
211 if (timeval_elapsed(&ban_state
->last_reported_time
) > ctdb
->tunable
.recovery_grace_period
) {
212 /* this was the first time in a long while this node
213 misbehaved so we will forgive any old transgressions.
215 ban_state
->count
= 0;
218 ban_state
->count
+= count
;
219 ban_state
->last_reported_time
= timeval_current();
220 rec
->last_culprit_node
= culprit
;
224 remember the trouble maker
226 static void ctdb_set_culprit(struct ctdb_recoverd
*rec
, uint32_t culprit
)
228 ctdb_set_culprit_count(rec
, culprit
, 1);
232 /* this callback is called for every node that failed to execute the
235 static void recovered_fail_callback(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
237 struct ctdb_recoverd
*rec
= talloc_get_type(callback_data
, struct ctdb_recoverd
);
239 DEBUG(DEBUG_ERR
, (__location__
" Node %u failed the recovered event. Setting it as recovery fail culprit\n", node_pnn
));
241 ctdb_set_culprit(rec
, node_pnn
);
245 run the "recovered" eventscript on all nodes
247 static int run_recovered_eventscript(struct ctdb_recoverd
*rec
, struct ctdb_node_map
*nodemap
, const char *caller
)
251 struct ctdb_context
*ctdb
= rec
->ctdb
;
253 tmp_ctx
= talloc_new(ctdb
);
254 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
256 nodes
= list_of_active_nodes(ctdb
, nodemap
, tmp_ctx
, true);
257 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_END_RECOVERY
,
259 CONTROL_TIMEOUT(), false, tdb_null
,
260 NULL
, recovered_fail_callback
,
262 DEBUG(DEBUG_ERR
, (__location__
" Unable to run the 'recovered' event when called from %s\n", caller
));
264 talloc_free(tmp_ctx
);
268 talloc_free(tmp_ctx
);
272 /* this callback is called for every node that failed to execute the
275 static void startrecovery_fail_callback(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
277 struct ctdb_recoverd
*rec
= talloc_get_type(callback_data
, struct ctdb_recoverd
);
279 DEBUG(DEBUG_ERR
, (__location__
" Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn
));
281 ctdb_set_culprit(rec
, node_pnn
);
285 run the "startrecovery" eventscript on all nodes
287 static int run_startrecovery_eventscript(struct ctdb_recoverd
*rec
, struct ctdb_node_map
*nodemap
)
291 struct ctdb_context
*ctdb
= rec
->ctdb
;
293 tmp_ctx
= talloc_new(ctdb
);
294 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
296 nodes
= list_of_active_nodes(ctdb
, nodemap
, tmp_ctx
, true);
297 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_START_RECOVERY
,
299 CONTROL_TIMEOUT(), false, tdb_null
,
301 startrecovery_fail_callback
,
303 DEBUG(DEBUG_ERR
, (__location__
" Unable to run the 'startrecovery' event. Recovery failed.\n"));
304 talloc_free(tmp_ctx
);
308 talloc_free(tmp_ctx
);
312 static void async_getcap_callback(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
314 if ( (outdata
.dsize
!= sizeof(uint32_t)) || (outdata
.dptr
== NULL
) ) {
315 DEBUG(DEBUG_ERR
, (__location__
" Invalid length/pointer for getcap callback : %u %p\n", (unsigned)outdata
.dsize
, outdata
.dptr
));
318 if (node_pnn
< ctdb
->num_nodes
) {
319 ctdb
->nodes
[node_pnn
]->capabilities
= *((uint32_t *)outdata
.dptr
);
322 if (node_pnn
== ctdb
->pnn
) {
323 ctdb
->capabilities
= ctdb
->nodes
[node_pnn
]->capabilities
;
328 update the node capabilities for all connected nodes
330 static int update_capabilities(struct ctdb_context
*ctdb
, struct ctdb_node_map
*nodemap
)
335 tmp_ctx
= talloc_new(ctdb
);
336 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
338 nodes
= list_of_connected_nodes(ctdb
, nodemap
, tmp_ctx
, true);
339 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_GET_CAPABILITIES
,
343 async_getcap_callback
, NULL
,
345 DEBUG(DEBUG_ERR
, (__location__
" Failed to read node capabilities.\n"));
346 talloc_free(tmp_ctx
);
350 talloc_free(tmp_ctx
);
354 static void set_recmode_fail_callback(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
356 struct ctdb_recoverd
*rec
= talloc_get_type(callback_data
, struct ctdb_recoverd
);
358 DEBUG(DEBUG_ERR
,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn
, rec
->nodemap
->num
));
359 ctdb_set_culprit_count(rec
, node_pnn
, rec
->nodemap
->num
);
362 static void transaction_start_fail_callback(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
364 struct ctdb_recoverd
*rec
= talloc_get_type(callback_data
, struct ctdb_recoverd
);
366 DEBUG(DEBUG_ERR
,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn
, rec
->nodemap
->num
));
367 ctdb_set_culprit_count(rec
, node_pnn
, rec
->nodemap
->num
);
371 change recovery mode on all nodes
373 static int set_recovery_mode(struct ctdb_context
*ctdb
, struct ctdb_recoverd
*rec
, struct ctdb_node_map
*nodemap
, uint32_t rec_mode
)
379 tmp_ctx
= talloc_new(ctdb
);
380 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
382 nodes
= list_of_active_nodes(ctdb
, nodemap
, tmp_ctx
, true);
384 data
.dsize
= sizeof(uint32_t);
385 data
.dptr
= (unsigned char *)&rec_mode
;
387 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_SET_RECMODE
,
393 DEBUG(DEBUG_ERR
, (__location__
" Unable to set recovery mode. Recovery failed.\n"));
394 talloc_free(tmp_ctx
);
398 /* freeze all nodes */
399 if (rec_mode
== CTDB_RECOVERY_ACTIVE
) {
402 for (i
=1; i
<=NUM_DB_PRIORITIES
; i
++) {
403 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_FREEZE
,
408 set_recmode_fail_callback
,
410 DEBUG(DEBUG_ERR
, (__location__
" Unable to freeze nodes. Recovery failed.\n"));
411 talloc_free(tmp_ctx
);
417 talloc_free(tmp_ctx
);
422 change recovery master on all node
424 static int set_recovery_master(struct ctdb_context
*ctdb
, struct ctdb_node_map
*nodemap
, uint32_t pnn
)
430 tmp_ctx
= talloc_new(ctdb
);
431 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
433 data
.dsize
= sizeof(uint32_t);
434 data
.dptr
= (unsigned char *)&pnn
;
436 nodes
= list_of_active_nodes(ctdb
, nodemap
, tmp_ctx
, true);
437 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_SET_RECMASTER
,
439 CONTROL_TIMEOUT(), false, data
,
442 DEBUG(DEBUG_ERR
, (__location__
" Unable to set recmaster. Recovery failed.\n"));
443 talloc_free(tmp_ctx
);
447 talloc_free(tmp_ctx
);
451 /* update all remote nodes to use the same db priority that we have
452 this can fail if the remove node has not yet been upgraded to
453 support this function, so we always return success and never fail
454 a recovery if this call fails.
456 static int update_db_priority_on_remote_nodes(struct ctdb_context
*ctdb
,
457 struct ctdb_node_map
*nodemap
,
458 uint32_t pnn
, struct ctdb_dbid_map
*dbmap
, TALLOC_CTX
*mem_ctx
)
462 /* step through all local databases */
463 for (db
=0; db
<dbmap
->num
;db
++) {
464 struct ctdb_db_priority db_prio
;
467 db_prio
.db_id
= dbmap
->dbs
[db
].dbid
;
468 ret
= ctdb_ctrl_get_db_priority(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, dbmap
->dbs
[db
].dbid
, &db_prio
.priority
);
470 DEBUG(DEBUG_ERR
,(__location__
" Failed to read database priority from local node for db 0x%08x\n", dbmap
->dbs
[db
].dbid
));
474 DEBUG(DEBUG_INFO
,("Update DB priority for db 0x%08x to %u\n", dbmap
->dbs
[db
].dbid
, db_prio
.priority
));
476 ret
= ctdb_ctrl_set_db_priority(ctdb
, CONTROL_TIMEOUT(),
477 CTDB_CURRENT_NODE
, &db_prio
);
479 DEBUG(DEBUG_ERR
,(__location__
" Failed to set DB priority for 0x%08x\n",
488 ensure all other nodes have attached to any databases that we have
490 static int create_missing_remote_databases(struct ctdb_context
*ctdb
, struct ctdb_node_map
*nodemap
,
491 uint32_t pnn
, struct ctdb_dbid_map
*dbmap
, TALLOC_CTX
*mem_ctx
)
494 struct ctdb_dbid_map
*remote_dbmap
;
496 /* verify that all other nodes have all our databases */
497 for (j
=0; j
<nodemap
->num
; j
++) {
498 /* we dont need to ourself ourselves */
499 if (nodemap
->nodes
[j
].pnn
== pnn
) {
502 /* dont check nodes that are unavailable */
503 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
507 ret
= ctdb_ctrl_getdbmap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
508 mem_ctx
, &remote_dbmap
);
510 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbids from node %u\n", pnn
));
514 /* step through all local databases */
515 for (db
=0; db
<dbmap
->num
;db
++) {
519 for (i
=0;i
<remote_dbmap
->num
;i
++) {
520 if (dbmap
->dbs
[db
].dbid
== remote_dbmap
->dbs
[i
].dbid
) {
524 /* the remote node already have this database */
525 if (i
!=remote_dbmap
->num
) {
528 /* ok so we need to create this database */
529 ret
= ctdb_ctrl_getdbname(ctdb
, CONTROL_TIMEOUT(), pnn
,
530 dbmap
->dbs
[db
].dbid
, mem_ctx
,
533 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbname from node %u\n", pnn
));
536 ret
= ctdb_ctrl_createdb(ctdb
, CONTROL_TIMEOUT(),
537 nodemap
->nodes
[j
].pnn
,
539 dbmap
->dbs
[db
].flags
& CTDB_DB_FLAGS_PERSISTENT
);
541 DEBUG(DEBUG_ERR
, (__location__
" Unable to create remote db:%s\n", name
));
552 ensure we are attached to any databases that anyone else is attached to
554 static int create_missing_local_databases(struct ctdb_context
*ctdb
, struct ctdb_node_map
*nodemap
,
555 uint32_t pnn
, struct ctdb_dbid_map
**dbmap
, TALLOC_CTX
*mem_ctx
)
558 struct ctdb_dbid_map
*remote_dbmap
;
560 /* verify that we have all database any other node has */
561 for (j
=0; j
<nodemap
->num
; j
++) {
562 /* we dont need to ourself ourselves */
563 if (nodemap
->nodes
[j
].pnn
== pnn
) {
566 /* dont check nodes that are unavailable */
567 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
571 ret
= ctdb_ctrl_getdbmap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
572 mem_ctx
, &remote_dbmap
);
574 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbids from node %u\n", pnn
));
578 /* step through all databases on the remote node */
579 for (db
=0; db
<remote_dbmap
->num
;db
++) {
582 for (i
=0;i
<(*dbmap
)->num
;i
++) {
583 if (remote_dbmap
->dbs
[db
].dbid
== (*dbmap
)->dbs
[i
].dbid
) {
587 /* we already have this db locally */
588 if (i
!=(*dbmap
)->num
) {
591 /* ok so we need to create this database and
594 ctdb_ctrl_getdbname(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
595 remote_dbmap
->dbs
[db
].dbid
, mem_ctx
, &name
);
597 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbname from node %u\n",
598 nodemap
->nodes
[j
].pnn
));
601 ctdb_ctrl_createdb(ctdb
, CONTROL_TIMEOUT(), pnn
, mem_ctx
, name
,
602 remote_dbmap
->dbs
[db
].flags
& CTDB_DB_FLAGS_PERSISTENT
);
604 DEBUG(DEBUG_ERR
, (__location__
" Unable to create local db:%s\n", name
));
607 ret
= ctdb_ctrl_getdbmap(ctdb
, CONTROL_TIMEOUT(), pnn
, mem_ctx
, dbmap
);
609 DEBUG(DEBUG_ERR
, (__location__
" Unable to reread dbmap on node %u\n", pnn
));
620 pull the remote database contents from one node into the recdb
622 static int pull_one_remote_database(struct ctdb_context
*ctdb
, uint32_t srcnode
,
623 struct tdb_wrap
*recdb
, uint32_t dbid
)
627 struct ctdb_marshall_buffer
*reply
;
628 struct ctdb_rec_data
*rec
;
630 TALLOC_CTX
*tmp_ctx
= talloc_new(recdb
);
632 ret
= ctdb_ctrl_pulldb(ctdb
, srcnode
, dbid
, CTDB_LMASTER_ANY
, tmp_ctx
,
633 CONTROL_TIMEOUT(), &outdata
);
635 DEBUG(DEBUG_ERR
,(__location__
" Unable to copy db from node %u\n", srcnode
));
636 talloc_free(tmp_ctx
);
640 reply
= (struct ctdb_marshall_buffer
*)outdata
.dptr
;
642 if (outdata
.dsize
< offsetof(struct ctdb_marshall_buffer
, data
)) {
643 DEBUG(DEBUG_ERR
,(__location__
" invalid data in pulldb reply\n"));
644 talloc_free(tmp_ctx
);
648 rec
= (struct ctdb_rec_data
*)&reply
->data
[0];
652 rec
= (struct ctdb_rec_data
*)(rec
->length
+ (uint8_t *)rec
), i
++) {
654 struct ctdb_ltdb_header
*hdr
;
657 key
.dptr
= &rec
->data
[0];
658 key
.dsize
= rec
->keylen
;
659 data
.dptr
= &rec
->data
[key
.dsize
];
660 data
.dsize
= rec
->datalen
;
662 hdr
= (struct ctdb_ltdb_header
*)data
.dptr
;
664 if (data
.dsize
< sizeof(struct ctdb_ltdb_header
)) {
665 DEBUG(DEBUG_CRIT
,(__location__
" bad ltdb record\n"));
666 talloc_free(tmp_ctx
);
670 /* fetch the existing record, if any */
671 existing
= tdb_fetch(recdb
->tdb
, key
);
673 if (existing
.dptr
!= NULL
) {
674 struct ctdb_ltdb_header header
;
675 if (existing
.dsize
< sizeof(struct ctdb_ltdb_header
)) {
676 DEBUG(DEBUG_CRIT
,(__location__
" Bad record size %u from node %u\n",
677 (unsigned)existing
.dsize
, srcnode
));
679 talloc_free(tmp_ctx
);
682 header
= *(struct ctdb_ltdb_header
*)existing
.dptr
;
684 if (!(header
.rsn
< hdr
->rsn
||
685 (header
.dmaster
!= ctdb
->recovery_master
&& header
.rsn
== hdr
->rsn
))) {
690 if (tdb_store(recdb
->tdb
, key
, data
, TDB_REPLACE
) != 0) {
691 DEBUG(DEBUG_CRIT
,(__location__
" Failed to store record\n"));
692 talloc_free(tmp_ctx
);
697 talloc_free(tmp_ctx
);
703 struct pull_seqnum_cbdata
{
709 static void pull_seqnum_cb(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
711 struct pull_seqnum_cbdata
*cb_data
= talloc_get_type(callback_data
, struct pull_seqnum_cbdata
);
714 if (cb_data
->failed
!= 0) {
715 DEBUG(DEBUG_ERR
, ("Got seqnum from node %d but we have already failed the entire operation\n", node_pnn
));
720 DEBUG(DEBUG_ERR
, ("Error when pulling seqnum from node %d\n", node_pnn
));
725 if (outdata
.dsize
!= sizeof(uint64_t)) {
726 DEBUG(DEBUG_ERR
, ("Error when reading pull seqnum from node %d, got %d bytes but expected %d\n", node_pnn
, (int)outdata
.dsize
, (int)sizeof(uint64_t)));
727 cb_data
->failed
= -1;
731 seqnum
= *((uint64_t *)outdata
.dptr
);
733 if (seqnum
> cb_data
->seqnum
||
734 (cb_data
->pnn
== -1 && seqnum
== 0)) {
735 cb_data
->seqnum
= seqnum
;
736 cb_data
->pnn
= node_pnn
;
740 static void pull_seqnum_fail_cb(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
742 struct pull_seqnum_cbdata
*cb_data
= talloc_get_type(callback_data
, struct pull_seqnum_cbdata
);
744 DEBUG(DEBUG_ERR
, ("Failed to pull db seqnum from node %d\n", node_pnn
));
748 static int pull_highest_seqnum_pdb(struct ctdb_context
*ctdb
,
749 struct ctdb_recoverd
*rec
,
750 struct ctdb_node_map
*nodemap
,
751 struct tdb_wrap
*recdb
, uint32_t dbid
)
753 TALLOC_CTX
*tmp_ctx
= talloc_new(NULL
);
757 struct pull_seqnum_cbdata
*cb_data
;
759 DEBUG(DEBUG_NOTICE
, ("Scan for highest seqnum pdb for db:0x%08x\n", dbid
));
764 data
.dsize
= sizeof(outdata
);
765 data
.dptr
= (uint8_t *)&outdata
[0];
767 cb_data
= talloc(tmp_ctx
, struct pull_seqnum_cbdata
);
768 if (cb_data
== NULL
) {
769 DEBUG(DEBUG_ERR
, ("Failed to allocate pull highest seqnum cb_data structure\n"));
770 talloc_free(tmp_ctx
);
778 nodes
= list_of_active_nodes(ctdb
, nodemap
, tmp_ctx
, true);
779 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_GET_DB_SEQNUM
,
781 CONTROL_TIMEOUT(), false, data
,
785 DEBUG(DEBUG_ERR
, (__location__
" Failed to run async GET_DB_SEQNUM\n"));
787 talloc_free(tmp_ctx
);
791 if (cb_data
->failed
!= 0) {
792 DEBUG(DEBUG_NOTICE
, ("Failed to pull sequence numbers for DB 0x%08x\n", dbid
));
793 talloc_free(tmp_ctx
);
797 if (cb_data
->pnn
== -1) {
798 DEBUG(DEBUG_NOTICE
, ("Failed to find a node with highest sequence numbers for DB 0x%08x\n", dbid
));
799 talloc_free(tmp_ctx
);
803 DEBUG(DEBUG_NOTICE
, ("Pull persistent db:0x%08x from node %d with highest seqnum:%lld\n", dbid
, cb_data
->pnn
, (long long)cb_data
->seqnum
));
805 if (pull_one_remote_database(ctdb
, cb_data
->pnn
, recdb
, dbid
) != 0) {
806 DEBUG(DEBUG_ERR
, ("Failed to pull higest seqnum database 0x%08x from node %d\n", dbid
, cb_data
->pnn
));
807 talloc_free(tmp_ctx
);
811 talloc_free(tmp_ctx
);
817 pull all the remote database contents into the recdb
819 static int pull_remote_database(struct ctdb_context
*ctdb
,
820 struct ctdb_recoverd
*rec
,
821 struct ctdb_node_map
*nodemap
,
822 struct tdb_wrap
*recdb
, uint32_t dbid
,
827 if (persistent
&& ctdb
->tunable
.recover_pdb_by_seqnum
!= 0) {
829 ret
= pull_highest_seqnum_pdb(ctdb
, rec
, nodemap
, recdb
, dbid
);
835 /* pull all records from all other nodes across onto this node
836 (this merges based on rsn)
838 for (j
=0; j
<nodemap
->num
; j
++) {
839 /* dont merge from nodes that are unavailable */
840 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
843 if (pull_one_remote_database(ctdb
, nodemap
->nodes
[j
].pnn
, recdb
, dbid
) != 0) {
844 DEBUG(DEBUG_ERR
,(__location__
" Failed to pull remote database from node %u\n",
845 nodemap
->nodes
[j
].pnn
));
846 ctdb_set_culprit_count(rec
, nodemap
->nodes
[j
].pnn
, nodemap
->num
);
856 update flags on all active nodes
858 static int update_flags_on_all_nodes(struct ctdb_context
*ctdb
, struct ctdb_node_map
*nodemap
, uint32_t pnn
, uint32_t flags
)
862 ret
= ctdb_ctrl_modflags(ctdb
, CONTROL_TIMEOUT(), pnn
, flags
, ~flags
);
864 DEBUG(DEBUG_ERR
, (__location__
" Unable to update nodeflags on remote nodes\n"));
872 ensure all nodes have the same vnnmap we do
874 static int update_vnnmap_on_all_nodes(struct ctdb_context
*ctdb
, struct ctdb_node_map
*nodemap
,
875 uint32_t pnn
, struct ctdb_vnn_map
*vnnmap
, TALLOC_CTX
*mem_ctx
)
879 /* push the new vnn map out to all the nodes */
880 for (j
=0; j
<nodemap
->num
; j
++) {
881 /* dont push to nodes that are unavailable */
882 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
886 ret
= ctdb_ctrl_setvnnmap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
, mem_ctx
, vnnmap
);
888 DEBUG(DEBUG_ERR
, (__location__
" Unable to set vnnmap for node %u\n", pnn
));
898 struct vacuum_info
*next
, *prev
;
899 struct ctdb_recoverd
*rec
;
901 struct ctdb_db_context
*ctdb_db
;
902 struct ctdb_marshall_buffer
*recs
;
903 struct ctdb_rec_data
*r
;
906 static void vacuum_fetch_next(struct vacuum_info
*v
);
909 called when a vacuum fetch has completed - just free it and do the next one
911 static void vacuum_fetch_callback(struct ctdb_client_call_state
*state
)
918 process the next element from the vacuum list
920 static void vacuum_fetch_next(struct vacuum_info
*v
)
922 struct ctdb_call call
;
923 struct ctdb_rec_data
*r
;
925 while (v
->recs
->count
) {
926 struct ctdb_client_call_state
*state
;
928 struct ctdb_ltdb_header
*hdr
;
931 call
.call_id
= CTDB_NULL_FUNC
;
932 call
.flags
= CTDB_IMMEDIATE_MIGRATION
;
933 call
.flags
|= CTDB_CALL_FLAG_VACUUM_MIGRATION
;
936 v
->r
= (struct ctdb_rec_data
*)(r
->length
+ (uint8_t *)r
);
939 call
.key
.dptr
= &r
->data
[0];
940 call
.key
.dsize
= r
->keylen
;
942 /* ensure we don't block this daemon - just skip a record if we can't get
944 if (tdb_chainlock_nonblock(v
->ctdb_db
->ltdb
->tdb
, call
.key
) != 0) {
948 data
= tdb_fetch(v
->ctdb_db
->ltdb
->tdb
, call
.key
);
949 if (data
.dptr
== NULL
) {
950 tdb_chainunlock(v
->ctdb_db
->ltdb
->tdb
, call
.key
);
954 if (data
.dsize
< sizeof(struct ctdb_ltdb_header
)) {
956 tdb_chainunlock(v
->ctdb_db
->ltdb
->tdb
, call
.key
);
960 hdr
= (struct ctdb_ltdb_header
*)data
.dptr
;
961 if (hdr
->dmaster
== v
->rec
->ctdb
->pnn
) {
962 /* its already local */
964 tdb_chainunlock(v
->ctdb_db
->ltdb
->tdb
, call
.key
);
970 state
= ctdb_call_send(v
->ctdb_db
, &call
);
971 tdb_chainunlock(v
->ctdb_db
->ltdb
->tdb
, call
.key
);
973 DEBUG(DEBUG_ERR
,(__location__
" Failed to setup vacuum fetch call\n"));
977 state
->async
.fn
= vacuum_fetch_callback
;
978 state
->async
.private_data
= NULL
;
986 destroy a vacuum info structure
988 static int vacuum_info_destructor(struct vacuum_info
*v
)
990 DLIST_REMOVE(v
->rec
->vacuum_info
, v
);
996 handler for vacuum fetch
998 static void vacuum_fetch_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
999 TDB_DATA data
, void *private_data
)
1001 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
, struct ctdb_recoverd
);
1002 struct ctdb_marshall_buffer
*recs
;
1004 TALLOC_CTX
*tmp_ctx
= talloc_new(ctdb
);
1006 struct ctdb_dbid_map
*dbmap
=NULL
;
1007 bool persistent
= false;
1008 struct ctdb_db_context
*ctdb_db
;
1009 struct ctdb_rec_data
*r
;
1011 struct vacuum_info
*v
;
1013 recs
= (struct ctdb_marshall_buffer
*)data
.dptr
;
1014 r
= (struct ctdb_rec_data
*)&recs
->data
[0];
1016 if (recs
->count
== 0) {
1017 talloc_free(tmp_ctx
);
1023 for (v
=rec
->vacuum_info
;v
;v
=v
->next
) {
1024 if (srcnode
== v
->srcnode
&& recs
->db_id
== v
->ctdb_db
->db_id
) {
1025 /* we're already working on records from this node */
1026 talloc_free(tmp_ctx
);
1031 /* work out if the database is persistent */
1032 ret
= ctdb_ctrl_getdbmap(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, tmp_ctx
, &dbmap
);
1034 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbids from local node\n"));
1035 talloc_free(tmp_ctx
);
1039 for (i
=0;i
<dbmap
->num
;i
++) {
1040 if (dbmap
->dbs
[i
].dbid
== recs
->db_id
) {
1041 persistent
= dbmap
->dbs
[i
].flags
& CTDB_DB_FLAGS_PERSISTENT
;
1045 if (i
== dbmap
->num
) {
1046 DEBUG(DEBUG_ERR
, (__location__
" Unable to find db_id 0x%x on local node\n", recs
->db_id
));
1047 talloc_free(tmp_ctx
);
1051 /* find the name of this database */
1052 if (ctdb_ctrl_getdbname(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, recs
->db_id
, tmp_ctx
, &name
) != 0) {
1053 DEBUG(DEBUG_ERR
,(__location__
" Failed to get name of db 0x%x\n", recs
->db_id
));
1054 talloc_free(tmp_ctx
);
1059 ctdb_db
= ctdb_attach(ctdb
, CONTROL_TIMEOUT(), name
, persistent
, 0);
1060 if (ctdb_db
== NULL
) {
1061 DEBUG(DEBUG_ERR
,(__location__
" Failed to attach to database '%s'\n", name
));
1062 talloc_free(tmp_ctx
);
1066 v
= talloc_zero(rec
, struct vacuum_info
);
1068 DEBUG(DEBUG_CRIT
,(__location__
" Out of memory\n"));
1069 talloc_free(tmp_ctx
);
1074 v
->srcnode
= srcnode
;
1075 v
->ctdb_db
= ctdb_db
;
1076 v
->recs
= talloc_memdup(v
, recs
, data
.dsize
);
1077 if (v
->recs
== NULL
) {
1078 DEBUG(DEBUG_CRIT
,(__location__
" Out of memory\n"));
1080 talloc_free(tmp_ctx
);
1083 v
->r
= (struct ctdb_rec_data
*)&v
->recs
->data
[0];
1085 DLIST_ADD(rec
->vacuum_info
, v
);
1087 talloc_set_destructor(v
, vacuum_info_destructor
);
1089 vacuum_fetch_next(v
);
1090 talloc_free(tmp_ctx
);
1095 * handler for database detach
1097 static void detach_database_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
1098 TDB_DATA data
, void *private_data
)
1100 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
,
1101 struct ctdb_recoverd
);
1103 struct vacuum_info
*v
, *vnext
;
1104 struct ctdb_db_context
*ctdb_db
;
1106 if (data
.dsize
!= sizeof(db_id
)) {
1109 db_id
= *(uint32_t *)data
.dptr
;
1111 ctdb_db
= find_ctdb_db(ctdb
, db_id
);
1112 if (ctdb_db
== NULL
) {
1113 /* database is not attached */
1117 /* Stop any active vacuum fetch */
1118 v
= rec
->vacuum_info
;
1122 if (v
->ctdb_db
->db_id
== db_id
) {
1128 DLIST_REMOVE(ctdb
->db_list
, ctdb_db
);
1130 DEBUG(DEBUG_NOTICE
, ("Detached from database '%s'\n",
1132 talloc_free(ctdb_db
);
1136 called when ctdb_wait_timeout should finish
1138 static void ctdb_wait_handler(struct event_context
*ev
, struct timed_event
*te
,
1139 struct timeval yt
, void *p
)
1141 uint32_t *timed_out
= (uint32_t *)p
;
1146 wait for a given number of seconds
1148 static void ctdb_wait_timeout(struct ctdb_context
*ctdb
, double secs
)
1150 uint32_t timed_out
= 0;
1151 time_t usecs
= (secs
- (time_t)secs
) * 1000000;
1152 event_add_timed(ctdb
->ev
, ctdb
, timeval_current_ofs(secs
, usecs
), ctdb_wait_handler
, &timed_out
);
1153 while (!timed_out
) {
1154 event_loop_once(ctdb
->ev
);
1159 called when an election times out (ends)
1161 static void ctdb_election_timeout(struct event_context
*ev
, struct timed_event
*te
,
1162 struct timeval t
, void *p
)
1164 struct ctdb_recoverd
*rec
= talloc_get_type(p
, struct ctdb_recoverd
);
1165 rec
->election_timeout
= NULL
;
1168 DEBUG(DEBUG_WARNING
,("Election period ended\n"));
1173 wait for an election to finish. It finished election_timeout seconds after
1174 the last election packet is received
1176 static void ctdb_wait_election(struct ctdb_recoverd
*rec
)
1178 struct ctdb_context
*ctdb
= rec
->ctdb
;
1179 while (rec
->election_timeout
) {
1180 event_loop_once(ctdb
->ev
);
1185 Update our local flags from all remote connected nodes.
1186 This is only run when we are or we belive we are the recovery master
1188 static int update_local_flags(struct ctdb_recoverd
*rec
, struct ctdb_node_map
*nodemap
)
1191 struct ctdb_context
*ctdb
= rec
->ctdb
;
1192 TALLOC_CTX
*mem_ctx
= talloc_new(ctdb
);
1194 /* get the nodemap for all active remote nodes and verify
1195 they are the same as for this node
1197 for (j
=0; j
<nodemap
->num
; j
++) {
1198 struct ctdb_node_map
*remote_nodemap
=NULL
;
1201 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_DISCONNECTED
) {
1204 if (nodemap
->nodes
[j
].pnn
== ctdb
->pnn
) {
1208 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
1209 mem_ctx
, &remote_nodemap
);
1211 DEBUG(DEBUG_ERR
, (__location__
" Unable to get nodemap from remote node %u\n",
1212 nodemap
->nodes
[j
].pnn
));
1213 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
1214 talloc_free(mem_ctx
);
1215 return MONITOR_FAILED
;
1217 if (nodemap
->nodes
[j
].flags
!= remote_nodemap
->nodes
[j
].flags
) {
1218 /* We should tell our daemon about this so it
1219 updates its flags or else we will log the same
1220 message again in the next iteration of recovery.
1221 Since we are the recovery master we can just as
1222 well update the flags on all nodes.
1224 ret
= ctdb_ctrl_modflags(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
, remote_nodemap
->nodes
[j
].flags
, ~remote_nodemap
->nodes
[j
].flags
);
1226 DEBUG(DEBUG_ERR
, (__location__
" Unable to update nodeflags on remote nodes\n"));
1230 /* Update our local copy of the flags in the recovery
1233 DEBUG(DEBUG_NOTICE
,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
1234 nodemap
->nodes
[j
].pnn
, remote_nodemap
->nodes
[j
].flags
,
1235 nodemap
->nodes
[j
].flags
));
1236 nodemap
->nodes
[j
].flags
= remote_nodemap
->nodes
[j
].flags
;
1238 talloc_free(remote_nodemap
);
1240 talloc_free(mem_ctx
);
1245 /* Create a new random generation ip.
1246 The generation id can not be the INVALID_GENERATION id
1248 static uint32_t new_generation(void)
1250 uint32_t generation
;
1253 generation
= random();
1255 if (generation
!= INVALID_GENERATION
) {
1265 create a temporary working database
1267 static struct tdb_wrap
*create_recdb(struct ctdb_context
*ctdb
, TALLOC_CTX
*mem_ctx
)
1270 struct tdb_wrap
*recdb
;
1273 /* open up the temporary recovery database */
1274 name
= talloc_asprintf(mem_ctx
, "%s/recdb.tdb.%u",
1275 ctdb
->db_directory_state
,
1282 tdb_flags
= TDB_NOLOCK
;
1283 if (ctdb
->valgrinding
) {
1284 tdb_flags
|= TDB_NOMMAP
;
1286 tdb_flags
|= (TDB_INCOMPATIBLE_HASH
| TDB_DISALLOW_NESTING
);
1288 recdb
= tdb_wrap_open(mem_ctx
, name
, ctdb
->tunable
.database_hash_size
,
1289 tdb_flags
, O_RDWR
|O_CREAT
|O_EXCL
, 0600);
1290 if (recdb
== NULL
) {
1291 DEBUG(DEBUG_CRIT
,(__location__
" Failed to create temp recovery database '%s'\n", name
));
1301 a traverse function for pulling all relevant records from recdb
1304 struct ctdb_context
*ctdb
;
1305 struct ctdb_marshall_buffer
*recdata
;
1307 uint32_t allocated_len
;
1312 static int traverse_recdb(struct tdb_context
*tdb
, TDB_DATA key
, TDB_DATA data
, void *p
)
1314 struct recdb_data
*params
= (struct recdb_data
*)p
;
1315 struct ctdb_rec_data
*rec
;
1316 struct ctdb_ltdb_header
*hdr
;
1319 * skip empty records - but NOT for persistent databases:
1321 * The record-by-record mode of recovery deletes empty records.
1322 * For persistent databases, this can lead to data corruption
1323 * by deleting records that should be there:
1325 * - Assume the cluster has been running for a while.
1327 * - A record R in a persistent database has been created and
1328 * deleted a couple of times, the last operation being deletion,
1329 * leaving an empty record with a high RSN, say 10.
1331 * - Now a node N is turned off.
1333 * - This leaves the local database copy of D on N with the empty
1334 * copy of R and RSN 10. On all other nodes, the recovery has deleted
1335 * the copy of record R.
1337 * - Now the record is created again while node N is turned off.
1338 * This creates R with RSN = 1 on all nodes except for N.
1340 * - Now node N is turned on again. The following recovery will chose
1341 * the older empty copy of R due to RSN 10 > RSN 1.
1343 * ==> Hence the record is gone after the recovery.
1345 * On databases like Samba's registry, this can damage the higher-level
1346 * data structures built from the various tdb-level records.
1348 if (!params
->persistent
&& data
.dsize
<= sizeof(struct ctdb_ltdb_header
)) {
1352 /* update the dmaster field to point to us */
1353 hdr
= (struct ctdb_ltdb_header
*)data
.dptr
;
1354 if (!params
->persistent
) {
1355 hdr
->dmaster
= params
->ctdb
->pnn
;
1356 hdr
->flags
|= CTDB_REC_FLAG_MIGRATED_WITH_DATA
;
1359 /* add the record to the blob ready to send to the nodes */
1360 rec
= ctdb_marshall_record(params
->recdata
, 0, key
, NULL
, data
);
1362 params
->failed
= true;
1365 if (params
->len
+ rec
->length
>= params
->allocated_len
) {
1366 params
->allocated_len
= rec
->length
+ params
->len
+ params
->ctdb
->tunable
.pulldb_preallocation_size
;
1367 params
->recdata
= talloc_realloc_size(NULL
, params
->recdata
, params
->allocated_len
);
1369 if (params
->recdata
== NULL
) {
1370 DEBUG(DEBUG_CRIT
,(__location__
" Failed to expand recdata to %u\n",
1371 rec
->length
+ params
->len
));
1372 params
->failed
= true;
1375 params
->recdata
->count
++;
1376 memcpy(params
->len
+(uint8_t *)params
->recdata
, rec
, rec
->length
);
1377 params
->len
+= rec
->length
;
1384 push the recdb database out to all nodes
1386 static int push_recdb_database(struct ctdb_context
*ctdb
, uint32_t dbid
,
1388 struct tdb_wrap
*recdb
, struct ctdb_node_map
*nodemap
)
1390 struct recdb_data params
;
1391 struct ctdb_marshall_buffer
*recdata
;
1393 TALLOC_CTX
*tmp_ctx
;
1396 tmp_ctx
= talloc_new(ctdb
);
1397 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
1399 recdata
= talloc_zero(recdb
, struct ctdb_marshall_buffer
);
1400 CTDB_NO_MEMORY(ctdb
, recdata
);
1402 recdata
->db_id
= dbid
;
1405 params
.recdata
= recdata
;
1406 params
.len
= offsetof(struct ctdb_marshall_buffer
, data
);
1407 params
.allocated_len
= params
.len
;
1408 params
.failed
= false;
1409 params
.persistent
= persistent
;
1411 if (tdb_traverse_read(recdb
->tdb
, traverse_recdb
, ¶ms
) == -1) {
1412 DEBUG(DEBUG_ERR
,(__location__
" Failed to traverse recdb database\n"));
1413 talloc_free(params
.recdata
);
1414 talloc_free(tmp_ctx
);
1418 if (params
.failed
) {
1419 DEBUG(DEBUG_ERR
,(__location__
" Failed to traverse recdb database\n"));
1420 talloc_free(params
.recdata
);
1421 talloc_free(tmp_ctx
);
1425 recdata
= params
.recdata
;
1427 outdata
.dptr
= (void *)recdata
;
1428 outdata
.dsize
= params
.len
;
1430 nodes
= list_of_active_nodes(ctdb
, nodemap
, tmp_ctx
, true);
1431 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_PUSH_DB
,
1433 CONTROL_TIMEOUT(), false, outdata
,
1436 DEBUG(DEBUG_ERR
,(__location__
" Failed to push recdb records to nodes for db 0x%x\n", dbid
));
1437 talloc_free(recdata
);
1438 talloc_free(tmp_ctx
);
1442 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - pushed remote database 0x%x of size %u\n",
1443 dbid
, recdata
->count
));
1445 talloc_free(recdata
);
1446 talloc_free(tmp_ctx
);
1453 go through a full recovery on one database
1455 static int recover_database(struct ctdb_recoverd
*rec
,
1456 TALLOC_CTX
*mem_ctx
,
1460 struct ctdb_node_map
*nodemap
,
1461 uint32_t transaction_id
)
1463 struct tdb_wrap
*recdb
;
1465 struct ctdb_context
*ctdb
= rec
->ctdb
;
1467 struct ctdb_control_wipe_database w
;
1470 recdb
= create_recdb(ctdb
, mem_ctx
);
1471 if (recdb
== NULL
) {
1475 /* pull all remote databases onto the recdb */
1476 ret
= pull_remote_database(ctdb
, rec
, nodemap
, recdb
, dbid
, persistent
);
1478 DEBUG(DEBUG_ERR
, (__location__
" Unable to pull remote database 0x%x\n", dbid
));
1482 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - pulled remote database 0x%x\n", dbid
));
1484 /* wipe all the remote databases. This is safe as we are in a transaction */
1486 w
.transaction_id
= transaction_id
;
1488 data
.dptr
= (void *)&w
;
1489 data
.dsize
= sizeof(w
);
1491 nodes
= list_of_active_nodes(ctdb
, nodemap
, recdb
, true);
1492 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_WIPE_DATABASE
,
1494 CONTROL_TIMEOUT(), false, data
,
1497 DEBUG(DEBUG_ERR
, (__location__
" Unable to wipe database. Recovery failed.\n"));
1502 /* push out the correct database. This sets the dmaster and skips
1503 the empty records */
1504 ret
= push_recdb_database(ctdb
, dbid
, persistent
, recdb
, nodemap
);
1510 /* all done with this database */
1516 static int ctdb_reload_remote_public_ips(struct ctdb_context
*ctdb
,
1517 struct ctdb_recoverd
*rec
,
1518 struct ctdb_node_map
*nodemap
,
1524 if (ctdb
->num_nodes
!= nodemap
->num
) {
1525 DEBUG(DEBUG_ERR
, (__location__
" ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
1526 ctdb
->num_nodes
, nodemap
->num
));
1528 *culprit
= ctdb
->pnn
;
1533 for (j
=0; j
<nodemap
->num
; j
++) {
1534 /* For readability */
1535 struct ctdb_node
*node
= ctdb
->nodes
[j
];
1537 /* release any existing data */
1538 if (node
->known_public_ips
) {
1539 talloc_free(node
->known_public_ips
);
1540 node
->known_public_ips
= NULL
;
1542 if (node
->available_public_ips
) {
1543 talloc_free(node
->available_public_ips
);
1544 node
->available_public_ips
= NULL
;
1547 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
1551 /* Retrieve the list of known public IPs from the node */
1552 ret
= ctdb_ctrl_get_public_ips_flags(ctdb
,
1557 &node
->known_public_ips
);
1560 ("Failed to read known public IPs from node: %u\n",
1563 *culprit
= node
->pnn
;
1568 if (ctdb
->do_checkpublicip
&&
1569 rec
->takeover_runs_disable_ctx
== NULL
&&
1570 verify_remote_ip_allocation(ctdb
,
1571 node
->known_public_ips
,
1573 DEBUG(DEBUG_ERR
,("Trigger IP reallocation\n"));
1574 rec
->need_takeover_run
= true;
1577 /* Retrieve the list of available public IPs from the node */
1578 ret
= ctdb_ctrl_get_public_ips_flags(ctdb
,
1582 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE
,
1583 &node
->available_public_ips
);
1586 ("Failed to read available public IPs from node: %u\n",
1589 *culprit
= node
->pnn
;
1598 /* when we start a recovery, make sure all nodes use the same reclock file
1601 static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd
*rec
)
1603 struct ctdb_context
*ctdb
= rec
->ctdb
;
1604 TALLOC_CTX
*tmp_ctx
= talloc_new(NULL
);
1608 if (ctdb
->recovery_lock_file
== NULL
) {
1612 data
.dsize
= strlen(ctdb
->recovery_lock_file
) + 1;
1613 data
.dptr
= (uint8_t *)ctdb
->recovery_lock_file
;
1616 nodes
= list_of_active_nodes(ctdb
, rec
->nodemap
, tmp_ctx
, true);
1617 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_SET_RECLOCK_FILE
,
1623 DEBUG(DEBUG_ERR
, (__location__
" Failed to sync reclock file settings\n"));
1624 talloc_free(tmp_ctx
);
1628 talloc_free(tmp_ctx
);
1634 * this callback is called for every node that failed to execute ctdb_takeover_run()
1635 * and set flag to re-run takeover run.
1637 static void takeover_fail_callback(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
1639 DEBUG(DEBUG_ERR
, ("Node %u failed the takeover run\n", node_pnn
));
1641 if (callback_data
!= NULL
) {
1642 struct ctdb_recoverd
*rec
= talloc_get_type(callback_data
, struct ctdb_recoverd
);
1644 DEBUG(DEBUG_ERR
, ("Setting node %u as recovery fail culprit\n", node_pnn
));
1646 ctdb_set_culprit(rec
, node_pnn
);
1651 static void ban_misbehaving_nodes(struct ctdb_recoverd
*rec
, bool *self_ban
)
1653 struct ctdb_context
*ctdb
= rec
->ctdb
;
1655 struct ctdb_banning_state
*ban_state
;
1658 for (i
=0; i
<ctdb
->num_nodes
; i
++) {
1659 if (ctdb
->nodes
[i
]->ban_state
== NULL
) {
1662 ban_state
= (struct ctdb_banning_state
*)ctdb
->nodes
[i
]->ban_state
;
1663 if (ban_state
->count
< 2*ctdb
->num_nodes
) {
1667 DEBUG(DEBUG_NOTICE
,("Node %u reached %u banning credits - banning it for %u seconds\n",
1668 ctdb
->nodes
[i
]->pnn
, ban_state
->count
,
1669 ctdb
->tunable
.recovery_ban_period
));
1670 ctdb_ban_node(rec
, ctdb
->nodes
[i
]->pnn
, ctdb
->tunable
.recovery_ban_period
);
1671 ban_state
->count
= 0;
1673 /* Banning ourself? */
1674 if (ctdb
->nodes
[i
]->pnn
== rec
->ctdb
->pnn
) {
1680 static bool do_takeover_run(struct ctdb_recoverd
*rec
,
1681 struct ctdb_node_map
*nodemap
,
1682 bool banning_credits_on_fail
)
1684 uint32_t *nodes
= NULL
;
1685 struct srvid_request_data dtr
;
1688 uint32_t *rebalance_nodes
= rec
->force_rebalance_nodes
;
1692 DEBUG(DEBUG_NOTICE
, ("Takeover run starting\n"));
1694 if (rec
->takeover_run_in_progress
) {
1695 DEBUG(DEBUG_ERR
, (__location__
1696 " takeover run already in progress \n"));
1701 rec
->takeover_run_in_progress
= true;
1703 /* If takeover runs are in disabled then fail... */
1704 if (rec
->takeover_runs_disable_ctx
!= NULL
) {
1706 ("Takeover runs are disabled so refusing to run one\n"));
1711 /* Disable IP checks (takeover runs, really) on other nodes
1712 * while doing this takeover run. This will stop those other
1713 * nodes from triggering takeover runs when think they should
1714 * be hosting an IP but it isn't yet on an interface. Don't
1715 * wait for replies since a failure here might cause some
1716 * noise in the logs but will not actually cause a problem.
1718 dtr
.srvid
= 0; /* No reply */
1721 data
.dptr
= (uint8_t*)&dtr
;
1722 data
.dsize
= sizeof(dtr
);
1724 nodes
= list_of_connected_nodes(rec
->ctdb
, nodemap
, rec
, false);
1726 /* Disable for 60 seconds. This can be a tunable later if
1730 for (i
= 0; i
< talloc_array_length(nodes
); i
++) {
1731 if (ctdb_client_send_message(rec
->ctdb
, nodes
[i
],
1732 CTDB_SRVID_DISABLE_TAKEOVER_RUNS
,
1734 DEBUG(DEBUG_INFO
,("Failed to disable takeover runs\n"));
1738 ret
= ctdb_takeover_run(rec
->ctdb
, nodemap
,
1739 rec
->force_rebalance_nodes
,
1740 takeover_fail_callback
,
1741 banning_credits_on_fail
? rec
: NULL
);
1743 /* Reenable takeover runs and IP checks on other nodes */
1745 for (i
= 0; i
< talloc_array_length(nodes
); i
++) {
1746 if (ctdb_client_send_message(rec
->ctdb
, nodes
[i
],
1747 CTDB_SRVID_DISABLE_TAKEOVER_RUNS
,
1749 DEBUG(DEBUG_INFO
,("Failed to reenable takeover runs\n"));
1754 DEBUG(DEBUG_ERR
, ("ctdb_takeover_run() failed\n"));
1760 /* Takeover run was successful so clear force rebalance targets */
1761 if (rebalance_nodes
== rec
->force_rebalance_nodes
) {
1762 TALLOC_FREE(rec
->force_rebalance_nodes
);
1764 DEBUG(DEBUG_WARNING
,
1765 ("Rebalance target nodes changed during takeover run - not clearing\n"));
1768 rec
->need_takeover_run
= !ok
;
1770 rec
->takeover_run_in_progress
= false;
1772 DEBUG(DEBUG_NOTICE
, ("Takeover run %s\n", ok
? "completed successfully" : "unsuccessful"));
1778 we are the recmaster, and recovery is needed - start a recovery run
1780 static int do_recovery(struct ctdb_recoverd
*rec
,
1781 TALLOC_CTX
*mem_ctx
, uint32_t pnn
,
1782 struct ctdb_node_map
*nodemap
, struct ctdb_vnn_map
*vnnmap
)
1784 struct ctdb_context
*ctdb
= rec
->ctdb
;
1786 uint32_t generation
;
1787 struct ctdb_dbid_map
*dbmap
;
1790 struct timeval start_time
;
1791 uint32_t culprit
= (uint32_t)-1;
1794 DEBUG(DEBUG_NOTICE
, (__location__
" Starting do_recovery\n"));
1796 /* if recovery fails, force it again */
1797 rec
->need_recovery
= true;
1799 if (rec
->election_timeout
) {
1800 /* an election is in progress */
1801 DEBUG(DEBUG_ERR
, ("do_recovery called while election in progress - try again later\n"));
1805 ban_misbehaving_nodes(rec
, &self_ban
);
1807 DEBUG(DEBUG_NOTICE
, ("This node was banned, aborting recovery\n"));
1811 if (ctdb
->recovery_lock_file
!= NULL
) {
1812 if (ctdb_recovery_have_lock(ctdb
)) {
1813 DEBUG(DEBUG_NOTICE
, ("Already holding recovery lock\n"));
1815 start_time
= timeval_current();
1816 DEBUG(DEBUG_NOTICE
, ("Attempting to take recovery lock (%s)\n",
1817 ctdb
->recovery_lock_file
));
1818 if (!ctdb_recovery_lock(ctdb
)) {
1819 if (ctdb
->runstate
== CTDB_RUNSTATE_FIRST_RECOVERY
) {
1820 /* If ctdb is trying first recovery, it's
1821 * possible that current node does not know
1822 * yet who the recmaster is.
1824 DEBUG(DEBUG_ERR
, ("Unable to get recovery lock"
1825 " - retrying recovery\n"));
1829 DEBUG(DEBUG_ERR
,("Unable to get recovery lock - aborting recovery "
1830 "and ban ourself for %u seconds\n",
1831 ctdb
->tunable
.recovery_ban_period
));
1832 ctdb_ban_node(rec
, pnn
, ctdb
->tunable
.recovery_ban_period
);
1835 ctdb_ctrl_report_recd_lock_latency(ctdb
,
1837 timeval_elapsed(&start_time
));
1839 ("Recovery lock taken successfully by recovery daemon\n"));
1843 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery initiated due to problem with node %u\n", rec
->last_culprit_node
));
1845 /* get a list of all databases */
1846 ret
= ctdb_ctrl_getdbmap(ctdb
, CONTROL_TIMEOUT(), pnn
, mem_ctx
, &dbmap
);
1848 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbids from node :%u\n", pnn
));
1852 /* we do the db creation before we set the recovery mode, so the freeze happens
1853 on all databases we will be dealing with. */
1855 /* verify that we have all the databases any other node has */
1856 ret
= create_missing_local_databases(ctdb
, nodemap
, pnn
, &dbmap
, mem_ctx
);
1858 DEBUG(DEBUG_ERR
, (__location__
" Unable to create missing local databases\n"));
1862 /* verify that all other nodes have all our databases */
1863 ret
= create_missing_remote_databases(ctdb
, nodemap
, pnn
, dbmap
, mem_ctx
);
1865 DEBUG(DEBUG_ERR
, (__location__
" Unable to create missing remote databases\n"));
1868 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - created remote databases\n"));
1870 /* update the database priority for all remote databases */
1871 ret
= update_db_priority_on_remote_nodes(ctdb
, nodemap
, pnn
, dbmap
, mem_ctx
);
1873 DEBUG(DEBUG_ERR
, (__location__
" Unable to set db priority on remote nodes\n"));
1875 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - updated db priority for all databases\n"));
1878 /* update all other nodes to use the same setting for reclock files
1879 as the local recovery master.
1881 sync_recovery_lock_file_across_cluster(rec
);
1883 /* set recovery mode to active on all nodes */
1884 ret
= set_recovery_mode(ctdb
, rec
, nodemap
, CTDB_RECOVERY_ACTIVE
);
1886 DEBUG(DEBUG_ERR
, (__location__
" Unable to set recovery mode to active on cluster\n"));
1890 /* execute the "startrecovery" event script on all nodes */
1891 ret
= run_startrecovery_eventscript(rec
, nodemap
);
1893 DEBUG(DEBUG_ERR
, (__location__
" Unable to run the 'startrecovery' event on cluster\n"));
1898 update all nodes to have the same flags that we have
1900 for (i
=0;i
<nodemap
->num
;i
++) {
1901 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
) {
1905 ret
= update_flags_on_all_nodes(ctdb
, nodemap
, i
, nodemap
->nodes
[i
].flags
);
1907 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_INACTIVE
) {
1908 DEBUG(DEBUG_WARNING
, (__location__
"Unable to update flags on inactive node %d\n", i
));
1910 DEBUG(DEBUG_ERR
, (__location__
" Unable to update flags on all nodes for node %d\n", i
));
1916 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - updated flags\n"));
1918 /* pick a new generation number */
1919 generation
= new_generation();
1921 /* change the vnnmap on this node to use the new generation
1922 number but not on any other nodes.
1923 this guarantees that if we abort the recovery prematurely
1924 for some reason (a node stops responding?)
1925 that we can just return immediately and we will reenter
1926 recovery shortly again.
1927 I.e. we deliberately leave the cluster with an inconsistent
1928 generation id to allow us to abort recovery at any stage and
1929 just restart it from scratch.
1931 vnnmap
->generation
= generation
;
1932 ret
= ctdb_ctrl_setvnnmap(ctdb
, CONTROL_TIMEOUT(), pnn
, mem_ctx
, vnnmap
);
1934 DEBUG(DEBUG_ERR
, (__location__
" Unable to set vnnmap for node %u\n", pnn
));
1938 data
.dptr
= (void *)&generation
;
1939 data
.dsize
= sizeof(uint32_t);
1941 nodes
= list_of_active_nodes(ctdb
, nodemap
, mem_ctx
, true);
1942 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_TRANSACTION_START
,
1944 CONTROL_TIMEOUT(), false, data
,
1946 transaction_start_fail_callback
,
1948 DEBUG(DEBUG_ERR
, (__location__
" Unable to start transactions. Recovery failed.\n"));
1949 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_TRANSACTION_CANCEL
,
1951 CONTROL_TIMEOUT(), false, tdb_null
,
1955 DEBUG(DEBUG_ERR
,("Failed to cancel recovery transaction\n"));
1960 DEBUG(DEBUG_NOTICE
,(__location__
" started transactions on all nodes\n"));
1962 for (i
=0;i
<dbmap
->num
;i
++) {
1963 ret
= recover_database(rec
, mem_ctx
,
1965 dbmap
->dbs
[i
].flags
& CTDB_DB_FLAGS_PERSISTENT
,
1966 pnn
, nodemap
, generation
);
1968 DEBUG(DEBUG_ERR
, (__location__
" Failed to recover database 0x%x\n", dbmap
->dbs
[i
].dbid
));
1973 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - starting database commits\n"));
1975 /* commit all the changes */
1976 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_TRANSACTION_COMMIT
,
1978 CONTROL_TIMEOUT(), false, data
,
1981 DEBUG(DEBUG_ERR
, (__location__
" Unable to commit recovery changes. Recovery failed.\n"));
1985 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - committed databases\n"));
1988 /* update the capabilities for all nodes */
1989 ret
= update_capabilities(ctdb
, nodemap
);
1991 DEBUG(DEBUG_ERR
, (__location__
" Unable to update node capabilities.\n"));
1995 /* build a new vnn map with all the currently active and
1997 generation
= new_generation();
1998 vnnmap
= talloc(mem_ctx
, struct ctdb_vnn_map
);
1999 CTDB_NO_MEMORY(ctdb
, vnnmap
);
2000 vnnmap
->generation
= generation
;
2002 vnnmap
->map
= talloc_zero_array(vnnmap
, uint32_t, vnnmap
->size
);
2003 CTDB_NO_MEMORY(ctdb
, vnnmap
->map
);
2004 for (i
=j
=0;i
<nodemap
->num
;i
++) {
2005 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_INACTIVE
) {
2008 if (!(ctdb
->nodes
[i
]->capabilities
& CTDB_CAP_LMASTER
)) {
2009 /* this node can not be an lmaster */
2010 DEBUG(DEBUG_DEBUG
, ("Node %d cant be a LMASTER, skipping it\n", i
));
2015 vnnmap
->map
= talloc_realloc(vnnmap
, vnnmap
->map
, uint32_t, vnnmap
->size
);
2016 CTDB_NO_MEMORY(ctdb
, vnnmap
->map
);
2017 vnnmap
->map
[j
++] = nodemap
->nodes
[i
].pnn
;
2020 if (vnnmap
->size
== 0) {
2021 DEBUG(DEBUG_NOTICE
, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
2023 vnnmap
->map
= talloc_realloc(vnnmap
, vnnmap
->map
, uint32_t, vnnmap
->size
);
2024 CTDB_NO_MEMORY(ctdb
, vnnmap
->map
);
2025 vnnmap
->map
[0] = pnn
;
2028 /* update to the new vnnmap on all nodes */
2029 ret
= update_vnnmap_on_all_nodes(ctdb
, nodemap
, pnn
, vnnmap
, mem_ctx
);
2031 DEBUG(DEBUG_ERR
, (__location__
" Unable to update vnnmap on all nodes\n"));
2035 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - updated vnnmap\n"));
2037 /* update recmaster to point to us for all nodes */
2038 ret
= set_recovery_master(ctdb
, nodemap
, pnn
);
2040 DEBUG(DEBUG_ERR
, (__location__
" Unable to set recovery master\n"));
2044 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - updated recmaster\n"));
2046 /* disable recovery mode */
2047 ret
= set_recovery_mode(ctdb
, rec
, nodemap
, CTDB_RECOVERY_NORMAL
);
2049 DEBUG(DEBUG_ERR
, (__location__
" Unable to set recovery mode to normal on cluster\n"));
2053 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - disabled recovery mode\n"));
2055 /* Fetch known/available public IPs from each active node */
2056 ret
= ctdb_reload_remote_public_ips(ctdb
, rec
, nodemap
, &culprit
);
2058 DEBUG(DEBUG_ERR
,("Failed to read public ips from remote node %d\n",
2060 rec
->need_takeover_run
= true;
2064 do_takeover_run(rec
, nodemap
, false);
2066 /* execute the "recovered" event script on all nodes */
2067 ret
= run_recovered_eventscript(rec
, nodemap
, "do_recovery");
2069 DEBUG(DEBUG_ERR
, (__location__
" Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
2073 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - finished the recovered event\n"));
2075 /* send a message to all clients telling them that the cluster
2076 has been reconfigured */
2077 ret
= ctdb_client_send_message(ctdb
, CTDB_BROADCAST_CONNECTED
,
2078 CTDB_SRVID_RECONFIGURE
, tdb_null
);
2080 DEBUG(DEBUG_ERR
, (__location__
" Failed to send reconfigure message\n"));
2084 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery complete\n"));
2086 rec
->need_recovery
= false;
2088 /* we managed to complete a full recovery, make sure to forgive
2089 any past sins by the nodes that could now participate in the
2092 DEBUG(DEBUG_ERR
,("Resetting ban count to 0 for all nodes\n"));
2093 for (i
=0;i
<nodemap
->num
;i
++) {
2094 struct ctdb_banning_state
*ban_state
;
2096 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
) {
2100 ban_state
= (struct ctdb_banning_state
*)ctdb
->nodes
[nodemap
->nodes
[i
].pnn
]->ban_state
;
2101 if (ban_state
== NULL
) {
2105 ban_state
->count
= 0;
2109 /* We just finished a recovery successfully.
2110 We now wait for rerecovery_timeout before we allow
2111 another recovery to take place.
2113 DEBUG(DEBUG_NOTICE
, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb
->tunable
.rerecovery_timeout
));
2114 ctdb_wait_timeout(ctdb
, ctdb
->tunable
.rerecovery_timeout
);
2115 DEBUG(DEBUG_NOTICE
, ("The rerecovery timeout has elapsed. We now allow recoveries to trigger again.\n"));
2122 elections are won by first checking the number of connected nodes, then
2123 the priority time, then the pnn
2125 struct election_message
{
2126 uint32_t num_connected
;
2127 struct timeval priority_time
;
2129 uint32_t node_flags
;
2133 form this nodes election data
2135 static void ctdb_election_data(struct ctdb_recoverd
*rec
, struct election_message
*em
)
2138 struct ctdb_node_map
*nodemap
;
2139 struct ctdb_context
*ctdb
= rec
->ctdb
;
2143 em
->pnn
= rec
->ctdb
->pnn
;
2144 em
->priority_time
= rec
->priority_time
;
2146 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, rec
, &nodemap
);
2148 DEBUG(DEBUG_ERR
,(__location__
" unable to get node map\n"));
2152 rec
->node_flags
= nodemap
->nodes
[ctdb
->pnn
].flags
;
2153 em
->node_flags
= rec
->node_flags
;
2155 for (i
=0;i
<nodemap
->num
;i
++) {
2156 if (!(nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
)) {
2157 em
->num_connected
++;
2161 /* we shouldnt try to win this election if we cant be a recmaster */
2162 if ((ctdb
->capabilities
& CTDB_CAP_RECMASTER
) == 0) {
2163 em
->num_connected
= 0;
2164 em
->priority_time
= timeval_current();
2167 talloc_free(nodemap
);
2171 see if the given election data wins
2173 static bool ctdb_election_win(struct ctdb_recoverd
*rec
, struct election_message
*em
)
2175 struct election_message myem
;
2178 ctdb_election_data(rec
, &myem
);
2180 /* we cant win if we dont have the recmaster capability */
2181 if ((rec
->ctdb
->capabilities
& CTDB_CAP_RECMASTER
) == 0) {
2185 /* we cant win if we are banned */
2186 if (rec
->node_flags
& NODE_FLAGS_BANNED
) {
2190 /* we cant win if we are stopped */
2191 if (rec
->node_flags
& NODE_FLAGS_STOPPED
) {
2195 /* we will automatically win if the other node is banned */
2196 if (em
->node_flags
& NODE_FLAGS_BANNED
) {
2200 /* we will automatically win if the other node is banned */
2201 if (em
->node_flags
& NODE_FLAGS_STOPPED
) {
2205 /* try to use the most connected node */
2207 cmp
= (int)myem
.num_connected
- (int)em
->num_connected
;
2210 /* then the longest running node */
2212 cmp
= timeval_compare(&em
->priority_time
, &myem
.priority_time
);
2216 cmp
= (int)myem
.pnn
- (int)em
->pnn
;
2223 send out an election request
2225 static int send_election_request(struct ctdb_recoverd
*rec
, uint32_t pnn
)
2228 TDB_DATA election_data
;
2229 struct election_message emsg
;
2231 struct ctdb_context
*ctdb
= rec
->ctdb
;
2233 srvid
= CTDB_SRVID_RECOVERY
;
2235 ctdb_election_data(rec
, &emsg
);
2237 election_data
.dsize
= sizeof(struct election_message
);
2238 election_data
.dptr
= (unsigned char *)&emsg
;
2241 /* first we assume we will win the election and set
2242 recoverymaster to be ourself on the current node
2244 ret
= ctdb_ctrl_setrecmaster(ctdb
, CONTROL_TIMEOUT(), pnn
, pnn
);
2246 DEBUG(DEBUG_ERR
, (__location__
" failed to send recmaster election request\n"));
2251 /* send an election message to all active nodes */
2252 DEBUG(DEBUG_INFO
,(__location__
" Send election request to all active nodes\n"));
2253 return ctdb_client_send_message(ctdb
, CTDB_BROADCAST_ALL
, srvid
, election_data
);
2257 this function will unban all nodes in the cluster
2259 static void unban_all_nodes(struct ctdb_context
*ctdb
)
2262 struct ctdb_node_map
*nodemap
;
2263 TALLOC_CTX
*tmp_ctx
= talloc_new(ctdb
);
2265 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, tmp_ctx
, &nodemap
);
2267 DEBUG(DEBUG_ERR
,(__location__
" failed to get nodemap to unban all nodes\n"));
2271 for (i
=0;i
<nodemap
->num
;i
++) {
2272 if ( (!(nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
))
2273 && (nodemap
->nodes
[i
].flags
& NODE_FLAGS_BANNED
) ) {
2274 ret
= ctdb_ctrl_modflags(ctdb
, CONTROL_TIMEOUT(),
2275 nodemap
->nodes
[i
].pnn
, 0,
2278 DEBUG(DEBUG_ERR
, (__location__
" failed to reset ban state\n"));
2283 talloc_free(tmp_ctx
);
2288 we think we are winning the election - send a broadcast election request
2290 static void election_send_request(struct event_context
*ev
, struct timed_event
*te
, struct timeval t
, void *p
)
2292 struct ctdb_recoverd
*rec
= talloc_get_type(p
, struct ctdb_recoverd
);
2295 ret
= send_election_request(rec
, ctdb_get_pnn(rec
->ctdb
));
2297 DEBUG(DEBUG_ERR
,("Failed to send election request!\n"));
2300 talloc_free(rec
->send_election_te
);
2301 rec
->send_election_te
= NULL
;
2305 handler for memory dumps
2307 static void mem_dump_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2308 TDB_DATA data
, void *private_data
)
2310 TALLOC_CTX
*tmp_ctx
= talloc_new(ctdb
);
2313 struct srvid_request
*rd
;
2315 if (data
.dsize
!= sizeof(struct srvid_request
)) {
2316 DEBUG(DEBUG_ERR
, (__location__
" Wrong size of return address.\n"));
2317 talloc_free(tmp_ctx
);
2320 rd
= (struct srvid_request
*)data
.dptr
;
2322 dump
= talloc_zero(tmp_ctx
, TDB_DATA
);
2324 DEBUG(DEBUG_ERR
, (__location__
" Failed to allocate memory for memdump\n"));
2325 talloc_free(tmp_ctx
);
2328 ret
= ctdb_dump_memory(ctdb
, dump
);
2330 DEBUG(DEBUG_ERR
, (__location__
" ctdb_dump_memory() failed\n"));
2331 talloc_free(tmp_ctx
);
2335 DEBUG(DEBUG_ERR
, ("recovery master memory dump\n"));
2337 ret
= ctdb_client_send_message(ctdb
, rd
->pnn
, rd
->srvid
, *dump
);
2339 DEBUG(DEBUG_ERR
,("Failed to send rd memdump reply message\n"));
2340 talloc_free(tmp_ctx
);
2344 talloc_free(tmp_ctx
);
2348 handler for reload_nodes
2350 static void reload_nodes_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2351 TDB_DATA data
, void *private_data
)
2353 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
, struct ctdb_recoverd
);
2355 DEBUG(DEBUG_ERR
, (__location__
" Reload nodes file from recovery daemon\n"));
2357 ctdb_load_nodes_file(rec
->ctdb
);
2361 static void ctdb_rebalance_timeout(struct event_context
*ev
,
2362 struct timed_event
*te
,
2363 struct timeval t
, void *p
)
2365 struct ctdb_recoverd
*rec
= talloc_get_type(p
, struct ctdb_recoverd
);
2367 if (rec
->force_rebalance_nodes
== NULL
) {
2369 ("Rebalance timeout occurred - no nodes to rebalance\n"));
2374 ("Rebalance timeout occurred - do takeover run\n"));
2375 do_takeover_run(rec
, rec
->nodemap
, false);
2379 static void recd_node_rebalance_handler(struct ctdb_context
*ctdb
,
2381 TDB_DATA data
, void *private_data
)
2386 uint32_t deferred_rebalance
;
2387 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
, struct ctdb_recoverd
);
2389 if (rec
->recmaster
!= ctdb_get_pnn(ctdb
)) {
2393 if (data
.dsize
!= sizeof(uint32_t)) {
2394 DEBUG(DEBUG_ERR
,(__location__
" Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data
.dsize
, sizeof(uint32_t)));
2398 pnn
= *(uint32_t *)&data
.dptr
[0];
2400 DEBUG(DEBUG_NOTICE
,("Setting up rebalance of IPs to node %u\n", pnn
));
2402 /* Copy any existing list of nodes. There's probably some
2403 * sort of realloc variant that will do this but we need to
2404 * make sure that freeing the old array also cancels the timer
2405 * event for the timeout... not sure if realloc will do that.
2407 len
= (rec
->force_rebalance_nodes
!= NULL
) ?
2408 talloc_array_length(rec
->force_rebalance_nodes
) :
2411 /* This allows duplicates to be added but they don't cause
2412 * harm. A call to add a duplicate PNN arguably means that
2413 * the timeout should be reset, so this is the simplest
2416 t
= talloc_zero_array(rec
, uint32_t, len
+1);
2417 CTDB_NO_MEMORY_VOID(ctdb
, t
);
2419 memcpy(t
, rec
->force_rebalance_nodes
, sizeof(uint32_t) * len
);
2423 talloc_free(rec
->force_rebalance_nodes
);
2425 rec
->force_rebalance_nodes
= t
;
2427 /* If configured, setup a deferred takeover run to make sure
2428 * that certain nodes get IPs rebalanced to them. This will
2429 * be cancelled if a successful takeover run happens before
2430 * the timeout. Assign tunable value to variable for
2433 deferred_rebalance
= ctdb
->tunable
.deferred_rebalance_on_node_add
;
2434 if (deferred_rebalance
!= 0) {
2435 event_add_timed(ctdb
->ev
, rec
->force_rebalance_nodes
,
2436 timeval_current_ofs(deferred_rebalance
, 0),
2437 ctdb_rebalance_timeout
, rec
);
2443 static void recd_update_ip_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2444 TDB_DATA data
, void *private_data
)
2446 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
, struct ctdb_recoverd
);
2447 struct ctdb_public_ip
*ip
;
2449 if (rec
->recmaster
!= rec
->ctdb
->pnn
) {
2450 DEBUG(DEBUG_INFO
,("Not recmaster, ignore update ip message\n"));
2454 if (data
.dsize
!= sizeof(struct ctdb_public_ip
)) {
2455 DEBUG(DEBUG_ERR
,(__location__
" Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data
.dsize
, sizeof(struct ctdb_public_ip
)));
2459 ip
= (struct ctdb_public_ip
*)data
.dptr
;
2461 update_ip_assignment_tree(rec
->ctdb
, ip
);
2465 static void clear_takeover_runs_disable(struct ctdb_recoverd
*rec
)
2467 TALLOC_FREE(rec
->takeover_runs_disable_ctx
);
2470 static void reenable_takeover_runs(struct event_context
*ev
,
2471 struct timed_event
*te
,
2472 struct timeval yt
, void *p
)
2474 struct ctdb_recoverd
*rec
= talloc_get_type(p
, struct ctdb_recoverd
);
2476 DEBUG(DEBUG_NOTICE
,("Reenabling takeover runs after timeout\n"));
2477 clear_takeover_runs_disable(rec
);
2480 static void disable_takeover_runs_handler(struct ctdb_context
*ctdb
,
2481 uint64_t srvid
, TDB_DATA data
,
2484 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
,
2485 struct ctdb_recoverd
);
2486 struct srvid_request_data
*r
;
2491 /* Validate input data */
2492 if (data
.dsize
!= sizeof(struct srvid_request_data
)) {
2493 DEBUG(DEBUG_ERR
,(__location__
" Wrong size for data :%lu "
2494 "expecting %lu\n", (long unsigned)data
.dsize
,
2495 (long unsigned)sizeof(struct srvid_request
)));
2498 if (data
.dptr
== NULL
) {
2499 DEBUG(DEBUG_ERR
,(__location__
" No data received\n"));
2503 r
= (struct srvid_request_data
*)data
.dptr
;
2507 DEBUG(DEBUG_NOTICE
,("Reenabling takeover runs\n"));
2508 clear_takeover_runs_disable(rec
);
2509 ret
= ctdb_get_pnn(ctdb
);
2513 if (rec
->takeover_run_in_progress
) {
2515 ("Unable to disable takeover runs - in progress\n"));
2520 DEBUG(DEBUG_NOTICE
,("Disabling takeover runs for %u seconds\n", timeout
));
2522 /* Clear any old timers */
2523 clear_takeover_runs_disable(rec
);
2525 /* When this is non-NULL it indicates that takeover runs are
2526 * disabled. This context also holds the timeout timer.
2528 rec
->takeover_runs_disable_ctx
= talloc_new(rec
);
2529 if (rec
->takeover_runs_disable_ctx
== NULL
) {
2530 DEBUG(DEBUG_ERR
,(__location__
" Unable to allocate memory\n"));
2535 /* Arrange for the timeout to occur */
2536 event_add_timed(ctdb
->ev
, rec
->takeover_runs_disable_ctx
,
2537 timeval_current_ofs(timeout
, 0),
2538 reenable_takeover_runs
,
2541 /* Returning our PNN tells the caller that we succeeded */
2542 ret
= ctdb_get_pnn(ctdb
);
2544 result
.dsize
= sizeof(int32_t);
2545 result
.dptr
= (uint8_t *)&ret
;
2546 srvid_request_reply(ctdb
, (struct srvid_request
*)r
, result
);
2549 /* Backward compatibility for this SRVID - call
2550 * disable_takeover_runs_handler() instead
2552 static void disable_ip_check_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2553 TDB_DATA data
, void *private_data
)
2555 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
,
2556 struct ctdb_recoverd
);
2558 struct srvid_request_data
*req
;
2560 if (data
.dsize
!= sizeof(uint32_t)) {
2561 DEBUG(DEBUG_ERR
,(__location__
" Wrong size for data :%lu "
2562 "expecting %lu\n", (long unsigned)data
.dsize
,
2563 (long unsigned)sizeof(uint32_t)));
2566 if (data
.dptr
== NULL
) {
2567 DEBUG(DEBUG_ERR
,(__location__
" No data received\n"));
2571 req
= talloc(ctdb
, struct srvid_request_data
);
2572 CTDB_NO_MEMORY_VOID(ctdb
, req
);
2574 req
->srvid
= 0; /* No reply */
2576 req
->data
= *((uint32_t *)data
.dptr
); /* Timeout */
2578 data2
.dsize
= sizeof(*req
);
2579 data2
.dptr
= (uint8_t *)req
;
2581 disable_takeover_runs_handler(rec
->ctdb
,
2582 CTDB_SRVID_DISABLE_TAKEOVER_RUNS
,
2587 handler for ip reallocate, just add it to the list of requests and
2588 handle this later in the monitor_cluster loop so we do not recurse
2589 with other requests to takeover_run()
2591 static void ip_reallocate_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2592 TDB_DATA data
, void *private_data
)
2594 struct srvid_request
*request
;
2595 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
,
2596 struct ctdb_recoverd
);
2598 if (data
.dsize
!= sizeof(struct srvid_request
)) {
2599 DEBUG(DEBUG_ERR
, (__location__
" Wrong size of return address.\n"));
2603 request
= (struct srvid_request
*)data
.dptr
;
2605 srvid_request_add(ctdb
, &rec
->reallocate_requests
, request
);
2608 static void process_ipreallocate_requests(struct ctdb_context
*ctdb
,
2609 struct ctdb_recoverd
*rec
)
2614 struct srvid_requests
*current
;
2616 DEBUG(DEBUG_INFO
, ("recovery master forced ip reallocation\n"));
2618 /* Only process requests that are currently pending. More
2619 * might come in while the takeover run is in progress and
2620 * they will need to be processed later since they might
2621 * be in response flag changes.
2623 current
= rec
->reallocate_requests
;
2624 rec
->reallocate_requests
= NULL
;
2626 /* update the list of public ips that a node can handle for
2629 ret
= ctdb_reload_remote_public_ips(ctdb
, rec
, rec
->nodemap
, &culprit
);
2631 DEBUG(DEBUG_ERR
,("Failed to read public ips from remote node %d\n",
2633 rec
->need_takeover_run
= true;
2636 if (do_takeover_run(rec
, rec
->nodemap
, false)) {
2637 ret
= ctdb_get_pnn(ctdb
);
2643 result
.dsize
= sizeof(int32_t);
2644 result
.dptr
= (uint8_t *)&ret
;
2646 srvid_requests_reply(ctdb
, ¤t
, result
);
2651 handler for recovery master elections
2653 static void election_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2654 TDB_DATA data
, void *private_data
)
2656 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
, struct ctdb_recoverd
);
2658 struct election_message
*em
= (struct election_message
*)data
.dptr
;
2659 TALLOC_CTX
*mem_ctx
;
2661 /* Ignore election packets from ourself */
2662 if (ctdb
->pnn
== em
->pnn
) {
2666 /* we got an election packet - update the timeout for the election */
2667 talloc_free(rec
->election_timeout
);
2668 rec
->election_timeout
= event_add_timed(ctdb
->ev
, ctdb
,
2670 timeval_current_ofs(0, 500000) :
2671 timeval_current_ofs(ctdb
->tunable
.election_timeout
, 0),
2672 ctdb_election_timeout
, rec
);
2674 mem_ctx
= talloc_new(ctdb
);
2676 /* someone called an election. check their election data
2677 and if we disagree and we would rather be the elected node,
2678 send a new election message to all other nodes
2680 if (ctdb_election_win(rec
, em
)) {
2681 if (!rec
->send_election_te
) {
2682 rec
->send_election_te
= event_add_timed(ctdb
->ev
, rec
,
2683 timeval_current_ofs(0, 500000),
2684 election_send_request
, rec
);
2686 talloc_free(mem_ctx
);
2687 /*unban_all_nodes(ctdb);*/
2692 talloc_free(rec
->send_election_te
);
2693 rec
->send_election_te
= NULL
;
2695 if (ctdb
->recovery_lock_file
!= NULL
) {
2696 /* Release the recovery lock file */
2697 if (em
->pnn
!= ctdb
->pnn
&&
2698 ctdb_recovery_have_lock(ctdb
)) {
2699 ctdb_recovery_unlock(ctdb
);
2700 unban_all_nodes(ctdb
);
2704 /* ok, let that guy become recmaster then */
2705 ret
= ctdb_ctrl_setrecmaster(ctdb
, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb
), em
->pnn
);
2707 DEBUG(DEBUG_ERR
, (__location__
" failed to send recmaster election request"));
2708 talloc_free(mem_ctx
);
2712 talloc_free(mem_ctx
);
2718 force the start of the election process
2720 static void force_election(struct ctdb_recoverd
*rec
, uint32_t pnn
,
2721 struct ctdb_node_map
*nodemap
)
2724 struct ctdb_context
*ctdb
= rec
->ctdb
;
2726 DEBUG(DEBUG_INFO
,(__location__
" Force an election\n"));
2728 /* set all nodes to recovery mode to stop all internode traffic */
2729 ret
= set_recovery_mode(ctdb
, rec
, nodemap
, CTDB_RECOVERY_ACTIVE
);
2731 DEBUG(DEBUG_ERR
, (__location__
" Unable to set recovery mode to active on cluster\n"));
2735 talloc_free(rec
->election_timeout
);
2736 rec
->election_timeout
= event_add_timed(ctdb
->ev
, ctdb
,
2738 timeval_current_ofs(0, 500000) :
2739 timeval_current_ofs(ctdb
->tunable
.election_timeout
, 0),
2740 ctdb_election_timeout
, rec
);
2742 ret
= send_election_request(rec
, pnn
);
2744 DEBUG(DEBUG_ERR
, (__location__
" failed to initiate recmaster election"));
2748 /* wait for a few seconds to collect all responses */
2749 ctdb_wait_election(rec
);
2755 handler for when a node changes its flags
2757 static void monitor_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2758 TDB_DATA data
, void *private_data
)
2761 struct ctdb_node_flag_change
*c
= (struct ctdb_node_flag_change
*)data
.dptr
;
2762 struct ctdb_node_map
*nodemap
=NULL
;
2763 TALLOC_CTX
*tmp_ctx
;
2765 struct ctdb_recoverd
*rec
= talloc_get_type(private_data
, struct ctdb_recoverd
);
2766 int disabled_flag_changed
;
2768 if (data
.dsize
!= sizeof(*c
)) {
2769 DEBUG(DEBUG_ERR
,(__location__
"Invalid data in ctdb_node_flag_change\n"));
2773 tmp_ctx
= talloc_new(ctdb
);
2774 CTDB_NO_MEMORY_VOID(ctdb
, tmp_ctx
);
2776 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, tmp_ctx
, &nodemap
);
2778 DEBUG(DEBUG_ERR
,(__location__
"ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2779 talloc_free(tmp_ctx
);
2784 for (i
=0;i
<nodemap
->num
;i
++) {
2785 if (nodemap
->nodes
[i
].pnn
== c
->pnn
) break;
2788 if (i
== nodemap
->num
) {
2789 DEBUG(DEBUG_CRIT
,(__location__
"Flag change for non-existant node %u\n", c
->pnn
));
2790 talloc_free(tmp_ctx
);
2794 if (c
->old_flags
!= c
->new_flags
) {
2795 DEBUG(DEBUG_NOTICE
,("Node %u has changed flags - now 0x%x was 0x%x\n", c
->pnn
, c
->new_flags
, c
->old_flags
));
2798 disabled_flag_changed
= (nodemap
->nodes
[i
].flags
^ c
->new_flags
) & NODE_FLAGS_DISABLED
;
2800 nodemap
->nodes
[i
].flags
= c
->new_flags
;
2802 ret
= ctdb_ctrl_getrecmaster(ctdb
, tmp_ctx
, CONTROL_TIMEOUT(),
2803 CTDB_CURRENT_NODE
, &ctdb
->recovery_master
);
2806 ret
= ctdb_ctrl_getrecmode(ctdb
, tmp_ctx
, CONTROL_TIMEOUT(),
2807 CTDB_CURRENT_NODE
, &ctdb
->recovery_mode
);
2811 ctdb
->recovery_master
== ctdb
->pnn
&&
2812 ctdb
->recovery_mode
== CTDB_RECOVERY_NORMAL
) {
2813 /* Only do the takeover run if the perm disabled or unhealthy
2814 flags changed since these will cause an ip failover but not
2816 If the node became disconnected or banned this will also
2817 lead to an ip address failover but that is handled
2820 if (disabled_flag_changed
) {
2821 rec
->need_takeover_run
= true;
2825 talloc_free(tmp_ctx
);
2829 handler for when we need to push out flag changes ot all other nodes
2831 static void push_flags_handler(struct ctdb_context
*ctdb
, uint64_t srvid
,
2832 TDB_DATA data
, void *private_data
)
2835 struct ctdb_node_flag_change
*c
= (struct ctdb_node_flag_change
*)data
.dptr
;
2836 struct ctdb_node_map
*nodemap
=NULL
;
2837 TALLOC_CTX
*tmp_ctx
= talloc_new(ctdb
);
2841 /* find the recovery master */
2842 ret
= ctdb_ctrl_getrecmaster(ctdb
, tmp_ctx
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, &recmaster
);
2844 DEBUG(DEBUG_ERR
, (__location__
" Unable to get recmaster from local node\n"));
2845 talloc_free(tmp_ctx
);
2849 /* read the node flags from the recmaster */
2850 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), recmaster
, tmp_ctx
, &nodemap
);
2852 DEBUG(DEBUG_ERR
, (__location__
" Unable to get nodemap from node %u\n", c
->pnn
));
2853 talloc_free(tmp_ctx
);
2856 if (c
->pnn
>= nodemap
->num
) {
2857 DEBUG(DEBUG_ERR
,(__location__
" Nodemap from recmaster does not contain node %d\n", c
->pnn
));
2858 talloc_free(tmp_ctx
);
2862 /* send the flags update to all connected nodes */
2863 nodes
= list_of_connected_nodes(ctdb
, nodemap
, tmp_ctx
, true);
2865 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_MODIFY_FLAGS
,
2866 nodes
, 0, CONTROL_TIMEOUT(),
2870 DEBUG(DEBUG_ERR
, (__location__
" ctdb_control to modify node flags failed\n"));
2872 talloc_free(tmp_ctx
);
2876 talloc_free(tmp_ctx
);
2880 struct verify_recmode_normal_data
{
2882 enum monitor_result status
;
2885 static void verify_recmode_normal_callback(struct ctdb_client_control_state
*state
)
2887 struct verify_recmode_normal_data
*rmdata
= talloc_get_type(state
->async
.private_data
, struct verify_recmode_normal_data
);
2890 /* one more node has responded with recmode data*/
2893 /* if we failed to get the recmode, then return an error and let
2894 the main loop try again.
2896 if (state
->state
!= CTDB_CONTROL_DONE
) {
2897 if (rmdata
->status
== MONITOR_OK
) {
2898 rmdata
->status
= MONITOR_FAILED
;
2903 /* if we got a response, then the recmode will be stored in the
2906 if (state
->status
!= CTDB_RECOVERY_NORMAL
) {
2907 DEBUG(DEBUG_NOTICE
, ("Node:%u was in recovery mode. Start recovery process\n", state
->c
->hdr
.destnode
));
2908 rmdata
->status
= MONITOR_RECOVERY_NEEDED
;
2915 /* verify that all nodes are in normal recovery mode */
2916 static enum monitor_result
verify_recmode(struct ctdb_context
*ctdb
, struct ctdb_node_map
*nodemap
)
2918 struct verify_recmode_normal_data
*rmdata
;
2919 TALLOC_CTX
*mem_ctx
= talloc_new(ctdb
);
2920 struct ctdb_client_control_state
*state
;
2921 enum monitor_result status
;
2924 rmdata
= talloc(mem_ctx
, struct verify_recmode_normal_data
);
2925 CTDB_NO_MEMORY_FATAL(ctdb
, rmdata
);
2927 rmdata
->status
= MONITOR_OK
;
2929 /* loop over all active nodes and send an async getrecmode call to
2931 for (j
=0; j
<nodemap
->num
; j
++) {
2932 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
2935 state
= ctdb_ctrl_getrecmode_send(ctdb
, mem_ctx
,
2937 nodemap
->nodes
[j
].pnn
);
2938 if (state
== NULL
) {
2939 /* we failed to send the control, treat this as
2940 an error and try again next iteration
2942 DEBUG(DEBUG_ERR
,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2943 talloc_free(mem_ctx
);
2944 return MONITOR_FAILED
;
2947 /* set up the callback functions */
2948 state
->async
.fn
= verify_recmode_normal_callback
;
2949 state
->async
.private_data
= rmdata
;
2951 /* one more control to wait for to complete */
2956 /* now wait for up to the maximum number of seconds allowed
2957 or until all nodes we expect a response from has replied
2959 while (rmdata
->count
> 0) {
2960 event_loop_once(ctdb
->ev
);
2963 status
= rmdata
->status
;
2964 talloc_free(mem_ctx
);
2969 struct verify_recmaster_data
{
2970 struct ctdb_recoverd
*rec
;
2973 enum monitor_result status
;
2976 static void verify_recmaster_callback(struct ctdb_client_control_state
*state
)
2978 struct verify_recmaster_data
*rmdata
= talloc_get_type(state
->async
.private_data
, struct verify_recmaster_data
);
2981 /* one more node has responded with recmaster data*/
2984 /* if we failed to get the recmaster, then return an error and let
2985 the main loop try again.
2987 if (state
->state
!= CTDB_CONTROL_DONE
) {
2988 if (rmdata
->status
== MONITOR_OK
) {
2989 rmdata
->status
= MONITOR_FAILED
;
2994 /* if we got a response, then the recmaster will be stored in the
2997 if (state
->status
!= rmdata
->pnn
) {
2998 DEBUG(DEBUG_ERR
,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state
->c
->hdr
.destnode
, state
->status
));
2999 ctdb_set_culprit(rmdata
->rec
, state
->c
->hdr
.destnode
);
3000 rmdata
->status
= MONITOR_ELECTION_NEEDED
;
3007 /* verify that all nodes agree that we are the recmaster */
3008 static enum monitor_result
verify_recmaster(struct ctdb_recoverd
*rec
, struct ctdb_node_map
*nodemap
, uint32_t pnn
)
3010 struct ctdb_context
*ctdb
= rec
->ctdb
;
3011 struct verify_recmaster_data
*rmdata
;
3012 TALLOC_CTX
*mem_ctx
= talloc_new(ctdb
);
3013 struct ctdb_client_control_state
*state
;
3014 enum monitor_result status
;
3017 rmdata
= talloc(mem_ctx
, struct verify_recmaster_data
);
3018 CTDB_NO_MEMORY_FATAL(ctdb
, rmdata
);
3022 rmdata
->status
= MONITOR_OK
;
3024 /* loop over all active nodes and send an async getrecmaster call to
3026 for (j
=0; j
<nodemap
->num
; j
++) {
3027 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
3030 state
= ctdb_ctrl_getrecmaster_send(ctdb
, mem_ctx
,
3032 nodemap
->nodes
[j
].pnn
);
3033 if (state
== NULL
) {
3034 /* we failed to send the control, treat this as
3035 an error and try again next iteration
3037 DEBUG(DEBUG_ERR
,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
3038 talloc_free(mem_ctx
);
3039 return MONITOR_FAILED
;
3042 /* set up the callback functions */
3043 state
->async
.fn
= verify_recmaster_callback
;
3044 state
->async
.private_data
= rmdata
;
3046 /* one more control to wait for to complete */
3051 /* now wait for up to the maximum number of seconds allowed
3052 or until all nodes we expect a response from has replied
3054 while (rmdata
->count
> 0) {
3055 event_loop_once(ctdb
->ev
);
3058 status
= rmdata
->status
;
3059 talloc_free(mem_ctx
);
3063 static bool interfaces_have_changed(struct ctdb_context
*ctdb
,
3064 struct ctdb_recoverd
*rec
)
3066 struct ctdb_control_get_ifaces
*ifaces
= NULL
;
3067 TALLOC_CTX
*mem_ctx
;
3070 mem_ctx
= talloc_new(NULL
);
3072 /* Read the interfaces from the local node */
3073 if (ctdb_ctrl_get_ifaces(ctdb
, CONTROL_TIMEOUT(),
3074 CTDB_CURRENT_NODE
, mem_ctx
, &ifaces
) != 0) {
3075 DEBUG(DEBUG_ERR
, ("Unable to get interfaces from local node %u\n", ctdb
->pnn
));
3076 /* We could return an error. However, this will be
3077 * rare so we'll decide that the interfaces have
3078 * actually changed, just in case.
3080 talloc_free(mem_ctx
);
3085 /* We haven't been here before so things have changed */
3086 DEBUG(DEBUG_NOTICE
, ("Initial interface fetched\n"));
3088 } else if (rec
->ifaces
->num
!= ifaces
->num
) {
3089 /* Number of interfaces has changed */
3090 DEBUG(DEBUG_NOTICE
, ("Interface count changed from %d to %d\n",
3091 rec
->ifaces
->num
, ifaces
->num
));
3094 /* See if interface names or link states have changed */
3096 for (i
= 0; i
< rec
->ifaces
->num
; i
++) {
3097 struct ctdb_control_iface_info
* iface
= &rec
->ifaces
->ifaces
[i
];
3098 if (strcmp(iface
->name
, ifaces
->ifaces
[i
].name
) != 0) {
3100 ("Interface in slot %d changed: %s => %s\n",
3101 i
, iface
->name
, ifaces
->ifaces
[i
].name
));
3105 if (iface
->link_state
!= ifaces
->ifaces
[i
].link_state
) {
3107 ("Interface %s changed state: %d => %d\n",
3108 iface
->name
, iface
->link_state
,
3109 ifaces
->ifaces
[i
].link_state
));
3116 talloc_free(rec
->ifaces
);
3117 rec
->ifaces
= talloc_steal(rec
, ifaces
);
3119 talloc_free(mem_ctx
);
3123 /* called to check that the local allocation of public ip addresses is ok.
3125 static int verify_local_ip_allocation(struct ctdb_context
*ctdb
, struct ctdb_recoverd
*rec
, uint32_t pnn
, struct ctdb_node_map
*nodemap
)
3127 TALLOC_CTX
*mem_ctx
= talloc_new(NULL
);
3128 struct ctdb_uptime
*uptime1
= NULL
;
3129 struct ctdb_uptime
*uptime2
= NULL
;
3131 bool need_takeover_run
= false;
3133 ret
= ctdb_ctrl_uptime(ctdb
, mem_ctx
, CONTROL_TIMEOUT(),
3134 CTDB_CURRENT_NODE
, &uptime1
);
3136 DEBUG(DEBUG_ERR
, ("Unable to get uptime from local node %u\n", pnn
));
3137 talloc_free(mem_ctx
);
3141 if (interfaces_have_changed(ctdb
, rec
)) {
3142 DEBUG(DEBUG_NOTICE
, ("The interfaces status has changed on "
3143 "local node %u - force takeover run\n",
3145 need_takeover_run
= true;
3148 ret
= ctdb_ctrl_uptime(ctdb
, mem_ctx
, CONTROL_TIMEOUT(),
3149 CTDB_CURRENT_NODE
, &uptime2
);
3151 DEBUG(DEBUG_ERR
, ("Unable to get uptime from local node %u\n", pnn
));
3152 talloc_free(mem_ctx
);
3156 /* skip the check if the startrecovery time has changed */
3157 if (timeval_compare(&uptime1
->last_recovery_started
,
3158 &uptime2
->last_recovery_started
) != 0) {
3159 DEBUG(DEBUG_NOTICE
, (__location__
" last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3160 talloc_free(mem_ctx
);
3164 /* skip the check if the endrecovery time has changed */
3165 if (timeval_compare(&uptime1
->last_recovery_finished
,
3166 &uptime2
->last_recovery_finished
) != 0) {
3167 DEBUG(DEBUG_NOTICE
, (__location__
" last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3168 talloc_free(mem_ctx
);
3172 /* skip the check if we have started but not finished recovery */
3173 if (timeval_compare(&uptime1
->last_recovery_finished
,
3174 &uptime1
->last_recovery_started
) != 1) {
3175 DEBUG(DEBUG_INFO
, (__location__
" in the middle of recovery or ip reallocation. skipping public ip address check\n"));
3176 talloc_free(mem_ctx
);
3181 /* verify that we have the ip addresses we should have
3182 and we dont have ones we shouldnt have.
3183 if we find an inconsistency we set recmode to
3184 active on the local node and wait for the recmaster
3185 to do a full blown recovery.
3186 also if the pnn is -1 and we are healthy and can host the ip
3187 we also request a ip reallocation.
3189 if (ctdb
->tunable
.disable_ip_failover
== 0) {
3190 struct ctdb_all_public_ips
*ips
= NULL
;
3192 /* read the *available* IPs from the local node */
3193 ret
= ctdb_ctrl_get_public_ips_flags(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, mem_ctx
, CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE
, &ips
);
3195 DEBUG(DEBUG_ERR
, ("Unable to get available public IPs from local node %u\n", pnn
));
3196 talloc_free(mem_ctx
);
3200 for (j
=0; j
<ips
->num
; j
++) {
3201 if (ips
->ips
[j
].pnn
== -1 &&
3202 nodemap
->nodes
[pnn
].flags
== 0) {
3203 DEBUG(DEBUG_CRIT
,("Public IP '%s' is not assigned and we could serve it\n",
3204 ctdb_addr_to_str(&ips
->ips
[j
].addr
)));
3205 need_takeover_run
= true;
3211 /* read the *known* IPs from the local node */
3212 ret
= ctdb_ctrl_get_public_ips_flags(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, mem_ctx
, 0, &ips
);
3214 DEBUG(DEBUG_ERR
, ("Unable to get known public IPs from local node %u\n", pnn
));
3215 talloc_free(mem_ctx
);
3219 for (j
=0; j
<ips
->num
; j
++) {
3220 if (ips
->ips
[j
].pnn
== pnn
) {
3221 if (ctdb
->do_checkpublicip
&& !ctdb_sys_have_ip(&ips
->ips
[j
].addr
)) {
3222 DEBUG(DEBUG_CRIT
,("Public IP '%s' is assigned to us but not on an interface\n",
3223 ctdb_addr_to_str(&ips
->ips
[j
].addr
)));
3224 need_takeover_run
= true;
3227 if (ctdb
->do_checkpublicip
&&
3228 ctdb_sys_have_ip(&ips
->ips
[j
].addr
)) {
3230 DEBUG(DEBUG_CRIT
,("We are still serving a public IP '%s' that we should not be serving. Removing it\n",
3231 ctdb_addr_to_str(&ips
->ips
[j
].addr
)));
3233 if (ctdb_ctrl_release_ip(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, &ips
->ips
[j
]) != 0) {
3234 DEBUG(DEBUG_ERR
,("Failed to release local IP address\n"));
3241 if (need_takeover_run
) {
3242 struct srvid_request rd
;
3245 DEBUG(DEBUG_CRIT
,("Trigger takeoverrun\n"));
3249 data
.dptr
= (uint8_t *)&rd
;
3250 data
.dsize
= sizeof(rd
);
3252 ret
= ctdb_client_send_message(ctdb
, rec
->recmaster
, CTDB_SRVID_TAKEOVER_RUN
, data
);
3254 DEBUG(DEBUG_ERR
,(__location__
" Failed to send ipreallocate to recmaster :%d\n", (int)rec
->recmaster
));
3257 talloc_free(mem_ctx
);
3262 static void async_getnodemap_callback(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
3264 struct ctdb_node_map
**remote_nodemaps
= callback_data
;
3266 if (node_pnn
>= ctdb
->num_nodes
) {
3267 DEBUG(DEBUG_ERR
,(__location__
" pnn from invalid node\n"));
3271 remote_nodemaps
[node_pnn
] = (struct ctdb_node_map
*)talloc_steal(remote_nodemaps
, outdata
.dptr
);
3275 static int get_remote_nodemaps(struct ctdb_context
*ctdb
, TALLOC_CTX
*mem_ctx
,
3276 struct ctdb_node_map
*nodemap
,
3277 struct ctdb_node_map
**remote_nodemaps
)
3281 nodes
= list_of_active_nodes(ctdb
, nodemap
, mem_ctx
, true);
3282 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_GET_NODEMAP
,
3284 CONTROL_TIMEOUT(), false, tdb_null
,
3285 async_getnodemap_callback
,
3287 remote_nodemaps
) != 0) {
3288 DEBUG(DEBUG_ERR
, (__location__
" Unable to pull all remote nodemaps\n"));
3296 static int update_recovery_lock_file(struct ctdb_context
*ctdb
)
3298 TALLOC_CTX
*tmp_ctx
= talloc_new(NULL
);
3299 const char *reclockfile
;
3301 if (ctdb_ctrl_getreclock(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, tmp_ctx
, &reclockfile
) != 0) {
3302 DEBUG(DEBUG_ERR
,("Failed to read reclock file from daemon\n"));
3303 talloc_free(tmp_ctx
);
3307 if (reclockfile
== NULL
) {
3308 if (ctdb
->recovery_lock_file
!= NULL
) {
3309 DEBUG(DEBUG_NOTICE
,("Recovery lock file disabled\n"));
3310 talloc_free(ctdb
->recovery_lock_file
);
3311 ctdb
->recovery_lock_file
= NULL
;
3312 ctdb_recovery_unlock(ctdb
);
3314 talloc_free(tmp_ctx
);
3318 if (ctdb
->recovery_lock_file
== NULL
) {
3320 ("Recovery lock file enabled (%s)\n", reclockfile
));
3321 ctdb
->recovery_lock_file
= talloc_strdup(ctdb
, reclockfile
);
3322 ctdb_recovery_unlock(ctdb
);
3323 talloc_free(tmp_ctx
);
3328 if (!strcmp(reclockfile
, ctdb
->recovery_lock_file
)) {
3329 talloc_free(tmp_ctx
);
3334 ("Recovery lock file changed (now %s)\n", reclockfile
));
3335 talloc_free(ctdb
->recovery_lock_file
);
3336 ctdb
->recovery_lock_file
= talloc_strdup(ctdb
, reclockfile
);
3337 ctdb_recovery_unlock(ctdb
);
3339 talloc_free(tmp_ctx
);
3343 static void main_loop(struct ctdb_context
*ctdb
, struct ctdb_recoverd
*rec
,
3344 TALLOC_CTX
*mem_ctx
)
3347 struct ctdb_node_map
*nodemap
=NULL
;
3348 struct ctdb_node_map
*recmaster_nodemap
=NULL
;
3349 struct ctdb_node_map
**remote_nodemaps
=NULL
;
3350 struct ctdb_vnn_map
*vnnmap
=NULL
;
3351 struct ctdb_vnn_map
*remote_vnnmap
=NULL
;
3352 int32_t debug_level
;
3357 /* verify that the main daemon is still running */
3358 if (ctdb_kill(ctdb
, ctdb
->ctdbd_pid
, 0) != 0) {
3359 DEBUG(DEBUG_CRIT
,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
3363 /* ping the local daemon to tell it we are alive */
3364 ctdb_ctrl_recd_ping(ctdb
);
3366 if (rec
->election_timeout
) {
3367 /* an election is in progress */
3371 /* read the debug level from the parent and update locally */
3372 ret
= ctdb_ctrl_get_debuglevel(ctdb
, CTDB_CURRENT_NODE
, &debug_level
);
3374 DEBUG(DEBUG_ERR
, (__location__
" Failed to read debuglevel from parent\n"));
3377 DEBUGLEVEL
= debug_level
;
3379 /* get relevant tunables */
3380 ret
= ctdb_ctrl_get_all_tunables(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, &ctdb
->tunable
);
3382 DEBUG(DEBUG_ERR
,("Failed to get tunables - retrying\n"));
3387 ret
= ctdb_ctrl_get_runstate(ctdb
, CONTROL_TIMEOUT(),
3388 CTDB_CURRENT_NODE
, &ctdb
->runstate
);
3390 DEBUG(DEBUG_ERR
, ("Failed to get runstate - retrying\n"));
3394 /* get the current recovery lock file from the server */
3395 if (update_recovery_lock_file(ctdb
) != 0) {
3396 DEBUG(DEBUG_ERR
,("Failed to update the recovery lock file\n"));
3400 /* Make sure that if recovery lock verification becomes disabled when
3403 if (ctdb
->recovery_lock_file
== NULL
) {
3404 ctdb_recovery_unlock(ctdb
);
3407 pnn
= ctdb_get_pnn(ctdb
);
3409 /* get the vnnmap */
3410 ret
= ctdb_ctrl_getvnnmap(ctdb
, CONTROL_TIMEOUT(), pnn
, mem_ctx
, &vnnmap
);
3412 DEBUG(DEBUG_ERR
, (__location__
" Unable to get vnnmap from node %u\n", pnn
));
3417 /* get number of nodes */
3419 talloc_free(rec
->nodemap
);
3420 rec
->nodemap
= NULL
;
3423 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), pnn
, rec
, &rec
->nodemap
);
3425 DEBUG(DEBUG_ERR
, (__location__
" Unable to get nodemap from node %u\n", pnn
));
3428 nodemap
= rec
->nodemap
;
3430 /* remember our own node flags */
3431 rec
->node_flags
= nodemap
->nodes
[pnn
].flags
;
3433 ban_misbehaving_nodes(rec
, &self_ban
);
3435 DEBUG(DEBUG_NOTICE
, ("This node was banned, restart main_loop\n"));
3439 /* if the local daemon is STOPPED or BANNED, we verify that the databases are
3440 also frozen and that the recmode is set to active.
3442 if (rec
->node_flags
& (NODE_FLAGS_STOPPED
| NODE_FLAGS_BANNED
)) {
3443 /* If this node has become inactive then we want to
3444 * reduce the chances of it taking over the recovery
3445 * master role when it becomes active again. This
3446 * helps to stabilise the recovery master role so that
3447 * it stays on the most stable node.
3449 rec
->priority_time
= timeval_current();
3451 ret
= ctdb_ctrl_getrecmode(ctdb
, mem_ctx
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, &ctdb
->recovery_mode
);
3453 DEBUG(DEBUG_ERR
,(__location__
" Failed to read recmode from local node\n"));
3455 if (ctdb
->recovery_mode
== CTDB_RECOVERY_NORMAL
) {
3456 DEBUG(DEBUG_ERR
,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
3458 ret
= ctdb_ctrl_setrecmode(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, CTDB_RECOVERY_ACTIVE
);
3460 DEBUG(DEBUG_ERR
,(__location__
" Failed to activate recovery mode in STOPPED or BANNED state\n"));
3464 ret
= ctdb_ctrl_freeze(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
);
3466 DEBUG(DEBUG_ERR
,(__location__
" Failed to freeze node in STOPPED or BANNED state\n"));
3471 /* If this node is stopped or banned then it is not the recovery
3472 * master, so don't do anything. This prevents stopped or banned
3473 * node from starting election and sending unnecessary controls.
3478 /* check which node is the recovery master */
3479 ret
= ctdb_ctrl_getrecmaster(ctdb
, mem_ctx
, CONTROL_TIMEOUT(), pnn
, &rec
->recmaster
);
3481 DEBUG(DEBUG_ERR
, (__location__
" Unable to get recmaster from node %u\n", pnn
));
3485 /* If we are not the recmaster then do some housekeeping */
3486 if (rec
->recmaster
!= pnn
) {
3487 /* Ignore any IP reallocate requests - only recmaster
3490 TALLOC_FREE(rec
->reallocate_requests
);
3491 /* Clear any nodes that should be force rebalanced in
3492 * the next takeover run. If the recovery master role
3493 * has moved then we don't want to process these some
3494 * time in the future.
3496 TALLOC_FREE(rec
->force_rebalance_nodes
);
3499 /* This is a special case. When recovery daemon is started, recmaster
3500 * is set to -1. If a node is not started in stopped state, then
3501 * start election to decide recovery master
3503 if (rec
->recmaster
== (uint32_t)-1) {
3504 DEBUG(DEBUG_NOTICE
,(__location__
" Initial recovery master set - forcing election\n"));
3505 force_election(rec
, pnn
, nodemap
);
3509 /* update the capabilities for all nodes */
3510 ret
= update_capabilities(ctdb
, nodemap
);
3512 DEBUG(DEBUG_ERR
, (__location__
" Unable to update node capabilities.\n"));
3517 * If the current recmaster does not have CTDB_CAP_RECMASTER,
3518 * but we have, then force an election and try to become the new
3521 if ((rec
->ctdb
->nodes
[rec
->recmaster
]->capabilities
& CTDB_CAP_RECMASTER
) == 0 &&
3522 (rec
->ctdb
->capabilities
& CTDB_CAP_RECMASTER
) &&
3523 !(nodemap
->nodes
[pnn
].flags
& NODE_FLAGS_INACTIVE
)) {
3524 DEBUG(DEBUG_ERR
, (__location__
" Current recmaster node %u does not have CAP_RECMASTER,"
3525 " but we (node %u) have - force an election\n",
3526 rec
->recmaster
, pnn
));
3527 force_election(rec
, pnn
, nodemap
);
3531 /* count how many active nodes there are */
3532 rec
->num_active
= 0;
3533 rec
->num_lmasters
= 0;
3534 rec
->num_connected
= 0;
3535 for (i
=0; i
<nodemap
->num
; i
++) {
3536 if (!(nodemap
->nodes
[i
].flags
& NODE_FLAGS_INACTIVE
)) {
3538 if (rec
->ctdb
->nodes
[i
]->capabilities
& CTDB_CAP_LMASTER
) {
3539 rec
->num_lmasters
++;
3542 if (!(nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
)) {
3543 rec
->num_connected
++;
3548 /* verify that the recmaster node is still active */
3549 for (j
=0; j
<nodemap
->num
; j
++) {
3550 if (nodemap
->nodes
[j
].pnn
==rec
->recmaster
) {
3555 if (j
== nodemap
->num
) {
3556 DEBUG(DEBUG_ERR
, ("Recmaster node %u not in list. Force reelection\n", rec
->recmaster
));
3557 force_election(rec
, pnn
, nodemap
);
3561 /* if recovery master is disconnected we must elect a new recmaster */
3562 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_DISCONNECTED
) {
3563 DEBUG(DEBUG_NOTICE
, ("Recmaster node %u is disconnected. Force reelection\n", nodemap
->nodes
[j
].pnn
));
3564 force_election(rec
, pnn
, nodemap
);
3568 /* get nodemap from the recovery master to check if it is inactive */
3569 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
3570 mem_ctx
, &recmaster_nodemap
);
3572 DEBUG(DEBUG_ERR
, (__location__
" Unable to get nodemap from recovery master %u\n",
3573 nodemap
->nodes
[j
].pnn
));
3578 if ((recmaster_nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) &&
3579 (rec
->node_flags
& NODE_FLAGS_INACTIVE
) == 0) {
3580 DEBUG(DEBUG_NOTICE
, ("Recmaster node %u no longer available. Force reelection\n", nodemap
->nodes
[j
].pnn
));
3582 * update our nodemap to carry the recmaster's notion of
3583 * its own flags, so that we don't keep freezing the
3584 * inactive recmaster node...
3586 nodemap
->nodes
[j
].flags
= recmaster_nodemap
->nodes
[j
].flags
;
3587 force_election(rec
, pnn
, nodemap
);
3591 /* verify that we have all ip addresses we should have and we dont
3592 * have addresses we shouldnt have.
3594 if (ctdb
->tunable
.disable_ip_failover
== 0 &&
3595 rec
->takeover_runs_disable_ctx
== NULL
) {
3596 if (verify_local_ip_allocation(ctdb
, rec
, pnn
, nodemap
) != 0) {
3597 DEBUG(DEBUG_ERR
, (__location__
" Public IPs were inconsistent.\n"));
3602 /* if we are not the recmaster then we do not need to check
3603 if recovery is needed
3605 if (pnn
!= rec
->recmaster
) {
3610 /* ensure our local copies of flags are right */
3611 ret
= update_local_flags(rec
, nodemap
);
3612 if (ret
== MONITOR_ELECTION_NEEDED
) {
3613 DEBUG(DEBUG_NOTICE
,("update_local_flags() called for a re-election.\n"));
3614 force_election(rec
, pnn
, nodemap
);
3617 if (ret
!= MONITOR_OK
) {
3618 DEBUG(DEBUG_ERR
,("Unable to update local flags\n"));
3622 if (ctdb
->num_nodes
!= nodemap
->num
) {
3623 DEBUG(DEBUG_ERR
, (__location__
" ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb
->num_nodes
, nodemap
->num
));
3624 ctdb_load_nodes_file(ctdb
);
3628 /* verify that all active nodes agree that we are the recmaster */
3629 switch (verify_recmaster(rec
, nodemap
, pnn
)) {
3630 case MONITOR_RECOVERY_NEEDED
:
3631 /* can not happen */
3633 case MONITOR_ELECTION_NEEDED
:
3634 force_election(rec
, pnn
, nodemap
);
3638 case MONITOR_FAILED
:
3643 if (rec
->need_recovery
) {
3644 /* a previous recovery didn't finish */
3645 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3649 /* verify that all active nodes are in normal mode
3650 and not in recovery mode
3652 switch (verify_recmode(ctdb
, nodemap
)) {
3653 case MONITOR_RECOVERY_NEEDED
:
3654 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3656 case MONITOR_FAILED
:
3658 case MONITOR_ELECTION_NEEDED
:
3659 /* can not happen */
3665 if (ctdb
->recovery_lock_file
!= NULL
) {
3666 /* We must already hold the recovery lock */
3667 if (!ctdb_recovery_have_lock(ctdb
)) {
3668 DEBUG(DEBUG_ERR
,("Failed recovery lock sanity check. Force a recovery\n"));
3669 ctdb_set_culprit(rec
, ctdb
->pnn
);
3670 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3676 /* if there are takeovers requested, perform it and notify the waiters */
3677 if (rec
->takeover_runs_disable_ctx
== NULL
&&
3678 rec
->reallocate_requests
) {
3679 process_ipreallocate_requests(ctdb
, rec
);
3682 /* get the nodemap for all active remote nodes
3684 remote_nodemaps
= talloc_array(mem_ctx
, struct ctdb_node_map
*, nodemap
->num
);
3685 if (remote_nodemaps
== NULL
) {
3686 DEBUG(DEBUG_ERR
, (__location__
" failed to allocate remote nodemap array\n"));
3689 for(i
=0; i
<nodemap
->num
; i
++) {
3690 remote_nodemaps
[i
] = NULL
;
3692 if (get_remote_nodemaps(ctdb
, mem_ctx
, nodemap
, remote_nodemaps
) != 0) {
3693 DEBUG(DEBUG_ERR
,(__location__
" Failed to read remote nodemaps\n"));
3697 /* verify that all other nodes have the same nodemap as we have
3699 for (j
=0; j
<nodemap
->num
; j
++) {
3700 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
3704 if (remote_nodemaps
[j
] == NULL
) {
3705 DEBUG(DEBUG_ERR
,(__location__
" Did not get a remote nodemap for node %d, restarting monitoring\n", j
));
3706 ctdb_set_culprit(rec
, j
);
3711 /* if the nodes disagree on how many nodes there are
3712 then this is a good reason to try recovery
3714 if (remote_nodemaps
[j
]->num
!= nodemap
->num
) {
3715 DEBUG(DEBUG_ERR
, (__location__
" Remote node:%u has different node count. %u vs %u of the local node\n",
3716 nodemap
->nodes
[j
].pnn
, remote_nodemaps
[j
]->num
, nodemap
->num
));
3717 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3718 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3722 /* if the nodes disagree on which nodes exist and are
3723 active, then that is also a good reason to do recovery
3725 for (i
=0;i
<nodemap
->num
;i
++) {
3726 if (remote_nodemaps
[j
]->nodes
[i
].pnn
!= nodemap
->nodes
[i
].pnn
) {
3727 DEBUG(DEBUG_ERR
, (__location__
" Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3728 nodemap
->nodes
[j
].pnn
, i
,
3729 remote_nodemaps
[j
]->nodes
[i
].pnn
, nodemap
->nodes
[i
].pnn
));
3730 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3731 do_recovery(rec
, mem_ctx
, pnn
, nodemap
,
3739 * Update node flags obtained from each active node. This ensure we have
3740 * up-to-date information for all the nodes.
3742 for (j
=0; j
<nodemap
->num
; j
++) {
3743 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
3746 nodemap
->nodes
[j
].flags
= remote_nodemaps
[j
]->nodes
[j
].flags
;
3749 for (j
=0; j
<nodemap
->num
; j
++) {
3750 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
3754 /* verify the flags are consistent
3756 for (i
=0; i
<nodemap
->num
; i
++) {
3757 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
) {
3761 if (nodemap
->nodes
[i
].flags
!= remote_nodemaps
[j
]->nodes
[i
].flags
) {
3762 DEBUG(DEBUG_ERR
, (__location__
" Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3763 nodemap
->nodes
[j
].pnn
,
3764 nodemap
->nodes
[i
].pnn
,
3765 remote_nodemaps
[j
]->nodes
[i
].flags
,
3766 nodemap
->nodes
[i
].flags
));
3768 DEBUG(DEBUG_ERR
,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps
[j
]->nodes
[i
].flags
, j
));
3769 update_flags_on_all_nodes(ctdb
, nodemap
, nodemap
->nodes
[i
].pnn
, remote_nodemaps
[j
]->nodes
[i
].flags
);
3770 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3771 do_recovery(rec
, mem_ctx
, pnn
, nodemap
,
3775 DEBUG(DEBUG_ERR
,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap
->nodes
[i
].flags
, i
));
3776 update_flags_on_all_nodes(ctdb
, nodemap
, nodemap
->nodes
[i
].pnn
, nodemap
->nodes
[i
].flags
);
3777 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3778 do_recovery(rec
, mem_ctx
, pnn
, nodemap
,
3787 /* There must be the same number of lmasters in the vnn map as
3788 * there are active nodes with the lmaster capability... or
3791 if (vnnmap
->size
!= rec
->num_lmasters
) {
3792 DEBUG(DEBUG_ERR
, (__location__
" The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
3793 vnnmap
->size
, rec
->num_lmasters
));
3794 ctdb_set_culprit(rec
, ctdb
->pnn
);
3795 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3799 /* verify that all active nodes in the nodemap also exist in
3802 for (j
=0; j
<nodemap
->num
; j
++) {
3803 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
3806 if (nodemap
->nodes
[j
].pnn
== pnn
) {
3810 for (i
=0; i
<vnnmap
->size
; i
++) {
3811 if (vnnmap
->map
[i
] == nodemap
->nodes
[j
].pnn
) {
3815 if (i
== vnnmap
->size
) {
3816 DEBUG(DEBUG_ERR
, (__location__
" Node %u is active in the nodemap but did not exist in the vnnmap\n",
3817 nodemap
->nodes
[j
].pnn
));
3818 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3819 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3825 /* verify that all other nodes have the same vnnmap
3826 and are from the same generation
3828 for (j
=0; j
<nodemap
->num
; j
++) {
3829 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
3832 if (nodemap
->nodes
[j
].pnn
== pnn
) {
3836 ret
= ctdb_ctrl_getvnnmap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
3837 mem_ctx
, &remote_vnnmap
);
3839 DEBUG(DEBUG_ERR
, (__location__
" Unable to get vnnmap from remote node %u\n",
3840 nodemap
->nodes
[j
].pnn
));
3844 /* verify the vnnmap generation is the same */
3845 if (vnnmap
->generation
!= remote_vnnmap
->generation
) {
3846 DEBUG(DEBUG_ERR
, (__location__
" Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3847 nodemap
->nodes
[j
].pnn
, remote_vnnmap
->generation
, vnnmap
->generation
));
3848 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3849 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3853 /* verify the vnnmap size is the same */
3854 if (vnnmap
->size
!= remote_vnnmap
->size
) {
3855 DEBUG(DEBUG_ERR
, (__location__
" Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3856 nodemap
->nodes
[j
].pnn
, remote_vnnmap
->size
, vnnmap
->size
));
3857 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3858 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3862 /* verify the vnnmap is the same */
3863 for (i
=0;i
<vnnmap
->size
;i
++) {
3864 if (remote_vnnmap
->map
[i
] != vnnmap
->map
[i
]) {
3865 DEBUG(DEBUG_ERR
, (__location__
" Remote node %u has different vnnmap.\n",
3866 nodemap
->nodes
[j
].pnn
));
3867 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3868 do_recovery(rec
, mem_ctx
, pnn
, nodemap
,
3875 /* we might need to change who has what IP assigned */
3876 if (rec
->need_takeover_run
) {
3877 uint32_t culprit
= (uint32_t)-1;
3879 rec
->need_takeover_run
= false;
3881 /* update the list of public ips that a node can handle for
3884 ret
= ctdb_reload_remote_public_ips(ctdb
, rec
, nodemap
, &culprit
);
3886 DEBUG(DEBUG_ERR
,("Failed to read public ips from remote node %d\n",
3888 rec
->need_takeover_run
= true;
3892 /* execute the "startrecovery" event script on all nodes */
3893 ret
= run_startrecovery_eventscript(rec
, nodemap
);
3895 DEBUG(DEBUG_ERR
, (__location__
" Unable to run the 'startrecovery' event on cluster\n"));
3896 ctdb_set_culprit(rec
, ctdb
->pnn
);
3897 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3901 /* If takeover run fails, then the offending nodes are
3902 * assigned ban culprit counts. And we re-try takeover.
3903 * If takeover run fails repeatedly, the node would get
3906 * If rec->need_takeover_run is not set to true at this
3907 * failure, monitoring is disabled cluster-wide (via
3908 * startrecovery eventscript) and will not get enabled.
3910 if (!do_takeover_run(rec
, nodemap
, true)) {
3914 /* execute the "recovered" event script on all nodes */
3915 ret
= run_recovered_eventscript(rec
, nodemap
, "monitor_cluster");
3917 // we cant check whether the event completed successfully
3918 // since this script WILL fail if the node is in recovery mode
3919 // and if that race happens, the code here would just cause a second
3920 // cascading recovery.
3922 DEBUG(DEBUG_ERR
, (__location__
" Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
3923 ctdb_set_culprit(rec
, ctdb
->pnn
);
3924 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3931 the main monitoring loop
3933 static void monitor_cluster(struct ctdb_context
*ctdb
)
3935 struct ctdb_recoverd
*rec
;
3937 DEBUG(DEBUG_NOTICE
,("monitor_cluster starting\n"));
3939 rec
= talloc_zero(ctdb
, struct ctdb_recoverd
);
3940 CTDB_NO_MEMORY_FATAL(ctdb
, rec
);
3944 rec
->takeover_run_in_progress
= false;
3946 rec
->priority_time
= timeval_current();
3948 /* register a message port for sending memory dumps */
3949 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_MEM_DUMP
, mem_dump_handler
, rec
);
3951 /* register a message port for recovery elections */
3952 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_RECOVERY
, election_handler
, rec
);
3954 /* when nodes are disabled/enabled */
3955 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_SET_NODE_FLAGS
, monitor_handler
, rec
);
3957 /* when we are asked to puch out a flag change */
3958 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_PUSH_NODE_FLAGS
, push_flags_handler
, rec
);
3960 /* register a message port for vacuum fetch */
3961 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_VACUUM_FETCH
, vacuum_fetch_handler
, rec
);
3963 /* register a message port for reloadnodes */
3964 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_RELOAD_NODES
, reload_nodes_handler
, rec
);
3966 /* register a message port for performing a takeover run */
3967 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_TAKEOVER_RUN
, ip_reallocate_handler
, rec
);
3969 /* register a message port for disabling the ip check for a short while */
3970 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_DISABLE_IP_CHECK
, disable_ip_check_handler
, rec
);
3972 /* register a message port for updating the recovery daemons node assignment for an ip */
3973 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_RECD_UPDATE_IP
, recd_update_ip_handler
, rec
);
3975 /* register a message port for forcing a rebalance of a node next
3977 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_REBALANCE_NODE
, recd_node_rebalance_handler
, rec
);
3979 /* Register a message port for disabling takeover runs */
3980 ctdb_client_set_message_handler(ctdb
,
3981 CTDB_SRVID_DISABLE_TAKEOVER_RUNS
,
3982 disable_takeover_runs_handler
, rec
);
3984 /* register a message port for detaching database */
3985 ctdb_client_set_message_handler(ctdb
,
3986 CTDB_SRVID_DETACH_DATABASE
,
3987 detach_database_handler
, rec
);
3990 TALLOC_CTX
*mem_ctx
= talloc_new(ctdb
);
3991 struct timeval start
;
3995 DEBUG(DEBUG_CRIT
,(__location__
3996 " Failed to create temp context\n"));
4000 start
= timeval_current();
4001 main_loop(ctdb
, rec
, mem_ctx
);
4002 talloc_free(mem_ctx
);
4004 /* we only check for recovery once every second */
4005 elapsed
= timeval_elapsed(&start
);
4006 if (elapsed
< ctdb
->tunable
.recover_interval
) {
4007 ctdb_wait_timeout(ctdb
, ctdb
->tunable
.recover_interval
4014 event handler for when the main ctdbd dies
4016 static void ctdb_recoverd_parent(struct event_context
*ev
, struct fd_event
*fde
,
4017 uint16_t flags
, void *private_data
)
4019 DEBUG(DEBUG_ALERT
,("recovery daemon parent died - exiting\n"));
4024 called regularly to verify that the recovery daemon is still running
4026 static void ctdb_check_recd(struct event_context
*ev
, struct timed_event
*te
,
4027 struct timeval yt
, void *p
)
4029 struct ctdb_context
*ctdb
= talloc_get_type(p
, struct ctdb_context
);
4031 if (ctdb_kill(ctdb
, ctdb
->recoverd_pid
, 0) != 0) {
4032 DEBUG(DEBUG_ERR
,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb
->recoverd_pid
));
4034 event_add_timed(ctdb
->ev
, ctdb
, timeval_zero(),
4035 ctdb_restart_recd
, ctdb
);
4040 event_add_timed(ctdb
->ev
, ctdb
->recd_ctx
,
4041 timeval_current_ofs(30, 0),
4042 ctdb_check_recd
, ctdb
);
4045 static void recd_sig_child_handler(struct event_context
*ev
,
4046 struct signal_event
*se
, int signum
, int count
,
4050 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4055 pid
= waitpid(-1, &status
, WNOHANG
);
4057 if (errno
!= ECHILD
) {
4058 DEBUG(DEBUG_ERR
, (__location__
" waitpid() returned error. errno:%s(%d)\n", strerror(errno
),errno
));
4063 DEBUG(DEBUG_DEBUG
, ("RECD SIGCHLD from %d\n", (int)pid
));
4069 startup the recovery daemon as a child of the main ctdb daemon
4071 int ctdb_start_recoverd(struct ctdb_context
*ctdb
)
4074 struct signal_event
*se
;
4075 struct tevent_fd
*fde
;
4077 if (pipe(fd
) != 0) {
4081 ctdb
->recoverd_pid
= ctdb_fork(ctdb
);
4082 if (ctdb
->recoverd_pid
== -1) {
4086 if (ctdb
->recoverd_pid
!= 0) {
4087 talloc_free(ctdb
->recd_ctx
);
4088 ctdb
->recd_ctx
= talloc_new(ctdb
);
4089 CTDB_NO_MEMORY(ctdb
, ctdb
->recd_ctx
);
4092 event_add_timed(ctdb
->ev
, ctdb
->recd_ctx
,
4093 timeval_current_ofs(30, 0),
4094 ctdb_check_recd
, ctdb
);
4100 srandom(getpid() ^ time(NULL
));
4102 ctdb_set_process_name("ctdb_recovered");
4103 if (switch_from_server_to_client(ctdb
, "recoverd") != 0) {
4104 DEBUG(DEBUG_CRIT
, (__location__
"ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
4108 DEBUG(DEBUG_DEBUG
, (__location__
" Created PIPE FD:%d to recovery daemon\n", fd
[0]));
4110 fde
= event_add_fd(ctdb
->ev
, ctdb
, fd
[0], EVENT_FD_READ
,
4111 ctdb_recoverd_parent
, &fd
[0]);
4112 tevent_fd_set_auto_close(fde
);
4114 /* set up a handler to pick up sigchld */
4115 se
= event_add_signal(ctdb
->ev
, ctdb
,
4117 recd_sig_child_handler
,
4120 DEBUG(DEBUG_CRIT
,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
4124 monitor_cluster(ctdb
);
4126 DEBUG(DEBUG_ALERT
,("ERROR: ctdb_recoverd finished!?\n"));
4131 shutdown the recovery daemon
4133 void ctdb_stop_recoverd(struct ctdb_context
*ctdb
)
4135 if (ctdb
->recoverd_pid
== 0) {
4139 DEBUG(DEBUG_NOTICE
,("Shutting down recovery daemon\n"));
4140 ctdb_kill(ctdb
, ctdb
->recoverd_pid
, SIGTERM
);
4142 TALLOC_FREE(ctdb
->recd_ctx
);
4143 TALLOC_FREE(ctdb
->recd_ping_count
);
4146 static void ctdb_restart_recd(struct event_context
*ev
, struct timed_event
*te
,
4147 struct timeval t
, void *private_data
)
4149 struct ctdb_context
*ctdb
= talloc_get_type(private_data
, struct ctdb_context
);
4151 DEBUG(DEBUG_ERR
,("Restarting recovery daemon\n"));
4152 ctdb_stop_recoverd(ctdb
);
4153 ctdb_start_recoverd(ctdb
);