4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "system/filesys.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/wait.h"
31 #include "lib/tdb_wrap/tdb_wrap.h"
32 #include "lib/util/dlinklist.h"
33 #include "lib/util/debug.h"
34 #include "lib/util/samba_util.h"
35 #include "lib/util/util_process.h"
37 #include "ctdb_private.h"
38 #include "ctdb_client.h"
40 #include "common/system.h"
41 #include "common/cmdline.h"
42 #include "common/common.h"
43 #include "common/logging.h"
45 #include "ctdb_cluster_mutex.h"
47 /* List of SRVID requests that need to be processed */
49 struct srvid_list
*next
, *prev
;
50 struct ctdb_srvid_message
*request
;
53 struct srvid_requests
{
54 struct srvid_list
*requests
;
57 static void srvid_request_reply(struct ctdb_context
*ctdb
,
58 struct ctdb_srvid_message
*request
,
61 /* Someone that sent srvid==0 does not want a reply */
62 if (request
->srvid
== 0) {
67 if (ctdb_client_send_message(ctdb
, request
->pnn
, request
->srvid
,
69 DEBUG(DEBUG_INFO
,("Sent SRVID reply to %u:%llu\n",
70 (unsigned)request
->pnn
,
71 (unsigned long long)request
->srvid
));
73 DEBUG(DEBUG_ERR
,("Failed to send SRVID reply to %u:%llu\n",
74 (unsigned)request
->pnn
,
75 (unsigned long long)request
->srvid
));
81 static void srvid_requests_reply(struct ctdb_context
*ctdb
,
82 struct srvid_requests
**requests
,
87 if (*requests
== NULL
) {
91 for (r
= (*requests
)->requests
; r
!= NULL
; r
= r
->next
) {
92 srvid_request_reply(ctdb
, r
->request
, result
);
95 /* Free the list structure... */
96 TALLOC_FREE(*requests
);
99 static void srvid_request_add(struct ctdb_context
*ctdb
,
100 struct srvid_requests
**requests
,
101 struct ctdb_srvid_message
*request
)
103 struct srvid_list
*t
;
107 if (*requests
== NULL
) {
108 *requests
= talloc_zero(ctdb
, struct srvid_requests
);
109 if (*requests
== NULL
) {
114 t
= talloc_zero(*requests
, struct srvid_list
);
116 /* If *requests was just allocated above then free it */
117 if ((*requests
)->requests
== NULL
) {
118 TALLOC_FREE(*requests
);
123 t
->request
= (struct ctdb_srvid_message
*)talloc_steal(t
, request
);
124 DLIST_ADD((*requests
)->requests
, t
);
129 /* Failed to add the request to the list. Send a fail. */
130 DEBUG(DEBUG_ERR
, (__location__
131 " Out of memory, failed to queue SRVID request\n"));
133 result
.dsize
= sizeof(ret
);
134 result
.dptr
= (uint8_t *)&ret
;
135 srvid_request_reply(ctdb
, request
, result
);
138 /* An abstraction to allow an operation (takeover runs, recoveries,
139 * ...) to be disabled for a given timeout */
140 struct ctdb_op_state
{
141 struct tevent_timer
*timer
;
146 static struct ctdb_op_state
*ctdb_op_init(TALLOC_CTX
*mem_ctx
, const char *name
)
148 struct ctdb_op_state
*state
= talloc_zero(mem_ctx
, struct ctdb_op_state
);
151 state
->in_progress
= false;
158 static bool ctdb_op_is_disabled(struct ctdb_op_state
*state
)
160 return state
->timer
!= NULL
;
163 static bool ctdb_op_begin(struct ctdb_op_state
*state
)
165 if (ctdb_op_is_disabled(state
)) {
167 ("Unable to begin - %s are disabled\n", state
->name
));
171 state
->in_progress
= true;
175 static bool ctdb_op_end(struct ctdb_op_state
*state
)
177 return state
->in_progress
= false;
180 static bool ctdb_op_is_in_progress(struct ctdb_op_state
*state
)
182 return state
->in_progress
;
185 static void ctdb_op_enable(struct ctdb_op_state
*state
)
187 TALLOC_FREE(state
->timer
);
190 static void ctdb_op_timeout_handler(struct tevent_context
*ev
,
191 struct tevent_timer
*te
,
192 struct timeval yt
, void *p
)
194 struct ctdb_op_state
*state
=
195 talloc_get_type(p
, struct ctdb_op_state
);
197 DEBUG(DEBUG_NOTICE
,("Reenabling %s after timeout\n", state
->name
));
198 ctdb_op_enable(state
);
201 static int ctdb_op_disable(struct ctdb_op_state
*state
,
202 struct tevent_context
*ev
,
206 DEBUG(DEBUG_NOTICE
,("Reenabling %s\n", state
->name
));
207 ctdb_op_enable(state
);
211 if (state
->in_progress
) {
213 ("Unable to disable %s - in progress\n", state
->name
));
217 DEBUG(DEBUG_NOTICE
,("Disabling %s for %u seconds\n",
218 state
->name
, timeout
));
220 /* Clear any old timers */
221 talloc_free(state
->timer
);
223 /* Arrange for the timeout to occur */
224 state
->timer
= tevent_add_timer(ev
, state
,
225 timeval_current_ofs(timeout
, 0),
226 ctdb_op_timeout_handler
, state
);
227 if (state
->timer
== NULL
) {
228 DEBUG(DEBUG_ERR
,(__location__
" Unable to setup timer\n"));
235 struct ctdb_banning_state
{
237 struct timeval last_reported_time
;
241 private state of recovery daemon
243 struct ctdb_recoverd
{
244 struct ctdb_context
*ctdb
;
246 uint32_t last_culprit_node
;
247 struct ctdb_node_map_old
*nodemap
;
248 struct timeval priority_time
;
249 bool need_takeover_run
;
252 struct tevent_timer
*send_election_te
;
253 struct tevent_timer
*election_timeout
;
254 struct srvid_requests
*reallocate_requests
;
255 struct ctdb_op_state
*takeover_run
;
256 struct ctdb_op_state
*recovery
;
257 struct ctdb_iface_list_old
*ifaces
;
258 uint32_t *force_rebalance_nodes
;
259 struct ctdb_node_capabilities
*caps
;
260 bool frozen_on_inactive
;
261 struct ctdb_cluster_mutex_handle
*recovery_lock_handle
;
264 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
265 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
267 static void ctdb_restart_recd(struct tevent_context
*ev
,
268 struct tevent_timer
*te
, struct timeval t
,
272 ban a node for a period of time
274 static void ctdb_ban_node(struct ctdb_recoverd
*rec
, uint32_t pnn
, uint32_t ban_time
)
277 struct ctdb_context
*ctdb
= rec
->ctdb
;
278 struct ctdb_ban_state bantime
;
280 if (!ctdb_validate_pnn(ctdb
, pnn
)) {
281 DEBUG(DEBUG_ERR
,("Bad pnn %u in ctdb_ban_node\n", pnn
));
285 DEBUG(DEBUG_NOTICE
,("Banning node %u for %u seconds\n", pnn
, ban_time
));
288 bantime
.time
= ban_time
;
290 ret
= ctdb_ctrl_set_ban(ctdb
, CONTROL_TIMEOUT(), pnn
, &bantime
);
292 DEBUG(DEBUG_ERR
,(__location__
" Failed to ban node %d\n", pnn
));
298 enum monitor_result
{ MONITOR_OK
, MONITOR_RECOVERY_NEEDED
, MONITOR_ELECTION_NEEDED
, MONITOR_FAILED
};
302 remember the trouble maker
304 static void ctdb_set_culprit_count(struct ctdb_recoverd
*rec
, uint32_t culprit
, uint32_t count
)
306 struct ctdb_context
*ctdb
= talloc_get_type(rec
->ctdb
, struct ctdb_context
);
307 struct ctdb_banning_state
*ban_state
;
309 if (culprit
> ctdb
->num_nodes
) {
310 DEBUG(DEBUG_ERR
,("Trying to set culprit %d but num_nodes is %d\n", culprit
, ctdb
->num_nodes
));
314 /* If we are banned or stopped, do not set other nodes as culprits */
315 if (rec
->node_flags
& NODE_FLAGS_INACTIVE
) {
316 DEBUG(DEBUG_NOTICE
, ("This node is INACTIVE, cannot set culprit node %d\n", culprit
));
320 if (ctdb
->nodes
[culprit
]->ban_state
== NULL
) {
321 ctdb
->nodes
[culprit
]->ban_state
= talloc_zero(ctdb
->nodes
[culprit
], struct ctdb_banning_state
);
322 CTDB_NO_MEMORY_VOID(ctdb
, ctdb
->nodes
[culprit
]->ban_state
);
326 ban_state
= ctdb
->nodes
[culprit
]->ban_state
;
327 if (timeval_elapsed(&ban_state
->last_reported_time
) > ctdb
->tunable
.recovery_grace_period
) {
328 /* this was the first time in a long while this node
329 misbehaved so we will forgive any old transgressions.
331 ban_state
->count
= 0;
334 ban_state
->count
+= count
;
335 ban_state
->last_reported_time
= timeval_current();
336 rec
->last_culprit_node
= culprit
;
340 remember the trouble maker
342 static void ctdb_set_culprit(struct ctdb_recoverd
*rec
, uint32_t culprit
)
344 ctdb_set_culprit_count(rec
, culprit
, 1);
348 /* this callback is called for every node that failed to execute the
351 static void recovered_fail_callback(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
353 struct ctdb_recoverd
*rec
= talloc_get_type(callback_data
, struct ctdb_recoverd
);
355 DEBUG(DEBUG_ERR
, (__location__
" Node %u failed the recovered event. Setting it as recovery fail culprit\n", node_pnn
));
357 ctdb_set_culprit(rec
, node_pnn
);
361 run the "recovered" eventscript on all nodes
363 static int run_recovered_eventscript(struct ctdb_recoverd
*rec
, struct ctdb_node_map_old
*nodemap
, const char *caller
)
367 struct ctdb_context
*ctdb
= rec
->ctdb
;
369 tmp_ctx
= talloc_new(ctdb
);
370 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
372 nodes
= list_of_active_nodes(ctdb
, nodemap
, tmp_ctx
, true);
373 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_END_RECOVERY
,
375 CONTROL_TIMEOUT(), false, tdb_null
,
376 NULL
, recovered_fail_callback
,
378 DEBUG(DEBUG_ERR
, (__location__
" Unable to run the 'recovered' event when called from %s\n", caller
));
380 talloc_free(tmp_ctx
);
384 talloc_free(tmp_ctx
);
388 /* this callback is called for every node that failed to execute the
391 static void startrecovery_fail_callback(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
393 struct ctdb_recoverd
*rec
= talloc_get_type(callback_data
, struct ctdb_recoverd
);
395 DEBUG(DEBUG_ERR
, (__location__
" Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn
));
397 ctdb_set_culprit(rec
, node_pnn
);
401 run the "startrecovery" eventscript on all nodes
403 static int run_startrecovery_eventscript(struct ctdb_recoverd
*rec
, struct ctdb_node_map_old
*nodemap
)
407 struct ctdb_context
*ctdb
= rec
->ctdb
;
409 tmp_ctx
= talloc_new(ctdb
);
410 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
412 nodes
= list_of_active_nodes(ctdb
, nodemap
, tmp_ctx
, true);
413 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_START_RECOVERY
,
415 CONTROL_TIMEOUT(), false, tdb_null
,
417 startrecovery_fail_callback
,
419 DEBUG(DEBUG_ERR
, (__location__
" Unable to run the 'startrecovery' event. Recovery failed.\n"));
420 talloc_free(tmp_ctx
);
424 talloc_free(tmp_ctx
);
429 Retrieve capabilities from all connected nodes
431 static int update_capabilities(struct ctdb_recoverd
*rec
,
432 struct ctdb_node_map_old
*nodemap
)
436 struct ctdb_node_capabilities
*caps
;
437 struct ctdb_context
*ctdb
= rec
->ctdb
;
439 tmp_ctx
= talloc_new(rec
);
440 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
442 caps
= ctdb_get_capabilities(ctdb
, tmp_ctx
,
443 CONTROL_TIMEOUT(), nodemap
);
447 (__location__
" Failed to get node capabilities\n"));
448 talloc_free(tmp_ctx
);
452 capp
= ctdb_get_node_capabilities(caps
, ctdb_get_pnn(ctdb
));
456 " Capabilities don't include current node.\n"));
457 talloc_free(tmp_ctx
);
460 ctdb
->capabilities
= *capp
;
462 TALLOC_FREE(rec
->caps
);
463 rec
->caps
= talloc_steal(rec
, caps
);
465 talloc_free(tmp_ctx
);
469 static void set_recmode_fail_callback(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
471 struct ctdb_recoverd
*rec
= talloc_get_type(callback_data
, struct ctdb_recoverd
);
473 DEBUG(DEBUG_ERR
,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn
, rec
->nodemap
->num
));
474 ctdb_set_culprit_count(rec
, node_pnn
, rec
->nodemap
->num
);
477 static void transaction_start_fail_callback(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
479 struct ctdb_recoverd
*rec
= talloc_get_type(callback_data
, struct ctdb_recoverd
);
481 DEBUG(DEBUG_ERR
,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn
, rec
->nodemap
->num
));
482 ctdb_set_culprit_count(rec
, node_pnn
, rec
->nodemap
->num
);
486 change recovery mode on all nodes
488 static int set_recovery_mode(struct ctdb_context
*ctdb
,
489 struct ctdb_recoverd
*rec
,
490 struct ctdb_node_map_old
*nodemap
,
491 uint32_t rec_mode
, bool freeze
)
497 tmp_ctx
= talloc_new(ctdb
);
498 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
500 nodes
= list_of_active_nodes(ctdb
, nodemap
, tmp_ctx
, true);
502 data
.dsize
= sizeof(uint32_t);
503 data
.dptr
= (unsigned char *)&rec_mode
;
505 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_SET_RECMODE
,
511 DEBUG(DEBUG_ERR
, (__location__
" Unable to set recovery mode. Recovery failed.\n"));
512 talloc_free(tmp_ctx
);
516 /* freeze all nodes */
517 if (freeze
&& rec_mode
== CTDB_RECOVERY_ACTIVE
) {
520 for (i
=1; i
<=NUM_DB_PRIORITIES
; i
++) {
521 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_FREEZE
,
526 set_recmode_fail_callback
,
528 DEBUG(DEBUG_ERR
, (__location__
" Unable to freeze nodes. Recovery failed.\n"));
529 talloc_free(tmp_ctx
);
535 talloc_free(tmp_ctx
);
539 /* update all remote nodes to use the same db priority that we have
540 this can fail if the remove node has not yet been upgraded to
541 support this function, so we always return success and never fail
542 a recovery if this call fails.
544 static int update_db_priority_on_remote_nodes(struct ctdb_context
*ctdb
,
545 struct ctdb_node_map_old
*nodemap
,
546 uint32_t pnn
, struct ctdb_dbid_map_old
*dbmap
, TALLOC_CTX
*mem_ctx
)
550 /* step through all local databases */
551 for (db
=0; db
<dbmap
->num
;db
++) {
552 struct ctdb_db_priority db_prio
;
555 db_prio
.db_id
= dbmap
->dbs
[db
].db_id
;
556 ret
= ctdb_ctrl_get_db_priority(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, dbmap
->dbs
[db
].db_id
, &db_prio
.priority
);
558 DEBUG(DEBUG_ERR
,(__location__
" Failed to read database priority from local node for db 0x%08x\n", dbmap
->dbs
[db
].db_id
));
562 DEBUG(DEBUG_INFO
,("Update DB priority for db 0x%08x to %u\n", dbmap
->dbs
[db
].db_id
, db_prio
.priority
));
564 ret
= ctdb_ctrl_set_db_priority(ctdb
, CONTROL_TIMEOUT(),
565 CTDB_CURRENT_NODE
, &db_prio
);
567 DEBUG(DEBUG_ERR
,(__location__
" Failed to set DB priority for 0x%08x\n",
576 ensure all other nodes have attached to any databases that we have
578 static int create_missing_remote_databases(struct ctdb_context
*ctdb
, struct ctdb_node_map_old
*nodemap
,
579 uint32_t pnn
, struct ctdb_dbid_map_old
*dbmap
, TALLOC_CTX
*mem_ctx
)
582 struct ctdb_dbid_map_old
*remote_dbmap
;
584 /* verify that all other nodes have all our databases */
585 for (j
=0; j
<nodemap
->num
; j
++) {
586 /* we don't need to ourself ourselves */
587 if (nodemap
->nodes
[j
].pnn
== pnn
) {
590 /* don't check nodes that are unavailable */
591 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
595 ret
= ctdb_ctrl_getdbmap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
596 mem_ctx
, &remote_dbmap
);
598 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbids from node %u\n", pnn
));
602 /* step through all local databases */
603 for (db
=0; db
<dbmap
->num
;db
++) {
607 for (i
=0;i
<remote_dbmap
->num
;i
++) {
608 if (dbmap
->dbs
[db
].db_id
== remote_dbmap
->dbs
[i
].db_id
) {
612 /* the remote node already have this database */
613 if (i
!=remote_dbmap
->num
) {
616 /* ok so we need to create this database */
617 ret
= ctdb_ctrl_getdbname(ctdb
, CONTROL_TIMEOUT(), pnn
,
618 dbmap
->dbs
[db
].db_id
, mem_ctx
,
621 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbname from node %u\n", pnn
));
624 ret
= ctdb_ctrl_createdb(ctdb
, CONTROL_TIMEOUT(),
625 nodemap
->nodes
[j
].pnn
,
627 dbmap
->dbs
[db
].flags
& CTDB_DB_FLAGS_PERSISTENT
);
629 DEBUG(DEBUG_ERR
, (__location__
" Unable to create remote db:%s\n", name
));
640 ensure we are attached to any databases that anyone else is attached to
642 static int create_missing_local_databases(struct ctdb_context
*ctdb
, struct ctdb_node_map_old
*nodemap
,
643 uint32_t pnn
, struct ctdb_dbid_map_old
**dbmap
, TALLOC_CTX
*mem_ctx
)
646 struct ctdb_dbid_map_old
*remote_dbmap
;
648 /* verify that we have all database any other node has */
649 for (j
=0; j
<nodemap
->num
; j
++) {
650 /* we don't need to ourself ourselves */
651 if (nodemap
->nodes
[j
].pnn
== pnn
) {
654 /* don't check nodes that are unavailable */
655 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
659 ret
= ctdb_ctrl_getdbmap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
660 mem_ctx
, &remote_dbmap
);
662 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbids from node %u\n", pnn
));
666 /* step through all databases on the remote node */
667 for (db
=0; db
<remote_dbmap
->num
;db
++) {
670 for (i
=0;i
<(*dbmap
)->num
;i
++) {
671 if (remote_dbmap
->dbs
[db
].db_id
== (*dbmap
)->dbs
[i
].db_id
) {
675 /* we already have this db locally */
676 if (i
!=(*dbmap
)->num
) {
679 /* ok so we need to create this database and
682 ctdb_ctrl_getdbname(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
683 remote_dbmap
->dbs
[db
].db_id
, mem_ctx
, &name
);
685 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbname from node %u\n",
686 nodemap
->nodes
[j
].pnn
));
689 ctdb_ctrl_createdb(ctdb
, CONTROL_TIMEOUT(), pnn
, mem_ctx
, name
,
690 remote_dbmap
->dbs
[db
].flags
& CTDB_DB_FLAGS_PERSISTENT
);
692 DEBUG(DEBUG_ERR
, (__location__
" Unable to create local db:%s\n", name
));
695 ret
= ctdb_ctrl_getdbmap(ctdb
, CONTROL_TIMEOUT(), pnn
, mem_ctx
, dbmap
);
697 DEBUG(DEBUG_ERR
, (__location__
" Unable to reread dbmap on node %u\n", pnn
));
708 pull the remote database contents from one node into the recdb
710 static int pull_one_remote_database(struct ctdb_context
*ctdb
, uint32_t srcnode
,
711 struct tdb_wrap
*recdb
, uint32_t dbid
)
715 struct ctdb_marshall_buffer
*reply
;
716 struct ctdb_rec_data_old
*recdata
;
718 TALLOC_CTX
*tmp_ctx
= talloc_new(recdb
);
720 ret
= ctdb_ctrl_pulldb(ctdb
, srcnode
, dbid
, CTDB_LMASTER_ANY
, tmp_ctx
,
721 CONTROL_TIMEOUT(), &outdata
);
723 DEBUG(DEBUG_ERR
,(__location__
" Unable to copy db from node %u\n", srcnode
));
724 talloc_free(tmp_ctx
);
728 reply
= (struct ctdb_marshall_buffer
*)outdata
.dptr
;
730 if (outdata
.dsize
< offsetof(struct ctdb_marshall_buffer
, data
)) {
731 DEBUG(DEBUG_ERR
,(__location__
" invalid data in pulldb reply\n"));
732 talloc_free(tmp_ctx
);
736 recdata
= (struct ctdb_rec_data_old
*)&reply
->data
[0];
740 recdata
= (struct ctdb_rec_data_old
*)(recdata
->length
+ (uint8_t *)recdata
), i
++) {
742 struct ctdb_ltdb_header
*hdr
;
745 key
.dptr
= &recdata
->data
[0];
746 key
.dsize
= recdata
->keylen
;
747 data
.dptr
= &recdata
->data
[key
.dsize
];
748 data
.dsize
= recdata
->datalen
;
750 hdr
= (struct ctdb_ltdb_header
*)data
.dptr
;
752 if (data
.dsize
< sizeof(struct ctdb_ltdb_header
)) {
753 DEBUG(DEBUG_CRIT
,(__location__
" bad ltdb record\n"));
754 talloc_free(tmp_ctx
);
758 /* fetch the existing record, if any */
759 existing
= tdb_fetch(recdb
->tdb
, key
);
761 if (existing
.dptr
!= NULL
) {
762 struct ctdb_ltdb_header header
;
763 if (existing
.dsize
< sizeof(struct ctdb_ltdb_header
)) {
764 DEBUG(DEBUG_CRIT
,(__location__
" Bad record size %u from node %u\n",
765 (unsigned)existing
.dsize
, srcnode
));
767 talloc_free(tmp_ctx
);
770 header
= *(struct ctdb_ltdb_header
*)existing
.dptr
;
772 if (!(header
.rsn
< hdr
->rsn
||
773 (header
.dmaster
!= ctdb_get_pnn(ctdb
) &&
774 header
.rsn
== hdr
->rsn
))) {
779 if (tdb_store(recdb
->tdb
, key
, data
, TDB_REPLACE
) != 0) {
780 DEBUG(DEBUG_CRIT
,(__location__
" Failed to store record\n"));
781 talloc_free(tmp_ctx
);
786 talloc_free(tmp_ctx
);
792 struct pull_seqnum_cbdata
{
798 static void pull_seqnum_cb(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
800 struct pull_seqnum_cbdata
*cb_data
= talloc_get_type(callback_data
, struct pull_seqnum_cbdata
);
803 if (cb_data
->failed
!= 0) {
804 DEBUG(DEBUG_ERR
, ("Got seqnum from node %d but we have already failed the entire operation\n", node_pnn
));
809 DEBUG(DEBUG_ERR
, ("Error when pulling seqnum from node %d\n", node_pnn
));
814 if (outdata
.dsize
!= sizeof(uint64_t)) {
815 DEBUG(DEBUG_ERR
, ("Error when reading pull seqnum from node %d, got %d bytes but expected %d\n", node_pnn
, (int)outdata
.dsize
, (int)sizeof(uint64_t)));
816 cb_data
->failed
= -1;
820 seqnum
= *((uint64_t *)outdata
.dptr
);
822 if (seqnum
> cb_data
->seqnum
||
823 (cb_data
->pnn
== -1 && seqnum
== 0)) {
824 cb_data
->seqnum
= seqnum
;
825 cb_data
->pnn
= node_pnn
;
829 static void pull_seqnum_fail_cb(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
831 struct pull_seqnum_cbdata
*cb_data
= talloc_get_type(callback_data
, struct pull_seqnum_cbdata
);
833 DEBUG(DEBUG_ERR
, ("Failed to pull db seqnum from node %d\n", node_pnn
));
837 static int pull_highest_seqnum_pdb(struct ctdb_context
*ctdb
,
838 struct ctdb_recoverd
*rec
,
839 struct ctdb_node_map_old
*nodemap
,
840 struct tdb_wrap
*recdb
, uint32_t dbid
)
842 TALLOC_CTX
*tmp_ctx
= talloc_new(NULL
);
846 struct pull_seqnum_cbdata
*cb_data
;
848 DEBUG(DEBUG_NOTICE
, ("Scan for highest seqnum pdb for db:0x%08x\n", dbid
));
853 data
.dsize
= sizeof(outdata
);
854 data
.dptr
= (uint8_t *)&outdata
[0];
856 cb_data
= talloc(tmp_ctx
, struct pull_seqnum_cbdata
);
857 if (cb_data
== NULL
) {
858 DEBUG(DEBUG_ERR
, ("Failed to allocate pull highest seqnum cb_data structure\n"));
859 talloc_free(tmp_ctx
);
867 nodes
= list_of_active_nodes(ctdb
, nodemap
, tmp_ctx
, true);
868 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_GET_DB_SEQNUM
,
870 CONTROL_TIMEOUT(), false, data
,
874 DEBUG(DEBUG_ERR
, (__location__
" Failed to run async GET_DB_SEQNUM\n"));
876 talloc_free(tmp_ctx
);
880 if (cb_data
->failed
!= 0) {
881 DEBUG(DEBUG_NOTICE
, ("Failed to pull sequence numbers for DB 0x%08x\n", dbid
));
882 talloc_free(tmp_ctx
);
886 if (cb_data
->pnn
== -1) {
887 DEBUG(DEBUG_NOTICE
, ("Failed to find a node with highest sequence numbers for DB 0x%08x\n", dbid
));
888 talloc_free(tmp_ctx
);
892 DEBUG(DEBUG_NOTICE
, ("Pull persistent db:0x%08x from node %d with highest seqnum:%lld\n", dbid
, cb_data
->pnn
, (long long)cb_data
->seqnum
));
894 if (pull_one_remote_database(ctdb
, cb_data
->pnn
, recdb
, dbid
) != 0) {
895 DEBUG(DEBUG_ERR
, ("Failed to pull higest seqnum database 0x%08x from node %d\n", dbid
, cb_data
->pnn
));
896 talloc_free(tmp_ctx
);
900 talloc_free(tmp_ctx
);
906 pull all the remote database contents into the recdb
908 static int pull_remote_database(struct ctdb_context
*ctdb
,
909 struct ctdb_recoverd
*rec
,
910 struct ctdb_node_map_old
*nodemap
,
911 struct tdb_wrap
*recdb
, uint32_t dbid
,
916 if (persistent
&& ctdb
->tunable
.recover_pdb_by_seqnum
!= 0) {
918 ret
= pull_highest_seqnum_pdb(ctdb
, rec
, nodemap
, recdb
, dbid
);
924 /* pull all records from all other nodes across onto this node
925 (this merges based on rsn)
927 for (j
=0; j
<nodemap
->num
; j
++) {
928 /* don't merge from nodes that are unavailable */
929 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
932 if (pull_one_remote_database(ctdb
, nodemap
->nodes
[j
].pnn
, recdb
, dbid
) != 0) {
933 DEBUG(DEBUG_ERR
,(__location__
" Failed to pull remote database from node %u\n",
934 nodemap
->nodes
[j
].pnn
));
935 ctdb_set_culprit_count(rec
, nodemap
->nodes
[j
].pnn
, nodemap
->num
);
945 update flags on all active nodes
947 static int update_flags_on_all_nodes(struct ctdb_context
*ctdb
, struct ctdb_node_map_old
*nodemap
, uint32_t pnn
, uint32_t flags
)
951 ret
= ctdb_ctrl_modflags(ctdb
, CONTROL_TIMEOUT(), pnn
, flags
, ~flags
);
953 DEBUG(DEBUG_ERR
, (__location__
" Unable to update nodeflags on remote nodes\n"));
961 ensure all nodes have the same vnnmap we do
963 static int update_vnnmap_on_all_nodes(struct ctdb_context
*ctdb
, struct ctdb_node_map_old
*nodemap
,
964 uint32_t pnn
, struct ctdb_vnn_map
*vnnmap
, TALLOC_CTX
*mem_ctx
)
968 /* push the new vnn map out to all the nodes */
969 for (j
=0; j
<nodemap
->num
; j
++) {
970 /* don't push to nodes that are unavailable */
971 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
975 ret
= ctdb_ctrl_setvnnmap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
, mem_ctx
, vnnmap
);
977 DEBUG(DEBUG_ERR
, (__location__
" Unable to set vnnmap for node %u\n", pnn
));
987 called when a vacuum fetch has completed - just free it and do the next one
989 static void vacuum_fetch_callback(struct ctdb_client_call_state
*state
)
996 * Process one elements of the vacuum fetch list:
997 * Migrate it over to us with the special flag
998 * CTDB_CALL_FLAG_VACUUM_MIGRATION.
1000 static bool vacuum_fetch_process_one(struct ctdb_db_context
*ctdb_db
,
1002 struct ctdb_rec_data_old
*r
)
1004 struct ctdb_client_call_state
*state
;
1006 struct ctdb_ltdb_header
*hdr
;
1007 struct ctdb_call call
;
1010 call
.call_id
= CTDB_NULL_FUNC
;
1011 call
.flags
= CTDB_IMMEDIATE_MIGRATION
;
1012 call
.flags
|= CTDB_CALL_FLAG_VACUUM_MIGRATION
;
1014 call
.key
.dptr
= &r
->data
[0];
1015 call
.key
.dsize
= r
->keylen
;
1017 /* ensure we don't block this daemon - just skip a record if we can't get
1019 if (tdb_chainlock_nonblock(ctdb_db
->ltdb
->tdb
, call
.key
) != 0) {
1023 data
= tdb_fetch(ctdb_db
->ltdb
->tdb
, call
.key
);
1024 if (data
.dptr
== NULL
) {
1025 tdb_chainunlock(ctdb_db
->ltdb
->tdb
, call
.key
);
1029 if (data
.dsize
< sizeof(struct ctdb_ltdb_header
)) {
1031 tdb_chainunlock(ctdb_db
->ltdb
->tdb
, call
.key
);
1035 hdr
= (struct ctdb_ltdb_header
*)data
.dptr
;
1036 if (hdr
->dmaster
== pnn
) {
1037 /* its already local */
1039 tdb_chainunlock(ctdb_db
->ltdb
->tdb
, call
.key
);
1045 state
= ctdb_call_send(ctdb_db
, &call
);
1046 tdb_chainunlock(ctdb_db
->ltdb
->tdb
, call
.key
);
1047 if (state
== NULL
) {
1048 DEBUG(DEBUG_ERR
,(__location__
" Failed to setup vacuum fetch call\n"));
1051 state
->async
.fn
= vacuum_fetch_callback
;
1052 state
->async
.private_data
= NULL
;
1059 handler for vacuum fetch
1061 static void vacuum_fetch_handler(uint64_t srvid
, TDB_DATA data
,
1064 struct ctdb_recoverd
*rec
= talloc_get_type(
1065 private_data
, struct ctdb_recoverd
);
1066 struct ctdb_context
*ctdb
= rec
->ctdb
;
1067 struct ctdb_marshall_buffer
*recs
;
1069 TALLOC_CTX
*tmp_ctx
= talloc_new(ctdb
);
1071 struct ctdb_dbid_map_old
*dbmap
=NULL
;
1072 bool persistent
= false;
1073 struct ctdb_db_context
*ctdb_db
;
1074 struct ctdb_rec_data_old
*r
;
1076 recs
= (struct ctdb_marshall_buffer
*)data
.dptr
;
1078 if (recs
->count
== 0) {
1082 /* work out if the database is persistent */
1083 ret
= ctdb_ctrl_getdbmap(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, tmp_ctx
, &dbmap
);
1085 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbids from local node\n"));
1089 for (i
=0;i
<dbmap
->num
;i
++) {
1090 if (dbmap
->dbs
[i
].db_id
== recs
->db_id
) {
1091 persistent
= dbmap
->dbs
[i
].flags
& CTDB_DB_FLAGS_PERSISTENT
;
1095 if (i
== dbmap
->num
) {
1096 DEBUG(DEBUG_ERR
, (__location__
" Unable to find db_id 0x%x on local node\n", recs
->db_id
));
1100 /* find the name of this database */
1101 if (ctdb_ctrl_getdbname(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, recs
->db_id
, tmp_ctx
, &name
) != 0) {
1102 DEBUG(DEBUG_ERR
,(__location__
" Failed to get name of db 0x%x\n", recs
->db_id
));
1107 ctdb_db
= ctdb_attach(ctdb
, CONTROL_TIMEOUT(), name
, persistent
, 0);
1108 if (ctdb_db
== NULL
) {
1109 DEBUG(DEBUG_ERR
,(__location__
" Failed to attach to database '%s'\n", name
));
1113 r
= (struct ctdb_rec_data_old
*)&recs
->data
[0];
1114 while (recs
->count
) {
1117 ok
= vacuum_fetch_process_one(ctdb_db
, rec
->ctdb
->pnn
, r
);
1122 r
= (struct ctdb_rec_data_old
*)(r
->length
+ (uint8_t *)r
);
1127 talloc_free(tmp_ctx
);
1132 * handler for database detach
1134 static void detach_database_handler(uint64_t srvid
, TDB_DATA data
,
1137 struct ctdb_recoverd
*rec
= talloc_get_type(
1138 private_data
, struct ctdb_recoverd
);
1139 struct ctdb_context
*ctdb
= rec
->ctdb
;
1141 struct ctdb_db_context
*ctdb_db
;
1143 if (data
.dsize
!= sizeof(db_id
)) {
1146 db_id
= *(uint32_t *)data
.dptr
;
1148 ctdb_db
= find_ctdb_db(ctdb
, db_id
);
1149 if (ctdb_db
== NULL
) {
1150 /* database is not attached */
1154 DLIST_REMOVE(ctdb
->db_list
, ctdb_db
);
1156 DEBUG(DEBUG_NOTICE
, ("Detached from database '%s'\n",
1158 talloc_free(ctdb_db
);
1162 called when ctdb_wait_timeout should finish
1164 static void ctdb_wait_handler(struct tevent_context
*ev
,
1165 struct tevent_timer
*te
,
1166 struct timeval yt
, void *p
)
1168 uint32_t *timed_out
= (uint32_t *)p
;
1173 wait for a given number of seconds
1175 static void ctdb_wait_timeout(struct ctdb_context
*ctdb
, double secs
)
1177 uint32_t timed_out
= 0;
1178 time_t usecs
= (secs
- (time_t)secs
) * 1000000;
1179 tevent_add_timer(ctdb
->ev
, ctdb
, timeval_current_ofs(secs
, usecs
),
1180 ctdb_wait_handler
, &timed_out
);
1181 while (!timed_out
) {
1182 tevent_loop_once(ctdb
->ev
);
1187 called when an election times out (ends)
1189 static void ctdb_election_timeout(struct tevent_context
*ev
,
1190 struct tevent_timer
*te
,
1191 struct timeval t
, void *p
)
1193 struct ctdb_recoverd
*rec
= talloc_get_type(p
, struct ctdb_recoverd
);
1194 rec
->election_timeout
= NULL
;
1197 DEBUG(DEBUG_WARNING
,("Election period ended\n"));
1202 wait for an election to finish. It finished election_timeout seconds after
1203 the last election packet is received
1205 static void ctdb_wait_election(struct ctdb_recoverd
*rec
)
1207 struct ctdb_context
*ctdb
= rec
->ctdb
;
1208 while (rec
->election_timeout
) {
1209 tevent_loop_once(ctdb
->ev
);
1214 Update our local flags from all remote connected nodes.
1215 This is only run when we are or we belive we are the recovery master
1217 static int update_local_flags(struct ctdb_recoverd
*rec
, struct ctdb_node_map_old
*nodemap
)
1220 struct ctdb_context
*ctdb
= rec
->ctdb
;
1221 TALLOC_CTX
*mem_ctx
= talloc_new(ctdb
);
1223 /* get the nodemap for all active remote nodes and verify
1224 they are the same as for this node
1226 for (j
=0; j
<nodemap
->num
; j
++) {
1227 struct ctdb_node_map_old
*remote_nodemap
=NULL
;
1230 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_DISCONNECTED
) {
1233 if (nodemap
->nodes
[j
].pnn
== ctdb
->pnn
) {
1237 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
1238 mem_ctx
, &remote_nodemap
);
1240 DEBUG(DEBUG_ERR
, (__location__
" Unable to get nodemap from remote node %u\n",
1241 nodemap
->nodes
[j
].pnn
));
1242 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
1243 talloc_free(mem_ctx
);
1246 if (nodemap
->nodes
[j
].flags
!= remote_nodemap
->nodes
[j
].flags
) {
1247 /* We should tell our daemon about this so it
1248 updates its flags or else we will log the same
1249 message again in the next iteration of recovery.
1250 Since we are the recovery master we can just as
1251 well update the flags on all nodes.
1253 ret
= ctdb_ctrl_modflags(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
, remote_nodemap
->nodes
[j
].flags
, ~remote_nodemap
->nodes
[j
].flags
);
1255 DEBUG(DEBUG_ERR
, (__location__
" Unable to update nodeflags on remote nodes\n"));
1259 /* Update our local copy of the flags in the recovery
1262 DEBUG(DEBUG_NOTICE
,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
1263 nodemap
->nodes
[j
].pnn
, remote_nodemap
->nodes
[j
].flags
,
1264 nodemap
->nodes
[j
].flags
));
1265 nodemap
->nodes
[j
].flags
= remote_nodemap
->nodes
[j
].flags
;
1267 talloc_free(remote_nodemap
);
1269 talloc_free(mem_ctx
);
1274 /* Create a new random generation id.
1275 The generation id can not be the INVALID_GENERATION id
1277 static uint32_t new_generation(void)
1279 uint32_t generation
;
1282 generation
= random();
1284 if (generation
!= INVALID_GENERATION
) {
1294 create a temporary working database
1296 static struct tdb_wrap
*create_recdb(struct ctdb_context
*ctdb
, TALLOC_CTX
*mem_ctx
)
1299 struct tdb_wrap
*recdb
;
1302 /* open up the temporary recovery database */
1303 name
= talloc_asprintf(mem_ctx
, "%s/recdb.tdb.%u",
1304 ctdb
->db_directory_state
,
1311 tdb_flags
= TDB_NOLOCK
;
1312 if (ctdb
->valgrinding
) {
1313 tdb_flags
|= TDB_NOMMAP
;
1315 tdb_flags
|= (TDB_INCOMPATIBLE_HASH
| TDB_DISALLOW_NESTING
);
1317 recdb
= tdb_wrap_open(mem_ctx
, name
, ctdb
->tunable
.database_hash_size
,
1318 tdb_flags
, O_RDWR
|O_CREAT
|O_EXCL
, 0600);
1319 if (recdb
== NULL
) {
1320 DEBUG(DEBUG_CRIT
,(__location__
" Failed to create temp recovery database '%s'\n", name
));
1330 a traverse function for pulling all relevant records from recdb
1333 struct ctdb_context
*ctdb
;
1334 struct ctdb_marshall_buffer
*recdata
;
1336 uint32_t allocated_len
;
1341 static int traverse_recdb(struct tdb_context
*tdb
, TDB_DATA key
, TDB_DATA data
, void *p
)
1343 struct recdb_data
*params
= (struct recdb_data
*)p
;
1344 struct ctdb_rec_data_old
*recdata
;
1345 struct ctdb_ltdb_header
*hdr
;
1348 * skip empty records - but NOT for persistent databases:
1350 * The record-by-record mode of recovery deletes empty records.
1351 * For persistent databases, this can lead to data corruption
1352 * by deleting records that should be there:
1354 * - Assume the cluster has been running for a while.
1356 * - A record R in a persistent database has been created and
1357 * deleted a couple of times, the last operation being deletion,
1358 * leaving an empty record with a high RSN, say 10.
1360 * - Now a node N is turned off.
1362 * - This leaves the local database copy of D on N with the empty
1363 * copy of R and RSN 10. On all other nodes, the recovery has deleted
1364 * the copy of record R.
1366 * - Now the record is created again while node N is turned off.
1367 * This creates R with RSN = 1 on all nodes except for N.
1369 * - Now node N is turned on again. The following recovery will chose
1370 * the older empty copy of R due to RSN 10 > RSN 1.
1372 * ==> Hence the record is gone after the recovery.
1374 * On databases like Samba's registry, this can damage the higher-level
1375 * data structures built from the various tdb-level records.
1377 if (!params
->persistent
&& data
.dsize
<= sizeof(struct ctdb_ltdb_header
)) {
1381 /* update the dmaster field to point to us */
1382 hdr
= (struct ctdb_ltdb_header
*)data
.dptr
;
1383 if (!params
->persistent
) {
1384 hdr
->dmaster
= params
->ctdb
->pnn
;
1385 hdr
->flags
|= CTDB_REC_FLAG_MIGRATED_WITH_DATA
;
1388 /* add the record to the blob ready to send to the nodes */
1389 recdata
= ctdb_marshall_record(params
->recdata
, 0, key
, NULL
, data
);
1390 if (recdata
== NULL
) {
1391 params
->failed
= true;
1394 if (params
->len
+ recdata
->length
>= params
->allocated_len
) {
1395 params
->allocated_len
= recdata
->length
+ params
->len
+ params
->ctdb
->tunable
.pulldb_preallocation_size
;
1396 params
->recdata
= talloc_realloc_size(NULL
, params
->recdata
, params
->allocated_len
);
1398 if (params
->recdata
== NULL
) {
1399 DEBUG(DEBUG_CRIT
,(__location__
" Failed to expand recdata to %u\n",
1400 recdata
->length
+ params
->len
));
1401 params
->failed
= true;
1404 params
->recdata
->count
++;
1405 memcpy(params
->len
+(uint8_t *)params
->recdata
, recdata
, recdata
->length
);
1406 params
->len
+= recdata
->length
;
1407 talloc_free(recdata
);
1413 push the recdb database out to all nodes
1415 static int push_recdb_database(struct ctdb_context
*ctdb
, uint32_t dbid
,
1417 struct tdb_wrap
*recdb
, struct ctdb_node_map_old
*nodemap
)
1419 struct recdb_data params
;
1420 struct ctdb_marshall_buffer
*recdata
;
1422 TALLOC_CTX
*tmp_ctx
;
1425 tmp_ctx
= talloc_new(ctdb
);
1426 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
1428 recdata
= talloc_zero(recdb
, struct ctdb_marshall_buffer
);
1429 CTDB_NO_MEMORY(ctdb
, recdata
);
1431 recdata
->db_id
= dbid
;
1434 params
.recdata
= recdata
;
1435 params
.len
= offsetof(struct ctdb_marshall_buffer
, data
);
1436 params
.allocated_len
= params
.len
;
1437 params
.failed
= false;
1438 params
.persistent
= persistent
;
1440 if (tdb_traverse_read(recdb
->tdb
, traverse_recdb
, ¶ms
) == -1) {
1441 DEBUG(DEBUG_ERR
,(__location__
" Failed to traverse recdb database\n"));
1442 talloc_free(params
.recdata
);
1443 talloc_free(tmp_ctx
);
1447 if (params
.failed
) {
1448 DEBUG(DEBUG_ERR
,(__location__
" Failed to traverse recdb database\n"));
1449 talloc_free(params
.recdata
);
1450 talloc_free(tmp_ctx
);
1454 recdata
= params
.recdata
;
1456 outdata
.dptr
= (void *)recdata
;
1457 outdata
.dsize
= params
.len
;
1459 nodes
= list_of_active_nodes(ctdb
, nodemap
, tmp_ctx
, true);
1460 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_PUSH_DB
,
1462 CONTROL_TIMEOUT(), false, outdata
,
1465 DEBUG(DEBUG_ERR
,(__location__
" Failed to push recdb records to nodes for db 0x%x\n", dbid
));
1466 talloc_free(recdata
);
1467 talloc_free(tmp_ctx
);
1471 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - pushed remote database 0x%x of size %u\n",
1472 dbid
, recdata
->count
));
1474 talloc_free(recdata
);
1475 talloc_free(tmp_ctx
);
1482 go through a full recovery on one database
1484 static int recover_database(struct ctdb_recoverd
*rec
,
1485 TALLOC_CTX
*mem_ctx
,
1489 struct ctdb_node_map_old
*nodemap
,
1490 uint32_t transaction_id
)
1492 struct tdb_wrap
*recdb
;
1494 struct ctdb_context
*ctdb
= rec
->ctdb
;
1496 struct ctdb_transdb w
;
1499 recdb
= create_recdb(ctdb
, mem_ctx
);
1500 if (recdb
== NULL
) {
1504 /* pull all remote databases onto the recdb */
1505 ret
= pull_remote_database(ctdb
, rec
, nodemap
, recdb
, dbid
, persistent
);
1507 DEBUG(DEBUG_ERR
, (__location__
" Unable to pull remote database 0x%x\n", dbid
));
1511 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - pulled remote database 0x%x\n", dbid
));
1513 /* wipe all the remote databases. This is safe as we are in a transaction */
1515 w
.tid
= transaction_id
;
1517 data
.dptr
= (void *)&w
;
1518 data
.dsize
= sizeof(w
);
1520 nodes
= list_of_active_nodes(ctdb
, nodemap
, recdb
, true);
1521 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_WIPE_DATABASE
,
1523 CONTROL_TIMEOUT(), false, data
,
1526 DEBUG(DEBUG_ERR
, (__location__
" Unable to wipe database. Recovery failed.\n"));
1531 /* push out the correct database. This sets the dmaster and skips
1532 the empty records */
1533 ret
= push_recdb_database(ctdb
, dbid
, persistent
, recdb
, nodemap
);
1539 /* all done with this database */
1545 static bool ctdb_recovery_have_lock(struct ctdb_recoverd
*rec
)
1547 return (rec
->recovery_lock_handle
!= NULL
);
1550 struct hold_reclock_state
{
1556 static void take_reclock_handler(char status
,
1560 struct hold_reclock_state
*s
=
1561 (struct hold_reclock_state
*) private_data
;
1565 s
->latency
= latency
;
1570 ("Unable to take recovery lock - contention\n"));
1574 DEBUG(DEBUG_ERR
, ("ERROR: when taking recovery lock\n"));
1578 s
->locked
= (status
== '0') ;
1581 static bool ctdb_recovery_lock(struct ctdb_recoverd
*rec
);
1583 static void lost_reclock_handler(void *private_data
)
1585 struct ctdb_recoverd
*rec
= talloc_get_type_abort(
1586 private_data
, struct ctdb_recoverd
);
1589 ("Recovery lock helper terminated unexpectedly - "
1590 "trying to retake recovery lock\n"));
1591 TALLOC_FREE(rec
->recovery_lock_handle
);
1592 if (! ctdb_recovery_lock(rec
)) {
1593 DEBUG(DEBUG_ERR
, ("Failed to take recovery lock\n"));
1597 static bool ctdb_recovery_lock(struct ctdb_recoverd
*rec
)
1599 struct ctdb_context
*ctdb
= rec
->ctdb
;
1600 struct ctdb_cluster_mutex_handle
*h
;
1601 struct hold_reclock_state s
= {
1607 h
= ctdb_cluster_mutex(rec
, ctdb
, ctdb
->recovery_lock
, 0,
1608 take_reclock_handler
, &s
,
1609 lost_reclock_handler
, rec
);
1615 tevent_loop_once(ctdb
->ev
);
1623 rec
->recovery_lock_handle
= h
;
1624 ctdb_ctrl_report_recd_lock_latency(ctdb
, CONTROL_TIMEOUT(),
1630 static void ctdb_recovery_unlock(struct ctdb_recoverd
*rec
)
1632 if (rec
->recovery_lock_handle
!= NULL
) {
1633 DEBUG(DEBUG_NOTICE
, ("Releasing recovery lock\n"));
1634 TALLOC_FREE(rec
->recovery_lock_handle
);
1638 static void ban_misbehaving_nodes(struct ctdb_recoverd
*rec
, bool *self_ban
)
1640 struct ctdb_context
*ctdb
= rec
->ctdb
;
1642 struct ctdb_banning_state
*ban_state
;
1645 for (i
=0; i
<ctdb
->num_nodes
; i
++) {
1646 if (ctdb
->nodes
[i
]->ban_state
== NULL
) {
1649 ban_state
= (struct ctdb_banning_state
*)ctdb
->nodes
[i
]->ban_state
;
1650 if (ban_state
->count
< 2*ctdb
->num_nodes
) {
1654 DEBUG(DEBUG_NOTICE
,("Node %u reached %u banning credits - banning it for %u seconds\n",
1655 ctdb
->nodes
[i
]->pnn
, ban_state
->count
,
1656 ctdb
->tunable
.recovery_ban_period
));
1657 ctdb_ban_node(rec
, ctdb
->nodes
[i
]->pnn
, ctdb
->tunable
.recovery_ban_period
);
1658 ban_state
->count
= 0;
1660 /* Banning ourself? */
1661 if (ctdb
->nodes
[i
]->pnn
== rec
->ctdb
->pnn
) {
1667 static bool do_takeover_run(struct ctdb_recoverd
*rec
,
1668 struct ctdb_node_map_old
*nodemap
)
1670 uint32_t *nodes
= NULL
;
1671 struct ctdb_disable_message dtr
;
1674 uint32_t *rebalance_nodes
= rec
->force_rebalance_nodes
;
1678 DEBUG(DEBUG_NOTICE
, ("Takeover run starting\n"));
1680 if (ctdb_op_is_in_progress(rec
->takeover_run
)) {
1681 DEBUG(DEBUG_ERR
, (__location__
1682 " takeover run already in progress \n"));
1687 if (!ctdb_op_begin(rec
->takeover_run
)) {
1692 /* Disable IP checks (takeover runs, really) on other nodes
1693 * while doing this takeover run. This will stop those other
1694 * nodes from triggering takeover runs when think they should
1695 * be hosting an IP but it isn't yet on an interface. Don't
1696 * wait for replies since a failure here might cause some
1697 * noise in the logs but will not actually cause a problem.
1700 dtr
.srvid
= 0; /* No reply */
1703 data
.dptr
= (uint8_t*)&dtr
;
1704 data
.dsize
= sizeof(dtr
);
1706 nodes
= list_of_connected_nodes(rec
->ctdb
, nodemap
, rec
, false);
1708 /* Disable for 60 seconds. This can be a tunable later if
1712 for (i
= 0; i
< talloc_array_length(nodes
); i
++) {
1713 if (ctdb_client_send_message(rec
->ctdb
, nodes
[i
],
1714 CTDB_SRVID_DISABLE_TAKEOVER_RUNS
,
1716 DEBUG(DEBUG_INFO
,("Failed to disable takeover runs\n"));
1720 ret
= ctdb_takeover_run(rec
->ctdb
, nodemap
,
1721 rec
->force_rebalance_nodes
);
1723 /* Reenable takeover runs and IP checks on other nodes */
1725 for (i
= 0; i
< talloc_array_length(nodes
); i
++) {
1726 if (ctdb_client_send_message(rec
->ctdb
, nodes
[i
],
1727 CTDB_SRVID_DISABLE_TAKEOVER_RUNS
,
1729 DEBUG(DEBUG_INFO
,("Failed to re-enable takeover runs\n"));
1734 DEBUG(DEBUG_ERR
, ("ctdb_takeover_run() failed\n"));
1740 /* Takeover run was successful so clear force rebalance targets */
1741 if (rebalance_nodes
== rec
->force_rebalance_nodes
) {
1742 TALLOC_FREE(rec
->force_rebalance_nodes
);
1744 DEBUG(DEBUG_WARNING
,
1745 ("Rebalance target nodes changed during takeover run - not clearing\n"));
1748 rec
->need_takeover_run
= !ok
;
1750 ctdb_op_end(rec
->takeover_run
);
1752 DEBUG(DEBUG_NOTICE
, ("Takeover run %s\n", ok
? "completed successfully" : "unsuccessful"));
1756 struct recovery_helper_state
{
1763 static void ctdb_recovery_handler(struct tevent_context
*ev
,
1764 struct tevent_fd
*fde
,
1765 uint16_t flags
, void *private_data
)
1767 struct recovery_helper_state
*state
= talloc_get_type_abort(
1768 private_data
, struct recovery_helper_state
);
1771 ret
= sys_read(state
->fd
[0], &state
->result
, sizeof(state
->result
));
1772 if (ret
!= sizeof(state
->result
)) {
1773 state
->result
= EPIPE
;
1780 static int db_recovery_parallel(struct ctdb_recoverd
*rec
, TALLOC_CTX
*mem_ctx
)
1782 static char prog
[PATH_MAX
+1] = "";
1784 struct recovery_helper_state
*state
;
1785 struct tevent_fd
*fde
;
1788 if (!ctdb_set_helper("recovery_helper", prog
, sizeof(prog
),
1789 "CTDB_RECOVERY_HELPER", CTDB_HELPER_BINDIR
,
1790 "ctdb_recovery_helper")) {
1791 ctdb_die(rec
->ctdb
, "Unable to set recovery helper\n");
1794 state
= talloc_zero(mem_ctx
, struct recovery_helper_state
);
1795 if (state
== NULL
) {
1796 DEBUG(DEBUG_ERR
, (__location__
" memory error\n"));
1802 ret
= pipe(state
->fd
);
1805 ("Failed to create pipe for recovery helper\n"));
1809 set_close_on_exec(state
->fd
[0]);
1812 args
= talloc_array(state
, const char *, nargs
);
1814 DEBUG(DEBUG_ERR
, (__location__
" memory error\n"));
1818 args
[0] = talloc_asprintf(args
, "%d", state
->fd
[1]);
1819 args
[1] = rec
->ctdb
->daemon
.name
;
1820 args
[2] = talloc_asprintf(args
, "%u", new_generation());
1823 if (args
[0] == NULL
|| args
[2] == NULL
) {
1824 DEBUG(DEBUG_ERR
, (__location__
" memory error\n"));
1828 setenv("CTDB_DBDIR_STATE", rec
->ctdb
->db_directory_state
, 1);
1830 if (!ctdb_vfork_with_logging(state
, rec
->ctdb
, "recovery", prog
, nargs
,
1831 args
, NULL
, NULL
, &state
->pid
)) {
1833 ("Failed to create child for recovery helper\n"));
1837 close(state
->fd
[1]);
1840 state
->done
= false;
1842 fde
= tevent_add_fd(rec
->ctdb
->ev
, rec
->ctdb
, state
->fd
[0],
1843 TEVENT_FD_READ
, ctdb_recovery_handler
, state
);
1847 tevent_fd_set_auto_close(fde
);
1849 while (!state
->done
) {
1850 tevent_loop_once(rec
->ctdb
->ev
);
1853 close(state
->fd
[0]);
1856 if (state
->result
!= 0) {
1860 ctdb_kill(rec
->ctdb
, state
->pid
, SIGKILL
);
1865 if (state
->fd
[0] != -1) {
1866 close(state
->fd
[0]);
1868 if (state
->fd
[1] != -1) {
1869 close(state
->fd
[1]);
1871 if (state
->pid
!= -1) {
1872 ctdb_kill(rec
->ctdb
, state
->pid
, SIGKILL
);
1878 static int db_recovery_serial(struct ctdb_recoverd
*rec
, TALLOC_CTX
*mem_ctx
,
1879 uint32_t pnn
, struct ctdb_node_map_old
*nodemap
,
1880 struct ctdb_vnn_map
*vnnmap
,
1881 struct ctdb_dbid_map_old
*dbmap
)
1883 struct ctdb_context
*ctdb
= rec
->ctdb
;
1884 uint32_t generation
;
1889 /* set recovery mode to active on all nodes */
1890 ret
= set_recovery_mode(ctdb
, rec
, nodemap
, CTDB_RECOVERY_ACTIVE
, true);
1892 DEBUG(DEBUG_ERR
, (__location__
" Unable to set recovery mode to active on cluster\n"));
1896 /* execute the "startrecovery" event script on all nodes */
1897 ret
= run_startrecovery_eventscript(rec
, nodemap
);
1899 DEBUG(DEBUG_ERR
, (__location__
" Unable to run the 'startrecovery' event on cluster\n"));
1903 /* pick a new generation number */
1904 generation
= new_generation();
1906 /* change the vnnmap on this node to use the new generation
1907 number but not on any other nodes.
1908 this guarantees that if we abort the recovery prematurely
1909 for some reason (a node stops responding?)
1910 that we can just return immediately and we will reenter
1911 recovery shortly again.
1912 I.e. we deliberately leave the cluster with an inconsistent
1913 generation id to allow us to abort recovery at any stage and
1914 just restart it from scratch.
1916 vnnmap
->generation
= generation
;
1917 ret
= ctdb_ctrl_setvnnmap(ctdb
, CONTROL_TIMEOUT(), pnn
, mem_ctx
, vnnmap
);
1919 DEBUG(DEBUG_ERR
, (__location__
" Unable to set vnnmap for node %u\n", pnn
));
1923 /* Database generations are updated when the transaction is commited to
1924 * the databases. So make sure to use the final generation as the
1927 generation
= new_generation();
1929 data
.dptr
= (void *)&generation
;
1930 data
.dsize
= sizeof(uint32_t);
1932 nodes
= list_of_active_nodes(ctdb
, nodemap
, mem_ctx
, true);
1933 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_TRANSACTION_START
,
1935 CONTROL_TIMEOUT(), false, data
,
1937 transaction_start_fail_callback
,
1939 DEBUG(DEBUG_ERR
, (__location__
" Unable to start transactions. Recovery failed.\n"));
1940 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_TRANSACTION_CANCEL
,
1942 CONTROL_TIMEOUT(), false, tdb_null
,
1946 DEBUG(DEBUG_ERR
,("Failed to cancel recovery transaction\n"));
1951 DEBUG(DEBUG_NOTICE
,(__location__
" started transactions on all nodes\n"));
1953 for (i
=0;i
<dbmap
->num
;i
++) {
1954 ret
= recover_database(rec
, mem_ctx
,
1955 dbmap
->dbs
[i
].db_id
,
1956 dbmap
->dbs
[i
].flags
& CTDB_DB_FLAGS_PERSISTENT
,
1957 pnn
, nodemap
, generation
);
1959 DEBUG(DEBUG_ERR
, (__location__
" Failed to recover database 0x%x\n", dbmap
->dbs
[i
].db_id
));
1964 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - starting database commits\n"));
1966 /* commit all the changes */
1967 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_TRANSACTION_COMMIT
,
1969 CONTROL_TIMEOUT(), false, data
,
1972 DEBUG(DEBUG_ERR
, (__location__
" Unable to commit recovery changes. Recovery failed.\n"));
1976 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - committed databases\n"));
1978 /* build a new vnn map with all the currently active and
1980 vnnmap
= talloc(mem_ctx
, struct ctdb_vnn_map
);
1981 CTDB_NO_MEMORY(ctdb
, vnnmap
);
1982 vnnmap
->generation
= generation
;
1984 vnnmap
->map
= talloc_zero_array(vnnmap
, uint32_t, vnnmap
->size
);
1985 CTDB_NO_MEMORY(ctdb
, vnnmap
->map
);
1986 for (i
=j
=0;i
<nodemap
->num
;i
++) {
1987 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_INACTIVE
) {
1990 if (!ctdb_node_has_capabilities(rec
->caps
,
1991 ctdb
->nodes
[i
]->pnn
,
1992 CTDB_CAP_LMASTER
)) {
1993 /* this node can not be an lmaster */
1994 DEBUG(DEBUG_DEBUG
, ("Node %d cant be a LMASTER, skipping it\n", i
));
1999 vnnmap
->map
= talloc_realloc(vnnmap
, vnnmap
->map
, uint32_t, vnnmap
->size
);
2000 CTDB_NO_MEMORY(ctdb
, vnnmap
->map
);
2001 vnnmap
->map
[j
++] = nodemap
->nodes
[i
].pnn
;
2004 if (vnnmap
->size
== 0) {
2005 DEBUG(DEBUG_NOTICE
, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
2007 vnnmap
->map
= talloc_realloc(vnnmap
, vnnmap
->map
, uint32_t, vnnmap
->size
);
2008 CTDB_NO_MEMORY(ctdb
, vnnmap
->map
);
2009 vnnmap
->map
[0] = pnn
;
2012 /* update to the new vnnmap on all nodes */
2013 ret
= update_vnnmap_on_all_nodes(ctdb
, nodemap
, pnn
, vnnmap
, mem_ctx
);
2015 DEBUG(DEBUG_ERR
, (__location__
" Unable to update vnnmap on all nodes\n"));
2019 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - updated vnnmap\n"));
2021 /* disable recovery mode */
2022 ret
= set_recovery_mode(ctdb
, rec
, nodemap
, CTDB_RECOVERY_NORMAL
, false);
2024 DEBUG(DEBUG_ERR
, (__location__
" Unable to set recovery mode to normal on cluster\n"));
2028 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - disabled recovery mode\n"));
2030 /* execute the "recovered" event script on all nodes */
2031 ret
= run_recovered_eventscript(rec
, nodemap
, "do_recovery");
2033 DEBUG(DEBUG_ERR
, (__location__
" Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
2037 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - finished the recovered event\n"));
2043 we are the recmaster, and recovery is needed - start a recovery run
2045 static int do_recovery(struct ctdb_recoverd
*rec
,
2046 TALLOC_CTX
*mem_ctx
, uint32_t pnn
,
2047 struct ctdb_node_map_old
*nodemap
, struct ctdb_vnn_map
*vnnmap
)
2049 struct ctdb_context
*ctdb
= rec
->ctdb
;
2051 struct ctdb_dbid_map_old
*dbmap
;
2055 DEBUG(DEBUG_NOTICE
, (__location__
" Starting do_recovery\n"));
2057 /* Check if the current node is still the recmaster. It's possible that
2058 * re-election has changed the recmaster.
2060 if (pnn
!= rec
->recmaster
) {
2062 ("Recovery master changed to %u, aborting recovery\n",
2067 /* if recovery fails, force it again */
2068 rec
->need_recovery
= true;
2070 if (!ctdb_op_begin(rec
->recovery
)) {
2074 if (rec
->election_timeout
) {
2075 /* an election is in progress */
2076 DEBUG(DEBUG_ERR
, ("do_recovery called while election in progress - try again later\n"));
2080 ban_misbehaving_nodes(rec
, &self_ban
);
2082 DEBUG(DEBUG_NOTICE
, ("This node was banned, aborting recovery\n"));
2086 if (ctdb
->recovery_lock
!= NULL
) {
2087 if (ctdb_recovery_have_lock(rec
)) {
2088 DEBUG(DEBUG_NOTICE
, ("Already holding recovery lock\n"));
2090 DEBUG(DEBUG_NOTICE
, ("Attempting to take recovery lock (%s)\n",
2091 ctdb
->recovery_lock
));
2092 if (!ctdb_recovery_lock(rec
)) {
2093 if (ctdb
->runstate
== CTDB_RUNSTATE_FIRST_RECOVERY
) {
2094 /* If ctdb is trying first recovery, it's
2095 * possible that current node does not know
2096 * yet who the recmaster is.
2098 DEBUG(DEBUG_ERR
, ("Unable to get recovery lock"
2099 " - retrying recovery\n"));
2103 DEBUG(DEBUG_ERR
,("Unable to get recovery lock - aborting recovery "
2104 "and ban ourself for %u seconds\n",
2105 ctdb
->tunable
.recovery_ban_period
));
2106 ctdb_ban_node(rec
, pnn
, ctdb
->tunable
.recovery_ban_period
);
2110 ("Recovery lock taken successfully by recovery daemon\n"));
2114 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery initiated due to problem with node %u\n", rec
->last_culprit_node
));
2116 /* get a list of all databases */
2117 ret
= ctdb_ctrl_getdbmap(ctdb
, CONTROL_TIMEOUT(), pnn
, mem_ctx
, &dbmap
);
2119 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbids from node :%u\n", pnn
));
2123 /* we do the db creation before we set the recovery mode, so the freeze happens
2124 on all databases we will be dealing with. */
2126 /* verify that we have all the databases any other node has */
2127 ret
= create_missing_local_databases(ctdb
, nodemap
, pnn
, &dbmap
, mem_ctx
);
2129 DEBUG(DEBUG_ERR
, (__location__
" Unable to create missing local databases\n"));
2133 /* verify that all other nodes have all our databases */
2134 ret
= create_missing_remote_databases(ctdb
, nodemap
, pnn
, dbmap
, mem_ctx
);
2136 DEBUG(DEBUG_ERR
, (__location__
" Unable to create missing remote databases\n"));
2139 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - created remote databases\n"));
2141 /* update the database priority for all remote databases */
2142 ret
= update_db_priority_on_remote_nodes(ctdb
, nodemap
, pnn
, dbmap
, mem_ctx
);
2144 DEBUG(DEBUG_ERR
, (__location__
" Unable to set db priority on remote nodes\n"));
2146 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - updated db priority for all databases\n"));
2149 /* Retrieve capabilities from all connected nodes */
2150 ret
= update_capabilities(rec
, nodemap
);
2152 DEBUG(DEBUG_ERR
, (__location__
" Unable to update node capabilities.\n"));
2157 update all nodes to have the same flags that we have
2159 for (i
=0;i
<nodemap
->num
;i
++) {
2160 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
) {
2164 ret
= update_flags_on_all_nodes(ctdb
, nodemap
, i
, nodemap
->nodes
[i
].flags
);
2166 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_INACTIVE
) {
2167 DEBUG(DEBUG_WARNING
, (__location__
"Unable to update flags on inactive node %d\n", i
));
2169 DEBUG(DEBUG_ERR
, (__location__
" Unable to update flags on all nodes for node %d\n", i
));
2175 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - updated flags\n"));
2177 /* Check if all participating nodes have parallel recovery capability */
2178 par_recovery
= true;
2179 for (i
=0; i
<nodemap
->num
; i
++) {
2180 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_INACTIVE
) {
2184 if (!(rec
->caps
[i
].capabilities
&
2185 CTDB_CAP_PARALLEL_RECOVERY
)) {
2186 par_recovery
= false;
2192 ret
= db_recovery_parallel(rec
, mem_ctx
);
2194 ret
= db_recovery_serial(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
,
2202 do_takeover_run(rec
, nodemap
);
2204 /* send a message to all clients telling them that the cluster
2205 has been reconfigured */
2206 ret
= ctdb_client_send_message(ctdb
, CTDB_BROADCAST_CONNECTED
,
2207 CTDB_SRVID_RECONFIGURE
, tdb_null
);
2209 DEBUG(DEBUG_ERR
, (__location__
" Failed to send reconfigure message\n"));
2213 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery complete\n"));
2215 rec
->need_recovery
= false;
2216 ctdb_op_end(rec
->recovery
);
2218 /* we managed to complete a full recovery, make sure to forgive
2219 any past sins by the nodes that could now participate in the
2222 DEBUG(DEBUG_ERR
,("Resetting ban count to 0 for all nodes\n"));
2223 for (i
=0;i
<nodemap
->num
;i
++) {
2224 struct ctdb_banning_state
*ban_state
;
2226 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
) {
2230 ban_state
= (struct ctdb_banning_state
*)ctdb
->nodes
[nodemap
->nodes
[i
].pnn
]->ban_state
;
2231 if (ban_state
== NULL
) {
2235 ban_state
->count
= 0;
2238 /* We just finished a recovery successfully.
2239 We now wait for rerecovery_timeout before we allow
2240 another recovery to take place.
2242 DEBUG(DEBUG_NOTICE
, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb
->tunable
.rerecovery_timeout
));
2243 ctdb_op_disable(rec
->recovery
, ctdb
->ev
,
2244 ctdb
->tunable
.rerecovery_timeout
);
2248 ctdb_op_end(rec
->recovery
);
2254 elections are won by first checking the number of connected nodes, then
2255 the priority time, then the pnn
2257 struct election_message
{
2258 uint32_t num_connected
;
2259 struct timeval priority_time
;
2261 uint32_t node_flags
;
2265 form this nodes election data
2267 static void ctdb_election_data(struct ctdb_recoverd
*rec
, struct election_message
*em
)
2270 struct ctdb_node_map_old
*nodemap
;
2271 struct ctdb_context
*ctdb
= rec
->ctdb
;
2275 em
->pnn
= rec
->ctdb
->pnn
;
2276 em
->priority_time
= rec
->priority_time
;
2278 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, rec
, &nodemap
);
2280 DEBUG(DEBUG_ERR
,(__location__
" unable to get node map\n"));
2284 rec
->node_flags
= nodemap
->nodes
[ctdb
->pnn
].flags
;
2285 em
->node_flags
= rec
->node_flags
;
2287 for (i
=0;i
<nodemap
->num
;i
++) {
2288 if (!(nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
)) {
2289 em
->num_connected
++;
2293 /* we shouldnt try to win this election if we cant be a recmaster */
2294 if ((ctdb
->capabilities
& CTDB_CAP_RECMASTER
) == 0) {
2295 em
->num_connected
= 0;
2296 em
->priority_time
= timeval_current();
2299 talloc_free(nodemap
);
2303 see if the given election data wins
2305 static bool ctdb_election_win(struct ctdb_recoverd
*rec
, struct election_message
*em
)
2307 struct election_message myem
;
2310 ctdb_election_data(rec
, &myem
);
2312 /* we cant win if we don't have the recmaster capability */
2313 if ((rec
->ctdb
->capabilities
& CTDB_CAP_RECMASTER
) == 0) {
2317 /* we cant win if we are banned */
2318 if (rec
->node_flags
& NODE_FLAGS_BANNED
) {
2322 /* we cant win if we are stopped */
2323 if (rec
->node_flags
& NODE_FLAGS_STOPPED
) {
2327 /* we will automatically win if the other node is banned */
2328 if (em
->node_flags
& NODE_FLAGS_BANNED
) {
2332 /* we will automatically win if the other node is banned */
2333 if (em
->node_flags
& NODE_FLAGS_STOPPED
) {
2337 /* then the longest running node */
2339 cmp
= timeval_compare(&em
->priority_time
, &myem
.priority_time
);
2343 cmp
= (int)myem
.pnn
- (int)em
->pnn
;
2350 send out an election request
2352 static int send_election_request(struct ctdb_recoverd
*rec
, uint32_t pnn
)
2355 TDB_DATA election_data
;
2356 struct election_message emsg
;
2358 struct ctdb_context
*ctdb
= rec
->ctdb
;
2360 srvid
= CTDB_SRVID_ELECTION
;
2362 ctdb_election_data(rec
, &emsg
);
2364 election_data
.dsize
= sizeof(struct election_message
);
2365 election_data
.dptr
= (unsigned char *)&emsg
;
2368 /* first we assume we will win the election and set
2369 recoverymaster to be ourself on the current node
2371 ret
= ctdb_ctrl_setrecmaster(ctdb
, CONTROL_TIMEOUT(),
2372 CTDB_CURRENT_NODE
, pnn
);
2374 DEBUG(DEBUG_ERR
, (__location__
" failed to set recmaster\n"));
2377 rec
->recmaster
= pnn
;
2379 /* send an election message to all active nodes */
2380 DEBUG(DEBUG_INFO
,(__location__
" Send election request to all active nodes\n"));
2381 return ctdb_client_send_message(ctdb
, CTDB_BROADCAST_ALL
, srvid
, election_data
);
2385 we think we are winning the election - send a broadcast election request
2387 static void election_send_request(struct tevent_context
*ev
,
2388 struct tevent_timer
*te
,
2389 struct timeval t
, void *p
)
2391 struct ctdb_recoverd
*rec
= talloc_get_type(p
, struct ctdb_recoverd
);
2394 ret
= send_election_request(rec
, ctdb_get_pnn(rec
->ctdb
));
2396 DEBUG(DEBUG_ERR
,("Failed to send election request!\n"));
2399 TALLOC_FREE(rec
->send_election_te
);
2403 handler for memory dumps
2405 static void mem_dump_handler(uint64_t srvid
, TDB_DATA data
, void *private_data
)
2407 struct ctdb_recoverd
*rec
= talloc_get_type(
2408 private_data
, struct ctdb_recoverd
);
2409 struct ctdb_context
*ctdb
= rec
->ctdb
;
2410 TALLOC_CTX
*tmp_ctx
= talloc_new(ctdb
);
2413 struct ctdb_srvid_message
*rd
;
2415 if (data
.dsize
!= sizeof(struct ctdb_srvid_message
)) {
2416 DEBUG(DEBUG_ERR
, (__location__
" Wrong size of return address.\n"));
2417 talloc_free(tmp_ctx
);
2420 rd
= (struct ctdb_srvid_message
*)data
.dptr
;
2422 dump
= talloc_zero(tmp_ctx
, TDB_DATA
);
2424 DEBUG(DEBUG_ERR
, (__location__
" Failed to allocate memory for memdump\n"));
2425 talloc_free(tmp_ctx
);
2428 ret
= ctdb_dump_memory(ctdb
, dump
);
2430 DEBUG(DEBUG_ERR
, (__location__
" ctdb_dump_memory() failed\n"));
2431 talloc_free(tmp_ctx
);
2435 DEBUG(DEBUG_ERR
, ("recovery master memory dump\n"));
2437 ret
= ctdb_client_send_message(ctdb
, rd
->pnn
, rd
->srvid
, *dump
);
2439 DEBUG(DEBUG_ERR
,("Failed to send rd memdump reply message\n"));
2440 talloc_free(tmp_ctx
);
2444 talloc_free(tmp_ctx
);
2448 handler for reload_nodes
2450 static void reload_nodes_handler(uint64_t srvid
, TDB_DATA data
,
2453 struct ctdb_recoverd
*rec
= talloc_get_type(
2454 private_data
, struct ctdb_recoverd
);
2456 DEBUG(DEBUG_ERR
, (__location__
" Reload nodes file from recovery daemon\n"));
2458 ctdb_load_nodes_file(rec
->ctdb
);
2462 static void recd_node_rebalance_handler(uint64_t srvid
, TDB_DATA data
,
2465 struct ctdb_recoverd
*rec
= talloc_get_type(
2466 private_data
, struct ctdb_recoverd
);
2467 struct ctdb_context
*ctdb
= rec
->ctdb
;
2472 if (rec
->recmaster
!= ctdb_get_pnn(ctdb
)) {
2476 if (data
.dsize
!= sizeof(uint32_t)) {
2477 DEBUG(DEBUG_ERR
,(__location__
" Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data
.dsize
, sizeof(uint32_t)));
2481 pnn
= *(uint32_t *)&data
.dptr
[0];
2483 DEBUG(DEBUG_NOTICE
,("Setting up rebalance of IPs to node %u\n", pnn
));
2485 /* Copy any existing list of nodes. There's probably some
2486 * sort of realloc variant that will do this but we need to
2487 * make sure that freeing the old array also cancels the timer
2488 * event for the timeout... not sure if realloc will do that.
2490 len
= (rec
->force_rebalance_nodes
!= NULL
) ?
2491 talloc_array_length(rec
->force_rebalance_nodes
) :
2494 /* This allows duplicates to be added but they don't cause
2495 * harm. A call to add a duplicate PNN arguably means that
2496 * the timeout should be reset, so this is the simplest
2499 t
= talloc_zero_array(rec
, uint32_t, len
+1);
2500 CTDB_NO_MEMORY_VOID(ctdb
, t
);
2502 memcpy(t
, rec
->force_rebalance_nodes
, sizeof(uint32_t) * len
);
2506 talloc_free(rec
->force_rebalance_nodes
);
2508 rec
->force_rebalance_nodes
= t
;
2513 static void srvid_disable_and_reply(struct ctdb_context
*ctdb
,
2515 struct ctdb_op_state
*op_state
)
2517 struct ctdb_disable_message
*r
;
2522 /* Validate input data */
2523 if (data
.dsize
!= sizeof(struct ctdb_disable_message
)) {
2524 DEBUG(DEBUG_ERR
,(__location__
" Wrong size for data :%lu "
2525 "expecting %lu\n", (long unsigned)data
.dsize
,
2526 (long unsigned)sizeof(struct ctdb_srvid_message
)));
2529 if (data
.dptr
== NULL
) {
2530 DEBUG(DEBUG_ERR
,(__location__
" No data received\n"));
2534 r
= (struct ctdb_disable_message
*)data
.dptr
;
2535 timeout
= r
->timeout
;
2537 ret
= ctdb_op_disable(op_state
, ctdb
->ev
, timeout
);
2542 /* Returning our PNN tells the caller that we succeeded */
2543 ret
= ctdb_get_pnn(ctdb
);
2545 result
.dsize
= sizeof(int32_t);
2546 result
.dptr
= (uint8_t *)&ret
;
2547 srvid_request_reply(ctdb
, (struct ctdb_srvid_message
*)r
, result
);
2550 static void disable_takeover_runs_handler(uint64_t srvid
, TDB_DATA data
,
2553 struct ctdb_recoverd
*rec
= talloc_get_type(
2554 private_data
, struct ctdb_recoverd
);
2556 srvid_disable_and_reply(rec
->ctdb
, data
, rec
->takeover_run
);
2559 /* Backward compatibility for this SRVID */
2560 static void disable_ip_check_handler(uint64_t srvid
, TDB_DATA data
,
2563 struct ctdb_recoverd
*rec
= talloc_get_type(
2564 private_data
, struct ctdb_recoverd
);
2567 if (data
.dsize
!= sizeof(uint32_t)) {
2568 DEBUG(DEBUG_ERR
,(__location__
" Wrong size for data :%lu "
2569 "expecting %lu\n", (long unsigned)data
.dsize
,
2570 (long unsigned)sizeof(uint32_t)));
2573 if (data
.dptr
== NULL
) {
2574 DEBUG(DEBUG_ERR
,(__location__
" No data received\n"));
2578 timeout
= *((uint32_t *)data
.dptr
);
2580 ctdb_op_disable(rec
->takeover_run
, rec
->ctdb
->ev
, timeout
);
2583 static void disable_recoveries_handler(uint64_t srvid
, TDB_DATA data
,
2586 struct ctdb_recoverd
*rec
= talloc_get_type(
2587 private_data
, struct ctdb_recoverd
);
2589 srvid_disable_and_reply(rec
->ctdb
, data
, rec
->recovery
);
2593 handler for ip reallocate, just add it to the list of requests and
2594 handle this later in the monitor_cluster loop so we do not recurse
2595 with other requests to takeover_run()
2597 static void ip_reallocate_handler(uint64_t srvid
, TDB_DATA data
,
2600 struct ctdb_srvid_message
*request
;
2601 struct ctdb_recoverd
*rec
= talloc_get_type(
2602 private_data
, struct ctdb_recoverd
);
2604 if (data
.dsize
!= sizeof(struct ctdb_srvid_message
)) {
2605 DEBUG(DEBUG_ERR
, (__location__
" Wrong size of return address.\n"));
2609 request
= (struct ctdb_srvid_message
*)data
.dptr
;
2611 srvid_request_add(rec
->ctdb
, &rec
->reallocate_requests
, request
);
2614 static void process_ipreallocate_requests(struct ctdb_context
*ctdb
,
2615 struct ctdb_recoverd
*rec
)
2619 struct srvid_requests
*current
;
2621 /* Only process requests that are currently pending. More
2622 * might come in while the takeover run is in progress and
2623 * they will need to be processed later since they might
2624 * be in response flag changes.
2626 current
= rec
->reallocate_requests
;
2627 rec
->reallocate_requests
= NULL
;
2629 if (do_takeover_run(rec
, rec
->nodemap
)) {
2630 ret
= ctdb_get_pnn(ctdb
);
2635 result
.dsize
= sizeof(int32_t);
2636 result
.dptr
= (uint8_t *)&ret
;
2638 srvid_requests_reply(ctdb
, ¤t
, result
);
2642 * handler for assigning banning credits
2644 static void banning_handler(uint64_t srvid
, TDB_DATA data
, void *private_data
)
2646 struct ctdb_recoverd
*rec
= talloc_get_type(
2647 private_data
, struct ctdb_recoverd
);
2650 /* Ignore if we are not recmaster */
2651 if (rec
->ctdb
->pnn
!= rec
->recmaster
) {
2655 if (data
.dsize
!= sizeof(uint32_t)) {
2656 DEBUG(DEBUG_ERR
, (__location__
"invalid data size %zu\n",
2661 ban_pnn
= *(uint32_t *)data
.dptr
;
2663 ctdb_set_culprit_count(rec
, ban_pnn
, rec
->nodemap
->num
);
2667 handler for recovery master elections
2669 static void election_handler(uint64_t srvid
, TDB_DATA data
, void *private_data
)
2671 struct ctdb_recoverd
*rec
= talloc_get_type(
2672 private_data
, struct ctdb_recoverd
);
2673 struct ctdb_context
*ctdb
= rec
->ctdb
;
2675 struct election_message
*em
= (struct election_message
*)data
.dptr
;
2677 /* Ignore election packets from ourself */
2678 if (ctdb
->pnn
== em
->pnn
) {
2682 /* we got an election packet - update the timeout for the election */
2683 talloc_free(rec
->election_timeout
);
2684 rec
->election_timeout
= tevent_add_timer(
2687 timeval_current_ofs(0, 500000) :
2688 timeval_current_ofs(ctdb
->tunable
.election_timeout
, 0),
2689 ctdb_election_timeout
, rec
);
2691 /* someone called an election. check their election data
2692 and if we disagree and we would rather be the elected node,
2693 send a new election message to all other nodes
2695 if (ctdb_election_win(rec
, em
)) {
2696 if (!rec
->send_election_te
) {
2697 rec
->send_election_te
= tevent_add_timer(
2699 timeval_current_ofs(0, 500000),
2700 election_send_request
, rec
);
2706 TALLOC_FREE(rec
->send_election_te
);
2708 /* Release the recovery lock file */
2709 if (ctdb_recovery_have_lock(rec
)) {
2710 ctdb_recovery_unlock(rec
);
2713 /* ok, let that guy become recmaster then */
2714 ret
= ctdb_ctrl_setrecmaster(ctdb
, CONTROL_TIMEOUT(),
2715 CTDB_CURRENT_NODE
, em
->pnn
);
2717 DEBUG(DEBUG_ERR
, (__location__
" failed to set recmaster"));
2720 rec
->recmaster
= em
->pnn
;
2727 force the start of the election process
2729 static void force_election(struct ctdb_recoverd
*rec
, uint32_t pnn
,
2730 struct ctdb_node_map_old
*nodemap
)
2733 struct ctdb_context
*ctdb
= rec
->ctdb
;
2735 DEBUG(DEBUG_INFO
,(__location__
" Force an election\n"));
2737 /* set all nodes to recovery mode to stop all internode traffic */
2738 ret
= set_recovery_mode(ctdb
, rec
, nodemap
, CTDB_RECOVERY_ACTIVE
, false);
2740 DEBUG(DEBUG_ERR
, (__location__
" Unable to set recovery mode to active on cluster\n"));
2744 talloc_free(rec
->election_timeout
);
2745 rec
->election_timeout
= tevent_add_timer(
2748 timeval_current_ofs(0, 500000) :
2749 timeval_current_ofs(ctdb
->tunable
.election_timeout
, 0),
2750 ctdb_election_timeout
, rec
);
2752 ret
= send_election_request(rec
, pnn
);
2754 DEBUG(DEBUG_ERR
, (__location__
" failed to initiate recmaster election"));
2758 /* wait for a few seconds to collect all responses */
2759 ctdb_wait_election(rec
);
2765 handler for when a node changes its flags
2767 static void monitor_handler(uint64_t srvid
, TDB_DATA data
, void *private_data
)
2769 struct ctdb_recoverd
*rec
= talloc_get_type(
2770 private_data
, struct ctdb_recoverd
);
2771 struct ctdb_context
*ctdb
= rec
->ctdb
;
2773 struct ctdb_node_flag_change
*c
= (struct ctdb_node_flag_change
*)data
.dptr
;
2774 struct ctdb_node_map_old
*nodemap
=NULL
;
2775 TALLOC_CTX
*tmp_ctx
;
2778 if (data
.dsize
!= sizeof(*c
)) {
2779 DEBUG(DEBUG_ERR
,(__location__
"Invalid data in ctdb_node_flag_change\n"));
2783 tmp_ctx
= talloc_new(ctdb
);
2784 CTDB_NO_MEMORY_VOID(ctdb
, tmp_ctx
);
2786 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, tmp_ctx
, &nodemap
);
2788 DEBUG(DEBUG_ERR
,(__location__
"ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2789 talloc_free(tmp_ctx
);
2794 for (i
=0;i
<nodemap
->num
;i
++) {
2795 if (nodemap
->nodes
[i
].pnn
== c
->pnn
) break;
2798 if (i
== nodemap
->num
) {
2799 DEBUG(DEBUG_CRIT
,(__location__
"Flag change for non-existant node %u\n", c
->pnn
));
2800 talloc_free(tmp_ctx
);
2804 if (c
->old_flags
!= c
->new_flags
) {
2805 DEBUG(DEBUG_NOTICE
,("Node %u has changed flags - now 0x%x was 0x%x\n", c
->pnn
, c
->new_flags
, c
->old_flags
));
2808 nodemap
->nodes
[i
].flags
= c
->new_flags
;
2810 talloc_free(tmp_ctx
);
2814 handler for when we need to push out flag changes ot all other nodes
2816 static void push_flags_handler(uint64_t srvid
, TDB_DATA data
,
2819 struct ctdb_recoverd
*rec
= talloc_get_type(
2820 private_data
, struct ctdb_recoverd
);
2821 struct ctdb_context
*ctdb
= rec
->ctdb
;
2823 struct ctdb_node_flag_change
*c
= (struct ctdb_node_flag_change
*)data
.dptr
;
2824 struct ctdb_node_map_old
*nodemap
=NULL
;
2825 TALLOC_CTX
*tmp_ctx
= talloc_new(ctdb
);
2828 /* read the node flags from the recmaster */
2829 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), rec
->recmaster
,
2832 DEBUG(DEBUG_ERR
, (__location__
" Unable to get nodemap from node %u\n", c
->pnn
));
2833 talloc_free(tmp_ctx
);
2836 if (c
->pnn
>= nodemap
->num
) {
2837 DEBUG(DEBUG_ERR
,(__location__
" Nodemap from recmaster does not contain node %d\n", c
->pnn
));
2838 talloc_free(tmp_ctx
);
2842 /* send the flags update to all connected nodes */
2843 nodes
= list_of_connected_nodes(ctdb
, nodemap
, tmp_ctx
, true);
2845 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_MODIFY_FLAGS
,
2846 nodes
, 0, CONTROL_TIMEOUT(),
2850 DEBUG(DEBUG_ERR
, (__location__
" ctdb_control to modify node flags failed\n"));
2852 talloc_free(tmp_ctx
);
2856 talloc_free(tmp_ctx
);
2860 struct verify_recmode_normal_data
{
2862 enum monitor_result status
;
2865 static void verify_recmode_normal_callback(struct ctdb_client_control_state
*state
)
2867 struct verify_recmode_normal_data
*rmdata
= talloc_get_type(state
->async
.private_data
, struct verify_recmode_normal_data
);
2870 /* one more node has responded with recmode data*/
2873 /* if we failed to get the recmode, then return an error and let
2874 the main loop try again.
2876 if (state
->state
!= CTDB_CONTROL_DONE
) {
2877 if (rmdata
->status
== MONITOR_OK
) {
2878 rmdata
->status
= MONITOR_FAILED
;
2883 /* if we got a response, then the recmode will be stored in the
2886 if (state
->status
!= CTDB_RECOVERY_NORMAL
) {
2887 DEBUG(DEBUG_NOTICE
, ("Node:%u was in recovery mode. Start recovery process\n", state
->c
->hdr
.destnode
));
2888 rmdata
->status
= MONITOR_RECOVERY_NEEDED
;
2895 /* verify that all nodes are in normal recovery mode */
2896 static enum monitor_result
verify_recmode(struct ctdb_context
*ctdb
, struct ctdb_node_map_old
*nodemap
)
2898 struct verify_recmode_normal_data
*rmdata
;
2899 TALLOC_CTX
*mem_ctx
= talloc_new(ctdb
);
2900 struct ctdb_client_control_state
*state
;
2901 enum monitor_result status
;
2904 rmdata
= talloc(mem_ctx
, struct verify_recmode_normal_data
);
2905 CTDB_NO_MEMORY_FATAL(ctdb
, rmdata
);
2907 rmdata
->status
= MONITOR_OK
;
2909 /* loop over all active nodes and send an async getrecmode call to
2911 for (j
=0; j
<nodemap
->num
; j
++) {
2912 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
2915 state
= ctdb_ctrl_getrecmode_send(ctdb
, mem_ctx
,
2917 nodemap
->nodes
[j
].pnn
);
2918 if (state
== NULL
) {
2919 /* we failed to send the control, treat this as
2920 an error and try again next iteration
2922 DEBUG(DEBUG_ERR
,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2923 talloc_free(mem_ctx
);
2924 return MONITOR_FAILED
;
2927 /* set up the callback functions */
2928 state
->async
.fn
= verify_recmode_normal_callback
;
2929 state
->async
.private_data
= rmdata
;
2931 /* one more control to wait for to complete */
2936 /* now wait for up to the maximum number of seconds allowed
2937 or until all nodes we expect a response from has replied
2939 while (rmdata
->count
> 0) {
2940 tevent_loop_once(ctdb
->ev
);
2943 status
= rmdata
->status
;
2944 talloc_free(mem_ctx
);
2949 struct verify_recmaster_data
{
2950 struct ctdb_recoverd
*rec
;
2953 enum monitor_result status
;
2956 static void verify_recmaster_callback(struct ctdb_client_control_state
*state
)
2958 struct verify_recmaster_data
*rmdata
= talloc_get_type(state
->async
.private_data
, struct verify_recmaster_data
);
2961 /* one more node has responded with recmaster data*/
2964 /* if we failed to get the recmaster, then return an error and let
2965 the main loop try again.
2967 if (state
->state
!= CTDB_CONTROL_DONE
) {
2968 if (rmdata
->status
== MONITOR_OK
) {
2969 rmdata
->status
= MONITOR_FAILED
;
2974 /* if we got a response, then the recmaster will be stored in the
2977 if (state
->status
!= rmdata
->pnn
) {
2978 DEBUG(DEBUG_ERR
,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state
->c
->hdr
.destnode
, state
->status
));
2979 ctdb_set_culprit(rmdata
->rec
, state
->c
->hdr
.destnode
);
2980 rmdata
->status
= MONITOR_ELECTION_NEEDED
;
2987 /* verify that all nodes agree that we are the recmaster */
2988 static enum monitor_result
verify_recmaster(struct ctdb_recoverd
*rec
, struct ctdb_node_map_old
*nodemap
, uint32_t pnn
)
2990 struct ctdb_context
*ctdb
= rec
->ctdb
;
2991 struct verify_recmaster_data
*rmdata
;
2992 TALLOC_CTX
*mem_ctx
= talloc_new(ctdb
);
2993 struct ctdb_client_control_state
*state
;
2994 enum monitor_result status
;
2997 rmdata
= talloc(mem_ctx
, struct verify_recmaster_data
);
2998 CTDB_NO_MEMORY_FATAL(ctdb
, rmdata
);
3002 rmdata
->status
= MONITOR_OK
;
3004 /* loop over all active nodes and send an async getrecmaster call to
3006 for (j
=0; j
<nodemap
->num
; j
++) {
3007 if (nodemap
->nodes
[j
].pnn
== rec
->recmaster
) {
3010 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
3013 state
= ctdb_ctrl_getrecmaster_send(ctdb
, mem_ctx
,
3015 nodemap
->nodes
[j
].pnn
);
3016 if (state
== NULL
) {
3017 /* we failed to send the control, treat this as
3018 an error and try again next iteration
3020 DEBUG(DEBUG_ERR
,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
3021 talloc_free(mem_ctx
);
3022 return MONITOR_FAILED
;
3025 /* set up the callback functions */
3026 state
->async
.fn
= verify_recmaster_callback
;
3027 state
->async
.private_data
= rmdata
;
3029 /* one more control to wait for to complete */
3034 /* now wait for up to the maximum number of seconds allowed
3035 or until all nodes we expect a response from has replied
3037 while (rmdata
->count
> 0) {
3038 tevent_loop_once(ctdb
->ev
);
3041 status
= rmdata
->status
;
3042 talloc_free(mem_ctx
);
3046 static bool interfaces_have_changed(struct ctdb_context
*ctdb
,
3047 struct ctdb_recoverd
*rec
)
3049 struct ctdb_iface_list_old
*ifaces
= NULL
;
3050 TALLOC_CTX
*mem_ctx
;
3053 mem_ctx
= talloc_new(NULL
);
3055 /* Read the interfaces from the local node */
3056 if (ctdb_ctrl_get_ifaces(ctdb
, CONTROL_TIMEOUT(),
3057 CTDB_CURRENT_NODE
, mem_ctx
, &ifaces
) != 0) {
3058 DEBUG(DEBUG_ERR
, ("Unable to get interfaces from local node %u\n", ctdb
->pnn
));
3059 /* We could return an error. However, this will be
3060 * rare so we'll decide that the interfaces have
3061 * actually changed, just in case.
3063 talloc_free(mem_ctx
);
3068 /* We haven't been here before so things have changed */
3069 DEBUG(DEBUG_NOTICE
, ("Initial interface fetched\n"));
3071 } else if (rec
->ifaces
->num
!= ifaces
->num
) {
3072 /* Number of interfaces has changed */
3073 DEBUG(DEBUG_NOTICE
, ("Interface count changed from %d to %d\n",
3074 rec
->ifaces
->num
, ifaces
->num
));
3077 /* See if interface names or link states have changed */
3079 for (i
= 0; i
< rec
->ifaces
->num
; i
++) {
3080 struct ctdb_iface
* iface
= &rec
->ifaces
->ifaces
[i
];
3081 if (strcmp(iface
->name
, ifaces
->ifaces
[i
].name
) != 0) {
3083 ("Interface in slot %d changed: %s => %s\n",
3084 i
, iface
->name
, ifaces
->ifaces
[i
].name
));
3088 if (iface
->link_state
!= ifaces
->ifaces
[i
].link_state
) {
3090 ("Interface %s changed state: %d => %d\n",
3091 iface
->name
, iface
->link_state
,
3092 ifaces
->ifaces
[i
].link_state
));
3099 talloc_free(rec
->ifaces
);
3100 rec
->ifaces
= talloc_steal(rec
, ifaces
);
3102 talloc_free(mem_ctx
);
3106 /* Check that the local allocation of public IP addresses is correct
3107 * and do some house-keeping */
3108 static int verify_local_ip_allocation(struct ctdb_context
*ctdb
,
3109 struct ctdb_recoverd
*rec
,
3111 struct ctdb_node_map_old
*nodemap
)
3113 TALLOC_CTX
*mem_ctx
= talloc_new(NULL
);
3115 bool need_takeover_run
= false;
3116 struct ctdb_public_ip_list_old
*ips
= NULL
;
3118 /* If we are not the recmaster then do some housekeeping */
3119 if (rec
->recmaster
!= pnn
) {
3120 /* Ignore any IP reallocate requests - only recmaster
3123 TALLOC_FREE(rec
->reallocate_requests
);
3124 /* Clear any nodes that should be force rebalanced in
3125 * the next takeover run. If the recovery master role
3126 * has moved then we don't want to process these some
3127 * time in the future.
3129 TALLOC_FREE(rec
->force_rebalance_nodes
);
3132 /* Return early if disabled... */
3133 if (ctdb
->tunable
.disable_ip_failover
!= 0 ||
3134 ctdb_op_is_disabled(rec
->takeover_run
)) {
3138 if (interfaces_have_changed(ctdb
, rec
)) {
3139 need_takeover_run
= true;
3142 /* If there are unhosted IPs but this node can host them then
3143 * trigger an IP reallocation */
3145 /* Read *available* IPs from local node */
3146 ret
= ctdb_ctrl_get_public_ips_flags(
3147 ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, mem_ctx
,
3148 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE
, &ips
);
3150 DEBUG(DEBUG_ERR
, ("Unable to retrieve available public IPs\n"));
3151 talloc_free(mem_ctx
);
3155 for (j
=0; j
<ips
->num
; j
++) {
3156 if (ips
->ips
[j
].pnn
== -1 &&
3157 nodemap
->nodes
[pnn
].flags
== 0) {
3158 DEBUG(DEBUG_WARNING
,
3159 ("Unassigned IP %s can be served by this node\n",
3160 ctdb_addr_to_str(&ips
->ips
[j
].addr
)));
3161 need_takeover_run
= true;
3167 if (!ctdb
->do_checkpublicip
) {
3171 /* Validate the IP addresses that this node has on network
3172 * interfaces. If there is an inconsistency between reality
3173 * and the state expected by CTDB then try to fix it by
3174 * triggering an IP reallocation or releasing extraneous IP
3177 /* Read *known* IPs from local node */
3178 ret
= ctdb_ctrl_get_public_ips_flags(
3179 ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, mem_ctx
, 0, &ips
);
3181 DEBUG(DEBUG_ERR
, ("Unable to retrieve known public IPs\n"));
3182 talloc_free(mem_ctx
);
3186 for (j
=0; j
<ips
->num
; j
++) {
3187 if (ips
->ips
[j
].pnn
== pnn
) {
3188 if (!ctdb_sys_have_ip(&ips
->ips
[j
].addr
)) {
3190 ("Assigned IP %s not on an interface\n",
3191 ctdb_addr_to_str(&ips
->ips
[j
].addr
)));
3192 need_takeover_run
= true;
3195 if (ctdb_sys_have_ip(&ips
->ips
[j
].addr
)) {
3197 ("IP %s incorrectly on an interface - releasing\n",
3198 ctdb_addr_to_str(&ips
->ips
[j
].addr
)));
3199 ret
= ctdb_ctrl_release_ip(ctdb
,
3205 ("Failed to release IP address\n"));
3212 if (need_takeover_run
) {
3213 struct ctdb_srvid_message rd
;
3216 DEBUG(DEBUG_NOTICE
,("Trigger takeoverrun\n"));
3221 data
.dptr
= (uint8_t *)&rd
;
3222 data
.dsize
= sizeof(rd
);
3224 ret
= ctdb_client_send_message(ctdb
, rec
->recmaster
, CTDB_SRVID_TAKEOVER_RUN
, data
);
3227 ("Failed to send takeover run request\n"));
3230 talloc_free(mem_ctx
);
3235 static void async_getnodemap_callback(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
3237 struct ctdb_node_map_old
**remote_nodemaps
= callback_data
;
3239 if (node_pnn
>= ctdb
->num_nodes
) {
3240 DEBUG(DEBUG_ERR
,(__location__
" pnn from invalid node\n"));
3244 remote_nodemaps
[node_pnn
] = (struct ctdb_node_map_old
*)talloc_steal(remote_nodemaps
, outdata
.dptr
);
3248 static int get_remote_nodemaps(struct ctdb_context
*ctdb
, TALLOC_CTX
*mem_ctx
,
3249 struct ctdb_node_map_old
*nodemap
,
3250 struct ctdb_node_map_old
**remote_nodemaps
)
3254 nodes
= list_of_active_nodes(ctdb
, nodemap
, mem_ctx
, true);
3255 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_GET_NODEMAP
,
3257 CONTROL_TIMEOUT(), false, tdb_null
,
3258 async_getnodemap_callback
,
3260 remote_nodemaps
) != 0) {
3261 DEBUG(DEBUG_ERR
, (__location__
" Unable to pull all remote nodemaps\n"));
3269 static bool validate_recovery_master(struct ctdb_recoverd
*rec
,
3270 TALLOC_CTX
*mem_ctx
)
3272 struct ctdb_context
*ctdb
= rec
->ctdb
;
3273 uint32_t pnn
= ctdb_get_pnn(ctdb
);
3274 struct ctdb_node_map_old
*nodemap
= rec
->nodemap
;
3275 struct ctdb_node_map_old
*recmaster_nodemap
= NULL
;
3278 /* When recovery daemon is started, recmaster is set to
3279 * "unknown" so it knows to start an election.
3281 if (rec
->recmaster
== CTDB_UNKNOWN_PNN
) {
3283 ("Initial recovery master set - forcing election\n"));
3284 force_election(rec
, pnn
, nodemap
);
3289 * If the current recmaster does not have CTDB_CAP_RECMASTER,
3290 * but we have, then force an election and try to become the new
3293 if (!ctdb_node_has_capabilities(rec
->caps
,
3295 CTDB_CAP_RECMASTER
) &&
3296 (rec
->ctdb
->capabilities
& CTDB_CAP_RECMASTER
) &&
3297 !(nodemap
->nodes
[pnn
].flags
& NODE_FLAGS_INACTIVE
)) {
3299 (" Current recmaster node %u does not have CAP_RECMASTER,"
3300 " but we (node %u) have - force an election\n",
3301 rec
->recmaster
, pnn
));
3302 force_election(rec
, pnn
, nodemap
);
3306 /* Verify that the master node has not been deleted. This
3307 * should not happen because a node should always be shutdown
3308 * before being deleted, causing a new master to be elected
3309 * before now. However, if something strange has happened
3310 * then checking here will ensure we don't index beyond the
3311 * end of the nodemap array. */
3312 if (rec
->recmaster
>= nodemap
->num
) {
3314 ("Recmaster node %u has been deleted. Force election\n",
3316 force_election(rec
, pnn
, nodemap
);
3320 /* if recovery master is disconnected/deleted we must elect a new recmaster */
3321 if (nodemap
->nodes
[rec
->recmaster
].flags
&
3322 (NODE_FLAGS_DISCONNECTED
|NODE_FLAGS_DELETED
)) {
3324 ("Recmaster node %u is disconnected/deleted. Force election\n",
3326 force_election(rec
, pnn
, nodemap
);
3330 /* get nodemap from the recovery master to check if it is inactive */
3331 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), rec
->recmaster
,
3332 mem_ctx
, &recmaster_nodemap
);
3336 " Unable to get nodemap from recovery master %u\n",
3338 /* No election, just error */
3343 if ((recmaster_nodemap
->nodes
[rec
->recmaster
].flags
& NODE_FLAGS_INACTIVE
) &&
3344 (rec
->node_flags
& NODE_FLAGS_INACTIVE
) == 0) {
3346 ("Recmaster node %u is inactive. Force election\n",
3349 * update our nodemap to carry the recmaster's notion of
3350 * its own flags, so that we don't keep freezing the
3351 * inactive recmaster node...
3353 nodemap
->nodes
[rec
->recmaster
].flags
=
3354 recmaster_nodemap
->nodes
[rec
->recmaster
].flags
;
3355 force_election(rec
, pnn
, nodemap
);
3362 static void main_loop(struct ctdb_context
*ctdb
, struct ctdb_recoverd
*rec
,
3363 TALLOC_CTX
*mem_ctx
)
3366 struct ctdb_node_map_old
*nodemap
=NULL
;
3367 struct ctdb_node_map_old
**remote_nodemaps
=NULL
;
3368 struct ctdb_vnn_map
*vnnmap
=NULL
;
3369 struct ctdb_vnn_map
*remote_vnnmap
=NULL
;
3370 uint32_t num_lmasters
;
3371 int32_t debug_level
;
3376 /* verify that the main daemon is still running */
3377 if (ctdb_kill(ctdb
, ctdb
->ctdbd_pid
, 0) != 0) {
3378 DEBUG(DEBUG_CRIT
,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
3382 /* ping the local daemon to tell it we are alive */
3383 ctdb_ctrl_recd_ping(ctdb
);
3385 if (rec
->election_timeout
) {
3386 /* an election is in progress */
3390 /* read the debug level from the parent and update locally */
3391 ret
= ctdb_ctrl_get_debuglevel(ctdb
, CTDB_CURRENT_NODE
, &debug_level
);
3393 DEBUG(DEBUG_ERR
, (__location__
" Failed to read debuglevel from parent\n"));
3396 DEBUGLEVEL
= debug_level
;
3398 /* get relevant tunables */
3399 ret
= ctdb_ctrl_get_all_tunables(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, &ctdb
->tunable
);
3401 DEBUG(DEBUG_ERR
,("Failed to get tunables - retrying\n"));
3406 ret
= ctdb_ctrl_get_runstate(ctdb
, CONTROL_TIMEOUT(),
3407 CTDB_CURRENT_NODE
, &ctdb
->runstate
);
3409 DEBUG(DEBUG_ERR
, ("Failed to get runstate - retrying\n"));
3413 pnn
= ctdb_get_pnn(ctdb
);
3416 TALLOC_FREE(rec
->nodemap
);
3417 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), pnn
, rec
, &rec
->nodemap
);
3419 DEBUG(DEBUG_ERR
, (__location__
" Unable to get nodemap from node %u\n", pnn
));
3422 nodemap
= rec
->nodemap
;
3424 /* remember our own node flags */
3425 rec
->node_flags
= nodemap
->nodes
[pnn
].flags
;
3427 ban_misbehaving_nodes(rec
, &self_ban
);
3429 DEBUG(DEBUG_NOTICE
, ("This node was banned, restart main_loop\n"));
3433 /* if the local daemon is STOPPED or BANNED, we verify that the databases are
3434 also frozen and that the recmode is set to active.
3436 if (rec
->node_flags
& (NODE_FLAGS_STOPPED
| NODE_FLAGS_BANNED
)) {
3437 /* If this node has become inactive then we want to
3438 * reduce the chances of it taking over the recovery
3439 * master role when it becomes active again. This
3440 * helps to stabilise the recovery master role so that
3441 * it stays on the most stable node.
3443 rec
->priority_time
= timeval_current();
3445 ret
= ctdb_ctrl_getrecmode(ctdb
, mem_ctx
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, &ctdb
->recovery_mode
);
3447 DEBUG(DEBUG_ERR
,(__location__
" Failed to read recmode from local node\n"));
3449 if (ctdb
->recovery_mode
== CTDB_RECOVERY_NORMAL
) {
3450 DEBUG(DEBUG_ERR
,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
3452 ret
= ctdb_ctrl_setrecmode(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, CTDB_RECOVERY_ACTIVE
);
3454 DEBUG(DEBUG_ERR
,(__location__
" Failed to activate recovery mode in STOPPED or BANNED state\n"));
3459 if (! rec
->frozen_on_inactive
) {
3460 ret
= ctdb_ctrl_freeze(ctdb
, CONTROL_TIMEOUT(),
3464 (__location__
" Failed to freeze node "
3465 "in STOPPED or BANNED state\n"));
3469 rec
->frozen_on_inactive
= true;
3472 /* If this node is stopped or banned then it is not the recovery
3473 * master, so don't do anything. This prevents stopped or banned
3474 * node from starting election and sending unnecessary controls.
3479 rec
->frozen_on_inactive
= false;
3481 /* Retrieve capabilities from all connected nodes */
3482 ret
= update_capabilities(rec
, nodemap
);
3484 DEBUG(DEBUG_ERR
, (__location__
" Unable to update node capabilities.\n"));
3488 if (! validate_recovery_master(rec
, mem_ctx
)) {
3492 /* Check if an IP takeover run is needed and trigger one if
3494 verify_local_ip_allocation(ctdb
, rec
, pnn
, nodemap
);
3496 /* if we are not the recmaster then we do not need to check
3497 if recovery is needed
3499 if (pnn
!= rec
->recmaster
) {
3504 /* ensure our local copies of flags are right */
3505 ret
= update_local_flags(rec
, nodemap
);
3507 DEBUG(DEBUG_ERR
,("Unable to update local flags\n"));
3511 if (ctdb
->num_nodes
!= nodemap
->num
) {
3512 DEBUG(DEBUG_ERR
, (__location__
" ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb
->num_nodes
, nodemap
->num
));
3513 ctdb_load_nodes_file(ctdb
);
3517 /* verify that all active nodes agree that we are the recmaster */
3518 switch (verify_recmaster(rec
, nodemap
, pnn
)) {
3519 case MONITOR_RECOVERY_NEEDED
:
3520 /* can not happen */
3522 case MONITOR_ELECTION_NEEDED
:
3523 force_election(rec
, pnn
, nodemap
);
3527 case MONITOR_FAILED
:
3532 /* get the vnnmap */
3533 ret
= ctdb_ctrl_getvnnmap(ctdb
, CONTROL_TIMEOUT(), pnn
, mem_ctx
, &vnnmap
);
3535 DEBUG(DEBUG_ERR
, (__location__
" Unable to get vnnmap from node %u\n", pnn
));
3539 if (rec
->need_recovery
) {
3540 /* a previous recovery didn't finish */
3541 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3545 /* verify that all active nodes are in normal mode
3546 and not in recovery mode
3548 switch (verify_recmode(ctdb
, nodemap
)) {
3549 case MONITOR_RECOVERY_NEEDED
:
3550 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3552 case MONITOR_FAILED
:
3554 case MONITOR_ELECTION_NEEDED
:
3555 /* can not happen */
3561 if (ctdb
->recovery_lock
!= NULL
) {
3562 /* We must already hold the recovery lock */
3563 if (!ctdb_recovery_have_lock(rec
)) {
3564 DEBUG(DEBUG_ERR
,("Failed recovery lock sanity check. Force a recovery\n"));
3565 ctdb_set_culprit(rec
, ctdb
->pnn
);
3566 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3572 /* If recoveries are disabled then there is no use doing any
3573 * nodemap or flags checks. Recoveries might be disabled due
3574 * to "reloadnodes", so doing these checks might cause an
3575 * unnecessary recovery. */
3576 if (ctdb_op_is_disabled(rec
->recovery
)) {
3577 goto takeover_run_checks
;
3580 /* get the nodemap for all active remote nodes
3582 remote_nodemaps
= talloc_array(mem_ctx
, struct ctdb_node_map_old
*, nodemap
->num
);
3583 if (remote_nodemaps
== NULL
) {
3584 DEBUG(DEBUG_ERR
, (__location__
" failed to allocate remote nodemap array\n"));
3587 for(i
=0; i
<nodemap
->num
; i
++) {
3588 remote_nodemaps
[i
] = NULL
;
3590 if (get_remote_nodemaps(ctdb
, mem_ctx
, nodemap
, remote_nodemaps
) != 0) {
3591 DEBUG(DEBUG_ERR
,(__location__
" Failed to read remote nodemaps\n"));
3595 /* verify that all other nodes have the same nodemap as we have
3597 for (j
=0; j
<nodemap
->num
; j
++) {
3598 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
3602 if (remote_nodemaps
[j
] == NULL
) {
3603 DEBUG(DEBUG_ERR
,(__location__
" Did not get a remote nodemap for node %d, restarting monitoring\n", j
));
3604 ctdb_set_culprit(rec
, j
);
3609 /* if the nodes disagree on how many nodes there are
3610 then this is a good reason to try recovery
3612 if (remote_nodemaps
[j
]->num
!= nodemap
->num
) {
3613 DEBUG(DEBUG_ERR
, (__location__
" Remote node:%u has different node count. %u vs %u of the local node\n",
3614 nodemap
->nodes
[j
].pnn
, remote_nodemaps
[j
]->num
, nodemap
->num
));
3615 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3616 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3620 /* if the nodes disagree on which nodes exist and are
3621 active, then that is also a good reason to do recovery
3623 for (i
=0;i
<nodemap
->num
;i
++) {
3624 if (remote_nodemaps
[j
]->nodes
[i
].pnn
!= nodemap
->nodes
[i
].pnn
) {
3625 DEBUG(DEBUG_ERR
, (__location__
" Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3626 nodemap
->nodes
[j
].pnn
, i
,
3627 remote_nodemaps
[j
]->nodes
[i
].pnn
, nodemap
->nodes
[i
].pnn
));
3628 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3629 do_recovery(rec
, mem_ctx
, pnn
, nodemap
,
3637 * Update node flags obtained from each active node. This ensure we have
3638 * up-to-date information for all the nodes.
3640 for (j
=0; j
<nodemap
->num
; j
++) {
3641 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
3644 nodemap
->nodes
[j
].flags
= remote_nodemaps
[j
]->nodes
[j
].flags
;
3647 for (j
=0; j
<nodemap
->num
; j
++) {
3648 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
3652 /* verify the flags are consistent
3654 for (i
=0; i
<nodemap
->num
; i
++) {
3655 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
) {
3659 if (nodemap
->nodes
[i
].flags
!= remote_nodemaps
[j
]->nodes
[i
].flags
) {
3660 DEBUG(DEBUG_ERR
, (__location__
" Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3661 nodemap
->nodes
[j
].pnn
,
3662 nodemap
->nodes
[i
].pnn
,
3663 remote_nodemaps
[j
]->nodes
[i
].flags
,
3664 nodemap
->nodes
[i
].flags
));
3666 DEBUG(DEBUG_ERR
,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps
[j
]->nodes
[i
].flags
, j
));
3667 update_flags_on_all_nodes(ctdb
, nodemap
, nodemap
->nodes
[i
].pnn
, remote_nodemaps
[j
]->nodes
[i
].flags
);
3668 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3669 do_recovery(rec
, mem_ctx
, pnn
, nodemap
,
3673 DEBUG(DEBUG_ERR
,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap
->nodes
[i
].flags
, i
));
3674 update_flags_on_all_nodes(ctdb
, nodemap
, nodemap
->nodes
[i
].pnn
, nodemap
->nodes
[i
].flags
);
3675 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3676 do_recovery(rec
, mem_ctx
, pnn
, nodemap
,
3685 /* count how many active nodes there are */
3687 for (i
=0; i
<nodemap
->num
; i
++) {
3688 if (!(nodemap
->nodes
[i
].flags
& NODE_FLAGS_INACTIVE
)) {
3689 if (ctdb_node_has_capabilities(rec
->caps
,
3690 ctdb
->nodes
[i
]->pnn
,
3691 CTDB_CAP_LMASTER
)) {
3698 /* There must be the same number of lmasters in the vnn map as
3699 * there are active nodes with the lmaster capability... or
3702 if (vnnmap
->size
!= num_lmasters
) {
3703 DEBUG(DEBUG_ERR
, (__location__
" The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
3704 vnnmap
->size
, num_lmasters
));
3705 ctdb_set_culprit(rec
, ctdb
->pnn
);
3706 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3710 /* verify that all active nodes in the nodemap also exist in
3713 for (j
=0; j
<nodemap
->num
; j
++) {
3714 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
3717 if (nodemap
->nodes
[j
].pnn
== pnn
) {
3721 for (i
=0; i
<vnnmap
->size
; i
++) {
3722 if (vnnmap
->map
[i
] == nodemap
->nodes
[j
].pnn
) {
3726 if (i
== vnnmap
->size
) {
3727 DEBUG(DEBUG_ERR
, (__location__
" Node %u is active in the nodemap but did not exist in the vnnmap\n",
3728 nodemap
->nodes
[j
].pnn
));
3729 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3730 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3736 /* verify that all other nodes have the same vnnmap
3737 and are from the same generation
3739 for (j
=0; j
<nodemap
->num
; j
++) {
3740 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
3743 if (nodemap
->nodes
[j
].pnn
== pnn
) {
3747 ret
= ctdb_ctrl_getvnnmap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
3748 mem_ctx
, &remote_vnnmap
);
3750 DEBUG(DEBUG_ERR
, (__location__
" Unable to get vnnmap from remote node %u\n",
3751 nodemap
->nodes
[j
].pnn
));
3755 /* verify the vnnmap generation is the same */
3756 if (vnnmap
->generation
!= remote_vnnmap
->generation
) {
3757 DEBUG(DEBUG_ERR
, (__location__
" Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3758 nodemap
->nodes
[j
].pnn
, remote_vnnmap
->generation
, vnnmap
->generation
));
3759 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3760 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3764 /* verify the vnnmap size is the same */
3765 if (vnnmap
->size
!= remote_vnnmap
->size
) {
3766 DEBUG(DEBUG_ERR
, (__location__
" Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3767 nodemap
->nodes
[j
].pnn
, remote_vnnmap
->size
, vnnmap
->size
));
3768 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3769 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
3773 /* verify the vnnmap is the same */
3774 for (i
=0;i
<vnnmap
->size
;i
++) {
3775 if (remote_vnnmap
->map
[i
] != vnnmap
->map
[i
]) {
3776 DEBUG(DEBUG_ERR
, (__location__
" Remote node %u has different vnnmap.\n",
3777 nodemap
->nodes
[j
].pnn
));
3778 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
3779 do_recovery(rec
, mem_ctx
, pnn
, nodemap
,
3786 /* FIXME: Add remote public IP checking to ensure that nodes
3787 * have the IP addresses that are allocated to them. */
3789 takeover_run_checks
:
3791 /* If there are IP takeover runs requested or the previous one
3792 * failed then perform one and notify the waiters */
3793 if (!ctdb_op_is_disabled(rec
->takeover_run
) &&
3794 (rec
->reallocate_requests
|| rec
->need_takeover_run
)) {
3795 process_ipreallocate_requests(ctdb
, rec
);
3799 static void recd_sig_term_handler(struct tevent_context
*ev
,
3800 struct tevent_signal
*se
, int signum
,
3801 int count
, void *dont_care
,
3804 struct ctdb_recoverd
*rec
= talloc_get_type_abort(
3805 private_data
, struct ctdb_recoverd
);
3807 ctdb_recovery_unlock(rec
);
3813 the main monitoring loop
3815 static void monitor_cluster(struct ctdb_context
*ctdb
)
3817 struct tevent_signal
*se
;
3818 struct ctdb_recoverd
*rec
;
3820 DEBUG(DEBUG_NOTICE
,("monitor_cluster starting\n"));
3822 rec
= talloc_zero(ctdb
, struct ctdb_recoverd
);
3823 CTDB_NO_MEMORY_FATAL(ctdb
, rec
);
3826 rec
->recmaster
= CTDB_UNKNOWN_PNN
;
3827 rec
->recovery_lock_handle
= NULL
;
3829 rec
->takeover_run
= ctdb_op_init(rec
, "takeover runs");
3830 CTDB_NO_MEMORY_FATAL(ctdb
, rec
->takeover_run
);
3832 rec
->recovery
= ctdb_op_init(rec
, "recoveries");
3833 CTDB_NO_MEMORY_FATAL(ctdb
, rec
->recovery
);
3835 rec
->priority_time
= timeval_current();
3836 rec
->frozen_on_inactive
= false;
3838 se
= tevent_add_signal(ctdb
->ev
, ctdb
, SIGTERM
, 0,
3839 recd_sig_term_handler
, rec
);
3841 DEBUG(DEBUG_ERR
, ("Failed to install SIGTERM handler\n"));
3845 /* register a message port for sending memory dumps */
3846 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_MEM_DUMP
, mem_dump_handler
, rec
);
3848 /* when a node is assigned banning credits */
3849 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_BANNING
,
3850 banning_handler
, rec
);
3852 /* register a message port for recovery elections */
3853 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_ELECTION
, election_handler
, rec
);
3855 /* when nodes are disabled/enabled */
3856 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_SET_NODE_FLAGS
, monitor_handler
, rec
);
3858 /* when we are asked to puch out a flag change */
3859 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_PUSH_NODE_FLAGS
, push_flags_handler
, rec
);
3861 /* register a message port for vacuum fetch */
3862 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_VACUUM_FETCH
, vacuum_fetch_handler
, rec
);
3864 /* register a message port for reloadnodes */
3865 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_RELOAD_NODES
, reload_nodes_handler
, rec
);
3867 /* register a message port for performing a takeover run */
3868 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_TAKEOVER_RUN
, ip_reallocate_handler
, rec
);
3870 /* register a message port for disabling the ip check for a short while */
3871 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_DISABLE_IP_CHECK
, disable_ip_check_handler
, rec
);
3873 /* register a message port for forcing a rebalance of a node next
3875 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_REBALANCE_NODE
, recd_node_rebalance_handler
, rec
);
3877 /* Register a message port for disabling takeover runs */
3878 ctdb_client_set_message_handler(ctdb
,
3879 CTDB_SRVID_DISABLE_TAKEOVER_RUNS
,
3880 disable_takeover_runs_handler
, rec
);
3882 /* Register a message port for disabling recoveries */
3883 ctdb_client_set_message_handler(ctdb
,
3884 CTDB_SRVID_DISABLE_RECOVERIES
,
3885 disable_recoveries_handler
, rec
);
3887 /* register a message port for detaching database */
3888 ctdb_client_set_message_handler(ctdb
,
3889 CTDB_SRVID_DETACH_DATABASE
,
3890 detach_database_handler
, rec
);
3893 TALLOC_CTX
*mem_ctx
= talloc_new(ctdb
);
3894 struct timeval start
;
3898 DEBUG(DEBUG_CRIT
,(__location__
3899 " Failed to create temp context\n"));
3903 start
= timeval_current();
3904 main_loop(ctdb
, rec
, mem_ctx
);
3905 talloc_free(mem_ctx
);
3907 /* we only check for recovery once every second */
3908 elapsed
= timeval_elapsed(&start
);
3909 if (elapsed
< ctdb
->tunable
.recover_interval
) {
3910 ctdb_wait_timeout(ctdb
, ctdb
->tunable
.recover_interval
3917 event handler for when the main ctdbd dies
3919 static void ctdb_recoverd_parent(struct tevent_context
*ev
,
3920 struct tevent_fd
*fde
,
3921 uint16_t flags
, void *private_data
)
3923 DEBUG(DEBUG_ALERT
,("recovery daemon parent died - exiting\n"));
3928 called regularly to verify that the recovery daemon is still running
3930 static void ctdb_check_recd(struct tevent_context
*ev
,
3931 struct tevent_timer
*te
,
3932 struct timeval yt
, void *p
)
3934 struct ctdb_context
*ctdb
= talloc_get_type(p
, struct ctdb_context
);
3936 if (ctdb_kill(ctdb
, ctdb
->recoverd_pid
, 0) != 0) {
3937 DEBUG(DEBUG_ERR
,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb
->recoverd_pid
));
3939 tevent_add_timer(ctdb
->ev
, ctdb
, timeval_zero(),
3940 ctdb_restart_recd
, ctdb
);
3945 tevent_add_timer(ctdb
->ev
, ctdb
->recd_ctx
,
3946 timeval_current_ofs(30, 0),
3947 ctdb_check_recd
, ctdb
);
3950 static void recd_sig_child_handler(struct tevent_context
*ev
,
3951 struct tevent_signal
*se
, int signum
,
3952 int count
, void *dont_care
,
3955 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3960 pid
= waitpid(-1, &status
, WNOHANG
);
3962 if (errno
!= ECHILD
) {
3963 DEBUG(DEBUG_ERR
, (__location__
" waitpid() returned error. errno:%s(%d)\n", strerror(errno
),errno
));
3968 DEBUG(DEBUG_DEBUG
, ("RECD SIGCHLD from %d\n", (int)pid
));
3974 startup the recovery daemon as a child of the main ctdb daemon
3976 int ctdb_start_recoverd(struct ctdb_context
*ctdb
)
3979 struct tevent_signal
*se
;
3980 struct tevent_fd
*fde
;
3982 if (pipe(fd
) != 0) {
3986 ctdb
->recoverd_pid
= ctdb_fork(ctdb
);
3987 if (ctdb
->recoverd_pid
== -1) {
3991 if (ctdb
->recoverd_pid
!= 0) {
3992 talloc_free(ctdb
->recd_ctx
);
3993 ctdb
->recd_ctx
= talloc_new(ctdb
);
3994 CTDB_NO_MEMORY(ctdb
, ctdb
->recd_ctx
);
3997 tevent_add_timer(ctdb
->ev
, ctdb
->recd_ctx
,
3998 timeval_current_ofs(30, 0),
3999 ctdb_check_recd
, ctdb
);
4005 srandom(getpid() ^ time(NULL
));
4007 prctl_set_comment("ctdb_recovered");
4008 if (switch_from_server_to_client(ctdb
, "recoverd") != 0) {
4009 DEBUG(DEBUG_CRIT
, (__location__
"ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
4013 DEBUG(DEBUG_DEBUG
, (__location__
" Created PIPE FD:%d to recovery daemon\n", fd
[0]));
4015 fde
= tevent_add_fd(ctdb
->ev
, ctdb
, fd
[0], TEVENT_FD_READ
,
4016 ctdb_recoverd_parent
, &fd
[0]);
4017 tevent_fd_set_auto_close(fde
);
4019 /* set up a handler to pick up sigchld */
4020 se
= tevent_add_signal(ctdb
->ev
, ctdb
, SIGCHLD
, 0,
4021 recd_sig_child_handler
, ctdb
);
4023 DEBUG(DEBUG_CRIT
,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
4027 monitor_cluster(ctdb
);
4029 DEBUG(DEBUG_ALERT
,("ERROR: ctdb_recoverd finished!?\n"));
4034 shutdown the recovery daemon
4036 void ctdb_stop_recoverd(struct ctdb_context
*ctdb
)
4038 if (ctdb
->recoverd_pid
== 0) {
4042 DEBUG(DEBUG_NOTICE
,("Shutting down recovery daemon\n"));
4043 ctdb_kill(ctdb
, ctdb
->recoverd_pid
, SIGTERM
);
4045 TALLOC_FREE(ctdb
->recd_ctx
);
4046 TALLOC_FREE(ctdb
->recd_ping_count
);
4049 static void ctdb_restart_recd(struct tevent_context
*ev
,
4050 struct tevent_timer
*te
,
4051 struct timeval t
, void *private_data
)
4053 struct ctdb_context
*ctdb
= talloc_get_type(private_data
, struct ctdb_context
);
4055 DEBUG(DEBUG_ERR
,("Restarting recovery daemon\n"));
4056 ctdb_stop_recoverd(ctdb
);
4057 ctdb_start_recoverd(ctdb
);