4 Copyright (C) Ronnie Sahlberg 2007
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "system/filesys.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/wait.h"
31 #include "lib/tdb_wrap/tdb_wrap.h"
32 #include "lib/util/dlinklist.h"
33 #include "lib/util/debug.h"
34 #include "lib/util/samba_util.h"
35 #include "lib/util/sys_rw.h"
36 #include "lib/util/util_process.h"
38 #include "ctdb_private.h"
39 #include "ctdb_client.h"
41 #include "common/system.h"
42 #include "common/common.h"
43 #include "common/logging.h"
45 #include "ctdb_cluster_mutex.h"
47 /* List of SRVID requests that need to be processed */
49 struct srvid_list
*next
, *prev
;
50 struct ctdb_srvid_message
*request
;
53 struct srvid_requests
{
54 struct srvid_list
*requests
;
57 static void srvid_request_reply(struct ctdb_context
*ctdb
,
58 struct ctdb_srvid_message
*request
,
61 /* Someone that sent srvid==0 does not want a reply */
62 if (request
->srvid
== 0) {
67 if (ctdb_client_send_message(ctdb
, request
->pnn
, request
->srvid
,
69 DEBUG(DEBUG_INFO
,("Sent SRVID reply to %u:%llu\n",
70 (unsigned)request
->pnn
,
71 (unsigned long long)request
->srvid
));
73 DEBUG(DEBUG_ERR
,("Failed to send SRVID reply to %u:%llu\n",
74 (unsigned)request
->pnn
,
75 (unsigned long long)request
->srvid
));
81 static void srvid_requests_reply(struct ctdb_context
*ctdb
,
82 struct srvid_requests
**requests
,
87 if (*requests
== NULL
) {
91 for (r
= (*requests
)->requests
; r
!= NULL
; r
= r
->next
) {
92 srvid_request_reply(ctdb
, r
->request
, result
);
95 /* Free the list structure... */
96 TALLOC_FREE(*requests
);
99 static void srvid_request_add(struct ctdb_context
*ctdb
,
100 struct srvid_requests
**requests
,
101 struct ctdb_srvid_message
*request
)
103 struct srvid_list
*t
;
107 if (*requests
== NULL
) {
108 *requests
= talloc_zero(ctdb
, struct srvid_requests
);
109 if (*requests
== NULL
) {
114 t
= talloc_zero(*requests
, struct srvid_list
);
116 /* If *requests was just allocated above then free it */
117 if ((*requests
)->requests
== NULL
) {
118 TALLOC_FREE(*requests
);
123 t
->request
= (struct ctdb_srvid_message
*)talloc_steal(t
, request
);
124 DLIST_ADD((*requests
)->requests
, t
);
129 /* Failed to add the request to the list. Send a fail. */
130 DEBUG(DEBUG_ERR
, (__location__
131 " Out of memory, failed to queue SRVID request\n"));
133 result
.dsize
= sizeof(ret
);
134 result
.dptr
= (uint8_t *)&ret
;
135 srvid_request_reply(ctdb
, request
, result
);
138 /* An abstraction to allow an operation (takeover runs, recoveries,
139 * ...) to be disabled for a given timeout */
140 struct ctdb_op_state
{
141 struct tevent_timer
*timer
;
146 static struct ctdb_op_state
*ctdb_op_init(TALLOC_CTX
*mem_ctx
, const char *name
)
148 struct ctdb_op_state
*state
= talloc_zero(mem_ctx
, struct ctdb_op_state
);
151 state
->in_progress
= false;
158 static bool ctdb_op_is_disabled(struct ctdb_op_state
*state
)
160 return state
->timer
!= NULL
;
163 static bool ctdb_op_begin(struct ctdb_op_state
*state
)
165 if (ctdb_op_is_disabled(state
)) {
167 ("Unable to begin - %s are disabled\n", state
->name
));
171 state
->in_progress
= true;
175 static bool ctdb_op_end(struct ctdb_op_state
*state
)
177 return state
->in_progress
= false;
180 static bool ctdb_op_is_in_progress(struct ctdb_op_state
*state
)
182 return state
->in_progress
;
185 static void ctdb_op_enable(struct ctdb_op_state
*state
)
187 TALLOC_FREE(state
->timer
);
190 static void ctdb_op_timeout_handler(struct tevent_context
*ev
,
191 struct tevent_timer
*te
,
192 struct timeval yt
, void *p
)
194 struct ctdb_op_state
*state
=
195 talloc_get_type(p
, struct ctdb_op_state
);
197 DEBUG(DEBUG_NOTICE
,("Reenabling %s after timeout\n", state
->name
));
198 ctdb_op_enable(state
);
201 static int ctdb_op_disable(struct ctdb_op_state
*state
,
202 struct tevent_context
*ev
,
206 DEBUG(DEBUG_NOTICE
,("Reenabling %s\n", state
->name
));
207 ctdb_op_enable(state
);
211 if (state
->in_progress
) {
213 ("Unable to disable %s - in progress\n", state
->name
));
217 DEBUG(DEBUG_NOTICE
,("Disabling %s for %u seconds\n",
218 state
->name
, timeout
));
220 /* Clear any old timers */
221 talloc_free(state
->timer
);
223 /* Arrange for the timeout to occur */
224 state
->timer
= tevent_add_timer(ev
, state
,
225 timeval_current_ofs(timeout
, 0),
226 ctdb_op_timeout_handler
, state
);
227 if (state
->timer
== NULL
) {
228 DEBUG(DEBUG_ERR
,(__location__
" Unable to setup timer\n"));
235 struct ctdb_banning_state
{
237 struct timeval last_reported_time
;
241 private state of recovery daemon
243 struct ctdb_recoverd
{
244 struct ctdb_context
*ctdb
;
246 uint32_t last_culprit_node
;
247 struct ctdb_node_map_old
*nodemap
;
248 struct timeval priority_time
;
249 bool need_takeover_run
;
252 struct tevent_timer
*send_election_te
;
253 struct tevent_timer
*election_timeout
;
254 struct srvid_requests
*reallocate_requests
;
255 struct ctdb_op_state
*takeover_run
;
256 struct ctdb_op_state
*recovery
;
257 struct ctdb_iface_list_old
*ifaces
;
258 uint32_t *force_rebalance_nodes
;
259 struct ctdb_node_capabilities
*caps
;
260 bool frozen_on_inactive
;
261 struct ctdb_cluster_mutex_handle
*recovery_lock_handle
;
264 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
265 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
267 static void ctdb_restart_recd(struct tevent_context
*ev
,
268 struct tevent_timer
*te
, struct timeval t
,
272 ban a node for a period of time
274 static void ctdb_ban_node(struct ctdb_recoverd
*rec
, uint32_t pnn
, uint32_t ban_time
)
277 struct ctdb_context
*ctdb
= rec
->ctdb
;
278 struct ctdb_ban_state bantime
;
280 if (!ctdb_validate_pnn(ctdb
, pnn
)) {
281 DEBUG(DEBUG_ERR
,("Bad pnn %u in ctdb_ban_node\n", pnn
));
285 DEBUG(DEBUG_NOTICE
,("Banning node %u for %u seconds\n", pnn
, ban_time
));
288 bantime
.time
= ban_time
;
290 ret
= ctdb_ctrl_set_ban(ctdb
, CONTROL_TIMEOUT(), pnn
, &bantime
);
292 DEBUG(DEBUG_ERR
,(__location__
" Failed to ban node %d\n", pnn
));
298 enum monitor_result
{ MONITOR_OK
, MONITOR_RECOVERY_NEEDED
, MONITOR_ELECTION_NEEDED
, MONITOR_FAILED
};
302 remember the trouble maker
304 static void ctdb_set_culprit_count(struct ctdb_recoverd
*rec
, uint32_t culprit
, uint32_t count
)
306 struct ctdb_context
*ctdb
= talloc_get_type(rec
->ctdb
, struct ctdb_context
);
307 struct ctdb_banning_state
*ban_state
;
309 if (culprit
> ctdb
->num_nodes
) {
310 DEBUG(DEBUG_ERR
,("Trying to set culprit %d but num_nodes is %d\n", culprit
, ctdb
->num_nodes
));
314 /* If we are banned or stopped, do not set other nodes as culprits */
315 if (rec
->node_flags
& NODE_FLAGS_INACTIVE
) {
316 DEBUG(DEBUG_NOTICE
, ("This node is INACTIVE, cannot set culprit node %d\n", culprit
));
320 if (ctdb
->nodes
[culprit
]->ban_state
== NULL
) {
321 ctdb
->nodes
[culprit
]->ban_state
= talloc_zero(ctdb
->nodes
[culprit
], struct ctdb_banning_state
);
322 CTDB_NO_MEMORY_VOID(ctdb
, ctdb
->nodes
[culprit
]->ban_state
);
326 ban_state
= ctdb
->nodes
[culprit
]->ban_state
;
327 if (timeval_elapsed(&ban_state
->last_reported_time
) > ctdb
->tunable
.recovery_grace_period
) {
328 /* this was the first time in a long while this node
329 misbehaved so we will forgive any old transgressions.
331 ban_state
->count
= 0;
334 ban_state
->count
+= count
;
335 ban_state
->last_reported_time
= timeval_current();
336 rec
->last_culprit_node
= culprit
;
340 remember the trouble maker
342 static void ctdb_set_culprit(struct ctdb_recoverd
*rec
, uint32_t culprit
)
344 ctdb_set_culprit_count(rec
, culprit
, 1);
348 Retrieve capabilities from all connected nodes
350 static int update_capabilities(struct ctdb_recoverd
*rec
,
351 struct ctdb_node_map_old
*nodemap
)
355 struct ctdb_node_capabilities
*caps
;
356 struct ctdb_context
*ctdb
= rec
->ctdb
;
358 tmp_ctx
= talloc_new(rec
);
359 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
361 caps
= ctdb_get_capabilities(ctdb
, tmp_ctx
,
362 CONTROL_TIMEOUT(), nodemap
);
366 (__location__
" Failed to get node capabilities\n"));
367 talloc_free(tmp_ctx
);
371 capp
= ctdb_get_node_capabilities(caps
, ctdb_get_pnn(ctdb
));
375 " Capabilities don't include current node.\n"));
376 talloc_free(tmp_ctx
);
379 ctdb
->capabilities
= *capp
;
381 TALLOC_FREE(rec
->caps
);
382 rec
->caps
= talloc_steal(rec
, caps
);
384 talloc_free(tmp_ctx
);
389 change recovery mode on all nodes
391 static int set_recovery_mode(struct ctdb_context
*ctdb
,
392 struct ctdb_recoverd
*rec
,
393 struct ctdb_node_map_old
*nodemap
,
400 tmp_ctx
= talloc_new(ctdb
);
401 CTDB_NO_MEMORY(ctdb
, tmp_ctx
);
403 nodes
= list_of_active_nodes(ctdb
, nodemap
, tmp_ctx
, true);
405 data
.dsize
= sizeof(uint32_t);
406 data
.dptr
= (unsigned char *)&rec_mode
;
408 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_SET_RECMODE
,
414 DEBUG(DEBUG_ERR
, (__location__
" Unable to set recovery mode. Recovery failed.\n"));
415 talloc_free(tmp_ctx
);
419 talloc_free(tmp_ctx
);
424 ensure all other nodes have attached to any databases that we have
426 static int create_missing_remote_databases(struct ctdb_context
*ctdb
, struct ctdb_node_map_old
*nodemap
,
427 uint32_t pnn
, struct ctdb_dbid_map_old
*dbmap
, TALLOC_CTX
*mem_ctx
)
430 struct ctdb_dbid_map_old
*remote_dbmap
;
432 /* verify that all other nodes have all our databases */
433 for (j
=0; j
<nodemap
->num
; j
++) {
434 /* we don't need to ourself ourselves */
435 if (nodemap
->nodes
[j
].pnn
== pnn
) {
438 /* don't check nodes that are unavailable */
439 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
443 ret
= ctdb_ctrl_getdbmap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
444 mem_ctx
, &remote_dbmap
);
446 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbids from node %u\n", pnn
));
450 /* step through all local databases */
451 for (db
=0; db
<dbmap
->num
;db
++) {
455 for (i
=0;i
<remote_dbmap
->num
;i
++) {
456 if (dbmap
->dbs
[db
].db_id
== remote_dbmap
->dbs
[i
].db_id
) {
460 /* the remote node already have this database */
461 if (i
!=remote_dbmap
->num
) {
464 /* ok so we need to create this database */
465 ret
= ctdb_ctrl_getdbname(ctdb
, CONTROL_TIMEOUT(), pnn
,
466 dbmap
->dbs
[db
].db_id
, mem_ctx
,
469 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbname from node %u\n", pnn
));
472 ret
= ctdb_ctrl_createdb(ctdb
, CONTROL_TIMEOUT(),
473 nodemap
->nodes
[j
].pnn
,
475 dbmap
->dbs
[db
].flags
& CTDB_DB_FLAGS_PERSISTENT
);
477 DEBUG(DEBUG_ERR
, (__location__
" Unable to create remote db:%s\n", name
));
488 ensure we are attached to any databases that anyone else is attached to
490 static int create_missing_local_databases(struct ctdb_context
*ctdb
, struct ctdb_node_map_old
*nodemap
,
491 uint32_t pnn
, struct ctdb_dbid_map_old
**dbmap
, TALLOC_CTX
*mem_ctx
)
494 struct ctdb_dbid_map_old
*remote_dbmap
;
496 /* verify that we have all database any other node has */
497 for (j
=0; j
<nodemap
->num
; j
++) {
498 /* we don't need to ourself ourselves */
499 if (nodemap
->nodes
[j
].pnn
== pnn
) {
502 /* don't check nodes that are unavailable */
503 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
507 ret
= ctdb_ctrl_getdbmap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
508 mem_ctx
, &remote_dbmap
);
510 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbids from node %u\n", pnn
));
514 /* step through all databases on the remote node */
515 for (db
=0; db
<remote_dbmap
->num
;db
++) {
518 for (i
=0;i
<(*dbmap
)->num
;i
++) {
519 if (remote_dbmap
->dbs
[db
].db_id
== (*dbmap
)->dbs
[i
].db_id
) {
523 /* we already have this db locally */
524 if (i
!=(*dbmap
)->num
) {
527 /* ok so we need to create this database and
530 ctdb_ctrl_getdbname(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
531 remote_dbmap
->dbs
[db
].db_id
, mem_ctx
, &name
);
533 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbname from node %u\n",
534 nodemap
->nodes
[j
].pnn
));
537 ctdb_ctrl_createdb(ctdb
, CONTROL_TIMEOUT(), pnn
, mem_ctx
, name
,
538 remote_dbmap
->dbs
[db
].flags
& CTDB_DB_FLAGS_PERSISTENT
);
540 DEBUG(DEBUG_ERR
, (__location__
" Unable to create local db:%s\n", name
));
543 ret
= ctdb_ctrl_getdbmap(ctdb
, CONTROL_TIMEOUT(), pnn
, mem_ctx
, dbmap
);
545 DEBUG(DEBUG_ERR
, (__location__
" Unable to reread dbmap on node %u\n", pnn
));
555 update flags on all active nodes
557 static int update_flags_on_all_nodes(struct ctdb_context
*ctdb
, struct ctdb_node_map_old
*nodemap
, uint32_t pnn
, uint32_t flags
)
561 ret
= ctdb_ctrl_modflags(ctdb
, CONTROL_TIMEOUT(), pnn
, flags
, ~flags
);
563 DEBUG(DEBUG_ERR
, (__location__
" Unable to update nodeflags on remote nodes\n"));
571 called when a vacuum fetch has completed - just free it and do the next one
573 static void vacuum_fetch_callback(struct ctdb_client_call_state
*state
)
580 * Process one elements of the vacuum fetch list:
581 * Migrate it over to us with the special flag
582 * CTDB_CALL_FLAG_VACUUM_MIGRATION.
584 static bool vacuum_fetch_process_one(struct ctdb_db_context
*ctdb_db
,
586 struct ctdb_rec_data_old
*r
)
588 struct ctdb_client_call_state
*state
;
590 struct ctdb_ltdb_header
*hdr
;
591 struct ctdb_call call
;
594 call
.call_id
= CTDB_NULL_FUNC
;
595 call
.flags
= CTDB_IMMEDIATE_MIGRATION
;
596 call
.flags
|= CTDB_CALL_FLAG_VACUUM_MIGRATION
;
598 call
.key
.dptr
= &r
->data
[0];
599 call
.key
.dsize
= r
->keylen
;
601 /* ensure we don't block this daemon - just skip a record if we can't get
603 if (tdb_chainlock_nonblock(ctdb_db
->ltdb
->tdb
, call
.key
) != 0) {
607 data
= tdb_fetch(ctdb_db
->ltdb
->tdb
, call
.key
);
608 if (data
.dptr
== NULL
) {
609 tdb_chainunlock(ctdb_db
->ltdb
->tdb
, call
.key
);
613 if (data
.dsize
< sizeof(struct ctdb_ltdb_header
)) {
615 tdb_chainunlock(ctdb_db
->ltdb
->tdb
, call
.key
);
619 hdr
= (struct ctdb_ltdb_header
*)data
.dptr
;
620 if (hdr
->dmaster
== pnn
) {
621 /* its already local */
623 tdb_chainunlock(ctdb_db
->ltdb
->tdb
, call
.key
);
629 state
= ctdb_call_send(ctdb_db
, &call
);
630 tdb_chainunlock(ctdb_db
->ltdb
->tdb
, call
.key
);
632 DEBUG(DEBUG_ERR
,(__location__
" Failed to setup vacuum fetch call\n"));
635 state
->async
.fn
= vacuum_fetch_callback
;
636 state
->async
.private_data
= NULL
;
643 handler for vacuum fetch
645 static void vacuum_fetch_handler(uint64_t srvid
, TDB_DATA data
,
648 struct ctdb_recoverd
*rec
= talloc_get_type(
649 private_data
, struct ctdb_recoverd
);
650 struct ctdb_context
*ctdb
= rec
->ctdb
;
651 struct ctdb_marshall_buffer
*recs
;
653 TALLOC_CTX
*tmp_ctx
= talloc_new(ctdb
);
655 struct ctdb_dbid_map_old
*dbmap
=NULL
;
656 bool persistent
= false;
657 struct ctdb_db_context
*ctdb_db
;
658 struct ctdb_rec_data_old
*r
;
660 recs
= (struct ctdb_marshall_buffer
*)data
.dptr
;
662 if (recs
->count
== 0) {
666 /* work out if the database is persistent */
667 ret
= ctdb_ctrl_getdbmap(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, tmp_ctx
, &dbmap
);
669 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbids from local node\n"));
673 for (i
=0;i
<dbmap
->num
;i
++) {
674 if (dbmap
->dbs
[i
].db_id
== recs
->db_id
) {
675 persistent
= dbmap
->dbs
[i
].flags
& CTDB_DB_FLAGS_PERSISTENT
;
679 if (i
== dbmap
->num
) {
680 DEBUG(DEBUG_ERR
, (__location__
" Unable to find db_id 0x%x on local node\n", recs
->db_id
));
684 /* find the name of this database */
685 if (ctdb_ctrl_getdbname(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, recs
->db_id
, tmp_ctx
, &name
) != 0) {
686 DEBUG(DEBUG_ERR
,(__location__
" Failed to get name of db 0x%x\n", recs
->db_id
));
691 ctdb_db
= ctdb_attach(ctdb
, CONTROL_TIMEOUT(), name
, persistent
, 0);
692 if (ctdb_db
== NULL
) {
693 DEBUG(DEBUG_ERR
,(__location__
" Failed to attach to database '%s'\n", name
));
697 r
= (struct ctdb_rec_data_old
*)&recs
->data
[0];
698 while (recs
->count
) {
701 ok
= vacuum_fetch_process_one(ctdb_db
, rec
->ctdb
->pnn
, r
);
706 r
= (struct ctdb_rec_data_old
*)(r
->length
+ (uint8_t *)r
);
711 talloc_free(tmp_ctx
);
716 * handler for database detach
718 static void detach_database_handler(uint64_t srvid
, TDB_DATA data
,
721 struct ctdb_recoverd
*rec
= talloc_get_type(
722 private_data
, struct ctdb_recoverd
);
723 struct ctdb_context
*ctdb
= rec
->ctdb
;
725 struct ctdb_db_context
*ctdb_db
;
727 if (data
.dsize
!= sizeof(db_id
)) {
730 db_id
= *(uint32_t *)data
.dptr
;
732 ctdb_db
= find_ctdb_db(ctdb
, db_id
);
733 if (ctdb_db
== NULL
) {
734 /* database is not attached */
738 DLIST_REMOVE(ctdb
->db_list
, ctdb_db
);
740 DEBUG(DEBUG_NOTICE
, ("Detached from database '%s'\n",
742 talloc_free(ctdb_db
);
746 called when ctdb_wait_timeout should finish
748 static void ctdb_wait_handler(struct tevent_context
*ev
,
749 struct tevent_timer
*te
,
750 struct timeval yt
, void *p
)
752 uint32_t *timed_out
= (uint32_t *)p
;
757 wait for a given number of seconds
759 static void ctdb_wait_timeout(struct ctdb_context
*ctdb
, double secs
)
761 uint32_t timed_out
= 0;
762 time_t usecs
= (secs
- (time_t)secs
) * 1000000;
763 tevent_add_timer(ctdb
->ev
, ctdb
, timeval_current_ofs(secs
, usecs
),
764 ctdb_wait_handler
, &timed_out
);
766 tevent_loop_once(ctdb
->ev
);
771 called when an election times out (ends)
773 static void ctdb_election_timeout(struct tevent_context
*ev
,
774 struct tevent_timer
*te
,
775 struct timeval t
, void *p
)
777 struct ctdb_recoverd
*rec
= talloc_get_type(p
, struct ctdb_recoverd
);
778 rec
->election_timeout
= NULL
;
781 DEBUG(DEBUG_WARNING
,("Election period ended\n"));
786 wait for an election to finish. It finished election_timeout seconds after
787 the last election packet is received
789 static void ctdb_wait_election(struct ctdb_recoverd
*rec
)
791 struct ctdb_context
*ctdb
= rec
->ctdb
;
792 while (rec
->election_timeout
) {
793 tevent_loop_once(ctdb
->ev
);
798 Update our local flags from all remote connected nodes.
799 This is only run when we are or we belive we are the recovery master
801 static int update_local_flags(struct ctdb_recoverd
*rec
, struct ctdb_node_map_old
*nodemap
)
804 struct ctdb_context
*ctdb
= rec
->ctdb
;
805 TALLOC_CTX
*mem_ctx
= talloc_new(ctdb
);
807 /* get the nodemap for all active remote nodes and verify
808 they are the same as for this node
810 for (j
=0; j
<nodemap
->num
; j
++) {
811 struct ctdb_node_map_old
*remote_nodemap
=NULL
;
814 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_DISCONNECTED
) {
817 if (nodemap
->nodes
[j
].pnn
== ctdb
->pnn
) {
821 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
822 mem_ctx
, &remote_nodemap
);
824 DEBUG(DEBUG_ERR
, (__location__
" Unable to get nodemap from remote node %u\n",
825 nodemap
->nodes
[j
].pnn
));
826 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
827 talloc_free(mem_ctx
);
830 if (nodemap
->nodes
[j
].flags
!= remote_nodemap
->nodes
[j
].flags
) {
831 /* We should tell our daemon about this so it
832 updates its flags or else we will log the same
833 message again in the next iteration of recovery.
834 Since we are the recovery master we can just as
835 well update the flags on all nodes.
837 ret
= ctdb_ctrl_modflags(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
, remote_nodemap
->nodes
[j
].flags
, ~remote_nodemap
->nodes
[j
].flags
);
839 DEBUG(DEBUG_ERR
, (__location__
" Unable to update nodeflags on remote nodes\n"));
843 /* Update our local copy of the flags in the recovery
846 DEBUG(DEBUG_NOTICE
,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
847 nodemap
->nodes
[j
].pnn
, remote_nodemap
->nodes
[j
].flags
,
848 nodemap
->nodes
[j
].flags
));
849 nodemap
->nodes
[j
].flags
= remote_nodemap
->nodes
[j
].flags
;
851 talloc_free(remote_nodemap
);
853 talloc_free(mem_ctx
);
858 /* Create a new random generation id.
859 The generation id can not be the INVALID_GENERATION id
861 static uint32_t new_generation(void)
866 generation
= random();
868 if (generation
!= INVALID_GENERATION
) {
876 static bool ctdb_recovery_have_lock(struct ctdb_recoverd
*rec
)
878 return (rec
->recovery_lock_handle
!= NULL
);
881 struct hold_reclock_state
{
887 static void take_reclock_handler(char status
,
891 struct hold_reclock_state
*s
=
892 (struct hold_reclock_state
*) private_data
;
896 s
->latency
= latency
;
901 ("Unable to take recovery lock - contention\n"));
905 DEBUG(DEBUG_ERR
, ("ERROR: when taking recovery lock\n"));
909 s
->locked
= (status
== '0') ;
912 static bool ctdb_recovery_lock(struct ctdb_recoverd
*rec
);
914 static void lost_reclock_handler(void *private_data
)
916 struct ctdb_recoverd
*rec
= talloc_get_type_abort(
917 private_data
, struct ctdb_recoverd
);
920 ("Recovery lock helper terminated unexpectedly - "
921 "trying to retake recovery lock\n"));
922 TALLOC_FREE(rec
->recovery_lock_handle
);
923 if (! ctdb_recovery_lock(rec
)) {
924 DEBUG(DEBUG_ERR
, ("Failed to take recovery lock\n"));
928 static bool ctdb_recovery_lock(struct ctdb_recoverd
*rec
)
930 struct ctdb_context
*ctdb
= rec
->ctdb
;
931 struct ctdb_cluster_mutex_handle
*h
;
932 struct hold_reclock_state s
= {
938 h
= ctdb_cluster_mutex(rec
, ctdb
, ctdb
->recovery_lock
, 0,
939 take_reclock_handler
, &s
,
940 lost_reclock_handler
, rec
);
946 tevent_loop_once(ctdb
->ev
);
954 rec
->recovery_lock_handle
= h
;
955 ctdb_ctrl_report_recd_lock_latency(ctdb
, CONTROL_TIMEOUT(),
961 static void ctdb_recovery_unlock(struct ctdb_recoverd
*rec
)
963 if (rec
->recovery_lock_handle
!= NULL
) {
964 DEBUG(DEBUG_NOTICE
, ("Releasing recovery lock\n"));
965 TALLOC_FREE(rec
->recovery_lock_handle
);
969 static void ban_misbehaving_nodes(struct ctdb_recoverd
*rec
, bool *self_ban
)
971 struct ctdb_context
*ctdb
= rec
->ctdb
;
973 struct ctdb_banning_state
*ban_state
;
976 for (i
=0; i
<ctdb
->num_nodes
; i
++) {
977 if (ctdb
->nodes
[i
]->ban_state
== NULL
) {
980 ban_state
= (struct ctdb_banning_state
*)ctdb
->nodes
[i
]->ban_state
;
981 if (ban_state
->count
< 2*ctdb
->num_nodes
) {
985 DEBUG(DEBUG_NOTICE
,("Node %u reached %u banning credits - banning it for %u seconds\n",
986 ctdb
->nodes
[i
]->pnn
, ban_state
->count
,
987 ctdb
->tunable
.recovery_ban_period
));
988 ctdb_ban_node(rec
, ctdb
->nodes
[i
]->pnn
, ctdb
->tunable
.recovery_ban_period
);
989 ban_state
->count
= 0;
991 /* Banning ourself? */
992 if (ctdb
->nodes
[i
]->pnn
== rec
->ctdb
->pnn
) {
998 struct helper_state
{
1005 static void helper_handler(struct tevent_context
*ev
,
1006 struct tevent_fd
*fde
,
1007 uint16_t flags
, void *private_data
)
1009 struct helper_state
*state
= talloc_get_type_abort(
1010 private_data
, struct helper_state
);
1013 ret
= sys_read(state
->fd
[0], &state
->result
, sizeof(state
->result
));
1014 if (ret
!= sizeof(state
->result
)) {
1015 state
->result
= EPIPE
;
1021 static int helper_run(struct ctdb_recoverd
*rec
, TALLOC_CTX
*mem_ctx
,
1022 const char *prog
, const char *arg
, const char *type
)
1024 struct helper_state
*state
;
1025 struct tevent_fd
*fde
;
1029 state
= talloc_zero(mem_ctx
, struct helper_state
);
1030 if (state
== NULL
) {
1031 DEBUG(DEBUG_ERR
, (__location__
" memory error\n"));
1037 ret
= pipe(state
->fd
);
1040 ("Failed to create pipe for %s helper\n", type
));
1044 set_close_on_exec(state
->fd
[0]);
1047 args
= talloc_array(state
, const char *, nargs
);
1049 DEBUG(DEBUG_ERR
, (__location__
" memory error\n"));
1053 args
[0] = talloc_asprintf(args
, "%d", state
->fd
[1]);
1054 if (args
[0] == NULL
) {
1055 DEBUG(DEBUG_ERR
, (__location__
" memory error\n"));
1058 args
[1] = rec
->ctdb
->daemon
.name
;
1062 if (args
[2] == NULL
) {
1066 state
->pid
= ctdb_vfork_exec(state
, rec
->ctdb
, prog
, nargs
, args
);
1067 if (state
->pid
== -1) {
1069 ("Failed to create child for %s helper\n", type
));
1073 close(state
->fd
[1]);
1076 state
->done
= false;
1078 fde
= tevent_add_fd(rec
->ctdb
->ev
, rec
->ctdb
, state
->fd
[0],
1079 TEVENT_FD_READ
, helper_handler
, state
);
1083 tevent_fd_set_auto_close(fde
);
1085 while (!state
->done
) {
1086 tevent_loop_once(rec
->ctdb
->ev
);
1089 close(state
->fd
[0]);
1092 if (state
->result
!= 0) {
1096 ctdb_kill(rec
->ctdb
, state
->pid
, SIGKILL
);
1101 if (state
->fd
[0] != -1) {
1102 close(state
->fd
[0]);
1104 if (state
->fd
[1] != -1) {
1105 close(state
->fd
[1]);
1107 if (state
->pid
!= -1) {
1108 ctdb_kill(rec
->ctdb
, state
->pid
, SIGKILL
);
1115 static int ctdb_takeover(struct ctdb_recoverd
*rec
,
1116 uint32_t *force_rebalance_nodes
)
1118 static char prog
[PATH_MAX
+1] = "";
1122 if (!ctdb_set_helper("takeover_helper", prog
, sizeof(prog
),
1123 "CTDB_TAKEOVER_HELPER", CTDB_HELPER_BINDIR
,
1124 "ctdb_takeover_helper")) {
1125 ctdb_die(rec
->ctdb
, "Unable to set takeover helper\n");
1129 for (i
= 0; i
< talloc_array_length(force_rebalance_nodes
); i
++) {
1130 uint32_t pnn
= force_rebalance_nodes
[i
];
1132 arg
= talloc_asprintf(rec
, "%u", pnn
);
1134 arg
= talloc_asprintf_append(arg
, ",%u", pnn
);
1137 DEBUG(DEBUG_ERR
, (__location__
" memory error\n"));
1142 return helper_run(rec
, rec
, prog
, arg
, "takeover");
1145 static bool do_takeover_run(struct ctdb_recoverd
*rec
,
1146 struct ctdb_node_map_old
*nodemap
)
1148 uint32_t *nodes
= NULL
;
1149 struct ctdb_disable_message dtr
;
1152 uint32_t *rebalance_nodes
= rec
->force_rebalance_nodes
;
1156 DEBUG(DEBUG_NOTICE
, ("Takeover run starting\n"));
1158 if (ctdb_op_is_in_progress(rec
->takeover_run
)) {
1159 DEBUG(DEBUG_ERR
, (__location__
1160 " takeover run already in progress \n"));
1165 if (!ctdb_op_begin(rec
->takeover_run
)) {
1170 /* Disable IP checks (takeover runs, really) on other nodes
1171 * while doing this takeover run. This will stop those other
1172 * nodes from triggering takeover runs when think they should
1173 * be hosting an IP but it isn't yet on an interface. Don't
1174 * wait for replies since a failure here might cause some
1175 * noise in the logs but will not actually cause a problem.
1178 dtr
.srvid
= 0; /* No reply */
1181 data
.dptr
= (uint8_t*)&dtr
;
1182 data
.dsize
= sizeof(dtr
);
1184 nodes
= list_of_connected_nodes(rec
->ctdb
, nodemap
, rec
, false);
1186 /* Disable for 60 seconds. This can be a tunable later if
1190 for (i
= 0; i
< talloc_array_length(nodes
); i
++) {
1191 if (ctdb_client_send_message(rec
->ctdb
, nodes
[i
],
1192 CTDB_SRVID_DISABLE_TAKEOVER_RUNS
,
1194 DEBUG(DEBUG_INFO
,("Failed to disable takeover runs\n"));
1198 ret
= ctdb_takeover(rec
, rec
->force_rebalance_nodes
);
1200 /* Reenable takeover runs and IP checks on other nodes */
1202 for (i
= 0; i
< talloc_array_length(nodes
); i
++) {
1203 if (ctdb_client_send_message(rec
->ctdb
, nodes
[i
],
1204 CTDB_SRVID_DISABLE_TAKEOVER_RUNS
,
1206 DEBUG(DEBUG_INFO
,("Failed to re-enable takeover runs\n"));
1211 DEBUG(DEBUG_ERR
, ("ctdb_takeover_run() failed\n"));
1217 /* Takeover run was successful so clear force rebalance targets */
1218 if (rebalance_nodes
== rec
->force_rebalance_nodes
) {
1219 TALLOC_FREE(rec
->force_rebalance_nodes
);
1221 DEBUG(DEBUG_WARNING
,
1222 ("Rebalance target nodes changed during takeover run - not clearing\n"));
1225 rec
->need_takeover_run
= !ok
;
1227 ctdb_op_end(rec
->takeover_run
);
1229 DEBUG(DEBUG_NOTICE
, ("Takeover run %s\n", ok
? "completed successfully" : "unsuccessful"));
1233 static int db_recovery_parallel(struct ctdb_recoverd
*rec
, TALLOC_CTX
*mem_ctx
)
1235 static char prog
[PATH_MAX
+1] = "";
1238 if (!ctdb_set_helper("recovery_helper", prog
, sizeof(prog
),
1239 "CTDB_RECOVERY_HELPER", CTDB_HELPER_BINDIR
,
1240 "ctdb_recovery_helper")) {
1241 ctdb_die(rec
->ctdb
, "Unable to set recovery helper\n");
1244 arg
= talloc_asprintf(mem_ctx
, "%u", new_generation());
1246 DEBUG(DEBUG_ERR
, (__location__
" memory error\n"));
1250 setenv("CTDB_DBDIR_STATE", rec
->ctdb
->db_directory_state
, 1);
1252 return helper_run(rec
, mem_ctx
, prog
, arg
, "recovery");
1256 we are the recmaster, and recovery is needed - start a recovery run
1258 static int do_recovery(struct ctdb_recoverd
*rec
,
1259 TALLOC_CTX
*mem_ctx
, uint32_t pnn
,
1260 struct ctdb_node_map_old
*nodemap
, struct ctdb_vnn_map
*vnnmap
)
1262 struct ctdb_context
*ctdb
= rec
->ctdb
;
1264 struct ctdb_dbid_map_old
*dbmap
;
1267 DEBUG(DEBUG_NOTICE
, (__location__
" Starting do_recovery\n"));
1269 /* Check if the current node is still the recmaster. It's possible that
1270 * re-election has changed the recmaster.
1272 if (pnn
!= rec
->recmaster
) {
1274 ("Recovery master changed to %u, aborting recovery\n",
1279 /* if recovery fails, force it again */
1280 rec
->need_recovery
= true;
1282 if (!ctdb_op_begin(rec
->recovery
)) {
1286 if (rec
->election_timeout
) {
1287 /* an election is in progress */
1288 DEBUG(DEBUG_ERR
, ("do_recovery called while election in progress - try again later\n"));
1292 ban_misbehaving_nodes(rec
, &self_ban
);
1294 DEBUG(DEBUG_NOTICE
, ("This node was banned, aborting recovery\n"));
1298 if (ctdb
->recovery_lock
!= NULL
) {
1299 if (ctdb_recovery_have_lock(rec
)) {
1300 DEBUG(DEBUG_NOTICE
, ("Already holding recovery lock\n"));
1302 DEBUG(DEBUG_NOTICE
, ("Attempting to take recovery lock (%s)\n",
1303 ctdb
->recovery_lock
));
1304 if (!ctdb_recovery_lock(rec
)) {
1305 if (ctdb
->runstate
== CTDB_RUNSTATE_FIRST_RECOVERY
) {
1306 /* If ctdb is trying first recovery, it's
1307 * possible that current node does not know
1308 * yet who the recmaster is.
1310 DEBUG(DEBUG_ERR
, ("Unable to get recovery lock"
1311 " - retrying recovery\n"));
1315 DEBUG(DEBUG_ERR
,("Unable to get recovery lock - aborting recovery "
1316 "and ban ourself for %u seconds\n",
1317 ctdb
->tunable
.recovery_ban_period
));
1318 ctdb_ban_node(rec
, pnn
, ctdb
->tunable
.recovery_ban_period
);
1322 ("Recovery lock taken successfully by recovery daemon\n"));
1326 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery initiated due to problem with node %u\n", rec
->last_culprit_node
));
1328 /* get a list of all databases */
1329 ret
= ctdb_ctrl_getdbmap(ctdb
, CONTROL_TIMEOUT(), pnn
, mem_ctx
, &dbmap
);
1331 DEBUG(DEBUG_ERR
, (__location__
" Unable to get dbids from node :%u\n", pnn
));
1335 /* we do the db creation before we set the recovery mode, so the freeze happens
1336 on all databases we will be dealing with. */
1338 /* verify that we have all the databases any other node has */
1339 ret
= create_missing_local_databases(ctdb
, nodemap
, pnn
, &dbmap
, mem_ctx
);
1341 DEBUG(DEBUG_ERR
, (__location__
" Unable to create missing local databases\n"));
1345 /* verify that all other nodes have all our databases */
1346 ret
= create_missing_remote_databases(ctdb
, nodemap
, pnn
, dbmap
, mem_ctx
);
1348 DEBUG(DEBUG_ERR
, (__location__
" Unable to create missing remote databases\n"));
1351 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - created remote databases\n"));
1354 /* Retrieve capabilities from all connected nodes */
1355 ret
= update_capabilities(rec
, nodemap
);
1357 DEBUG(DEBUG_ERR
, (__location__
" Unable to update node capabilities.\n"));
1362 update all nodes to have the same flags that we have
1364 for (i
=0;i
<nodemap
->num
;i
++) {
1365 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
) {
1369 ret
= update_flags_on_all_nodes(ctdb
, nodemap
, i
, nodemap
->nodes
[i
].flags
);
1371 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_INACTIVE
) {
1372 DEBUG(DEBUG_WARNING
, (__location__
"Unable to update flags on inactive node %d\n", i
));
1374 DEBUG(DEBUG_ERR
, (__location__
" Unable to update flags on all nodes for node %d\n", i
));
1380 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery - updated flags\n"));
1382 ret
= db_recovery_parallel(rec
, mem_ctx
);
1387 do_takeover_run(rec
, nodemap
);
1389 /* send a message to all clients telling them that the cluster
1390 has been reconfigured */
1391 ret
= ctdb_client_send_message(ctdb
, CTDB_BROADCAST_CONNECTED
,
1392 CTDB_SRVID_RECONFIGURE
, tdb_null
);
1394 DEBUG(DEBUG_ERR
, (__location__
" Failed to send reconfigure message\n"));
1398 DEBUG(DEBUG_NOTICE
, (__location__
" Recovery complete\n"));
1400 rec
->need_recovery
= false;
1401 ctdb_op_end(rec
->recovery
);
1403 /* we managed to complete a full recovery, make sure to forgive
1404 any past sins by the nodes that could now participate in the
1407 DEBUG(DEBUG_ERR
,("Resetting ban count to 0 for all nodes\n"));
1408 for (i
=0;i
<nodemap
->num
;i
++) {
1409 struct ctdb_banning_state
*ban_state
;
1411 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
) {
1415 ban_state
= (struct ctdb_banning_state
*)ctdb
->nodes
[nodemap
->nodes
[i
].pnn
]->ban_state
;
1416 if (ban_state
== NULL
) {
1420 ban_state
->count
= 0;
1423 /* We just finished a recovery successfully.
1424 We now wait for rerecovery_timeout before we allow
1425 another recovery to take place.
1427 DEBUG(DEBUG_NOTICE
, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb
->tunable
.rerecovery_timeout
));
1428 ctdb_op_disable(rec
->recovery
, ctdb
->ev
,
1429 ctdb
->tunable
.rerecovery_timeout
);
1433 ctdb_op_end(rec
->recovery
);
1439 elections are won by first checking the number of connected nodes, then
1440 the priority time, then the pnn
1442 struct election_message
{
1443 uint32_t num_connected
;
1444 struct timeval priority_time
;
1446 uint32_t node_flags
;
1450 form this nodes election data
1452 static void ctdb_election_data(struct ctdb_recoverd
*rec
, struct election_message
*em
)
1455 struct ctdb_node_map_old
*nodemap
;
1456 struct ctdb_context
*ctdb
= rec
->ctdb
;
1460 em
->pnn
= rec
->ctdb
->pnn
;
1461 em
->priority_time
= rec
->priority_time
;
1463 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, rec
, &nodemap
);
1465 DEBUG(DEBUG_ERR
,(__location__
" unable to get node map\n"));
1469 rec
->node_flags
= nodemap
->nodes
[ctdb
->pnn
].flags
;
1470 em
->node_flags
= rec
->node_flags
;
1472 for (i
=0;i
<nodemap
->num
;i
++) {
1473 if (!(nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
)) {
1474 em
->num_connected
++;
1478 /* we shouldnt try to win this election if we cant be a recmaster */
1479 if ((ctdb
->capabilities
& CTDB_CAP_RECMASTER
) == 0) {
1480 em
->num_connected
= 0;
1481 em
->priority_time
= timeval_current();
1484 talloc_free(nodemap
);
1488 see if the given election data wins
1490 static bool ctdb_election_win(struct ctdb_recoverd
*rec
, struct election_message
*em
)
1492 struct election_message myem
;
1495 ctdb_election_data(rec
, &myem
);
1497 /* we cant win if we don't have the recmaster capability */
1498 if ((rec
->ctdb
->capabilities
& CTDB_CAP_RECMASTER
) == 0) {
1502 /* we cant win if we are banned */
1503 if (rec
->node_flags
& NODE_FLAGS_BANNED
) {
1507 /* we cant win if we are stopped */
1508 if (rec
->node_flags
& NODE_FLAGS_STOPPED
) {
1512 /* we will automatically win if the other node is banned */
1513 if (em
->node_flags
& NODE_FLAGS_BANNED
) {
1517 /* we will automatically win if the other node is banned */
1518 if (em
->node_flags
& NODE_FLAGS_STOPPED
) {
1522 /* then the longest running node */
1524 cmp
= timeval_compare(&em
->priority_time
, &myem
.priority_time
);
1528 cmp
= (int)myem
.pnn
- (int)em
->pnn
;
1535 send out an election request
1537 static int send_election_request(struct ctdb_recoverd
*rec
, uint32_t pnn
)
1540 TDB_DATA election_data
;
1541 struct election_message emsg
;
1543 struct ctdb_context
*ctdb
= rec
->ctdb
;
1545 srvid
= CTDB_SRVID_ELECTION
;
1547 ctdb_election_data(rec
, &emsg
);
1549 election_data
.dsize
= sizeof(struct election_message
);
1550 election_data
.dptr
= (unsigned char *)&emsg
;
1553 /* first we assume we will win the election and set
1554 recoverymaster to be ourself on the current node
1556 ret
= ctdb_ctrl_setrecmaster(ctdb
, CONTROL_TIMEOUT(),
1557 CTDB_CURRENT_NODE
, pnn
);
1559 DEBUG(DEBUG_ERR
, (__location__
" failed to set recmaster\n"));
1562 rec
->recmaster
= pnn
;
1564 /* send an election message to all active nodes */
1565 DEBUG(DEBUG_INFO
,(__location__
" Send election request to all active nodes\n"));
1566 return ctdb_client_send_message(ctdb
, CTDB_BROADCAST_ALL
, srvid
, election_data
);
1570 we think we are winning the election - send a broadcast election request
1572 static void election_send_request(struct tevent_context
*ev
,
1573 struct tevent_timer
*te
,
1574 struct timeval t
, void *p
)
1576 struct ctdb_recoverd
*rec
= talloc_get_type(p
, struct ctdb_recoverd
);
1579 ret
= send_election_request(rec
, ctdb_get_pnn(rec
->ctdb
));
1581 DEBUG(DEBUG_ERR
,("Failed to send election request!\n"));
1584 TALLOC_FREE(rec
->send_election_te
);
1588 handler for memory dumps
1590 static void mem_dump_handler(uint64_t srvid
, TDB_DATA data
, void *private_data
)
1592 struct ctdb_recoverd
*rec
= talloc_get_type(
1593 private_data
, struct ctdb_recoverd
);
1594 struct ctdb_context
*ctdb
= rec
->ctdb
;
1595 TALLOC_CTX
*tmp_ctx
= talloc_new(ctdb
);
1598 struct ctdb_srvid_message
*rd
;
1600 if (data
.dsize
!= sizeof(struct ctdb_srvid_message
)) {
1601 DEBUG(DEBUG_ERR
, (__location__
" Wrong size of return address.\n"));
1602 talloc_free(tmp_ctx
);
1605 rd
= (struct ctdb_srvid_message
*)data
.dptr
;
1607 dump
= talloc_zero(tmp_ctx
, TDB_DATA
);
1609 DEBUG(DEBUG_ERR
, (__location__
" Failed to allocate memory for memdump\n"));
1610 talloc_free(tmp_ctx
);
1613 ret
= ctdb_dump_memory(ctdb
, dump
);
1615 DEBUG(DEBUG_ERR
, (__location__
" ctdb_dump_memory() failed\n"));
1616 talloc_free(tmp_ctx
);
1620 DEBUG(DEBUG_ERR
, ("recovery master memory dump\n"));
1622 ret
= ctdb_client_send_message(ctdb
, rd
->pnn
, rd
->srvid
, *dump
);
1624 DEBUG(DEBUG_ERR
,("Failed to send rd memdump reply message\n"));
1625 talloc_free(tmp_ctx
);
1629 talloc_free(tmp_ctx
);
1633 handler for reload_nodes
1635 static void reload_nodes_handler(uint64_t srvid
, TDB_DATA data
,
1638 struct ctdb_recoverd
*rec
= talloc_get_type(
1639 private_data
, struct ctdb_recoverd
);
1641 DEBUG(DEBUG_ERR
, (__location__
" Reload nodes file from recovery daemon\n"));
1643 ctdb_load_nodes_file(rec
->ctdb
);
1647 static void recd_node_rebalance_handler(uint64_t srvid
, TDB_DATA data
,
1650 struct ctdb_recoverd
*rec
= talloc_get_type(
1651 private_data
, struct ctdb_recoverd
);
1652 struct ctdb_context
*ctdb
= rec
->ctdb
;
1657 if (rec
->recmaster
!= ctdb_get_pnn(ctdb
)) {
1661 if (data
.dsize
!= sizeof(uint32_t)) {
1662 DEBUG(DEBUG_ERR
,(__location__
" Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data
.dsize
, sizeof(uint32_t)));
1666 pnn
= *(uint32_t *)&data
.dptr
[0];
1668 DEBUG(DEBUG_NOTICE
,("Setting up rebalance of IPs to node %u\n", pnn
));
1670 /* Copy any existing list of nodes. There's probably some
1671 * sort of realloc variant that will do this but we need to
1672 * make sure that freeing the old array also cancels the timer
1673 * event for the timeout... not sure if realloc will do that.
1675 len
= (rec
->force_rebalance_nodes
!= NULL
) ?
1676 talloc_array_length(rec
->force_rebalance_nodes
) :
1679 /* This allows duplicates to be added but they don't cause
1680 * harm. A call to add a duplicate PNN arguably means that
1681 * the timeout should be reset, so this is the simplest
1684 t
= talloc_zero_array(rec
, uint32_t, len
+1);
1685 CTDB_NO_MEMORY_VOID(ctdb
, t
);
1687 memcpy(t
, rec
->force_rebalance_nodes
, sizeof(uint32_t) * len
);
1691 talloc_free(rec
->force_rebalance_nodes
);
1693 rec
->force_rebalance_nodes
= t
;
1698 static void srvid_disable_and_reply(struct ctdb_context
*ctdb
,
1700 struct ctdb_op_state
*op_state
)
1702 struct ctdb_disable_message
*r
;
1707 /* Validate input data */
1708 if (data
.dsize
!= sizeof(struct ctdb_disable_message
)) {
1709 DEBUG(DEBUG_ERR
,(__location__
" Wrong size for data :%lu "
1710 "expecting %lu\n", (long unsigned)data
.dsize
,
1711 (long unsigned)sizeof(struct ctdb_srvid_message
)));
1714 if (data
.dptr
== NULL
) {
1715 DEBUG(DEBUG_ERR
,(__location__
" No data received\n"));
1719 r
= (struct ctdb_disable_message
*)data
.dptr
;
1720 timeout
= r
->timeout
;
1722 ret
= ctdb_op_disable(op_state
, ctdb
->ev
, timeout
);
1727 /* Returning our PNN tells the caller that we succeeded */
1728 ret
= ctdb_get_pnn(ctdb
);
1730 result
.dsize
= sizeof(int32_t);
1731 result
.dptr
= (uint8_t *)&ret
;
1732 srvid_request_reply(ctdb
, (struct ctdb_srvid_message
*)r
, result
);
1735 static void disable_takeover_runs_handler(uint64_t srvid
, TDB_DATA data
,
1738 struct ctdb_recoverd
*rec
= talloc_get_type(
1739 private_data
, struct ctdb_recoverd
);
1741 srvid_disable_and_reply(rec
->ctdb
, data
, rec
->takeover_run
);
1744 /* Backward compatibility for this SRVID */
1745 static void disable_ip_check_handler(uint64_t srvid
, TDB_DATA data
,
1748 struct ctdb_recoverd
*rec
= talloc_get_type(
1749 private_data
, struct ctdb_recoverd
);
1752 if (data
.dsize
!= sizeof(uint32_t)) {
1753 DEBUG(DEBUG_ERR
,(__location__
" Wrong size for data :%lu "
1754 "expecting %lu\n", (long unsigned)data
.dsize
,
1755 (long unsigned)sizeof(uint32_t)));
1758 if (data
.dptr
== NULL
) {
1759 DEBUG(DEBUG_ERR
,(__location__
" No data received\n"));
1763 timeout
= *((uint32_t *)data
.dptr
);
1765 ctdb_op_disable(rec
->takeover_run
, rec
->ctdb
->ev
, timeout
);
1768 static void disable_recoveries_handler(uint64_t srvid
, TDB_DATA data
,
1771 struct ctdb_recoverd
*rec
= talloc_get_type(
1772 private_data
, struct ctdb_recoverd
);
1774 srvid_disable_and_reply(rec
->ctdb
, data
, rec
->recovery
);
1778 handler for ip reallocate, just add it to the list of requests and
1779 handle this later in the monitor_cluster loop so we do not recurse
1780 with other requests to takeover_run()
1782 static void ip_reallocate_handler(uint64_t srvid
, TDB_DATA data
,
1785 struct ctdb_srvid_message
*request
;
1786 struct ctdb_recoverd
*rec
= talloc_get_type(
1787 private_data
, struct ctdb_recoverd
);
1789 if (data
.dsize
!= sizeof(struct ctdb_srvid_message
)) {
1790 DEBUG(DEBUG_ERR
, (__location__
" Wrong size of return address.\n"));
1794 request
= (struct ctdb_srvid_message
*)data
.dptr
;
1796 srvid_request_add(rec
->ctdb
, &rec
->reallocate_requests
, request
);
1799 static void process_ipreallocate_requests(struct ctdb_context
*ctdb
,
1800 struct ctdb_recoverd
*rec
)
1804 struct srvid_requests
*current
;
1806 /* Only process requests that are currently pending. More
1807 * might come in while the takeover run is in progress and
1808 * they will need to be processed later since they might
1809 * be in response flag changes.
1811 current
= rec
->reallocate_requests
;
1812 rec
->reallocate_requests
= NULL
;
1814 if (do_takeover_run(rec
, rec
->nodemap
)) {
1815 ret
= ctdb_get_pnn(ctdb
);
1820 result
.dsize
= sizeof(int32_t);
1821 result
.dptr
= (uint8_t *)&ret
;
1823 srvid_requests_reply(ctdb
, ¤t
, result
);
1827 * handler for assigning banning credits
1829 static void banning_handler(uint64_t srvid
, TDB_DATA data
, void *private_data
)
1831 struct ctdb_recoverd
*rec
= talloc_get_type(
1832 private_data
, struct ctdb_recoverd
);
1835 /* Ignore if we are not recmaster */
1836 if (rec
->ctdb
->pnn
!= rec
->recmaster
) {
1840 if (data
.dsize
!= sizeof(uint32_t)) {
1841 DEBUG(DEBUG_ERR
, (__location__
"invalid data size %zu\n",
1846 ban_pnn
= *(uint32_t *)data
.dptr
;
1848 ctdb_set_culprit_count(rec
, ban_pnn
, rec
->nodemap
->num
);
1852 handler for recovery master elections
1854 static void election_handler(uint64_t srvid
, TDB_DATA data
, void *private_data
)
1856 struct ctdb_recoverd
*rec
= talloc_get_type(
1857 private_data
, struct ctdb_recoverd
);
1858 struct ctdb_context
*ctdb
= rec
->ctdb
;
1860 struct election_message
*em
= (struct election_message
*)data
.dptr
;
1862 /* Ignore election packets from ourself */
1863 if (ctdb
->pnn
== em
->pnn
) {
1867 /* we got an election packet - update the timeout for the election */
1868 talloc_free(rec
->election_timeout
);
1869 rec
->election_timeout
= tevent_add_timer(
1872 timeval_current_ofs(0, 500000) :
1873 timeval_current_ofs(ctdb
->tunable
.election_timeout
, 0),
1874 ctdb_election_timeout
, rec
);
1876 /* someone called an election. check their election data
1877 and if we disagree and we would rather be the elected node,
1878 send a new election message to all other nodes
1880 if (ctdb_election_win(rec
, em
)) {
1881 if (!rec
->send_election_te
) {
1882 rec
->send_election_te
= tevent_add_timer(
1884 timeval_current_ofs(0, 500000),
1885 election_send_request
, rec
);
1891 TALLOC_FREE(rec
->send_election_te
);
1893 /* Release the recovery lock file */
1894 if (ctdb_recovery_have_lock(rec
)) {
1895 ctdb_recovery_unlock(rec
);
1898 /* ok, let that guy become recmaster then */
1899 ret
= ctdb_ctrl_setrecmaster(ctdb
, CONTROL_TIMEOUT(),
1900 CTDB_CURRENT_NODE
, em
->pnn
);
1902 DEBUG(DEBUG_ERR
, (__location__
" failed to set recmaster"));
1905 rec
->recmaster
= em
->pnn
;
1912 force the start of the election process
1914 static void force_election(struct ctdb_recoverd
*rec
, uint32_t pnn
,
1915 struct ctdb_node_map_old
*nodemap
)
1918 struct ctdb_context
*ctdb
= rec
->ctdb
;
1920 DEBUG(DEBUG_INFO
,(__location__
" Force an election\n"));
1922 /* set all nodes to recovery mode to stop all internode traffic */
1923 ret
= set_recovery_mode(ctdb
, rec
, nodemap
, CTDB_RECOVERY_ACTIVE
);
1925 DEBUG(DEBUG_ERR
, (__location__
" Unable to set recovery mode to active on cluster\n"));
1929 talloc_free(rec
->election_timeout
);
1930 rec
->election_timeout
= tevent_add_timer(
1933 timeval_current_ofs(0, 500000) :
1934 timeval_current_ofs(ctdb
->tunable
.election_timeout
, 0),
1935 ctdb_election_timeout
, rec
);
1937 ret
= send_election_request(rec
, pnn
);
1939 DEBUG(DEBUG_ERR
, (__location__
" failed to initiate recmaster election"));
1943 /* wait for a few seconds to collect all responses */
1944 ctdb_wait_election(rec
);
1950 handler for when a node changes its flags
1952 static void monitor_handler(uint64_t srvid
, TDB_DATA data
, void *private_data
)
1954 struct ctdb_recoverd
*rec
= talloc_get_type(
1955 private_data
, struct ctdb_recoverd
);
1956 struct ctdb_context
*ctdb
= rec
->ctdb
;
1958 struct ctdb_node_flag_change
*c
= (struct ctdb_node_flag_change
*)data
.dptr
;
1959 struct ctdb_node_map_old
*nodemap
=NULL
;
1960 TALLOC_CTX
*tmp_ctx
;
1963 if (data
.dsize
!= sizeof(*c
)) {
1964 DEBUG(DEBUG_ERR
,(__location__
"Invalid data in ctdb_node_flag_change\n"));
1968 tmp_ctx
= talloc_new(ctdb
);
1969 CTDB_NO_MEMORY_VOID(ctdb
, tmp_ctx
);
1971 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, tmp_ctx
, &nodemap
);
1973 DEBUG(DEBUG_ERR
,(__location__
"ctdb_ctrl_getnodemap failed in monitor_handler\n"));
1974 talloc_free(tmp_ctx
);
1979 for (i
=0;i
<nodemap
->num
;i
++) {
1980 if (nodemap
->nodes
[i
].pnn
== c
->pnn
) break;
1983 if (i
== nodemap
->num
) {
1984 DEBUG(DEBUG_CRIT
,(__location__
"Flag change for non-existant node %u\n", c
->pnn
));
1985 talloc_free(tmp_ctx
);
1989 if (c
->old_flags
!= c
->new_flags
) {
1990 DEBUG(DEBUG_NOTICE
,("Node %u has changed flags - now 0x%x was 0x%x\n", c
->pnn
, c
->new_flags
, c
->old_flags
));
1993 nodemap
->nodes
[i
].flags
= c
->new_flags
;
1995 talloc_free(tmp_ctx
);
1999 handler for when we need to push out flag changes ot all other nodes
2001 static void push_flags_handler(uint64_t srvid
, TDB_DATA data
,
2004 struct ctdb_recoverd
*rec
= talloc_get_type(
2005 private_data
, struct ctdb_recoverd
);
2006 struct ctdb_context
*ctdb
= rec
->ctdb
;
2008 struct ctdb_node_flag_change
*c
= (struct ctdb_node_flag_change
*)data
.dptr
;
2009 struct ctdb_node_map_old
*nodemap
=NULL
;
2010 TALLOC_CTX
*tmp_ctx
= talloc_new(ctdb
);
2013 /* read the node flags from the recmaster */
2014 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), rec
->recmaster
,
2017 DEBUG(DEBUG_ERR
, (__location__
" Unable to get nodemap from node %u\n", c
->pnn
));
2018 talloc_free(tmp_ctx
);
2021 if (c
->pnn
>= nodemap
->num
) {
2022 DEBUG(DEBUG_ERR
,(__location__
" Nodemap from recmaster does not contain node %d\n", c
->pnn
));
2023 talloc_free(tmp_ctx
);
2027 /* send the flags update to all connected nodes */
2028 nodes
= list_of_connected_nodes(ctdb
, nodemap
, tmp_ctx
, true);
2030 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_MODIFY_FLAGS
,
2031 nodes
, 0, CONTROL_TIMEOUT(),
2035 DEBUG(DEBUG_ERR
, (__location__
" ctdb_control to modify node flags failed\n"));
2037 talloc_free(tmp_ctx
);
2041 talloc_free(tmp_ctx
);
2045 struct verify_recmode_normal_data
{
2047 enum monitor_result status
;
2050 static void verify_recmode_normal_callback(struct ctdb_client_control_state
*state
)
2052 struct verify_recmode_normal_data
*rmdata
= talloc_get_type(state
->async
.private_data
, struct verify_recmode_normal_data
);
2055 /* one more node has responded with recmode data*/
2058 /* if we failed to get the recmode, then return an error and let
2059 the main loop try again.
2061 if (state
->state
!= CTDB_CONTROL_DONE
) {
2062 if (rmdata
->status
== MONITOR_OK
) {
2063 rmdata
->status
= MONITOR_FAILED
;
2068 /* if we got a response, then the recmode will be stored in the
2071 if (state
->status
!= CTDB_RECOVERY_NORMAL
) {
2072 DEBUG(DEBUG_NOTICE
, ("Node:%u was in recovery mode. Start recovery process\n", state
->c
->hdr
.destnode
));
2073 rmdata
->status
= MONITOR_RECOVERY_NEEDED
;
2080 /* verify that all nodes are in normal recovery mode */
2081 static enum monitor_result
verify_recmode(struct ctdb_context
*ctdb
, struct ctdb_node_map_old
*nodemap
)
2083 struct verify_recmode_normal_data
*rmdata
;
2084 TALLOC_CTX
*mem_ctx
= talloc_new(ctdb
);
2085 struct ctdb_client_control_state
*state
;
2086 enum monitor_result status
;
2089 rmdata
= talloc(mem_ctx
, struct verify_recmode_normal_data
);
2090 CTDB_NO_MEMORY_FATAL(ctdb
, rmdata
);
2092 rmdata
->status
= MONITOR_OK
;
2094 /* loop over all active nodes and send an async getrecmode call to
2096 for (j
=0; j
<nodemap
->num
; j
++) {
2097 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
2100 state
= ctdb_ctrl_getrecmode_send(ctdb
, mem_ctx
,
2102 nodemap
->nodes
[j
].pnn
);
2103 if (state
== NULL
) {
2104 /* we failed to send the control, treat this as
2105 an error and try again next iteration
2107 DEBUG(DEBUG_ERR
,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2108 talloc_free(mem_ctx
);
2109 return MONITOR_FAILED
;
2112 /* set up the callback functions */
2113 state
->async
.fn
= verify_recmode_normal_callback
;
2114 state
->async
.private_data
= rmdata
;
2116 /* one more control to wait for to complete */
2121 /* now wait for up to the maximum number of seconds allowed
2122 or until all nodes we expect a response from has replied
2124 while (rmdata
->count
> 0) {
2125 tevent_loop_once(ctdb
->ev
);
2128 status
= rmdata
->status
;
2129 talloc_free(mem_ctx
);
2134 struct verify_recmaster_data
{
2135 struct ctdb_recoverd
*rec
;
2138 enum monitor_result status
;
2141 static void verify_recmaster_callback(struct ctdb_client_control_state
*state
)
2143 struct verify_recmaster_data
*rmdata
= talloc_get_type(state
->async
.private_data
, struct verify_recmaster_data
);
2146 /* one more node has responded with recmaster data*/
2149 /* if we failed to get the recmaster, then return an error and let
2150 the main loop try again.
2152 if (state
->state
!= CTDB_CONTROL_DONE
) {
2153 if (rmdata
->status
== MONITOR_OK
) {
2154 rmdata
->status
= MONITOR_FAILED
;
2159 /* if we got a response, then the recmaster will be stored in the
2162 if (state
->status
!= rmdata
->pnn
) {
2163 DEBUG(DEBUG_ERR
,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state
->c
->hdr
.destnode
, state
->status
));
2164 ctdb_set_culprit(rmdata
->rec
, state
->c
->hdr
.destnode
);
2165 rmdata
->status
= MONITOR_ELECTION_NEEDED
;
2172 /* verify that all nodes agree that we are the recmaster */
2173 static enum monitor_result
verify_recmaster(struct ctdb_recoverd
*rec
, struct ctdb_node_map_old
*nodemap
, uint32_t pnn
)
2175 struct ctdb_context
*ctdb
= rec
->ctdb
;
2176 struct verify_recmaster_data
*rmdata
;
2177 TALLOC_CTX
*mem_ctx
= talloc_new(ctdb
);
2178 struct ctdb_client_control_state
*state
;
2179 enum monitor_result status
;
2182 rmdata
= talloc(mem_ctx
, struct verify_recmaster_data
);
2183 CTDB_NO_MEMORY_FATAL(ctdb
, rmdata
);
2187 rmdata
->status
= MONITOR_OK
;
2189 /* loop over all active nodes and send an async getrecmaster call to
2191 for (j
=0; j
<nodemap
->num
; j
++) {
2192 if (nodemap
->nodes
[j
].pnn
== rec
->recmaster
) {
2195 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
2198 state
= ctdb_ctrl_getrecmaster_send(ctdb
, mem_ctx
,
2200 nodemap
->nodes
[j
].pnn
);
2201 if (state
== NULL
) {
2202 /* we failed to send the control, treat this as
2203 an error and try again next iteration
2205 DEBUG(DEBUG_ERR
,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
2206 talloc_free(mem_ctx
);
2207 return MONITOR_FAILED
;
2210 /* set up the callback functions */
2211 state
->async
.fn
= verify_recmaster_callback
;
2212 state
->async
.private_data
= rmdata
;
2214 /* one more control to wait for to complete */
2219 /* now wait for up to the maximum number of seconds allowed
2220 or until all nodes we expect a response from has replied
2222 while (rmdata
->count
> 0) {
2223 tevent_loop_once(ctdb
->ev
);
2226 status
= rmdata
->status
;
2227 talloc_free(mem_ctx
);
2231 static bool interfaces_have_changed(struct ctdb_context
*ctdb
,
2232 struct ctdb_recoverd
*rec
)
2234 struct ctdb_iface_list_old
*ifaces
= NULL
;
2235 TALLOC_CTX
*mem_ctx
;
2238 mem_ctx
= talloc_new(NULL
);
2240 /* Read the interfaces from the local node */
2241 if (ctdb_ctrl_get_ifaces(ctdb
, CONTROL_TIMEOUT(),
2242 CTDB_CURRENT_NODE
, mem_ctx
, &ifaces
) != 0) {
2243 DEBUG(DEBUG_ERR
, ("Unable to get interfaces from local node %u\n", ctdb
->pnn
));
2244 /* We could return an error. However, this will be
2245 * rare so we'll decide that the interfaces have
2246 * actually changed, just in case.
2248 talloc_free(mem_ctx
);
2253 /* We haven't been here before so things have changed */
2254 DEBUG(DEBUG_NOTICE
, ("Initial interface fetched\n"));
2256 } else if (rec
->ifaces
->num
!= ifaces
->num
) {
2257 /* Number of interfaces has changed */
2258 DEBUG(DEBUG_NOTICE
, ("Interface count changed from %d to %d\n",
2259 rec
->ifaces
->num
, ifaces
->num
));
2262 /* See if interface names or link states have changed */
2264 for (i
= 0; i
< rec
->ifaces
->num
; i
++) {
2265 struct ctdb_iface
* iface
= &rec
->ifaces
->ifaces
[i
];
2266 if (strcmp(iface
->name
, ifaces
->ifaces
[i
].name
) != 0) {
2268 ("Interface in slot %d changed: %s => %s\n",
2269 i
, iface
->name
, ifaces
->ifaces
[i
].name
));
2273 if (iface
->link_state
!= ifaces
->ifaces
[i
].link_state
) {
2275 ("Interface %s changed state: %d => %d\n",
2276 iface
->name
, iface
->link_state
,
2277 ifaces
->ifaces
[i
].link_state
));
2284 talloc_free(rec
->ifaces
);
2285 rec
->ifaces
= talloc_steal(rec
, ifaces
);
2287 talloc_free(mem_ctx
);
2291 /* Check that the local allocation of public IP addresses is correct
2292 * and do some house-keeping */
2293 static int verify_local_ip_allocation(struct ctdb_context
*ctdb
,
2294 struct ctdb_recoverd
*rec
,
2296 struct ctdb_node_map_old
*nodemap
)
2298 TALLOC_CTX
*mem_ctx
= talloc_new(NULL
);
2300 bool need_takeover_run
= false;
2301 struct ctdb_public_ip_list_old
*ips
= NULL
;
2303 /* If we are not the recmaster then do some housekeeping */
2304 if (rec
->recmaster
!= pnn
) {
2305 /* Ignore any IP reallocate requests - only recmaster
2308 TALLOC_FREE(rec
->reallocate_requests
);
2309 /* Clear any nodes that should be force rebalanced in
2310 * the next takeover run. If the recovery master role
2311 * has moved then we don't want to process these some
2312 * time in the future.
2314 TALLOC_FREE(rec
->force_rebalance_nodes
);
2317 /* Return early if disabled... */
2318 if (ctdb
->tunable
.disable_ip_failover
!= 0 ||
2319 ctdb_op_is_disabled(rec
->takeover_run
)) {
2323 if (interfaces_have_changed(ctdb
, rec
)) {
2324 need_takeover_run
= true;
2327 /* If there are unhosted IPs but this node can host them then
2328 * trigger an IP reallocation */
2330 /* Read *available* IPs from local node */
2331 ret
= ctdb_ctrl_get_public_ips_flags(
2332 ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, mem_ctx
,
2333 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE
, &ips
);
2335 DEBUG(DEBUG_ERR
, ("Unable to retrieve available public IPs\n"));
2336 talloc_free(mem_ctx
);
2340 for (j
=0; j
<ips
->num
; j
++) {
2341 if (ips
->ips
[j
].pnn
== -1 &&
2342 nodemap
->nodes
[pnn
].flags
== 0) {
2343 DEBUG(DEBUG_WARNING
,
2344 ("Unassigned IP %s can be served by this node\n",
2345 ctdb_addr_to_str(&ips
->ips
[j
].addr
)));
2346 need_takeover_run
= true;
2352 if (!ctdb
->do_checkpublicip
) {
2356 /* Validate the IP addresses that this node has on network
2357 * interfaces. If there is an inconsistency between reality
2358 * and the state expected by CTDB then try to fix it by
2359 * triggering an IP reallocation or releasing extraneous IP
2362 /* Read *known* IPs from local node */
2363 ret
= ctdb_ctrl_get_public_ips_flags(
2364 ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, mem_ctx
, 0, &ips
);
2366 DEBUG(DEBUG_ERR
, ("Unable to retrieve known public IPs\n"));
2367 talloc_free(mem_ctx
);
2371 for (j
=0; j
<ips
->num
; j
++) {
2372 if (ips
->ips
[j
].pnn
== pnn
) {
2373 if (!ctdb_sys_have_ip(&ips
->ips
[j
].addr
)) {
2375 ("Assigned IP %s not on an interface\n",
2376 ctdb_addr_to_str(&ips
->ips
[j
].addr
)));
2377 need_takeover_run
= true;
2380 if (ctdb_sys_have_ip(&ips
->ips
[j
].addr
)) {
2382 ("IP %s incorrectly on an interface\n",
2383 ctdb_addr_to_str(&ips
->ips
[j
].addr
)));
2384 need_takeover_run
= true;
2390 if (need_takeover_run
) {
2391 struct ctdb_srvid_message rd
;
2394 DEBUG(DEBUG_NOTICE
,("Trigger takeoverrun\n"));
2399 data
.dptr
= (uint8_t *)&rd
;
2400 data
.dsize
= sizeof(rd
);
2402 ret
= ctdb_client_send_message(ctdb
, rec
->recmaster
, CTDB_SRVID_TAKEOVER_RUN
, data
);
2405 ("Failed to send takeover run request\n"));
2408 talloc_free(mem_ctx
);
2413 static void async_getnodemap_callback(struct ctdb_context
*ctdb
, uint32_t node_pnn
, int32_t res
, TDB_DATA outdata
, void *callback_data
)
2415 struct ctdb_node_map_old
**remote_nodemaps
= callback_data
;
2417 if (node_pnn
>= ctdb
->num_nodes
) {
2418 DEBUG(DEBUG_ERR
,(__location__
" pnn from invalid node\n"));
2422 remote_nodemaps
[node_pnn
] = (struct ctdb_node_map_old
*)talloc_steal(remote_nodemaps
, outdata
.dptr
);
2426 static int get_remote_nodemaps(struct ctdb_context
*ctdb
, TALLOC_CTX
*mem_ctx
,
2427 struct ctdb_node_map_old
*nodemap
,
2428 struct ctdb_node_map_old
**remote_nodemaps
)
2432 nodes
= list_of_active_nodes(ctdb
, nodemap
, mem_ctx
, true);
2433 if (ctdb_client_async_control(ctdb
, CTDB_CONTROL_GET_NODEMAP
,
2435 CONTROL_TIMEOUT(), false, tdb_null
,
2436 async_getnodemap_callback
,
2438 remote_nodemaps
) != 0) {
2439 DEBUG(DEBUG_ERR
, (__location__
" Unable to pull all remote nodemaps\n"));
2447 static bool validate_recovery_master(struct ctdb_recoverd
*rec
,
2448 TALLOC_CTX
*mem_ctx
)
2450 struct ctdb_context
*ctdb
= rec
->ctdb
;
2451 uint32_t pnn
= ctdb_get_pnn(ctdb
);
2452 struct ctdb_node_map_old
*nodemap
= rec
->nodemap
;
2453 struct ctdb_node_map_old
*recmaster_nodemap
= NULL
;
2456 /* When recovery daemon is started, recmaster is set to
2457 * "unknown" so it knows to start an election.
2459 if (rec
->recmaster
== CTDB_UNKNOWN_PNN
) {
2461 ("Initial recovery master set - forcing election\n"));
2462 force_election(rec
, pnn
, nodemap
);
2467 * If the current recmaster does not have CTDB_CAP_RECMASTER,
2468 * but we have, then force an election and try to become the new
2471 if (!ctdb_node_has_capabilities(rec
->caps
,
2473 CTDB_CAP_RECMASTER
) &&
2474 (rec
->ctdb
->capabilities
& CTDB_CAP_RECMASTER
) &&
2475 !(nodemap
->nodes
[pnn
].flags
& NODE_FLAGS_INACTIVE
)) {
2477 (" Current recmaster node %u does not have CAP_RECMASTER,"
2478 " but we (node %u) have - force an election\n",
2479 rec
->recmaster
, pnn
));
2480 force_election(rec
, pnn
, nodemap
);
2484 /* Verify that the master node has not been deleted. This
2485 * should not happen because a node should always be shutdown
2486 * before being deleted, causing a new master to be elected
2487 * before now. However, if something strange has happened
2488 * then checking here will ensure we don't index beyond the
2489 * end of the nodemap array. */
2490 if (rec
->recmaster
>= nodemap
->num
) {
2492 ("Recmaster node %u has been deleted. Force election\n",
2494 force_election(rec
, pnn
, nodemap
);
2498 /* if recovery master is disconnected/deleted we must elect a new recmaster */
2499 if (nodemap
->nodes
[rec
->recmaster
].flags
&
2500 (NODE_FLAGS_DISCONNECTED
|NODE_FLAGS_DELETED
)) {
2502 ("Recmaster node %u is disconnected/deleted. Force election\n",
2504 force_election(rec
, pnn
, nodemap
);
2508 /* get nodemap from the recovery master to check if it is inactive */
2509 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), rec
->recmaster
,
2510 mem_ctx
, &recmaster_nodemap
);
2514 " Unable to get nodemap from recovery master %u\n",
2516 /* No election, just error */
2521 if ((recmaster_nodemap
->nodes
[rec
->recmaster
].flags
& NODE_FLAGS_INACTIVE
) &&
2522 (rec
->node_flags
& NODE_FLAGS_INACTIVE
) == 0) {
2524 ("Recmaster node %u is inactive. Force election\n",
2527 * update our nodemap to carry the recmaster's notion of
2528 * its own flags, so that we don't keep freezing the
2529 * inactive recmaster node...
2531 nodemap
->nodes
[rec
->recmaster
].flags
=
2532 recmaster_nodemap
->nodes
[rec
->recmaster
].flags
;
2533 force_election(rec
, pnn
, nodemap
);
2540 static void main_loop(struct ctdb_context
*ctdb
, struct ctdb_recoverd
*rec
,
2541 TALLOC_CTX
*mem_ctx
)
2544 struct ctdb_node_map_old
*nodemap
=NULL
;
2545 struct ctdb_node_map_old
**remote_nodemaps
=NULL
;
2546 struct ctdb_vnn_map
*vnnmap
=NULL
;
2547 struct ctdb_vnn_map
*remote_vnnmap
=NULL
;
2548 uint32_t num_lmasters
;
2549 int32_t debug_level
;
2554 /* verify that the main daemon is still running */
2555 if (ctdb_kill(ctdb
, ctdb
->ctdbd_pid
, 0) != 0) {
2556 DEBUG(DEBUG_CRIT
,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2560 /* ping the local daemon to tell it we are alive */
2561 ctdb_ctrl_recd_ping(ctdb
);
2563 if (rec
->election_timeout
) {
2564 /* an election is in progress */
2568 /* read the debug level from the parent and update locally */
2569 ret
= ctdb_ctrl_get_debuglevel(ctdb
, CTDB_CURRENT_NODE
, &debug_level
);
2571 DEBUG(DEBUG_ERR
, (__location__
" Failed to read debuglevel from parent\n"));
2574 DEBUGLEVEL
= debug_level
;
2576 /* get relevant tunables */
2577 ret
= ctdb_ctrl_get_all_tunables(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, &ctdb
->tunable
);
2579 DEBUG(DEBUG_ERR
,("Failed to get tunables - retrying\n"));
2584 ret
= ctdb_ctrl_get_runstate(ctdb
, CONTROL_TIMEOUT(),
2585 CTDB_CURRENT_NODE
, &ctdb
->runstate
);
2587 DEBUG(DEBUG_ERR
, ("Failed to get runstate - retrying\n"));
2591 pnn
= ctdb_get_pnn(ctdb
);
2594 TALLOC_FREE(rec
->nodemap
);
2595 ret
= ctdb_ctrl_getnodemap(ctdb
, CONTROL_TIMEOUT(), pnn
, rec
, &rec
->nodemap
);
2597 DEBUG(DEBUG_ERR
, (__location__
" Unable to get nodemap from node %u\n", pnn
));
2600 nodemap
= rec
->nodemap
;
2602 /* remember our own node flags */
2603 rec
->node_flags
= nodemap
->nodes
[pnn
].flags
;
2605 ban_misbehaving_nodes(rec
, &self_ban
);
2607 DEBUG(DEBUG_NOTICE
, ("This node was banned, restart main_loop\n"));
2611 ret
= ctdb_ctrl_getrecmode(ctdb
, mem_ctx
, CONTROL_TIMEOUT(),
2612 CTDB_CURRENT_NODE
, &ctdb
->recovery_mode
);
2614 D_ERR("Failed to read recmode from local node\n");
2618 /* if the local daemon is STOPPED or BANNED, we verify that the databases are
2619 also frozen and that the recmode is set to active.
2621 if (rec
->node_flags
& (NODE_FLAGS_STOPPED
| NODE_FLAGS_BANNED
)) {
2622 /* If this node has become inactive then we want to
2623 * reduce the chances of it taking over the recovery
2624 * master role when it becomes active again. This
2625 * helps to stabilise the recovery master role so that
2626 * it stays on the most stable node.
2628 rec
->priority_time
= timeval_current();
2630 if (ctdb
->recovery_mode
== CTDB_RECOVERY_NORMAL
) {
2631 DEBUG(DEBUG_ERR
,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
2633 ret
= ctdb_ctrl_setrecmode(ctdb
, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE
, CTDB_RECOVERY_ACTIVE
);
2635 DEBUG(DEBUG_ERR
,(__location__
" Failed to activate recovery mode in STOPPED or BANNED state\n"));
2640 if (! rec
->frozen_on_inactive
) {
2641 ret
= ctdb_ctrl_freeze(ctdb
, CONTROL_TIMEOUT(),
2645 (__location__
" Failed to freeze node "
2646 "in STOPPED or BANNED state\n"));
2650 rec
->frozen_on_inactive
= true;
2653 /* If this node is stopped or banned then it is not the recovery
2654 * master, so don't do anything. This prevents stopped or banned
2655 * node from starting election and sending unnecessary controls.
2660 rec
->frozen_on_inactive
= false;
2662 /* Retrieve capabilities from all connected nodes */
2663 ret
= update_capabilities(rec
, nodemap
);
2665 DEBUG(DEBUG_ERR
, (__location__
" Unable to update node capabilities.\n"));
2669 if (! validate_recovery_master(rec
, mem_ctx
)) {
2673 if (ctdb
->recovery_mode
== CTDB_RECOVERY_NORMAL
) {
2674 /* Check if an IP takeover run is needed and trigger one if
2676 verify_local_ip_allocation(ctdb
, rec
, pnn
, nodemap
);
2679 /* if we are not the recmaster then we do not need to check
2680 if recovery is needed
2682 if (pnn
!= rec
->recmaster
) {
2687 /* ensure our local copies of flags are right */
2688 ret
= update_local_flags(rec
, nodemap
);
2690 DEBUG(DEBUG_ERR
,("Unable to update local flags\n"));
2694 if (ctdb
->num_nodes
!= nodemap
->num
) {
2695 DEBUG(DEBUG_ERR
, (__location__
" ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb
->num_nodes
, nodemap
->num
));
2696 ctdb_load_nodes_file(ctdb
);
2700 /* verify that all active nodes agree that we are the recmaster */
2701 switch (verify_recmaster(rec
, nodemap
, pnn
)) {
2702 case MONITOR_RECOVERY_NEEDED
:
2703 /* can not happen */
2705 case MONITOR_ELECTION_NEEDED
:
2706 force_election(rec
, pnn
, nodemap
);
2710 case MONITOR_FAILED
:
2715 /* get the vnnmap */
2716 ret
= ctdb_ctrl_getvnnmap(ctdb
, CONTROL_TIMEOUT(), pnn
, mem_ctx
, &vnnmap
);
2718 DEBUG(DEBUG_ERR
, (__location__
" Unable to get vnnmap from node %u\n", pnn
));
2722 if (rec
->need_recovery
) {
2723 /* a previous recovery didn't finish */
2724 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
2728 /* verify that all active nodes are in normal mode
2729 and not in recovery mode
2731 switch (verify_recmode(ctdb
, nodemap
)) {
2732 case MONITOR_RECOVERY_NEEDED
:
2733 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
2735 case MONITOR_FAILED
:
2737 case MONITOR_ELECTION_NEEDED
:
2738 /* can not happen */
2744 if (ctdb
->recovery_lock
!= NULL
) {
2745 /* We must already hold the recovery lock */
2746 if (!ctdb_recovery_have_lock(rec
)) {
2747 DEBUG(DEBUG_ERR
,("Failed recovery lock sanity check. Force a recovery\n"));
2748 ctdb_set_culprit(rec
, ctdb
->pnn
);
2749 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
2755 /* If recoveries are disabled then there is no use doing any
2756 * nodemap or flags checks. Recoveries might be disabled due
2757 * to "reloadnodes", so doing these checks might cause an
2758 * unnecessary recovery. */
2759 if (ctdb_op_is_disabled(rec
->recovery
)) {
2760 goto takeover_run_checks
;
2763 /* get the nodemap for all active remote nodes
2765 remote_nodemaps
= talloc_array(mem_ctx
, struct ctdb_node_map_old
*, nodemap
->num
);
2766 if (remote_nodemaps
== NULL
) {
2767 DEBUG(DEBUG_ERR
, (__location__
" failed to allocate remote nodemap array\n"));
2770 for(i
=0; i
<nodemap
->num
; i
++) {
2771 remote_nodemaps
[i
] = NULL
;
2773 if (get_remote_nodemaps(ctdb
, mem_ctx
, nodemap
, remote_nodemaps
) != 0) {
2774 DEBUG(DEBUG_ERR
,(__location__
" Failed to read remote nodemaps\n"));
2778 /* verify that all other nodes have the same nodemap as we have
2780 for (j
=0; j
<nodemap
->num
; j
++) {
2781 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
2785 if (remote_nodemaps
[j
] == NULL
) {
2786 DEBUG(DEBUG_ERR
,(__location__
" Did not get a remote nodemap for node %d, restarting monitoring\n", j
));
2787 ctdb_set_culprit(rec
, j
);
2792 /* if the nodes disagree on how many nodes there are
2793 then this is a good reason to try recovery
2795 if (remote_nodemaps
[j
]->num
!= nodemap
->num
) {
2796 DEBUG(DEBUG_ERR
, (__location__
" Remote node:%u has different node count. %u vs %u of the local node\n",
2797 nodemap
->nodes
[j
].pnn
, remote_nodemaps
[j
]->num
, nodemap
->num
));
2798 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
2799 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
2803 /* if the nodes disagree on which nodes exist and are
2804 active, then that is also a good reason to do recovery
2806 for (i
=0;i
<nodemap
->num
;i
++) {
2807 if (remote_nodemaps
[j
]->nodes
[i
].pnn
!= nodemap
->nodes
[i
].pnn
) {
2808 DEBUG(DEBUG_ERR
, (__location__
" Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
2809 nodemap
->nodes
[j
].pnn
, i
,
2810 remote_nodemaps
[j
]->nodes
[i
].pnn
, nodemap
->nodes
[i
].pnn
));
2811 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
2812 do_recovery(rec
, mem_ctx
, pnn
, nodemap
,
2820 * Update node flags obtained from each active node. This ensure we have
2821 * up-to-date information for all the nodes.
2823 for (j
=0; j
<nodemap
->num
; j
++) {
2824 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
2827 nodemap
->nodes
[j
].flags
= remote_nodemaps
[j
]->nodes
[j
].flags
;
2830 for (j
=0; j
<nodemap
->num
; j
++) {
2831 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
2835 /* verify the flags are consistent
2837 for (i
=0; i
<nodemap
->num
; i
++) {
2838 if (nodemap
->nodes
[i
].flags
& NODE_FLAGS_DISCONNECTED
) {
2842 if (nodemap
->nodes
[i
].flags
!= remote_nodemaps
[j
]->nodes
[i
].flags
) {
2843 DEBUG(DEBUG_ERR
, (__location__
" Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
2844 nodemap
->nodes
[j
].pnn
,
2845 nodemap
->nodes
[i
].pnn
,
2846 remote_nodemaps
[j
]->nodes
[i
].flags
,
2847 nodemap
->nodes
[i
].flags
));
2849 DEBUG(DEBUG_ERR
,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps
[j
]->nodes
[i
].flags
, j
));
2850 update_flags_on_all_nodes(ctdb
, nodemap
, nodemap
->nodes
[i
].pnn
, remote_nodemaps
[j
]->nodes
[i
].flags
);
2851 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
2852 do_recovery(rec
, mem_ctx
, pnn
, nodemap
,
2856 DEBUG(DEBUG_ERR
,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap
->nodes
[i
].flags
, i
));
2857 update_flags_on_all_nodes(ctdb
, nodemap
, nodemap
->nodes
[i
].pnn
, nodemap
->nodes
[i
].flags
);
2858 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
2859 do_recovery(rec
, mem_ctx
, pnn
, nodemap
,
2868 /* count how many active nodes there are */
2870 for (i
=0; i
<nodemap
->num
; i
++) {
2871 if (!(nodemap
->nodes
[i
].flags
& NODE_FLAGS_INACTIVE
)) {
2872 if (ctdb_node_has_capabilities(rec
->caps
,
2873 ctdb
->nodes
[i
]->pnn
,
2874 CTDB_CAP_LMASTER
)) {
2881 /* There must be the same number of lmasters in the vnn map as
2882 * there are active nodes with the lmaster capability... or
2885 if (vnnmap
->size
!= num_lmasters
) {
2886 DEBUG(DEBUG_ERR
, (__location__
" The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
2887 vnnmap
->size
, num_lmasters
));
2888 ctdb_set_culprit(rec
, ctdb
->pnn
);
2889 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
2893 /* verify that all active nodes in the nodemap also exist in
2896 for (j
=0; j
<nodemap
->num
; j
++) {
2897 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
2900 if (nodemap
->nodes
[j
].pnn
== pnn
) {
2904 for (i
=0; i
<vnnmap
->size
; i
++) {
2905 if (vnnmap
->map
[i
] == nodemap
->nodes
[j
].pnn
) {
2909 if (i
== vnnmap
->size
) {
2910 DEBUG(DEBUG_ERR
, (__location__
" Node %u is active in the nodemap but did not exist in the vnnmap\n",
2911 nodemap
->nodes
[j
].pnn
));
2912 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
2913 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
2919 /* verify that all other nodes have the same vnnmap
2920 and are from the same generation
2922 for (j
=0; j
<nodemap
->num
; j
++) {
2923 if (nodemap
->nodes
[j
].flags
& NODE_FLAGS_INACTIVE
) {
2926 if (nodemap
->nodes
[j
].pnn
== pnn
) {
2930 ret
= ctdb_ctrl_getvnnmap(ctdb
, CONTROL_TIMEOUT(), nodemap
->nodes
[j
].pnn
,
2931 mem_ctx
, &remote_vnnmap
);
2933 DEBUG(DEBUG_ERR
, (__location__
" Unable to get vnnmap from remote node %u\n",
2934 nodemap
->nodes
[j
].pnn
));
2938 /* verify the vnnmap generation is the same */
2939 if (vnnmap
->generation
!= remote_vnnmap
->generation
) {
2940 DEBUG(DEBUG_ERR
, (__location__
" Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
2941 nodemap
->nodes
[j
].pnn
, remote_vnnmap
->generation
, vnnmap
->generation
));
2942 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
2943 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
2947 /* verify the vnnmap size is the same */
2948 if (vnnmap
->size
!= remote_vnnmap
->size
) {
2949 DEBUG(DEBUG_ERR
, (__location__
" Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
2950 nodemap
->nodes
[j
].pnn
, remote_vnnmap
->size
, vnnmap
->size
));
2951 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
2952 do_recovery(rec
, mem_ctx
, pnn
, nodemap
, vnnmap
);
2956 /* verify the vnnmap is the same */
2957 for (i
=0;i
<vnnmap
->size
;i
++) {
2958 if (remote_vnnmap
->map
[i
] != vnnmap
->map
[i
]) {
2959 DEBUG(DEBUG_ERR
, (__location__
" Remote node %u has different vnnmap.\n",
2960 nodemap
->nodes
[j
].pnn
));
2961 ctdb_set_culprit(rec
, nodemap
->nodes
[j
].pnn
);
2962 do_recovery(rec
, mem_ctx
, pnn
, nodemap
,
2969 /* FIXME: Add remote public IP checking to ensure that nodes
2970 * have the IP addresses that are allocated to them. */
2972 takeover_run_checks
:
2974 /* If there are IP takeover runs requested or the previous one
2975 * failed then perform one and notify the waiters */
2976 if (!ctdb_op_is_disabled(rec
->takeover_run
) &&
2977 (rec
->reallocate_requests
|| rec
->need_takeover_run
)) {
2978 process_ipreallocate_requests(ctdb
, rec
);
2982 static void recd_sig_term_handler(struct tevent_context
*ev
,
2983 struct tevent_signal
*se
, int signum
,
2984 int count
, void *dont_care
,
2987 struct ctdb_recoverd
*rec
= talloc_get_type_abort(
2988 private_data
, struct ctdb_recoverd
);
2990 DEBUG(DEBUG_ERR
, ("Received SIGTERM, exiting\n"));
2991 ctdb_recovery_unlock(rec
);
2997 the main monitoring loop
2999 static void monitor_cluster(struct ctdb_context
*ctdb
)
3001 struct tevent_signal
*se
;
3002 struct ctdb_recoverd
*rec
;
3004 DEBUG(DEBUG_NOTICE
,("monitor_cluster starting\n"));
3006 rec
= talloc_zero(ctdb
, struct ctdb_recoverd
);
3007 CTDB_NO_MEMORY_FATAL(ctdb
, rec
);
3010 rec
->recmaster
= CTDB_UNKNOWN_PNN
;
3011 rec
->recovery_lock_handle
= NULL
;
3013 rec
->takeover_run
= ctdb_op_init(rec
, "takeover runs");
3014 CTDB_NO_MEMORY_FATAL(ctdb
, rec
->takeover_run
);
3016 rec
->recovery
= ctdb_op_init(rec
, "recoveries");
3017 CTDB_NO_MEMORY_FATAL(ctdb
, rec
->recovery
);
3019 rec
->priority_time
= timeval_current();
3020 rec
->frozen_on_inactive
= false;
3022 se
= tevent_add_signal(ctdb
->ev
, ctdb
, SIGTERM
, 0,
3023 recd_sig_term_handler
, rec
);
3025 DEBUG(DEBUG_ERR
, ("Failed to install SIGTERM handler\n"));
3029 /* register a message port for sending memory dumps */
3030 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_MEM_DUMP
, mem_dump_handler
, rec
);
3032 /* when a node is assigned banning credits */
3033 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_BANNING
,
3034 banning_handler
, rec
);
3036 /* register a message port for recovery elections */
3037 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_ELECTION
, election_handler
, rec
);
3039 /* when nodes are disabled/enabled */
3040 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_SET_NODE_FLAGS
, monitor_handler
, rec
);
3042 /* when we are asked to puch out a flag change */
3043 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_PUSH_NODE_FLAGS
, push_flags_handler
, rec
);
3045 /* register a message port for vacuum fetch */
3046 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_VACUUM_FETCH
, vacuum_fetch_handler
, rec
);
3048 /* register a message port for reloadnodes */
3049 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_RELOAD_NODES
, reload_nodes_handler
, rec
);
3051 /* register a message port for performing a takeover run */
3052 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_TAKEOVER_RUN
, ip_reallocate_handler
, rec
);
3054 /* register a message port for disabling the ip check for a short while */
3055 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_DISABLE_IP_CHECK
, disable_ip_check_handler
, rec
);
3057 /* register a message port for forcing a rebalance of a node next
3059 ctdb_client_set_message_handler(ctdb
, CTDB_SRVID_REBALANCE_NODE
, recd_node_rebalance_handler
, rec
);
3061 /* Register a message port for disabling takeover runs */
3062 ctdb_client_set_message_handler(ctdb
,
3063 CTDB_SRVID_DISABLE_TAKEOVER_RUNS
,
3064 disable_takeover_runs_handler
, rec
);
3066 /* Register a message port for disabling recoveries */
3067 ctdb_client_set_message_handler(ctdb
,
3068 CTDB_SRVID_DISABLE_RECOVERIES
,
3069 disable_recoveries_handler
, rec
);
3071 /* register a message port for detaching database */
3072 ctdb_client_set_message_handler(ctdb
,
3073 CTDB_SRVID_DETACH_DATABASE
,
3074 detach_database_handler
, rec
);
3077 TALLOC_CTX
*mem_ctx
= talloc_new(ctdb
);
3078 struct timeval start
;
3082 DEBUG(DEBUG_CRIT
,(__location__
3083 " Failed to create temp context\n"));
3087 start
= timeval_current();
3088 main_loop(ctdb
, rec
, mem_ctx
);
3089 talloc_free(mem_ctx
);
3091 /* we only check for recovery once every second */
3092 elapsed
= timeval_elapsed(&start
);
3093 if (elapsed
< ctdb
->tunable
.recover_interval
) {
3094 ctdb_wait_timeout(ctdb
, ctdb
->tunable
.recover_interval
3101 event handler for when the main ctdbd dies
3103 static void ctdb_recoverd_parent(struct tevent_context
*ev
,
3104 struct tevent_fd
*fde
,
3105 uint16_t flags
, void *private_data
)
3107 DEBUG(DEBUG_ALERT
,("recovery daemon parent died - exiting\n"));
3112 called regularly to verify that the recovery daemon is still running
3114 static void ctdb_check_recd(struct tevent_context
*ev
,
3115 struct tevent_timer
*te
,
3116 struct timeval yt
, void *p
)
3118 struct ctdb_context
*ctdb
= talloc_get_type(p
, struct ctdb_context
);
3120 if (ctdb_kill(ctdb
, ctdb
->recoverd_pid
, 0) != 0) {
3121 DEBUG(DEBUG_ERR
,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb
->recoverd_pid
));
3123 tevent_add_timer(ctdb
->ev
, ctdb
, timeval_zero(),
3124 ctdb_restart_recd
, ctdb
);
3129 tevent_add_timer(ctdb
->ev
, ctdb
->recd_ctx
,
3130 timeval_current_ofs(30, 0),
3131 ctdb_check_recd
, ctdb
);
3134 static void recd_sig_child_handler(struct tevent_context
*ev
,
3135 struct tevent_signal
*se
, int signum
,
3136 int count
, void *dont_care
,
3139 // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3144 pid
= waitpid(-1, &status
, WNOHANG
);
3146 if (errno
!= ECHILD
) {
3147 DEBUG(DEBUG_ERR
, (__location__
" waitpid() returned error. errno:%s(%d)\n", strerror(errno
),errno
));
3152 DEBUG(DEBUG_DEBUG
, ("RECD SIGCHLD from %d\n", (int)pid
));
3158 startup the recovery daemon as a child of the main ctdb daemon
3160 int ctdb_start_recoverd(struct ctdb_context
*ctdb
)
3163 struct tevent_signal
*se
;
3164 struct tevent_fd
*fde
;
3167 if (pipe(fd
) != 0) {
3171 ctdb
->recoverd_pid
= ctdb_fork(ctdb
);
3172 if (ctdb
->recoverd_pid
== -1) {
3176 if (ctdb
->recoverd_pid
!= 0) {
3177 talloc_free(ctdb
->recd_ctx
);
3178 ctdb
->recd_ctx
= talloc_new(ctdb
);
3179 CTDB_NO_MEMORY(ctdb
, ctdb
->recd_ctx
);
3182 tevent_add_timer(ctdb
->ev
, ctdb
->recd_ctx
,
3183 timeval_current_ofs(30, 0),
3184 ctdb_check_recd
, ctdb
);
3190 srandom(getpid() ^ time(NULL
));
3192 ret
= logging_init(ctdb
, NULL
, NULL
, "ctdb-recoverd");
3197 prctl_set_comment("ctdb_recovered");
3198 if (switch_from_server_to_client(ctdb
) != 0) {
3199 DEBUG(DEBUG_CRIT
, (__location__
"ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3203 DEBUG(DEBUG_DEBUG
, (__location__
" Created PIPE FD:%d to recovery daemon\n", fd
[0]));
3205 fde
= tevent_add_fd(ctdb
->ev
, ctdb
, fd
[0], TEVENT_FD_READ
,
3206 ctdb_recoverd_parent
, &fd
[0]);
3207 tevent_fd_set_auto_close(fde
);
3209 /* set up a handler to pick up sigchld */
3210 se
= tevent_add_signal(ctdb
->ev
, ctdb
, SIGCHLD
, 0,
3211 recd_sig_child_handler
, ctdb
);
3213 DEBUG(DEBUG_CRIT
,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3217 monitor_cluster(ctdb
);
3219 DEBUG(DEBUG_ALERT
,("ERROR: ctdb_recoverd finished!?\n"));
3224 shutdown the recovery daemon
3226 void ctdb_stop_recoverd(struct ctdb_context
*ctdb
)
3228 if (ctdb
->recoverd_pid
== 0) {
3232 DEBUG(DEBUG_NOTICE
,("Shutting down recovery daemon\n"));
3233 ctdb_kill(ctdb
, ctdb
->recoverd_pid
, SIGTERM
);
3235 TALLOC_FREE(ctdb
->recd_ctx
);
3236 TALLOC_FREE(ctdb
->recd_ping_count
);
3239 static void ctdb_restart_recd(struct tevent_context
*ev
,
3240 struct tevent_timer
*te
,
3241 struct timeval t
, void *private_data
)
3243 struct ctdb_context
*ctdb
= talloc_get_type(private_data
, struct ctdb_context
);
3245 DEBUG(DEBUG_ERR
,("Restarting recovery daemon\n"));
3246 ctdb_stop_recoverd(ctdb
);
3247 ctdb_start_recoverd(ctdb
);