ctdb/server/ctdb_recoverd.c

   1 /*
   2    ctdb recovery daemon
   3
   4    Copyright (C) Ronnie Sahlberg  2007
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20 #include "replace.h"
  21 #include "system/filesys.h"
  22 #include "system/time.h"
  23 #include "system/network.h"
  24 #include "system/wait.h"
  25
  26 #include <popt.h>
  27 #include <talloc.h>
  28 #include <tevent.h>
  29 #include <tdb.h>
  30
  31 #include "lib/tdb_wrap/tdb_wrap.h"
  32 #include "lib/util/dlinklist.h"
  33 #include "lib/util/debug.h"
  34 #include "lib/util/samba_util.h"
  35 #include "lib/util/sys_rw.h"
  36 #include "lib/util/util_process.h"
  37
  38 #include "ctdb_private.h"
  39 #include "ctdb_client.h"
  40
  41 #include "protocol/protocol_basic.h"
  42
  43 #include "common/system_socket.h"
  44 #include "common/common.h"
  45 #include "common/logging.h"
  46
  47 #include "server/ctdb_config.h"
  48
  49 #include "ctdb_cluster_mutex.h"
  50
  51 /* List of SRVID requests that need to be processed */
  52 struct srvid_list {
  53         struct srvid_list *next, *prev;
  54         struct ctdb_srvid_message *request;
  55 };
  56
  57 struct srvid_requests {
  58         struct srvid_list *requests;
  59 };
  60
  61 static void srvid_request_reply(struct ctdb_context *ctdb,
  62                                 struct ctdb_srvid_message *request,
  63                                 TDB_DATA result)
  64 {
  65         /* Someone that sent srvid==0 does not want a reply */
  66         if (request->srvid == 0) {
  67                 talloc_free(request);
  68                 return;
  69         }
  70
  71         if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
  72                                      result) == 0) {
  73                 DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
  74                                   (unsigned)request->pnn,
  75                                   (unsigned long long)request->srvid));
  76         } else {
  77                 DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
  78                                  (unsigned)request->pnn,
  79                                  (unsigned long long)request->srvid));
  80         }
  81
  82         talloc_free(request);
  83 }
  84
  85 static void srvid_requests_reply(struct ctdb_context *ctdb,
  86                                  struct srvid_requests **requests,
  87                                  TDB_DATA result)
  88 {
  89         struct srvid_list *r;
  90
  91         if (*requests == NULL) {
  92                 return;
  93         }
  94
  95         for (r = (*requests)->requests; r != NULL; r = r->next) {
  96                 srvid_request_reply(ctdb, r->request, result);
  97         }
  98
  99         /* Free the list structure... */
 100         TALLOC_FREE(*requests);
 101 }
 102
 103 static void srvid_request_add(struct ctdb_context *ctdb,
 104                               struct srvid_requests **requests,
 105                               struct ctdb_srvid_message *request)
 106 {
 107         struct srvid_list *t;
 108         int32_t ret;
 109         TDB_DATA result;
 110
 111         if (*requests == NULL) {
 112                 *requests = talloc_zero(ctdb, struct srvid_requests);
 113                 if (*requests == NULL) {
 114                         goto nomem;
 115                 }
 116         }
 117
 118         t = talloc_zero(*requests, struct srvid_list);
 119         if (t == NULL) {
 120                 /* If *requests was just allocated above then free it */
 121                 if ((*requests)->requests == NULL) {
 122                         TALLOC_FREE(*requests);
 123                 }
 124                 goto nomem;
 125         }
 126
 127         t->request = (struct ctdb_srvid_message *)talloc_steal(t, request);
 128         DLIST_ADD((*requests)->requests, t);
 129
 130         return;
 131
 132 nomem:
 133         /* Failed to add the request to the list.  Send a fail. */
 134         DEBUG(DEBUG_ERR, (__location__
 135                           " Out of memory, failed to queue SRVID request\n"));
 136         ret = -ENOMEM;
 137         result.dsize = sizeof(ret);
 138         result.dptr = (uint8_t *)&ret;
 139         srvid_request_reply(ctdb, request, result);
 140 }
 141
 142 /* An abstraction to allow an operation (takeover runs, recoveries,
 143  * ...) to be disabled for a given timeout */
 144 struct ctdb_op_state {
 145         struct tevent_timer *timer;
 146         bool in_progress;
 147         const char *name;
 148 };
 149
 150 static struct ctdb_op_state *ctdb_op_init(TALLOC_CTX *mem_ctx, const char *name)
 151 {
 152         struct ctdb_op_state *state = talloc_zero(mem_ctx, struct ctdb_op_state);
 153
 154         if (state != NULL) {
 155                 state->in_progress = false;
 156                 state->name = name;
 157         }
 158
 159         return state;
 160 }
 161
 162 static bool ctdb_op_is_disabled(struct ctdb_op_state *state)
 163 {
 164         return state->timer != NULL;
 165 }
 166
 167 static bool ctdb_op_begin(struct ctdb_op_state *state)
 168 {
 169         if (ctdb_op_is_disabled(state)) {
 170                 DEBUG(DEBUG_NOTICE,
 171                       ("Unable to begin - %s are disabled\n", state->name));
 172                 return false;
 173         }
 174
 175         state->in_progress = true;
 176         return true;
 177 }
 178
 179 static bool ctdb_op_end(struct ctdb_op_state *state)
 180 {
 181         return state->in_progress = false;
 182 }
 183
 184 static bool ctdb_op_is_in_progress(struct ctdb_op_state *state)
 185 {
 186         return state->in_progress;
 187 }
 188
 189 static void ctdb_op_enable(struct ctdb_op_state *state)
 190 {
 191         TALLOC_FREE(state->timer);
 192 }
 193
 194 static void ctdb_op_timeout_handler(struct tevent_context *ev,
 195                                     struct tevent_timer *te,
 196                                     struct timeval yt, void *p)
 197 {
 198         struct ctdb_op_state *state =
 199                 talloc_get_type(p, struct ctdb_op_state);
 200
 201         DEBUG(DEBUG_NOTICE,("Reenabling %s after timeout\n", state->name));
 202         ctdb_op_enable(state);
 203 }
 204
 205 static int ctdb_op_disable(struct ctdb_op_state *state,
 206                            struct tevent_context *ev,
 207                            uint32_t timeout)
 208 {
 209         if (timeout == 0) {
 210                 DEBUG(DEBUG_NOTICE,("Reenabling %s\n", state->name));
 211                 ctdb_op_enable(state);
 212                 return 0;
 213         }
 214
 215         if (state->in_progress) {
 216                 DEBUG(DEBUG_ERR,
 217                       ("Unable to disable %s - in progress\n", state->name));
 218                 return -EAGAIN;
 219         }
 220
 221         DEBUG(DEBUG_NOTICE,("Disabling %s for %u seconds\n",
 222                             state->name, timeout));
 223
 224         /* Clear any old timers */
 225         talloc_free(state->timer);
 226
 227         /* Arrange for the timeout to occur */
 228         state->timer = tevent_add_timer(ev, state,
 229                                         timeval_current_ofs(timeout, 0),
 230                                         ctdb_op_timeout_handler, state);
 231         if (state->timer == NULL) {
 232                 DEBUG(DEBUG_ERR,(__location__ " Unable to setup timer\n"));
 233                 return -ENOMEM;
 234         }
 235
 236         return 0;
 237 }
 238
 239 struct ctdb_banning_state {
 240         uint32_t count;
 241         struct timeval last_reported_time;
 242 };
 243
 244 struct ctdb_cluster_lock_handle;
 245
 246 /*
 247   private state of recovery daemon
 248  */
 249 struct ctdb_recoverd {
 250         struct ctdb_context *ctdb;
 251         uint32_t leader;
 252         struct tevent_timer *leader_broadcast_te;
 253         struct tevent_timer *leader_broadcast_timeout_te;
 254         uint32_t pnn;
 255         uint32_t last_culprit_node;
 256         struct ctdb_node_map_old *nodemap;
 257         struct timeval priority_time;
 258         bool need_takeover_run;
 259         bool need_recovery;
 260         uint32_t node_flags;
 261         struct tevent_timer *send_election_te;
 262         bool election_in_progress;
 263         struct tevent_timer *election_timeout;
 264         struct srvid_requests *reallocate_requests;
 265         struct ctdb_op_state *takeover_run;
 266         struct ctdb_op_state *recovery;
 267         struct ctdb_iface_list_old *ifaces;
 268         uint32_t *force_rebalance_nodes;
 269         struct ctdb_node_capabilities *caps;
 270         bool frozen_on_inactive;
 271         struct ctdb_cluster_lock_handle *cluster_lock_handle;
 272         pid_t helper_pid;
 273 };
 274
 275 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
 276 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
 277
 278 static void ctdb_restart_recd(struct tevent_context *ev,
 279                               struct tevent_timer *te, struct timeval t,
 280                               void *private_data);
 281
 282 static bool this_node_is_leader(struct ctdb_recoverd *rec)
 283 {
 284         return rec->leader == rec->pnn;
 285 }
 286
 287 static bool this_node_can_be_leader(struct ctdb_recoverd *rec)
 288 {
 289         return (rec->node_flags & NODE_FLAGS_INACTIVE) == 0 &&
 290                 (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) != 0;
 291 }
 292
 293 /*
 294   ban a node for a period of time
 295  */
 296 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn)
 297 {
 298         int ret;
 299         struct ctdb_context *ctdb = rec->ctdb;
 300         uint32_t ban_time = ctdb->tunable.recovery_ban_period;
 301         struct ctdb_ban_state bantime;
 302
 303         if (!ctdb_validate_pnn(ctdb, pnn)) {
 304                 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
 305                 return;
 306         }
 307
 308         DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
 309
 310         bantime.pnn  = pnn;
 311         bantime.time = ban_time;
 312
 313         ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
 314         if (ret != 0) {
 315                 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
 316                 return;
 317         }
 318
 319 }
 320
 321 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
 322
 323
 324 /*
 325   remember the trouble maker
 326  */
 327 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
 328 {
 329         struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
 330         struct ctdb_banning_state *ban_state;
 331
 332         if (culprit > ctdb->num_nodes) {
 333                 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
 334                 return;
 335         }
 336
 337         /* If we are banned or stopped, do not set other nodes as culprits */
 338         if (rec->node_flags & NODE_FLAGS_INACTIVE) {
 339                 DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
 340                 return;
 341         }
 342
 343         if (ctdb->nodes[culprit]->ban_state == NULL) {
 344                 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
 345                 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
 346
 347
 348         }
 349         ban_state = ctdb->nodes[culprit]->ban_state;
 350         if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
 351                 /* this was the first time in a long while this node
 352                    misbehaved so we will forgive any old transgressions.
 353                 */
 354                 ban_state->count = 0;
 355         }
 356
 357         ban_state->count += count;
 358         ban_state->last_reported_time = timeval_current();
 359         rec->last_culprit_node = culprit;
 360 }
 361
 362 /*
 363   remember the trouble maker
 364  */
 365 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
 366 {
 367         ctdb_set_culprit_count(rec, culprit, 1);
 368 }
 369
 370 /*
 371   Retrieve capabilities from all connected nodes
 372  */
 373 static int update_capabilities(struct ctdb_recoverd *rec,
 374                                struct ctdb_node_map_old *nodemap)
 375 {
 376         uint32_t *capp;
 377         TALLOC_CTX *tmp_ctx;
 378         struct ctdb_node_capabilities *caps;
 379         struct ctdb_context *ctdb = rec->ctdb;
 380
 381         tmp_ctx = talloc_new(rec);
 382         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 383
 384         caps = ctdb_get_capabilities(ctdb, tmp_ctx,
 385                                      CONTROL_TIMEOUT(), nodemap);
 386
 387         if (caps == NULL) {
 388                 DEBUG(DEBUG_ERR,
 389                       (__location__ " Failed to get node capabilities\n"));
 390                 talloc_free(tmp_ctx);
 391                 return -1;
 392         }
 393
 394         capp = ctdb_get_node_capabilities(caps, rec->pnn);
 395         if (capp == NULL) {
 396                 DEBUG(DEBUG_ERR,
 397                       (__location__
 398                        " Capabilities don't include current node.\n"));
 399                 talloc_free(tmp_ctx);
 400                 return -1;
 401         }
 402         ctdb->capabilities = *capp;
 403
 404         TALLOC_FREE(rec->caps);
 405         rec->caps = talloc_steal(rec, caps);
 406
 407         talloc_free(tmp_ctx);
 408         return 0;
 409 }
 410
 411 /*
 412   change recovery mode on all nodes
 413  */
 414 static int set_recovery_mode(struct ctdb_context *ctdb,
 415                              struct ctdb_recoverd *rec,
 416                              struct ctdb_node_map_old *nodemap,
 417                              uint32_t rec_mode)
 418 {
 419         TDB_DATA data;
 420         uint32_t *nodes;
 421         TALLOC_CTX *tmp_ctx;
 422
 423         tmp_ctx = talloc_new(ctdb);
 424         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 425
 426         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 427
 428         data.dsize = sizeof(uint32_t);
 429         data.dptr = (unsigned char *)&rec_mode;
 430
 431         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
 432                                         nodes, 0,
 433                                         CONTROL_TIMEOUT(),
 434                                         false, data,
 435                                         NULL, NULL,
 436                                         NULL) != 0) {
 437                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
 438                 talloc_free(tmp_ctx);
 439                 return -1;
 440         }
 441
 442         talloc_free(tmp_ctx);
 443         return 0;
 444 }
 445
 446 /*
 447  * Update flags on all connected nodes
 448  */
 449 static int update_flags_on_all_nodes(struct ctdb_recoverd *rec,
 450                                      uint32_t pnn,
 451                                      uint32_t flags)
 452 {
 453         struct ctdb_context *ctdb = rec->ctdb;
 454         struct timeval timeout = CONTROL_TIMEOUT();
 455         TDB_DATA data;
 456         struct ctdb_node_map_old *nodemap=NULL;
 457         struct ctdb_node_flag_change c;
 458         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
 459         uint32_t *nodes;
 460         uint32_t i;
 461         int ret;
 462
 463         nodemap = rec->nodemap;
 464
 465         for (i = 0; i < nodemap->num; i++) {
 466                 if (pnn == nodemap->nodes[i].pnn) {
 467                         break;
 468                 }
 469         }
 470         if (i >= nodemap->num) {
 471                 DBG_ERR("Nodemap does not contain node %d\n", pnn);
 472                 talloc_free(tmp_ctx);
 473                 return -1;
 474         }
 475
 476         c.pnn       = pnn;
 477         c.old_flags = nodemap->nodes[i].flags;
 478         c.new_flags = flags;
 479
 480         data.dsize = sizeof(c);
 481         data.dptr = (unsigned char *)&c;
 482
 483         /* send the flags update to all connected nodes */
 484         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
 485
 486         ret = ctdb_client_async_control(ctdb,
 487                                         CTDB_CONTROL_MODIFY_FLAGS,
 488                                         nodes,
 489                                         0,
 490                                         timeout,
 491                                         false,
 492                                         data,
 493                                         NULL,
 494                                         NULL,
 495                                         NULL);
 496         if (ret != 0) {
 497                 DBG_ERR("Unable to update flags on remote nodes\n");
 498                 talloc_free(tmp_ctx);
 499                 return -1;
 500         }
 501
 502         talloc_free(tmp_ctx);
 503         return 0;
 504 }
 505
 506 static bool _cluster_lock_lock(struct ctdb_recoverd *rec);
 507 static bool cluster_lock_held(struct ctdb_recoverd *rec);
 508
 509 static bool cluster_lock_enabled(struct ctdb_recoverd *rec)
 510 {
 511         return rec->ctdb->recovery_lock != NULL;
 512 }
 513
 514 static bool cluster_lock_take(struct ctdb_recoverd *rec)
 515 {
 516         struct ctdb_context *ctdb = rec->ctdb;
 517         bool have_lock;
 518
 519         if (!cluster_lock_enabled(rec)) {
 520                 return true;
 521         }
 522
 523         if (cluster_lock_held(rec)) {
 524                 D_NOTICE("Already holding cluster lock\n");
 525                 return true;
 526         }
 527
 528         D_NOTICE("Attempting to take cluster lock (%s)\n", ctdb->recovery_lock);
 529         have_lock = _cluster_lock_lock(rec);
 530         if (!have_lock) {
 531                 return false;
 532         }
 533
 534         D_NOTICE("Cluster lock taken successfully\n");
 535         return true;
 536 }
 537
 538 /*
 539   called when ctdb_wait_timeout should finish
 540  */
 541 static void ctdb_wait_handler(struct tevent_context *ev,
 542                               struct tevent_timer *te,
 543                               struct timeval yt, void *p)
 544 {
 545         uint32_t *timed_out = (uint32_t *)p;
 546         (*timed_out) = 1;
 547 }
 548
 549 /*
 550   wait for a given number of seconds
 551  */
 552 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
 553 {
 554         uint32_t timed_out = 0;
 555         time_t usecs = (secs - (time_t)secs) * 1000000;
 556         tevent_add_timer(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs),
 557                          ctdb_wait_handler, &timed_out);
 558         while (!timed_out) {
 559                 tevent_loop_once(ctdb->ev);
 560         }
 561 }
 562
 563 /*
 564  * Broadcast cluster leader
 565  */
 566
 567 static int leader_broadcast_send(struct ctdb_recoverd *rec, uint32_t pnn)
 568 {
 569         struct ctdb_context *ctdb = rec->ctdb;
 570         TDB_DATA data;
 571         int ret;
 572
 573         data.dptr = (uint8_t *)&pnn;
 574         data.dsize = sizeof(pnn);
 575
 576         ret = ctdb_client_send_message(ctdb,
 577                                        CTDB_BROADCAST_CONNECTED,
 578                                        CTDB_SRVID_LEADER,
 579                                        data);
 580         return ret;
 581 }
 582
 583 static int leader_broadcast_loop(struct ctdb_recoverd *rec);
 584 static void cluster_lock_release(struct ctdb_recoverd *rec);
 585
 586 /* This runs continously but only sends the broadcast when leader */
 587 static void leader_broadcast_loop_handler(struct tevent_context *ev,
 588                                           struct tevent_timer *te,
 589                                           struct timeval current_time,
 590                                           void *private_data)
 591 {
 592         struct ctdb_recoverd *rec = talloc_get_type_abort(
 593                 private_data, struct ctdb_recoverd);
 594         int ret;
 595
 596         if (!this_node_can_be_leader(rec)) {
 597                 if (this_node_is_leader(rec)) {
 598                         rec->leader = CTDB_UNKNOWN_PNN;
 599                 }
 600                 if (cluster_lock_enabled(rec) && cluster_lock_held(rec)) {
 601                         cluster_lock_release(rec);
 602                 }
 603                 goto done;
 604         }
 605
 606         if (!this_node_is_leader(rec)) {
 607                 goto done;
 608         }
 609
 610         if (rec->election_in_progress) {
 611                 goto done;
 612         }
 613
 614         ret = leader_broadcast_send(rec, rec->leader);
 615         if (ret != 0) {
 616                 DBG_WARNING("Failed to send leader broadcast\n");
 617         }
 618
 619 done:
 620         ret = leader_broadcast_loop(rec);
 621         if (ret != 0) {
 622                 D_WARNING("Failed to set up leader broadcast\n");
 623         }
 624 }
 625
 626 static int leader_broadcast_loop(struct ctdb_recoverd *rec)
 627 {
 628         struct ctdb_context *ctdb = rec->ctdb;
 629
 630         TALLOC_FREE(rec->leader_broadcast_te);
 631         rec->leader_broadcast_te =
 632                 tevent_add_timer(ctdb->ev,
 633                                  rec,
 634                                  timeval_current_ofs(1, 0),
 635                                  leader_broadcast_loop_handler,
 636                                  rec);
 637         if (rec->leader_broadcast_te == NULL) {
 638                 return ENOMEM;
 639         }
 640
 641         return 0;
 642 }
 643
 644 static bool leader_broadcast_loop_active(struct ctdb_recoverd *rec)
 645 {
 646         return rec->leader_broadcast_te != NULL;
 647 }
 648
 649 /*
 650   called when an election times out (ends)
 651  */
 652 static void ctdb_election_timeout(struct tevent_context *ev,
 653                                   struct tevent_timer *te,
 654                                   struct timeval t, void *p)
 655 {
 656         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
 657         bool ok;
 658
 659         rec->election_in_progress = false;
 660         rec->election_timeout = NULL;
 661         fast_start = false;
 662
 663         D_WARNING("Election period ended, leader=%u\n", rec->leader);
 664
 665         if (!this_node_is_leader(rec)) {
 666                 return;
 667         }
 668
 669         ok = cluster_lock_take(rec);
 670         if (!ok) {
 671                 D_ERR("Unable to get cluster lock, banning node\n");
 672                 ctdb_ban_node(rec, rec->pnn);
 673         }
 674 }
 675
 676
 677 /*
 678   wait for an election to finish. It finished election_timeout seconds after
 679   the last election packet is received
 680  */
 681 static void ctdb_wait_election(struct ctdb_recoverd *rec)
 682 {
 683         struct ctdb_context *ctdb = rec->ctdb;
 684         while (rec->election_in_progress) {
 685                 tevent_loop_once(ctdb->ev);
 686         }
 687 }
 688
 689 /*
 690  * Update local flags from all remote connected nodes and push out
 691  * flags changes to all nodes.  This is only run by the leader.
 692  */
 693 static int update_flags(struct ctdb_recoverd *rec,
 694                         struct ctdb_node_map_old *nodemap,
 695                         struct ctdb_node_map_old **remote_nodemaps)
 696 {
 697         unsigned int j;
 698         struct ctdb_context *ctdb = rec->ctdb;
 699         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
 700
 701         /* Check flags from remote nodes */
 702         for (j=0; j<nodemap->num; j++) {
 703                 struct ctdb_node_map_old *remote_nodemap=NULL;
 704                 uint32_t local_flags = nodemap->nodes[j].flags;
 705                 uint32_t remote_pnn = nodemap->nodes[j].pnn;
 706                 uint32_t remote_flags;
 707                 unsigned int i;
 708                 int ret;
 709
 710                 if (local_flags & NODE_FLAGS_DISCONNECTED) {
 711                         continue;
 712                 }
 713                 if (remote_pnn == rec->pnn) {
 714                         /*
 715                          * No remote nodemap for this node since this
 716                          * is the local nodemap.  However, still need
 717                          * to check this against the remote nodes and
 718                          * push it if they are out-of-date.
 719                          */
 720                         goto compare_remotes;
 721                 }
 722
 723                 remote_nodemap = remote_nodemaps[j];
 724                 remote_flags = remote_nodemap->nodes[j].flags;
 725
 726                 if (local_flags != remote_flags) {
 727                         /*
 728                          * Update the local copy of the flags in the
 729                          * recovery daemon.
 730                          */
 731                         D_NOTICE("Remote node %u had flags 0x%x, "
 732                                  "local had 0x%x - updating local\n",
 733                                  remote_pnn,
 734                                  remote_flags,
 735                                  local_flags);
 736                         nodemap->nodes[j].flags = remote_flags;
 737                         local_flags = remote_flags;
 738                         goto push;
 739                 }
 740
 741 compare_remotes:
 742                 for (i = 0; i < nodemap->num; i++) {
 743                         if (i == j) {
 744                                 continue;
 745                         }
 746                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
 747                                 continue;
 748                         }
 749                         if (nodemap->nodes[i].pnn == rec->pnn) {
 750                                 continue;
 751                         }
 752
 753                         remote_nodemap = remote_nodemaps[i];
 754                         remote_flags = remote_nodemap->nodes[j].flags;
 755
 756                         if (local_flags != remote_flags) {
 757                                 goto push;
 758                         }
 759                 }
 760
 761                 continue;
 762
 763 push:
 764                 D_NOTICE("Pushing updated flags for node %u (0x%x)\n",
 765                          remote_pnn,
 766                          local_flags);
 767                 ret = update_flags_on_all_nodes(rec, remote_pnn, local_flags);
 768                 if (ret != 0) {
 769                         DBG_ERR("Unable to update flags on remote nodes\n");
 770                         talloc_free(mem_ctx);
 771                         return -1;
 772                 }
 773         }
 774         talloc_free(mem_ctx);
 775         return 0;
 776 }
 777
 778
 779 /* Create a new random generation id.
 780    The generation id can not be the INVALID_GENERATION id
 781 */
 782 static uint32_t new_generation(void)
 783 {
 784         uint32_t generation;
 785
 786         while (1) {
 787                 generation = random();
 788
 789                 if (generation != INVALID_GENERATION) {
 790                         break;
 791                 }
 792         }
 793
 794         return generation;
 795 }
 796
 797 static bool cluster_lock_held(struct ctdb_recoverd *rec)
 798 {
 799         return (rec->cluster_lock_handle != NULL);
 800 }
 801
 802 struct ctdb_cluster_lock_handle {
 803         bool done;
 804         bool locked;
 805         double latency;
 806         struct ctdb_cluster_mutex_handle *h;
 807         struct ctdb_recoverd *rec;
 808 };
 809
 810 static void take_cluster_lock_handler(char status,
 811                                       double latency,
 812                                       void *private_data)
 813 {
 814         struct ctdb_cluster_lock_handle *s =
 815                 (struct ctdb_cluster_lock_handle *) private_data;
 816
 817         s->locked = (status == '0') ;
 818
 819         /*
 820          * If unsuccessful then ensure the process has exited and that
 821          * the file descriptor event handler has been cancelled
 822          */
 823         if (! s->locked) {
 824                 TALLOC_FREE(s->h);
 825         }
 826
 827         switch (status) {
 828         case '0':
 829                 s->latency = latency;
 830                 break;
 831
 832         case '1':
 833                 D_ERR("Unable to take cluster lock - contention\n");
 834                 break;
 835
 836         case '2':
 837                 D_ERR("Unable to take cluster lock - timeout\n");
 838                 break;
 839
 840         default:
 841                 D_ERR("Unable to take cluster lock - unknown error\n");
 842
 843                 {
 844                         struct ctdb_recoverd *rec = s->rec;
 845
 846                         D_ERR("Banning this node\n");
 847                         ctdb_ban_node(rec, rec->pnn);
 848                 }
 849         }
 850
 851         s->done = true;
 852 }
 853
 854 static void force_election(struct ctdb_recoverd *rec);
 855
 856 static void lost_cluster_lock_handler(void *private_data)
 857 {
 858         struct ctdb_recoverd *rec = talloc_get_type_abort(
 859                 private_data, struct ctdb_recoverd);
 860
 861         D_ERR("Cluster lock helper terminated\n");
 862         TALLOC_FREE(rec->cluster_lock_handle);
 863
 864         if (this_node_can_be_leader(rec)) {
 865                 force_election(rec);
 866         }
 867 }
 868
 869 static bool _cluster_lock_lock(struct ctdb_recoverd *rec)
 870 {
 871         struct ctdb_context *ctdb = rec->ctdb;
 872         struct ctdb_cluster_mutex_handle *h;
 873         struct ctdb_cluster_lock_handle *s;
 874
 875         s = talloc_zero(rec, struct ctdb_cluster_lock_handle);
 876         if (s == NULL) {
 877                 DBG_ERR("Memory allocation error\n");
 878                 return false;
 879         };
 880
 881         s->rec = rec;
 882
 883         h = ctdb_cluster_mutex(s,
 884                                ctdb,
 885                                ctdb->recovery_lock,
 886                                120,
 887                                take_cluster_lock_handler,
 888                                s,
 889                                lost_cluster_lock_handler,
 890                                rec);
 891         if (h == NULL) {
 892                 talloc_free(s);
 893                 return false;
 894         }
 895
 896         rec->cluster_lock_handle = s;
 897         s->h = h;
 898
 899         while (! s->done) {
 900                 tevent_loop_once(ctdb->ev);
 901         }
 902
 903         if (! s->locked) {
 904                 TALLOC_FREE(rec->cluster_lock_handle);
 905                 return false;
 906         }
 907
 908         ctdb_ctrl_report_recd_lock_latency(ctdb,
 909                                            CONTROL_TIMEOUT(),
 910                                            s->latency);
 911
 912         return true;
 913 }
 914
 915 static void cluster_lock_release(struct ctdb_recoverd *rec)
 916 {
 917         if (rec->cluster_lock_handle == NULL) {
 918                 return;
 919         }
 920
 921         if (! rec->cluster_lock_handle->done) {
 922                 /*
 923                  * Taking of cluster lock still in progress.  Free
 924                  * the cluster mutex handle to release it but leave
 925                  * the cluster lock handle in place to allow taking
 926                  * of the lock to fail.
 927                  */
 928                 D_NOTICE("Cancelling cluster lock\n");
 929                 TALLOC_FREE(rec->cluster_lock_handle->h);
 930                 rec->cluster_lock_handle->done = true;
 931                 rec->cluster_lock_handle->locked = false;
 932                 return;
 933         }
 934
 935         D_NOTICE("Releasing cluster lock\n");
 936         TALLOC_FREE(rec->cluster_lock_handle);
 937 }
 938
 939 static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
 940 {
 941         struct ctdb_context *ctdb = rec->ctdb;
 942         unsigned int i;
 943         struct ctdb_banning_state *ban_state;
 944
 945         *self_ban = false;
 946         for (i=0; i<ctdb->num_nodes; i++) {
 947                 if (ctdb->nodes[i]->ban_state == NULL) {
 948                         continue;
 949                 }
 950                 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
 951                 if (ban_state->count < 2*ctdb->num_nodes) {
 952                         continue;
 953                 }
 954
 955                 D_NOTICE("Node %u reached %u banning credits\n",
 956                          ctdb->nodes[i]->pnn,
 957                          ban_state->count);
 958                 ctdb_ban_node(rec, ctdb->nodes[i]->pnn);
 959                 ban_state->count = 0;
 960
 961                 /* Banning ourself? */
 962                 if (ctdb->nodes[i]->pnn == rec->pnn) {
 963                         *self_ban = true;
 964                 }
 965         }
 966 }
 967
 968 struct helper_state {
 969         int fd[2];
 970         pid_t pid;
 971         int result;
 972         bool done;
 973 };
 974
 975 static void helper_handler(struct tevent_context *ev,
 976                            struct tevent_fd *fde,
 977                            uint16_t flags, void *private_data)
 978 {
 979         struct helper_state *state = talloc_get_type_abort(
 980                 private_data, struct helper_state);
 981         int ret;
 982
 983         ret = sys_read(state->fd[0], &state->result, sizeof(state->result));
 984         if (ret != sizeof(state->result)) {
 985                 state->result = EPIPE;
 986         }
 987
 988         state->done = true;
 989 }
 990
 991 static int helper_run(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx,
 992                       const char *prog, const char *arg, const char *type)
 993 {
 994         struct helper_state *state;
 995         struct tevent_fd *fde;
 996         const char **args;
 997         int nargs, ret;
 998
 999         state = talloc_zero(mem_ctx, struct helper_state);
1000         if (state == NULL) {
1001                 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1002                 return -1;
1003         }
1004
1005         state->pid = -1;
1006
1007         ret = pipe(state->fd);
1008         if (ret != 0) {
1009                 DEBUG(DEBUG_ERR,
1010                       ("Failed to create pipe for %s helper\n", type));
1011                 goto fail;
1012         }
1013
1014         set_close_on_exec(state->fd[0]);
1015
1016         nargs = 4;
1017         args = talloc_array(state, const char *, nargs);
1018         if (args == NULL) {
1019                 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1020                 goto fail;
1021         }
1022
1023         args[0] = talloc_asprintf(args, "%d", state->fd[1]);
1024         if (args[0] == NULL) {
1025                 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1026                 goto fail;
1027         }
1028         args[1] = rec->ctdb->daemon.name;
1029         args[2] = arg;
1030         args[3] = NULL;
1031
1032         if (args[2] == NULL) {
1033                 nargs = 3;
1034         }
1035
1036         state->pid = ctdb_vfork_exec(state, rec->ctdb, prog, nargs, args);
1037         if (state->pid == -1) {
1038                 DEBUG(DEBUG_ERR,
1039                       ("Failed to create child for %s helper\n", type));
1040                 goto fail;
1041         }
1042
1043         close(state->fd[1]);
1044         state->fd[1] = -1;
1045
1046         rec->helper_pid = state->pid;
1047         state->done = false;
1048
1049         fde = tevent_add_fd(rec->ctdb->ev, state, state->fd[0],
1050                             TEVENT_FD_READ, helper_handler, state);
1051         if (fde == NULL) {
1052                 goto fail;
1053         }
1054         tevent_fd_set_auto_close(fde);
1055
1056         while (!state->done) {
1057                 tevent_loop_once(rec->ctdb->ev);
1058
1059                 if (!this_node_is_leader(rec)) {
1060                         D_ERR("Leader changed to %u, aborting %s\n",
1061                               rec->leader,
1062                               type);
1063                         state->result = 1;
1064                         break;
1065                 }
1066         }
1067
1068         close(state->fd[0]);
1069         state->fd[0] = -1;
1070
1071         if (state->result != 0) {
1072                 goto fail;
1073         }
1074
1075         rec->helper_pid = -1;
1076         ctdb_kill(rec->ctdb, state->pid, SIGKILL);
1077         talloc_free(state);
1078         return 0;
1079
1080 fail:
1081         if (state->fd[0] != -1) {
1082                 close(state->fd[0]);
1083         }
1084         if (state->fd[1] != -1) {
1085                 close(state->fd[1]);
1086         }
1087         rec->helper_pid = -1;
1088         if (state->pid != -1) {
1089                 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
1090         }
1091         talloc_free(state);
1092         return -1;
1093 }
1094
1095
1096 static int ctdb_takeover(struct ctdb_recoverd *rec,
1097                          uint32_t *force_rebalance_nodes)
1098 {
1099         static char prog[PATH_MAX+1] = "";
1100         char *arg;
1101         unsigned int i;
1102         int ret;
1103
1104         if (!ctdb_set_helper("takeover_helper", prog, sizeof(prog),
1105                              "CTDB_TAKEOVER_HELPER", CTDB_HELPER_BINDIR,
1106                              "ctdb_takeover_helper")) {
1107                 ctdb_die(rec->ctdb, "Unable to set takeover helper\n");
1108         }
1109
1110         arg = NULL;
1111         for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1112                 uint32_t pnn = force_rebalance_nodes[i];
1113                 if (arg == NULL) {
1114                         arg = talloc_asprintf(rec, "%u", pnn);
1115                 } else {
1116                         arg = talloc_asprintf_append(arg, ",%u", pnn);
1117                 }
1118                 if (arg == NULL) {
1119                         DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1120                         return -1;
1121                 }
1122         }
1123
1124         if (ctdb_config.failover_disabled) {
1125                 ret = setenv("CTDB_DISABLE_IP_FAILOVER", "1", 1);
1126                 if (ret != 0) {
1127                         D_ERR("Failed to set CTDB_DISABLE_IP_FAILOVER variable\n");
1128                         return -1;
1129                 }
1130         }
1131
1132         return helper_run(rec, rec, prog, arg, "takeover");
1133 }
1134
1135 static bool do_takeover_run(struct ctdb_recoverd *rec,
1136                             struct ctdb_node_map_old *nodemap)
1137 {
1138         uint32_t *nodes = NULL;
1139         struct ctdb_disable_message dtr;
1140         TDB_DATA data;
1141         size_t i;
1142         uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
1143         int ret;
1144         bool ok;
1145
1146         DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
1147
1148         if (ctdb_op_is_in_progress(rec->takeover_run)) {
1149                 DEBUG(DEBUG_ERR, (__location__
1150                                   " takeover run already in progress \n"));
1151                 ok = false;
1152                 goto done;
1153         }
1154
1155         if (!ctdb_op_begin(rec->takeover_run)) {
1156                 ok = false;
1157                 goto done;
1158         }
1159
1160         /* Disable IP checks (takeover runs, really) on other nodes
1161          * while doing this takeover run.  This will stop those other
1162          * nodes from triggering takeover runs when think they should
1163          * be hosting an IP but it isn't yet on an interface.  Don't
1164          * wait for replies since a failure here might cause some
1165          * noise in the logs but will not actually cause a problem.
1166          */
1167         ZERO_STRUCT(dtr);
1168         dtr.srvid = 0; /* No reply */
1169         dtr.pnn = -1;
1170
1171         data.dptr  = (uint8_t*)&dtr;
1172         data.dsize = sizeof(dtr);
1173
1174         nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
1175
1176         /* Disable for 60 seconds.  This can be a tunable later if
1177          * necessary.
1178          */
1179         dtr.timeout = 60;
1180         for (i = 0; i < talloc_array_length(nodes); i++) {
1181                 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1182                                              CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1183                                              data) != 0) {
1184                         DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
1185                 }
1186         }
1187
1188         ret = ctdb_takeover(rec, rec->force_rebalance_nodes);
1189
1190         /* Reenable takeover runs and IP checks on other nodes */
1191         dtr.timeout = 0;
1192         for (i = 0; i < talloc_array_length(nodes); i++) {
1193                 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1194                                              CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1195                                              data) != 0) {
1196                         DEBUG(DEBUG_INFO,("Failed to re-enable takeover runs\n"));
1197                 }
1198         }
1199
1200         if (ret != 0) {
1201                 DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
1202                 ok = false;
1203                 goto done;
1204         }
1205
1206         ok = true;
1207         /* Takeover run was successful so clear force rebalance targets */
1208         if (rebalance_nodes == rec->force_rebalance_nodes) {
1209                 TALLOC_FREE(rec->force_rebalance_nodes);
1210         } else {
1211                 DEBUG(DEBUG_WARNING,
1212                       ("Rebalance target nodes changed during takeover run - not clearing\n"));
1213         }
1214 done:
1215         rec->need_takeover_run = !ok;
1216         talloc_free(nodes);
1217         ctdb_op_end(rec->takeover_run);
1218
1219         DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
1220         return ok;
1221 }
1222
1223 static int db_recovery_parallel(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx)
1224 {
1225         static char prog[PATH_MAX+1] = "";
1226         const char *arg;
1227
1228         if (!ctdb_set_helper("recovery_helper", prog, sizeof(prog),
1229                              "CTDB_RECOVERY_HELPER", CTDB_HELPER_BINDIR,
1230                              "ctdb_recovery_helper")) {
1231                 ctdb_die(rec->ctdb, "Unable to set recovery helper\n");
1232         }
1233
1234         arg = talloc_asprintf(mem_ctx, "%u", new_generation());
1235         if (arg == NULL) {
1236                 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1237                 return -1;
1238         }
1239
1240         setenv("CTDB_DBDIR_STATE", rec->ctdb->db_directory_state, 1);
1241
1242         return helper_run(rec, mem_ctx, prog, arg, "recovery");
1243 }
1244
1245 /*
1246  * Main recovery function, only run by leader
1247  */
1248 static int do_recovery(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx)
1249 {
1250         struct ctdb_context *ctdb = rec->ctdb;
1251         struct ctdb_node_map_old *nodemap = rec->nodemap;
1252         unsigned int i;
1253         int ret;
1254         bool self_ban;
1255
1256         DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1257
1258         /* Check if the current node is still the leader.  It's possible that
1259          * re-election has changed the leader.
1260          */
1261         if (!this_node_is_leader(rec)) {
1262                 D_NOTICE("Leader changed to %u, aborting recovery\n",
1263                          rec->leader);
1264                 return -1;
1265         }
1266
1267         /* if recovery fails, force it again */
1268         rec->need_recovery = true;
1269
1270         if (!ctdb_op_begin(rec->recovery)) {
1271                 return -1;
1272         }
1273
1274         if (rec->election_in_progress) {
1275                 /* an election is in progress */
1276                 DEBUG(DEBUG_ERR, ("do_recovery called while election in progress - try again later\n"));
1277                 goto fail;
1278         }
1279
1280         ban_misbehaving_nodes(rec, &self_ban);
1281         if (self_ban) {
1282                 DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
1283                 goto fail;
1284         }
1285
1286         if (cluster_lock_enabled(rec) && !cluster_lock_held(rec)) {
1287                 /* Leader can change in ban_misbehaving_nodes() */
1288                 if (!this_node_is_leader(rec)) {
1289                         D_NOTICE("Leader changed to %u, aborting recovery\n",
1290                                  rec->leader);
1291                         rec->need_recovery = false;
1292                         goto fail;
1293                 }
1294
1295                 D_ERR("Cluster lock not held - abort recovery, ban node\n");
1296                 ctdb_ban_node(rec, rec->pnn);
1297                 goto fail;
1298         }
1299
1300         DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1301
1302         /* Retrieve capabilities from all connected nodes */
1303         ret = update_capabilities(rec, nodemap);
1304         if (ret!=0) {
1305                 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1306                 return -1;
1307         }
1308
1309         /*
1310           update all nodes to have the same flags that we have
1311          */
1312         for (i=0;i<nodemap->num;i++) {
1313                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1314                         continue;
1315                 }
1316
1317                 ret = update_flags_on_all_nodes(rec,
1318                                                 nodemap->nodes[i].pnn,
1319                                                 nodemap->nodes[i].flags);
1320                 if (ret != 0) {
1321                         if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1322                                 DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
1323                         } else {
1324                                 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1325                                 return -1;
1326                         }
1327                 }
1328         }
1329
1330         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1331
1332         ret = db_recovery_parallel(rec, mem_ctx);
1333         if (ret != 0) {
1334                 goto fail;
1335         }
1336
1337         do_takeover_run(rec, nodemap);
1338
1339         /* send a message to all clients telling them that the cluster
1340            has been reconfigured */
1341         ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
1342                                        CTDB_SRVID_RECONFIGURE, tdb_null);
1343         if (ret != 0) {
1344                 DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n"));
1345                 goto fail;
1346         }
1347
1348         DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1349
1350         rec->need_recovery = false;
1351         ctdb_op_end(rec->recovery);
1352
1353         /* we managed to complete a full recovery, make sure to forgive
1354            any past sins by the nodes that could now participate in the
1355            recovery.
1356         */
1357         DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
1358         for (i=0;i<nodemap->num;i++) {
1359                 struct ctdb_banning_state *ban_state;
1360
1361                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1362                         continue;
1363                 }
1364
1365                 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
1366                 if (ban_state == NULL) {
1367                         continue;
1368                 }
1369
1370                 ban_state->count = 0;
1371         }
1372
1373         /* We just finished a recovery successfully.
1374            We now wait for rerecovery_timeout before we allow
1375            another recovery to take place.
1376         */
1377         DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be suppressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
1378         ctdb_op_disable(rec->recovery, ctdb->ev,
1379                         ctdb->tunable.rerecovery_timeout);
1380         return 0;
1381
1382 fail:
1383         ctdb_op_end(rec->recovery);
1384         return -1;
1385 }
1386
1387
1388 /*
1389   elections are won by first checking the number of connected nodes, then
1390   the priority time, then the pnn
1391  */
1392 struct election_message {
1393         uint32_t num_connected;
1394         struct timeval priority_time;
1395         uint32_t pnn;
1396         uint32_t node_flags;
1397 };
1398
1399 /*
1400   form this nodes election data
1401  */
1402 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1403 {
1404         unsigned int i;
1405         int ret;
1406         struct ctdb_node_map_old *nodemap;
1407         struct ctdb_context *ctdb = rec->ctdb;
1408
1409         ZERO_STRUCTP(em);
1410
1411         em->pnn = rec->pnn;
1412         em->priority_time = rec->priority_time;
1413
1414         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1415         if (ret != 0) {
1416                 DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
1417                 return;
1418         }
1419
1420         rec->node_flags = nodemap->nodes[rec->pnn].flags;
1421         em->node_flags = rec->node_flags;
1422
1423         for (i=0;i<nodemap->num;i++) {
1424                 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1425                         em->num_connected++;
1426                 }
1427         }
1428
1429         if (!this_node_can_be_leader(rec)) {
1430                 /* Try to lose... */
1431                 em->num_connected = 0;
1432                 em->priority_time = timeval_current();
1433         }
1434
1435         talloc_free(nodemap);
1436 }
1437
1438 /*
1439   see if the given election data wins
1440  */
1441 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1442 {
1443         struct election_message myem;
1444         int cmp = 0;
1445
1446         ctdb_election_data(rec, &myem);
1447
1448         if (!this_node_can_be_leader(rec)) {
1449                 return false;
1450         }
1451
1452         /* Automatically win if other node is banned or stopped */
1453         if (em->node_flags & NODE_FLAGS_INACTIVE) {
1454                 return true;
1455         }
1456
1457         /* then the longest running node */
1458         if (cmp == 0) {
1459                 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1460         }
1461
1462         if (cmp == 0) {
1463                 cmp = (int)myem.pnn - (int)em->pnn;
1464         }
1465
1466         return cmp > 0;
1467 }
1468
1469 /*
1470   send out an election request
1471  */
1472 static int send_election_request(struct ctdb_recoverd *rec)
1473 {
1474         TDB_DATA election_data;
1475         struct election_message emsg;
1476         uint64_t srvid;
1477         struct ctdb_context *ctdb = rec->ctdb;
1478
1479         srvid = CTDB_SRVID_ELECTION;
1480
1481         ctdb_election_data(rec, &emsg);
1482
1483         election_data.dsize = sizeof(struct election_message);
1484         election_data.dptr  = (unsigned char *)&emsg;
1485
1486
1487         /* Assume this node will win the election, set leader accordingly */
1488         rec->leader = rec->pnn;
1489
1490         /* send an election message to all active nodes */
1491         DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
1492         return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1493 }
1494
1495 /*
1496   we think we are winning the election - send a broadcast election request
1497  */
1498 static void election_send_request(struct tevent_context *ev,
1499                                   struct tevent_timer *te,
1500                                   struct timeval t, void *p)
1501 {
1502         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1503         int ret;
1504
1505         ret = send_election_request(rec);
1506         if (ret != 0) {
1507                 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
1508         }
1509
1510         TALLOC_FREE(rec->send_election_te);
1511 }
1512
1513 /*
1514   handler for memory dumps
1515 */
1516 static void mem_dump_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1517 {
1518         struct ctdb_recoverd *rec = talloc_get_type(
1519                 private_data, struct ctdb_recoverd);
1520         struct ctdb_context *ctdb = rec->ctdb;
1521         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1522         TDB_DATA *dump;
1523         int ret;
1524         struct ctdb_srvid_message *rd;
1525
1526         if (data.dsize != sizeof(struct ctdb_srvid_message)) {
1527                 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1528                 talloc_free(tmp_ctx);
1529                 return;
1530         }
1531         rd = (struct ctdb_srvid_message *)data.dptr;
1532
1533         dump = talloc_zero(tmp_ctx, TDB_DATA);
1534         if (dump == NULL) {
1535                 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
1536                 talloc_free(tmp_ctx);
1537                 return;
1538         }
1539         ret = ctdb_dump_memory(ctdb, dump);
1540         if (ret != 0) {
1541                 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
1542                 talloc_free(tmp_ctx);
1543                 return;
1544         }
1545
1546         DBG_ERR("recovery daemon memory dump\n");
1547
1548         ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
1549         if (ret != 0) {
1550                 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
1551                 talloc_free(tmp_ctx);
1552                 return;
1553         }
1554
1555         talloc_free(tmp_ctx);
1556 }
1557
1558 /*
1559   handler for reload_nodes
1560 */
1561 static void reload_nodes_handler(uint64_t srvid, TDB_DATA data,
1562                                  void *private_data)
1563 {
1564         struct ctdb_recoverd *rec = talloc_get_type(
1565                 private_data, struct ctdb_recoverd);
1566
1567         DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
1568
1569         ctdb_load_nodes_file(rec->ctdb);
1570 }
1571
1572
1573 static void recd_node_rebalance_handler(uint64_t srvid, TDB_DATA data,
1574                                         void *private_data)
1575 {
1576         struct ctdb_recoverd *rec = talloc_get_type(
1577                 private_data, struct ctdb_recoverd);
1578         struct ctdb_context *ctdb = rec->ctdb;
1579         uint32_t pnn;
1580         uint32_t *t;
1581         int len;
1582
1583         if (!this_node_is_leader(rec)) {
1584                 return;
1585         }
1586
1587         if (data.dsize != sizeof(uint32_t)) {
1588                 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
1589                 return;
1590         }
1591
1592         pnn = *(uint32_t *)&data.dptr[0];
1593
1594         DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
1595
1596         /* Copy any existing list of nodes.  There's probably some
1597          * sort of realloc variant that will do this but we need to
1598          * make sure that freeing the old array also cancels the timer
1599          * event for the timeout... not sure if realloc will do that.
1600          */
1601         len = (rec->force_rebalance_nodes != NULL) ?
1602                 talloc_array_length(rec->force_rebalance_nodes) :
1603                 0;
1604
1605         /* This allows duplicates to be added but they don't cause
1606          * harm.  A call to add a duplicate PNN arguably means that
1607          * the timeout should be reset, so this is the simplest
1608          * solution.
1609          */
1610         t = talloc_zero_array(rec, uint32_t, len+1);
1611         CTDB_NO_MEMORY_VOID(ctdb, t);
1612         if (len > 0) {
1613                 memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
1614         }
1615         t[len] = pnn;
1616
1617         talloc_free(rec->force_rebalance_nodes);
1618
1619         rec->force_rebalance_nodes = t;
1620 }
1621
1622
1623
1624 static void srvid_disable_and_reply(struct ctdb_recoverd *rec,
1625                                     TDB_DATA data,
1626                                     struct ctdb_op_state *op_state)
1627 {
1628         struct ctdb_context *ctdb = rec->ctdb;
1629         struct ctdb_disable_message *r;
1630         uint32_t timeout;
1631         TDB_DATA result;
1632         int32_t ret = 0;
1633
1634         /* Validate input data */
1635         if (data.dsize != sizeof(struct ctdb_disable_message)) {
1636                 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1637                                  "expecting %lu\n", (long unsigned)data.dsize,
1638                                  (long unsigned)sizeof(struct ctdb_srvid_message)));
1639                 return;
1640         }
1641         if (data.dptr == NULL) {
1642                 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
1643                 return;
1644         }
1645
1646         r = (struct ctdb_disable_message *)data.dptr;
1647         timeout = r->timeout;
1648
1649         ret = ctdb_op_disable(op_state, ctdb->ev, timeout);
1650         if (ret != 0) {
1651                 goto done;
1652         }
1653
1654         /* Returning our PNN tells the caller that we succeeded */
1655         ret = rec->pnn;
1656 done:
1657         result.dsize = sizeof(int32_t);
1658         result.dptr  = (uint8_t *)&ret;
1659         srvid_request_reply(ctdb, (struct ctdb_srvid_message *)r, result);
1660 }
1661
1662 static void disable_takeover_runs_handler(uint64_t srvid, TDB_DATA data,
1663                                           void *private_data)
1664 {
1665         struct ctdb_recoverd *rec = talloc_get_type(
1666                 private_data, struct ctdb_recoverd);
1667
1668         srvid_disable_and_reply(rec, data, rec->takeover_run);
1669 }
1670
1671 /* Backward compatibility for this SRVID */
1672 static void disable_ip_check_handler(uint64_t srvid, TDB_DATA data,
1673                                      void *private_data)
1674 {
1675         struct ctdb_recoverd *rec = talloc_get_type(
1676                 private_data, struct ctdb_recoverd);
1677         uint32_t timeout;
1678
1679         if (data.dsize != sizeof(uint32_t)) {
1680                 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1681                                  "expecting %lu\n", (long unsigned)data.dsize,
1682                                  (long unsigned)sizeof(uint32_t)));
1683                 return;
1684         }
1685         if (data.dptr == NULL) {
1686                 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
1687                 return;
1688         }
1689
1690         timeout = *((uint32_t *)data.dptr);
1691
1692         ctdb_op_disable(rec->takeover_run, rec->ctdb->ev, timeout);
1693 }
1694
1695 static void disable_recoveries_handler(uint64_t srvid, TDB_DATA data,
1696                                        void *private_data)
1697 {
1698         struct ctdb_recoverd *rec = talloc_get_type(
1699                 private_data, struct ctdb_recoverd);
1700
1701         srvid_disable_and_reply(rec, data, rec->recovery);
1702 }
1703
1704 /*
1705   handler for ip reallocate, just add it to the list of requests and
1706   handle this later in the monitor_cluster loop so we do not recurse
1707   with other requests to takeover_run()
1708 */
1709 static void ip_reallocate_handler(uint64_t srvid, TDB_DATA data,
1710                                   void *private_data)
1711 {
1712         struct ctdb_srvid_message *request;
1713         struct ctdb_recoverd *rec = talloc_get_type(
1714                 private_data, struct ctdb_recoverd);
1715
1716         if (data.dsize != sizeof(struct ctdb_srvid_message)) {
1717                 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1718                 return;
1719         }
1720
1721         request = (struct ctdb_srvid_message *)data.dptr;
1722
1723         srvid_request_add(rec->ctdb, &rec->reallocate_requests, request);
1724 }
1725
1726 static void process_ipreallocate_requests(struct ctdb_context *ctdb,
1727                                           struct ctdb_recoverd *rec)
1728 {
1729         TDB_DATA result;
1730         int32_t ret;
1731         struct srvid_requests *current;
1732
1733         /* Only process requests that are currently pending.  More
1734          * might come in while the takeover run is in progress and
1735          * they will need to be processed later since they might
1736          * be in response flag changes.
1737          */
1738         current = rec->reallocate_requests;
1739         rec->reallocate_requests = NULL;
1740
1741         if (do_takeover_run(rec, rec->nodemap)) {
1742                 ret = rec->pnn;
1743         } else {
1744                 ret = -1;
1745         }
1746
1747         result.dsize = sizeof(int32_t);
1748         result.dptr  = (uint8_t *)&ret;
1749
1750         srvid_requests_reply(ctdb, &current, result);
1751 }
1752
1753 /*
1754  * handler for assigning banning credits
1755  */
1756 static void banning_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1757 {
1758         struct ctdb_recoverd *rec = talloc_get_type(
1759                 private_data, struct ctdb_recoverd);
1760         uint32_t ban_pnn;
1761
1762         /* Ignore if we are not leader */
1763         if (!this_node_is_leader(rec)) {
1764                 return;
1765         }
1766
1767         if (data.dsize != sizeof(uint32_t)) {
1768                 DEBUG(DEBUG_ERR, (__location__ "invalid data size %zu\n",
1769                                   data.dsize));
1770                 return;
1771         }
1772
1773         ban_pnn = *(uint32_t *)data.dptr;
1774
1775         ctdb_set_culprit_count(rec, ban_pnn, rec->nodemap->num);
1776 }
1777
1778 /*
1779  * Handler for leader elections
1780  */
1781 static void election_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1782 {
1783         struct ctdb_recoverd *rec = talloc_get_type(
1784                 private_data, struct ctdb_recoverd);
1785         struct ctdb_context *ctdb = rec->ctdb;
1786         struct election_message *em = (struct election_message *)data.dptr;
1787
1788         /* Ignore election packets from ourself */
1789         if (rec->pnn == em->pnn) {
1790                 return;
1791         }
1792
1793         /* we got an election packet - update the timeout for the election */
1794         talloc_free(rec->election_timeout);
1795         rec->election_in_progress = true;
1796         rec->election_timeout = tevent_add_timer(
1797                         ctdb->ev, ctdb,
1798                         fast_start ?
1799                                 timeval_current_ofs(0, 500000) :
1800                                 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1801                         ctdb_election_timeout, rec);
1802
1803         /* someone called an election. check their election data
1804            and if we disagree and we would rather be the elected node,
1805            send a new election message to all other nodes
1806          */
1807         if (ctdb_election_win(rec, em)) {
1808                 if (!rec->send_election_te) {
1809                         rec->send_election_te = tevent_add_timer(
1810                                         ctdb->ev, rec,
1811                                         timeval_current_ofs(0, 500000),
1812                                         election_send_request, rec);
1813                 }
1814                 return;
1815         }
1816
1817         /* we didn't win */
1818         TALLOC_FREE(rec->send_election_te);
1819
1820         /* Release the cluster lock file */
1821         if (cluster_lock_held(rec)) {
1822                 cluster_lock_release(rec);
1823         }
1824
1825         /* Set leader to the winner of this round */
1826         rec->leader = em->pnn;
1827
1828         return;
1829 }
1830
1831 static void cluster_lock_election(struct ctdb_recoverd *rec)
1832 {
1833         bool ok;
1834
1835         if (!this_node_can_be_leader(rec)) {
1836                 if (cluster_lock_held(rec)) {
1837                         cluster_lock_release(rec);
1838                 }
1839                 goto done;
1840         }
1841
1842         /*
1843          * Don't need to unconditionally release the lock and then
1844          * attempt to retake it.  This provides stability.
1845          */
1846         if (cluster_lock_held(rec)) {
1847                 goto done;
1848         }
1849
1850         rec->leader = CTDB_UNKNOWN_PNN;
1851         rec->election_in_progress = true;
1852
1853         ok = cluster_lock_take(rec);
1854         if (ok) {
1855                 rec->leader = rec->pnn;
1856                 D_WARNING("Took cluster lock, leader=%"PRIu32"\n", rec->leader);
1857         }
1858
1859 done:
1860         rec->election_in_progress = false;
1861 }
1862
1863 /*
1864   force the start of the election process
1865  */
1866 static void force_election(struct ctdb_recoverd *rec)
1867 {
1868         int ret;
1869         struct ctdb_context *ctdb = rec->ctdb;
1870
1871         DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
1872
1873         /* set all nodes to recovery mode to stop all internode traffic */
1874         ret = set_recovery_mode(ctdb, rec, rec->nodemap, CTDB_RECOVERY_ACTIVE);
1875         if (ret != 0) {
1876                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1877                 return;
1878         }
1879
1880         if (cluster_lock_enabled(rec)) {
1881                 cluster_lock_election(rec);
1882                 return;
1883         }
1884
1885         talloc_free(rec->election_timeout);
1886         rec->election_in_progress = true;
1887         rec->election_timeout = tevent_add_timer(
1888                         ctdb->ev, ctdb,
1889                         fast_start ?
1890                                 timeval_current_ofs(0, 500000) :
1891                                 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1892                         ctdb_election_timeout, rec);
1893
1894         ret = send_election_request(rec);
1895         if (ret!=0) {
1896                 DBG_ERR("Failed to initiate leader election");
1897                 return;
1898         }
1899
1900         /* wait for a few seconds to collect all responses */
1901         ctdb_wait_election(rec);
1902 }
1903
1904
1905 static void srvid_not_implemented(uint64_t srvid,
1906                                   TDB_DATA data,
1907                                   void *private_data)
1908 {
1909         const char *s;
1910
1911         switch (srvid) {
1912         case CTDB_SRVID_SET_NODE_FLAGS:
1913                 s = "CTDB_SRVID_SET_NODE_FLAGS";
1914                 break;
1915         default:
1916                 s = "UNKNOWN";
1917         }
1918
1919         D_WARNING("SRVID %s (0x%" PRIx64 ") is obsolete\n", s, srvid);
1920 }
1921
1922 /*
1923   handler for when we need to push out flag changes to all other nodes
1924 */
1925 static void push_flags_handler(uint64_t srvid, TDB_DATA data,
1926                                void *private_data)
1927 {
1928         struct ctdb_recoverd *rec = talloc_get_type(
1929                 private_data, struct ctdb_recoverd);
1930         struct ctdb_context *ctdb = rec->ctdb;
1931         int ret;
1932         struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
1933         struct ctdb_node_map_old *nodemap=NULL;
1934         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1935         uint32_t *nodes;
1936
1937         /* read the node flags from the leader */
1938         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->leader,
1939                                    tmp_ctx, &nodemap);
1940         if (ret != 0) {
1941                 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
1942                 talloc_free(tmp_ctx);
1943                 return;
1944         }
1945         if (c->pnn >= nodemap->num) {
1946                 DBG_ERR("Nodemap from leader does not contain node %d\n",
1947                         c->pnn);
1948                 talloc_free(tmp_ctx);
1949                 return;
1950         }
1951
1952         /* send the flags update to all connected nodes */
1953         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
1954
1955         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
1956                                       nodes, 0, CONTROL_TIMEOUT(),
1957                                       false, data,
1958                                       NULL, NULL,
1959                                       NULL) != 0) {
1960                 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
1961
1962                 talloc_free(tmp_ctx);
1963                 return;
1964         }
1965
1966         talloc_free(tmp_ctx);
1967 }
1968
1969 static void leader_broadcast_timeout_handler(struct tevent_context *ev,
1970                                              struct tevent_timer *te,
1971                                              struct timeval current_time,
1972                                              void *private_data)
1973 {
1974         struct ctdb_recoverd *rec = talloc_get_type_abort(
1975                 private_data, struct ctdb_recoverd);
1976
1977         rec->leader_broadcast_timeout_te = NULL;
1978
1979         /* Let other nodes know that an election is underway */
1980         leader_broadcast_send(rec, CTDB_UNKNOWN_PNN);
1981
1982         D_NOTICE("Leader broadcast timeout. Force election\n");
1983         force_election(rec);
1984 }
1985
1986 static void leader_broadcast_timeout_cancel(struct ctdb_recoverd *rec)
1987 {
1988         TALLOC_FREE(rec->leader_broadcast_timeout_te);
1989 }
1990
1991 static int leader_broadcast_timeout_start(struct ctdb_recoverd *rec)
1992 {
1993         struct ctdb_context *ctdb = rec->ctdb;
1994
1995         /*
1996          * This should not be necessary.  However, there will be
1997          * interactions with election code here.  It will want to
1998          * cancel and restart the timer around potentially long
1999          * elections.
2000          */
2001         leader_broadcast_timeout_cancel(rec);
2002
2003         rec->leader_broadcast_timeout_te =
2004                 tevent_add_timer(
2005                         ctdb->ev,
2006                         rec,
2007                         timeval_current_ofs(ctdb_config.leader_timeout, 0),
2008                         leader_broadcast_timeout_handler,
2009                         rec);
2010         if (rec->leader_broadcast_timeout_te == NULL) {
2011                 D_ERR("Unable to start leader broadcast timeout\n");
2012                 return ENOMEM;
2013         }
2014
2015         return 0;
2016 }
2017
2018 static bool leader_broadcast_timeout_active(struct ctdb_recoverd *rec)
2019 {
2020         return rec->leader_broadcast_timeout_te != NULL;
2021 }
2022
2023 static void leader_handler(uint64_t srvid, TDB_DATA data, void *private_data)
2024 {
2025         struct ctdb_recoverd *rec = talloc_get_type_abort(
2026                 private_data, struct ctdb_recoverd);
2027         uint32_t pnn;
2028         size_t npull;
2029         int ret;
2030
2031         ret = ctdb_uint32_pull(data.dptr, data.dsize, &pnn, &npull);
2032         if (ret != 0) {
2033                 DBG_WARNING("Unable to parse leader broadcast, ret=%d\n", ret);
2034                 return;
2035         }
2036
2037         leader_broadcast_timeout_cancel(rec);
2038
2039         if (pnn == rec->leader) {
2040                 goto done;
2041         }
2042
2043         if (pnn == CTDB_UNKNOWN_PNN) {
2044                 bool was_election_in_progress = rec->election_in_progress;
2045
2046                 /*
2047                  * Leader broadcast timeout was cancelled above - stop
2048                  * main loop from restarting it until election is
2049                  * complete
2050                  */
2051                 rec->election_in_progress = true;
2052
2053                 /*
2054                  * This is the only notification for a cluster lock
2055                  * election, so handle it here...
2056                  */
2057                 if (cluster_lock_enabled(rec) && !was_election_in_progress) {
2058                         cluster_lock_election(rec);
2059                 }
2060
2061                 return;
2062         }
2063
2064         D_NOTICE("Received leader broadcast, leader=%"PRIu32"\n", pnn);
2065         rec->leader = pnn;
2066
2067 done:
2068         leader_broadcast_timeout_start(rec);
2069 }
2070
2071 struct verify_recmode_normal_data {
2072         uint32_t count;
2073         enum monitor_result status;
2074 };
2075
2076 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2077 {
2078         struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2079
2080
2081         /* one more node has responded with recmode data*/
2082         rmdata->count--;
2083
2084         /* if we failed to get the recmode, then return an error and let
2085            the main loop try again.
2086         */
2087         if (state->state != CTDB_CONTROL_DONE) {
2088                 if (rmdata->status == MONITOR_OK) {
2089                         rmdata->status = MONITOR_FAILED;
2090                 }
2091                 return;
2092         }
2093
2094         /* if we got a response, then the recmode will be stored in the
2095            status field
2096         */
2097         if (state->status != CTDB_RECOVERY_NORMAL) {
2098                 DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
2099                 rmdata->status = MONITOR_RECOVERY_NEEDED;
2100         }
2101
2102         return;
2103 }
2104
2105
2106 /* verify that all nodes are in normal recovery mode */
2107 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap)
2108 {
2109         struct verify_recmode_normal_data *rmdata;
2110         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2111         struct ctdb_client_control_state *state;
2112         enum monitor_result status;
2113         unsigned int j;
2114
2115         rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2116         CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2117         rmdata->count  = 0;
2118         rmdata->status = MONITOR_OK;
2119
2120         /* loop over all active nodes and send an async getrecmode call to
2121            them*/
2122         for (j=0; j<nodemap->num; j++) {
2123                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2124                         continue;
2125                 }
2126                 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2127                                         CONTROL_TIMEOUT(),
2128                                         nodemap->nodes[j].pnn);
2129                 if (state == NULL) {
2130                         /* we failed to send the control, treat this as
2131                            an error and try again next iteration
2132                         */
2133                         DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2134                         talloc_free(mem_ctx);
2135                         return MONITOR_FAILED;
2136                 }
2137
2138                 /* set up the callback functions */
2139                 state->async.fn = verify_recmode_normal_callback;
2140                 state->async.private_data = rmdata;
2141
2142                 /* one more control to wait for to complete */
2143                 rmdata->count++;
2144         }
2145
2146
2147         /* now wait for up to the maximum number of seconds allowed
2148            or until all nodes we expect a response from has replied
2149         */
2150         while (rmdata->count > 0) {
2151                 tevent_loop_once(ctdb->ev);
2152         }
2153
2154         status = rmdata->status;
2155         talloc_free(mem_ctx);
2156         return status;
2157 }
2158
2159
2160 static bool interfaces_have_changed(struct ctdb_context *ctdb,
2161                                     struct ctdb_recoverd *rec)
2162 {
2163         struct ctdb_iface_list_old *ifaces = NULL;
2164         TALLOC_CTX *mem_ctx;
2165         bool ret = false;
2166
2167         mem_ctx = talloc_new(NULL);
2168
2169         /* Read the interfaces from the local node */
2170         if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
2171                                  CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
2172                 D_ERR("Unable to get interfaces from local node %u\n", rec->pnn);
2173                 /* We could return an error.  However, this will be
2174                  * rare so we'll decide that the interfaces have
2175                  * actually changed, just in case.
2176                  */
2177                 talloc_free(mem_ctx);
2178                 return true;
2179         }
2180
2181         if (!rec->ifaces) {
2182                 /* We haven't been here before so things have changed */
2183                 DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
2184                 ret = true;
2185         } else if (rec->ifaces->num != ifaces->num) {
2186                 /* Number of interfaces has changed */
2187                 DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
2188                                      rec->ifaces->num, ifaces->num));
2189                 ret = true;
2190         } else {
2191                 /* See if interface names or link states have changed */
2192                 unsigned int i;
2193                 for (i = 0; i < rec->ifaces->num; i++) {
2194                         struct ctdb_iface * iface = &rec->ifaces->ifaces[i];
2195                         if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
2196                                 DEBUG(DEBUG_NOTICE,
2197                                       ("Interface in slot %d changed: %s => %s\n",
2198                                        i, iface->name, ifaces->ifaces[i].name));
2199                                 ret = true;
2200                                 break;
2201                         }
2202                         if (iface->link_state != ifaces->ifaces[i].link_state) {
2203                                 DEBUG(DEBUG_NOTICE,
2204                                       ("Interface %s changed state: %d => %d\n",
2205                                        iface->name, iface->link_state,
2206                                        ifaces->ifaces[i].link_state));
2207                                 ret = true;
2208                                 break;
2209                         }
2210                 }
2211         }
2212
2213         talloc_free(rec->ifaces);
2214         rec->ifaces = talloc_steal(rec, ifaces);
2215
2216         talloc_free(mem_ctx);
2217         return ret;
2218 }
2219
2220 /* Check that the local allocation of public IP addresses is correct
2221  * and do some house-keeping */
2222 static int verify_local_ip_allocation(struct ctdb_recoverd *rec)
2223 {
2224         TALLOC_CTX *mem_ctx = talloc_new(NULL);
2225         struct ctdb_context *ctdb = rec->ctdb;
2226         unsigned int j;
2227         int ret;
2228         bool need_takeover_run = false;
2229         struct ctdb_public_ip_list_old *ips = NULL;
2230
2231         /* If we are not the leader then do some housekeeping */
2232         if (!this_node_is_leader(rec)) {
2233                 /* Ignore any IP reallocate requests - only leader
2234                  * processes them
2235                  */
2236                 TALLOC_FREE(rec->reallocate_requests);
2237                 /* Clear any nodes that should be force rebalanced in
2238                  * the next takeover run.  If the leader has changed
2239                  * then we don't want to process these some time in
2240                  * the future.
2241                  */
2242                 TALLOC_FREE(rec->force_rebalance_nodes);
2243         }
2244
2245         /* Return early if disabled... */
2246         if (ctdb_config.failover_disabled ||
2247             ctdb_op_is_disabled(rec->takeover_run)) {
2248                 talloc_free(mem_ctx);
2249                 return  0;
2250         }
2251
2252         if (interfaces_have_changed(ctdb, rec)) {
2253                 need_takeover_run = true;
2254         }
2255
2256         /* If there are unhosted IPs but this node can host them then
2257          * trigger an IP reallocation */
2258
2259         /* Read *available* IPs from local node */
2260         ret = ctdb_ctrl_get_public_ips_flags(
2261                 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx,
2262                 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
2263         if (ret != 0) {
2264                 DEBUG(DEBUG_ERR, ("Unable to retrieve available public IPs\n"));
2265                 talloc_free(mem_ctx);
2266                 return -1;
2267         }
2268
2269         for (j=0; j<ips->num; j++) {
2270                 if (ips->ips[j].pnn == CTDB_UNKNOWN_PNN &&
2271                     rec->nodemap->nodes[rec->pnn].flags == 0) {
2272                         DEBUG(DEBUG_WARNING,
2273                               ("Unassigned IP %s can be served by this node\n",
2274                                ctdb_addr_to_str(&ips->ips[j].addr)));
2275                         need_takeover_run = true;
2276                 }
2277         }
2278
2279         talloc_free(ips);
2280
2281         if (!ctdb->do_checkpublicip) {
2282                 goto done;
2283         }
2284
2285         /* Validate the IP addresses that this node has on network
2286          * interfaces.  If there is an inconsistency between reality
2287          * and the state expected by CTDB then try to fix it by
2288          * triggering an IP reallocation or releasing extraneous IP
2289          * addresses. */
2290
2291         /* Read *known* IPs from local node */
2292         ret = ctdb_ctrl_get_public_ips_flags(
2293                 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
2294         if (ret != 0) {
2295                 DEBUG(DEBUG_ERR, ("Unable to retrieve known public IPs\n"));
2296                 talloc_free(mem_ctx);
2297                 return -1;
2298         }
2299
2300         for (j=0; j<ips->num; j++) {
2301                 if (ips->ips[j].pnn == rec->pnn) {
2302                         if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
2303                                 DEBUG(DEBUG_ERR,
2304                                       ("Assigned IP %s not on an interface\n",
2305                                        ctdb_addr_to_str(&ips->ips[j].addr)));
2306                                 need_takeover_run = true;
2307                         }
2308                 } else {
2309                         if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
2310                                 DEBUG(DEBUG_ERR,
2311                                       ("IP %s incorrectly on an interface\n",
2312                                        ctdb_addr_to_str(&ips->ips[j].addr)));
2313                                 need_takeover_run = true;
2314                         }
2315                 }
2316         }
2317
2318 done:
2319         if (need_takeover_run) {
2320                 struct ctdb_srvid_message rd;
2321                 TDB_DATA data;
2322
2323                 DEBUG(DEBUG_NOTICE,("Trigger takeoverrun\n"));
2324
2325                 ZERO_STRUCT(rd);
2326                 rd.pnn = rec->pnn;
2327                 rd.srvid = 0;
2328                 data.dptr = (uint8_t *)&rd;
2329                 data.dsize = sizeof(rd);
2330
2331                 ret = ctdb_client_send_message(ctdb,
2332                                                CTDB_BROADCAST_CONNECTED,
2333                                                CTDB_SRVID_TAKEOVER_RUN,
2334                                                data);
2335                 if (ret != 0) {
2336                         D_ERR("Failed to send takeover run request\n");
2337                 }
2338         }
2339         talloc_free(mem_ctx);
2340         return 0;
2341 }
2342
2343
2344 struct remote_nodemaps_state {
2345         struct ctdb_node_map_old **remote_nodemaps;
2346         struct ctdb_recoverd *rec;
2347 };
2348
2349 static void async_getnodemap_callback(struct ctdb_context *ctdb,
2350                                       uint32_t node_pnn,
2351                                       int32_t res,
2352                                       TDB_DATA outdata,
2353                                       void *callback_data)
2354 {
2355         struct remote_nodemaps_state *state =
2356                 (struct remote_nodemaps_state *)callback_data;
2357         struct ctdb_node_map_old **remote_nodemaps = state->remote_nodemaps;
2358         struct ctdb_node_map_old *nodemap = state->rec->nodemap;
2359         size_t i;
2360
2361         for (i = 0; i < nodemap->num; i++) {
2362                 if (nodemap->nodes[i].pnn == node_pnn) {
2363                         break;
2364                 }
2365         }
2366
2367         if (i >= nodemap->num) {
2368                 DBG_ERR("Invalid PNN %"PRIu32"\n", node_pnn);
2369                 return;
2370         }
2371
2372         remote_nodemaps[i] = (struct ctdb_node_map_old *)talloc_steal(
2373                                         remote_nodemaps, outdata.dptr);
2374
2375 }
2376
2377 static void async_getnodemap_error(struct ctdb_context *ctdb,
2378                                    uint32_t node_pnn,
2379                                    int32_t res,
2380                                    TDB_DATA outdata,
2381                                    void *callback_data)
2382 {
2383         struct remote_nodemaps_state *state =
2384                 (struct remote_nodemaps_state *)callback_data;
2385         struct ctdb_recoverd *rec = state->rec;
2386
2387         DBG_ERR("Failed to retrieve nodemap from node %u\n", node_pnn);
2388         ctdb_set_culprit(rec, node_pnn);
2389 }
2390
2391 static int get_remote_nodemaps(struct ctdb_recoverd *rec,
2392                                TALLOC_CTX *mem_ctx,
2393                                struct ctdb_node_map_old ***remote_nodemaps)
2394 {
2395         struct ctdb_context *ctdb = rec->ctdb;
2396         struct ctdb_node_map_old **t;
2397         uint32_t *nodes;
2398         struct remote_nodemaps_state state;
2399         int ret;
2400
2401         t = talloc_zero_array(mem_ctx,
2402                               struct ctdb_node_map_old *,
2403                               rec->nodemap->num);
2404         if (t == NULL) {
2405                 DBG_ERR("Memory allocation error\n");
2406                 return -1;
2407         }
2408
2409         nodes = list_of_connected_nodes(ctdb, rec->nodemap, mem_ctx, false);
2410
2411         state.remote_nodemaps = t;
2412         state.rec = rec;
2413
2414         ret = ctdb_client_async_control(ctdb,
2415                                         CTDB_CONTROL_GET_NODEMAP,
2416                                         nodes,
2417                                         0,
2418                                         CONTROL_TIMEOUT(),
2419                                         false,
2420                                         tdb_null,
2421                                         async_getnodemap_callback,
2422                                         async_getnodemap_error,
2423                                         &state);
2424         talloc_free(nodes);
2425
2426         if (ret != 0) {
2427                 talloc_free(t);
2428                 return ret;
2429         }
2430
2431         *remote_nodemaps = t;
2432         return 0;
2433 }
2434
2435 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
2436                       TALLOC_CTX *mem_ctx)
2437 {
2438         struct ctdb_node_map_old *nodemap=NULL;
2439         struct ctdb_node_map_old **remote_nodemaps=NULL;
2440         struct ctdb_vnn_map *vnnmap=NULL;
2441         struct ctdb_vnn_map *remote_vnnmap=NULL;
2442         uint32_t num_lmasters;
2443         int32_t debug_level;
2444         unsigned int i, j;
2445         int ret;
2446         bool self_ban;
2447
2448
2449         /* verify that the main daemon is still running */
2450         if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
2451                 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2452                 exit(-1);
2453         }
2454
2455         /* ping the local daemon to tell it we are alive */
2456         ctdb_ctrl_recd_ping(ctdb);
2457
2458         if (rec->election_in_progress) {
2459                 /* an election is in progress */
2460                 return;
2461         }
2462
2463         /*
2464          * Start leader broadcasts if they are not active (1st time
2465          * through main loop?  Memory allocation error?)
2466          */
2467         if (!leader_broadcast_loop_active(rec)) {
2468                 ret = leader_broadcast_loop(rec);
2469                 if (ret != 0) {
2470                         D_ERR("Failed to set up leader broadcast\n");
2471                         ctdb_set_culprit(rec, rec->pnn);
2472                 }
2473         }
2474         /*
2475          * Similar for leader broadcast timeouts.  These can also have
2476          * been stopped by another node receiving a leader broadcast
2477          * timeout and transmitting an "unknown leader broadcast".
2478          * Note that this should never be done during an election - at
2479          * the moment there is nothing between here and the above
2480          * election-in-progress check that can process an election
2481          * result (i.e. no event loop).
2482          */
2483         if (!leader_broadcast_timeout_active(rec)) {
2484                 ret = leader_broadcast_timeout_start(rec);
2485                 if (ret != 0) {
2486                         ctdb_set_culprit(rec, rec->pnn);
2487                 }
2488         }
2489
2490
2491         /* read the debug level from the parent and update locally */
2492         ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
2493         if (ret !=0) {
2494                 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
2495                 return;
2496         }
2497         debuglevel_set(debug_level);
2498
2499         /* get relevant tunables */
2500         ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
2501         if (ret != 0) {
2502                 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
2503                 return;
2504         }
2505
2506         /* get runstate */
2507         ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(),
2508                                      CTDB_CURRENT_NODE, &ctdb->runstate);
2509         if (ret != 0) {
2510                 DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n"));
2511                 return;
2512         }
2513
2514         /* get nodemap */
2515         ret = ctdb_ctrl_getnodemap(ctdb,
2516                                    CONTROL_TIMEOUT(),
2517                                    rec->pnn,
2518                                    rec,
2519                                    &nodemap);
2520         if (ret != 0) {
2521                 DBG_ERR("Unable to get nodemap from node %"PRIu32"\n", rec->pnn);
2522                 return;
2523         }
2524         talloc_free(rec->nodemap);
2525         rec->nodemap = nodemap;
2526
2527         /* remember our own node flags */
2528         rec->node_flags = nodemap->nodes[rec->pnn].flags;
2529
2530         ban_misbehaving_nodes(rec, &self_ban);
2531         if (self_ban) {
2532                 DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
2533                 return;
2534         }
2535
2536         ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2537                                    CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2538         if (ret != 0) {
2539                 D_ERR("Failed to read recmode from local node\n");
2540                 return;
2541         }
2542
2543         /* if the local daemon is STOPPED or BANNED, we verify that the databases are
2544            also frozen and that the recmode is set to active.
2545         */
2546         if (rec->node_flags & NODE_FLAGS_INACTIVE) {
2547                 /* If this node has become inactive then we want to
2548                  * reduce the chances of it taking over the leader
2549                  * role when it becomes active again.  This
2550                  * helps to stabilise the leader role so that
2551                  * it stays on the most stable node.
2552                  */
2553                 rec->priority_time = timeval_current();
2554
2555                 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2556                         DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
2557
2558                         ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2559                         if (ret != 0) {
2560                                 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
2561
2562                                 return;
2563                         }
2564                 }
2565                 if (! rec->frozen_on_inactive) {
2566                         ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(),
2567                                                CTDB_CURRENT_NODE);
2568                         if (ret != 0) {
2569                                 DEBUG(DEBUG_ERR,
2570                                       (__location__ " Failed to freeze node "
2571                                        "in STOPPED or BANNED state\n"));
2572                                 return;
2573                         }
2574
2575                         rec->frozen_on_inactive = true;
2576                 }
2577
2578                 /* If this node is stopped or banned then it is not the recovery
2579                  * master, so don't do anything. This prevents stopped or banned
2580                  * node from starting election and sending unnecessary controls.
2581                  */
2582                 return;
2583         }
2584
2585         rec->frozen_on_inactive = false;
2586
2587         /* Retrieve capabilities from all connected nodes */
2588         ret = update_capabilities(rec, nodemap);
2589         if (ret != 0) {
2590                 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
2591                 return;
2592         }
2593
2594         if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2595                 /* Check if an IP takeover run is needed and trigger one if
2596                  * necessary */
2597                 verify_local_ip_allocation(rec);
2598         }
2599
2600         /* If this node is not the leader then skip recovery checks */
2601         if (!this_node_is_leader(rec)) {
2602                 return;
2603         }
2604
2605
2606         /* Get the nodemaps for all connected remote nodes */
2607         ret = get_remote_nodemaps(rec, mem_ctx, &remote_nodemaps);
2608         if (ret != 0) {
2609                 DBG_ERR("Failed to read remote nodemaps\n");
2610                 return;
2611         }
2612
2613         /* Ensure our local and remote flags are correct */
2614         ret = update_flags(rec, nodemap, remote_nodemaps);
2615         if (ret != 0) {
2616                 D_ERR("Unable to update flags\n");
2617                 return;
2618         }
2619
2620         if (ctdb->num_nodes != nodemap->num) {
2621                 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
2622                 ctdb_load_nodes_file(ctdb);
2623                 return;
2624         }
2625
2626         /* get the vnnmap */
2627         ret = ctdb_ctrl_getvnnmap(ctdb,
2628                                   CONTROL_TIMEOUT(),
2629                                   rec->pnn,
2630                                   mem_ctx,
2631                                   &vnnmap);
2632         if (ret != 0) {
2633                 DBG_ERR("Unable to get vnnmap from node %u\n", rec->pnn);
2634                 return;
2635         }
2636
2637         if (rec->need_recovery) {
2638                 /* a previous recovery didn't finish */
2639                 do_recovery(rec, mem_ctx);
2640                 return;
2641         }
2642
2643         /* verify that all active nodes are in normal mode
2644            and not in recovery mode
2645         */
2646         switch (verify_recmode(ctdb, nodemap)) {
2647         case MONITOR_RECOVERY_NEEDED:
2648                 do_recovery(rec, mem_ctx);
2649                 return;
2650         case MONITOR_FAILED:
2651                 return;
2652         case MONITOR_ELECTION_NEEDED:
2653                 /* can not happen */
2654         case MONITOR_OK:
2655                 break;
2656         }
2657
2658         if (cluster_lock_enabled(rec)) {
2659                 /* We must already hold the cluster lock */
2660                 if (!cluster_lock_held(rec)) {
2661                         D_ERR("Failed cluster lock sanity check\n");
2662                         ctdb_set_culprit(rec, rec->pnn);
2663                         do_recovery(rec, mem_ctx);
2664                         return;
2665                 }
2666         }
2667
2668
2669         /* If recoveries are disabled then there is no use doing any
2670          * nodemap or flags checks.  Recoveries might be disabled due
2671          * to "reloadnodes", so doing these checks might cause an
2672          * unnecessary recovery.  */
2673         if (ctdb_op_is_disabled(rec->recovery)) {
2674                 goto takeover_run_checks;
2675         }
2676
2677         /* verify that all other nodes have the same nodemap as we have
2678         */
2679         for (j=0; j<nodemap->num; j++) {
2680                 if (nodemap->nodes[j].pnn == rec->pnn) {
2681                         continue;
2682                 }
2683                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2684                         continue;
2685                 }
2686
2687                 /* if the nodes disagree on how many nodes there are
2688                    then this is a good reason to try recovery
2689                  */
2690                 if (remote_nodemaps[j]->num != nodemap->num) {
2691                         DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
2692                                   nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
2693                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2694                         do_recovery(rec, mem_ctx);
2695                         return;
2696                 }
2697
2698                 /* if the nodes disagree on which nodes exist and are
2699                    active, then that is also a good reason to do recovery
2700                  */
2701                 for (i=0;i<nodemap->num;i++) {
2702                         if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
2703                                 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
2704                                           nodemap->nodes[j].pnn, i,
2705                                           remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
2706                                 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2707                                 do_recovery(rec, mem_ctx);
2708                                 return;
2709                         }
2710                 }
2711         }
2712
2713         /* count how many active nodes there are */
2714         num_lmasters  = 0;
2715         for (i=0; i<nodemap->num; i++) {
2716                 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
2717                         if (ctdb_node_has_capabilities(rec->caps,
2718                                                        ctdb->nodes[i]->pnn,
2719                                                        CTDB_CAP_LMASTER)) {
2720                                 num_lmasters++;
2721                         }
2722                 }
2723         }
2724
2725
2726         /* There must be the same number of lmasters in the vnn map as
2727          * there are active nodes with the lmaster capability...  or
2728          * do a recovery.
2729          */
2730         if (vnnmap->size != num_lmasters) {
2731                 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
2732                           vnnmap->size, num_lmasters));
2733                 ctdb_set_culprit(rec, rec->pnn);
2734                 do_recovery(rec, mem_ctx);
2735                 return;
2736         }
2737
2738         /*
2739          * Verify that all active lmaster nodes in the nodemap also
2740          * exist in the vnnmap
2741          */
2742         for (j=0; j<nodemap->num; j++) {
2743                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2744                         continue;
2745                 }
2746                 if (! ctdb_node_has_capabilities(rec->caps,
2747                                                  nodemap->nodes[j].pnn,
2748                                                  CTDB_CAP_LMASTER)) {
2749                         continue;
2750                 }
2751                 if (nodemap->nodes[j].pnn == rec->pnn) {
2752                         continue;
2753                 }
2754
2755                 for (i=0; i<vnnmap->size; i++) {
2756                         if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
2757                                 break;
2758                         }
2759                 }
2760                 if (i == vnnmap->size) {
2761                         D_ERR("Active LMASTER node %u is not in the vnnmap\n",
2762                               nodemap->nodes[j].pnn);
2763                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2764                         do_recovery(rec, mem_ctx);
2765                         return;
2766                 }
2767         }
2768
2769
2770         /* verify that all other nodes have the same vnnmap
2771            and are from the same generation
2772          */
2773         for (j=0; j<nodemap->num; j++) {
2774                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2775                         continue;
2776                 }
2777                 if (nodemap->nodes[j].pnn == rec->pnn) {
2778                         continue;
2779                 }
2780
2781                 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
2782                                           mem_ctx, &remote_vnnmap);
2783                 if (ret != 0) {
2784                         DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
2785                                   nodemap->nodes[j].pnn));
2786                         return;
2787                 }
2788
2789                 /* verify the vnnmap generation is the same */
2790                 if (vnnmap->generation != remote_vnnmap->generation) {
2791                         DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
2792                                   nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
2793                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2794                         do_recovery(rec, mem_ctx);
2795                         return;
2796                 }
2797
2798                 /* verify the vnnmap size is the same */
2799                 if (vnnmap->size != remote_vnnmap->size) {
2800                         DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
2801                                   nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
2802                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2803                         do_recovery(rec, mem_ctx);
2804                         return;
2805                 }
2806
2807                 /* verify the vnnmap is the same */
2808                 for (i=0;i<vnnmap->size;i++) {
2809                         if (remote_vnnmap->map[i] != vnnmap->map[i]) {
2810                                 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
2811                                           nodemap->nodes[j].pnn));
2812                                 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2813                                 do_recovery(rec, mem_ctx);
2814                                 return;
2815                         }
2816                 }
2817         }
2818
2819         /* FIXME: Add remote public IP checking to ensure that nodes
2820          * have the IP addresses that are allocated to them. */
2821
2822 takeover_run_checks:
2823
2824         /* If there are IP takeover runs requested or the previous one
2825          * failed then perform one and notify the waiters */
2826         if (!ctdb_op_is_disabled(rec->takeover_run) &&
2827             (rec->reallocate_requests || rec->need_takeover_run)) {
2828                 process_ipreallocate_requests(ctdb, rec);
2829         }
2830 }
2831
2832 static void recd_sig_term_handler(struct tevent_context *ev,
2833                                   struct tevent_signal *se, int signum,
2834                                   int count, void *dont_care,
2835                                   void *private_data)
2836 {
2837         struct ctdb_recoverd *rec = talloc_get_type_abort(
2838                 private_data, struct ctdb_recoverd);
2839
2840         DEBUG(DEBUG_ERR, ("Received SIGTERM, exiting\n"));
2841         cluster_lock_release(rec);
2842         exit(0);
2843 }
2844
2845 /*
2846  * Periodically log elements of the cluster state
2847  *
2848  * This can be used to confirm a split brain has occurred
2849  */
2850 static void maybe_log_cluster_state(struct tevent_context *ev,
2851                                     struct tevent_timer *te,
2852                                     struct timeval current_time,
2853                                     void *private_data)
2854 {
2855         struct ctdb_recoverd *rec = talloc_get_type_abort(
2856                 private_data, struct ctdb_recoverd);
2857         struct ctdb_context *ctdb = rec->ctdb;
2858         struct tevent_timer *tt;
2859
2860         static struct timeval start_incomplete = {
2861                 .tv_sec = 0,
2862         };
2863
2864         bool is_complete;
2865         bool was_complete;
2866         unsigned int i;
2867         double seconds;
2868         unsigned int minutes;
2869         unsigned int num_connected;
2870
2871         if (!this_node_is_leader(rec)) {
2872                 goto done;
2873         }
2874
2875         if (rec->nodemap == NULL) {
2876                 goto done;
2877         }
2878
2879         is_complete = true;
2880         num_connected = 0;
2881         for (i = 0; i < rec->nodemap->num; i++) {
2882                 struct ctdb_node_and_flags *n = &rec->nodemap->nodes[i];
2883
2884                 if (n->pnn == rec->pnn) {
2885                         continue;
2886                 }
2887                 if ((n->flags & NODE_FLAGS_DELETED) != 0) {
2888                         continue;
2889                 }
2890                 if ((n->flags & NODE_FLAGS_DISCONNECTED) != 0) {
2891                         is_complete = false;
2892                         continue;
2893                 }
2894
2895                 num_connected++;
2896         }
2897
2898         was_complete = timeval_is_zero(&start_incomplete);
2899
2900         if (is_complete) {
2901                 if (! was_complete) {
2902                         D_WARNING("Cluster complete with leader=%u\n",
2903                                   rec->leader);
2904                         start_incomplete = timeval_zero();
2905                 }
2906                 goto done;
2907         }
2908
2909         /* Cluster is newly incomplete... */
2910         if (was_complete) {
2911                 start_incomplete = current_time;
2912                 minutes = 0;
2913                 goto log;
2914         }
2915
2916         /*
2917          * Cluster has been incomplete since previous check, so figure
2918          * out how long (in minutes) and decide whether to log anything
2919          */
2920         seconds = timeval_elapsed2(&start_incomplete, &current_time);
2921         minutes = (unsigned int)seconds / 60;
2922         if (minutes >= 60) {
2923                 /* Over an hour, log every hour */
2924                 if (minutes % 60 != 0) {
2925                         goto done;
2926                 }
2927         } else if (minutes >= 10) {
2928                 /* Over 10 minutes, log every 10 minutes */
2929                 if (minutes % 10 != 0) {
2930                         goto done;
2931                 }
2932         }
2933
2934 log:
2935         D_WARNING("Cluster incomplete with leader=%u, elapsed=%u minutes, "
2936                   "connected=%u\n",
2937                   rec->leader,
2938                   minutes,
2939                   num_connected);
2940
2941 done:
2942         tt = tevent_add_timer(ctdb->ev,
2943                               rec,
2944                               timeval_current_ofs(60, 0),
2945                               maybe_log_cluster_state,
2946                               rec);
2947         if (tt == NULL) {
2948                 DBG_WARNING("Failed to set up cluster state timer\n");
2949         }
2950 }
2951
2952 static void recd_sighup_hook(void *private_data)
2953 {
2954         struct ctdb_recoverd *rec = talloc_get_type_abort(
2955                 private_data, struct ctdb_recoverd);
2956
2957         if (rec->helper_pid > 0) {
2958                 kill(rec->helper_pid, SIGHUP);
2959         }
2960 }
2961
2962 /*
2963   the main monitoring loop
2964  */
2965 static void monitor_cluster(struct ctdb_context *ctdb)
2966 {
2967         struct tevent_signal *se;
2968         struct ctdb_recoverd *rec;
2969         bool status;
2970
2971         DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
2972
2973         rec = talloc_zero(ctdb, struct ctdb_recoverd);
2974         CTDB_NO_MEMORY_FATAL(ctdb, rec);
2975
2976         rec->ctdb = ctdb;
2977         rec->leader = CTDB_UNKNOWN_PNN;
2978         rec->pnn = ctdb_get_pnn(ctdb);
2979         rec->cluster_lock_handle = NULL;
2980         rec->helper_pid = -1;
2981
2982         rec->takeover_run = ctdb_op_init(rec, "takeover runs");
2983         CTDB_NO_MEMORY_FATAL(ctdb, rec->takeover_run);
2984
2985         rec->recovery = ctdb_op_init(rec, "recoveries");
2986         CTDB_NO_MEMORY_FATAL(ctdb, rec->recovery);
2987
2988         rec->priority_time = timeval_current();
2989         rec->frozen_on_inactive = false;
2990
2991         status = logging_setup_sighup_handler(rec->ctdb->ev,
2992                                               rec,
2993                                               recd_sighup_hook,
2994                                               rec);
2995         if (!status) {
2996                 D_ERR("Failed to install SIGHUP handler\n");
2997                 exit(1);
2998         }
2999
3000         se = tevent_add_signal(ctdb->ev, ctdb, SIGTERM, 0,
3001                                recd_sig_term_handler, rec);
3002         if (se == NULL) {
3003                 DEBUG(DEBUG_ERR, ("Failed to install SIGTERM handler\n"));
3004                 exit(1);
3005         }
3006
3007         if (!cluster_lock_enabled(rec)) {
3008                 struct tevent_timer *tt;
3009
3010                 tt = tevent_add_timer(ctdb->ev,
3011                                       rec,
3012                                       timeval_current_ofs(60, 0),
3013                                       maybe_log_cluster_state,
3014                                       rec);
3015                 if (tt == NULL) {
3016                         DBG_WARNING("Failed to set up cluster state timer\n");
3017                 }
3018         }
3019
3020         /* register a message port for sending memory dumps */
3021         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
3022
3023         /* when a node is assigned banning credits */
3024         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_BANNING,
3025                                         banning_handler, rec);
3026
3027         /* register a message port for recovery elections */
3028         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_ELECTION, election_handler, rec);
3029
3030         ctdb_client_set_message_handler(ctdb,
3031                                         CTDB_SRVID_SET_NODE_FLAGS,
3032                                         srvid_not_implemented,
3033                                         rec);
3034
3035         /* when we are asked to puch out a flag change */
3036         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
3037
3038         /* register a message port for reloadnodes  */
3039         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
3040
3041         /* register a message port for performing a takeover run */
3042         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
3043
3044         /* register a message port for disabling the ip check for a short while */
3045         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
3046
3047         /* register a message port for forcing a rebalance of a node next
3048            reallocation */
3049         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
3050
3051         /* Register a message port for disabling takeover runs */
3052         ctdb_client_set_message_handler(ctdb,
3053                                         CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
3054                                         disable_takeover_runs_handler, rec);
3055
3056         /* Register a message port for disabling recoveries */
3057         ctdb_client_set_message_handler(ctdb,
3058                                         CTDB_SRVID_DISABLE_RECOVERIES,
3059                                         disable_recoveries_handler, rec);
3060
3061         ctdb_client_set_message_handler(ctdb,
3062                                         CTDB_SRVID_LEADER,
3063                                         leader_handler,
3064                                         rec);
3065
3066         for (;;) {
3067                 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3068                 struct timeval start;
3069                 double elapsed;
3070
3071                 if (!mem_ctx) {
3072                         DEBUG(DEBUG_CRIT,(__location__
3073                                           " Failed to create temp context\n"));
3074                         exit(-1);
3075                 }
3076
3077                 start = timeval_current();
3078                 main_loop(ctdb, rec, mem_ctx);
3079                 talloc_free(mem_ctx);
3080
3081                 /* we only check for recovery once every second */
3082                 elapsed = timeval_elapsed(&start);
3083                 if (elapsed < ctdb->tunable.recover_interval) {
3084                         ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
3085                                           - elapsed);
3086                 }
3087         }
3088 }
3089
3090 /*
3091   event handler for when the main ctdbd dies
3092  */
3093 static void ctdb_recoverd_parent(struct tevent_context *ev,
3094                                  struct tevent_fd *fde,
3095                                  uint16_t flags, void *private_data)
3096 {
3097         DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
3098         _exit(1);
3099 }
3100
3101 /*
3102   called regularly to verify that the recovery daemon is still running
3103  */
3104 static void ctdb_check_recd(struct tevent_context *ev,
3105                             struct tevent_timer *te,
3106                             struct timeval yt, void *p)
3107 {
3108         struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3109
3110         if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
3111                 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
3112
3113                 tevent_add_timer(ctdb->ev, ctdb, timeval_zero(),
3114                                  ctdb_restart_recd, ctdb);
3115
3116                 return;
3117         }
3118
3119         tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3120                          timeval_current_ofs(30, 0),
3121                          ctdb_check_recd, ctdb);
3122 }
3123
3124 static void recd_sig_child_handler(struct tevent_context *ev,
3125                                    struct tevent_signal *se, int signum,
3126                                    int count, void *dont_care,
3127                                    void *private_data)
3128 {
3129 //      struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3130         int status;
3131         pid_t pid = -1;
3132
3133         while (pid != 0) {
3134                 pid = waitpid(-1, &status, WNOHANG);
3135                 if (pid == -1) {
3136                         if (errno != ECHILD) {
3137                                 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3138                         }
3139                         return;
3140                 }
3141                 if (pid > 0) {
3142                         DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3143                 }
3144         }
3145 }
3146
3147 /*
3148   startup the recovery daemon as a child of the main ctdb daemon
3149  */
3150 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3151 {
3152         int fd[2];
3153         struct tevent_signal *se;
3154         struct tevent_fd *fde;
3155         int ret;
3156
3157         if (pipe(fd) != 0) {
3158                 return -1;
3159         }
3160
3161         ctdb->recoverd_pid = ctdb_fork(ctdb);
3162         if (ctdb->recoverd_pid == -1) {
3163                 return -1;
3164         }
3165
3166         if (ctdb->recoverd_pid != 0) {
3167                 talloc_free(ctdb->recd_ctx);
3168                 ctdb->recd_ctx = talloc_new(ctdb);
3169                 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
3170
3171                 close(fd[0]);
3172                 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3173                                  timeval_current_ofs(30, 0),
3174                                  ctdb_check_recd, ctdb);
3175                 return 0;
3176         }
3177
3178         close(fd[1]);
3179
3180         srandom(getpid() ^ time(NULL));
3181
3182         ret = logging_init(ctdb, NULL, NULL, "ctdb-recoverd");
3183         if (ret != 0) {
3184                 return -1;
3185         }
3186
3187         prctl_set_comment("ctdb_recoverd");
3188         if (switch_from_server_to_client(ctdb) != 0) {
3189                 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3190                 exit(1);
3191         }
3192
3193         DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
3194
3195         fde = tevent_add_fd(ctdb->ev, ctdb, fd[0], TEVENT_FD_READ,
3196                             ctdb_recoverd_parent, &fd[0]);
3197         tevent_fd_set_auto_close(fde);
3198
3199         /* set up a handler to pick up sigchld */
3200         se = tevent_add_signal(ctdb->ev, ctdb, SIGCHLD, 0,
3201                                recd_sig_child_handler, ctdb);
3202         if (se == NULL) {
3203                 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3204                 exit(1);
3205         }
3206
3207         monitor_cluster(ctdb);
3208
3209         DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
3210         return -1;
3211 }
3212
3213 /*
3214   shutdown the recovery daemon
3215  */
3216 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
3217 {
3218         if (ctdb->recoverd_pid == 0) {
3219                 return;
3220         }
3221
3222         DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
3223         ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
3224
3225         TALLOC_FREE(ctdb->recd_ctx);
3226         TALLOC_FREE(ctdb->recd_ping_count);
3227 }
3228
3229 static void ctdb_restart_recd(struct tevent_context *ev,
3230                               struct tevent_timer *te,
3231                               struct timeval t, void *private_data)
3232 {
3233         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3234
3235         DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
3236         ctdb_stop_recoverd(ctdb);
3237         ctdb_start_recoverd(ctdb);
3238 }