ctdb/server/ctdb_recoverd.c

   1 /*
   2    ctdb recovery daemon
   3
   4    Copyright (C) Ronnie Sahlberg  2007
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20 #include "replace.h"
  21 #include "system/filesys.h"
  22 #include "system/time.h"
  23 #include "system/network.h"
  24 #include "system/wait.h"
  25
  26 #include <popt.h>
  27 #include <talloc.h>
  28 #include <tevent.h>
  29 #include <tdb.h>
  30
  31 #include "lib/tdb_wrap/tdb_wrap.h"
  32 #include "lib/util/dlinklist.h"
  33 #include "lib/util/debug.h"
  34 #include "lib/util/samba_util.h"
  35 #include "lib/util/sys_rw.h"
  36 #include "lib/util/util_process.h"
  37
  38 #include "ctdb_private.h"
  39 #include "ctdb_client.h"
  40
  41 #include "common/system_socket.h"
  42 #include "common/common.h"
  43 #include "common/logging.h"
  44
  45 #include "server/ctdb_config.h"
  46
  47 #include "ctdb_cluster_mutex.h"
  48
  49 /* List of SRVID requests that need to be processed */
  50 struct srvid_list {
  51         struct srvid_list *next, *prev;
  52         struct ctdb_srvid_message *request;
  53 };
  54
  55 struct srvid_requests {
  56         struct srvid_list *requests;
  57 };
  58
  59 static void srvid_request_reply(struct ctdb_context *ctdb,
  60                                 struct ctdb_srvid_message *request,
  61                                 TDB_DATA result)
  62 {
  63         /* Someone that sent srvid==0 does not want a reply */
  64         if (request->srvid == 0) {
  65                 talloc_free(request);
  66                 return;
  67         }
  68
  69         if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
  70                                      result) == 0) {
  71                 DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
  72                                   (unsigned)request->pnn,
  73                                   (unsigned long long)request->srvid));
  74         } else {
  75                 DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
  76                                  (unsigned)request->pnn,
  77                                  (unsigned long long)request->srvid));
  78         }
  79
  80         talloc_free(request);
  81 }
  82
  83 static void srvid_requests_reply(struct ctdb_context *ctdb,
  84                                  struct srvid_requests **requests,
  85                                  TDB_DATA result)
  86 {
  87         struct srvid_list *r;
  88
  89         if (*requests == NULL) {
  90                 return;
  91         }
  92
  93         for (r = (*requests)->requests; r != NULL; r = r->next) {
  94                 srvid_request_reply(ctdb, r->request, result);
  95         }
  96
  97         /* Free the list structure... */
  98         TALLOC_FREE(*requests);
  99 }
 100
 101 static void srvid_request_add(struct ctdb_context *ctdb,
 102                               struct srvid_requests **requests,
 103                               struct ctdb_srvid_message *request)
 104 {
 105         struct srvid_list *t;
 106         int32_t ret;
 107         TDB_DATA result;
 108
 109         if (*requests == NULL) {
 110                 *requests = talloc_zero(ctdb, struct srvid_requests);
 111                 if (*requests == NULL) {
 112                         goto nomem;
 113                 }
 114         }
 115
 116         t = talloc_zero(*requests, struct srvid_list);
 117         if (t == NULL) {
 118                 /* If *requests was just allocated above then free it */
 119                 if ((*requests)->requests == NULL) {
 120                         TALLOC_FREE(*requests);
 121                 }
 122                 goto nomem;
 123         }
 124
 125         t->request = (struct ctdb_srvid_message *)talloc_steal(t, request);
 126         DLIST_ADD((*requests)->requests, t);
 127
 128         return;
 129
 130 nomem:
 131         /* Failed to add the request to the list.  Send a fail. */
 132         DEBUG(DEBUG_ERR, (__location__
 133                           " Out of memory, failed to queue SRVID request\n"));
 134         ret = -ENOMEM;
 135         result.dsize = sizeof(ret);
 136         result.dptr = (uint8_t *)&ret;
 137         srvid_request_reply(ctdb, request, result);
 138 }
 139
 140 /* An abstraction to allow an operation (takeover runs, recoveries,
 141  * ...) to be disabled for a given timeout */
 142 struct ctdb_op_state {
 143         struct tevent_timer *timer;
 144         bool in_progress;
 145         const char *name;
 146 };
 147
 148 static struct ctdb_op_state *ctdb_op_init(TALLOC_CTX *mem_ctx, const char *name)
 149 {
 150         struct ctdb_op_state *state = talloc_zero(mem_ctx, struct ctdb_op_state);
 151
 152         if (state != NULL) {
 153                 state->in_progress = false;
 154                 state->name = name;
 155         }
 156
 157         return state;
 158 }
 159
 160 static bool ctdb_op_is_disabled(struct ctdb_op_state *state)
 161 {
 162         return state->timer != NULL;
 163 }
 164
 165 static bool ctdb_op_begin(struct ctdb_op_state *state)
 166 {
 167         if (ctdb_op_is_disabled(state)) {
 168                 DEBUG(DEBUG_NOTICE,
 169                       ("Unable to begin - %s are disabled\n", state->name));
 170                 return false;
 171         }
 172
 173         state->in_progress = true;
 174         return true;
 175 }
 176
 177 static bool ctdb_op_end(struct ctdb_op_state *state)
 178 {
 179         return state->in_progress = false;
 180 }
 181
 182 static bool ctdb_op_is_in_progress(struct ctdb_op_state *state)
 183 {
 184         return state->in_progress;
 185 }
 186
 187 static void ctdb_op_enable(struct ctdb_op_state *state)
 188 {
 189         TALLOC_FREE(state->timer);
 190 }
 191
 192 static void ctdb_op_timeout_handler(struct tevent_context *ev,
 193                                     struct tevent_timer *te,
 194                                     struct timeval yt, void *p)
 195 {
 196         struct ctdb_op_state *state =
 197                 talloc_get_type(p, struct ctdb_op_state);
 198
 199         DEBUG(DEBUG_NOTICE,("Reenabling %s after timeout\n", state->name));
 200         ctdb_op_enable(state);
 201 }
 202
 203 static int ctdb_op_disable(struct ctdb_op_state *state,
 204                            struct tevent_context *ev,
 205                            uint32_t timeout)
 206 {
 207         if (timeout == 0) {
 208                 DEBUG(DEBUG_NOTICE,("Reenabling %s\n", state->name));
 209                 ctdb_op_enable(state);
 210                 return 0;
 211         }
 212
 213         if (state->in_progress) {
 214                 DEBUG(DEBUG_ERR,
 215                       ("Unable to disable %s - in progress\n", state->name));
 216                 return -EAGAIN;
 217         }
 218
 219         DEBUG(DEBUG_NOTICE,("Disabling %s for %u seconds\n",
 220                             state->name, timeout));
 221
 222         /* Clear any old timers */
 223         talloc_free(state->timer);
 224
 225         /* Arrange for the timeout to occur */
 226         state->timer = tevent_add_timer(ev, state,
 227                                         timeval_current_ofs(timeout, 0),
 228                                         ctdb_op_timeout_handler, state);
 229         if (state->timer == NULL) {
 230                 DEBUG(DEBUG_ERR,(__location__ " Unable to setup timer\n"));
 231                 return -ENOMEM;
 232         }
 233
 234         return 0;
 235 }
 236
 237 struct ctdb_banning_state {
 238         uint32_t count;
 239         struct timeval last_reported_time;
 240 };
 241
 242 struct ctdb_recovery_lock_handle;
 243
 244 /*
 245   private state of recovery daemon
 246  */
 247 struct ctdb_recoverd {
 248         struct ctdb_context *ctdb;
 249         uint32_t recmaster;
 250         uint32_t last_culprit_node;
 251         struct ctdb_node_map_old *nodemap;
 252         struct timeval priority_time;
 253         bool need_takeover_run;
 254         bool need_recovery;
 255         uint32_t node_flags;
 256         struct tevent_timer *send_election_te;
 257         struct tevent_timer *election_timeout;
 258         struct srvid_requests *reallocate_requests;
 259         struct ctdb_op_state *takeover_run;
 260         struct ctdb_op_state *recovery;
 261         struct ctdb_iface_list_old *ifaces;
 262         uint32_t *force_rebalance_nodes;
 263         struct ctdb_node_capabilities *caps;
 264         bool frozen_on_inactive;
 265         struct ctdb_recovery_lock_handle *recovery_lock_handle;
 266 };
 267
 268 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
 269 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
 270
 271 static void ctdb_restart_recd(struct tevent_context *ev,
 272                               struct tevent_timer *te, struct timeval t,
 273                               void *private_data);
 274
 275 /*
 276   ban a node for a period of time
 277  */
 278 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
 279 {
 280         int ret;
 281         struct ctdb_context *ctdb = rec->ctdb;
 282         struct ctdb_ban_state bantime;
 283
 284         if (!ctdb_validate_pnn(ctdb, pnn)) {
 285                 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
 286                 return;
 287         }
 288
 289         DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
 290
 291         bantime.pnn  = pnn;
 292         bantime.time = ban_time;
 293
 294         ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
 295         if (ret != 0) {
 296                 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
 297                 return;
 298         }
 299
 300 }
 301
 302 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
 303
 304
 305 /*
 306   remember the trouble maker
 307  */
 308 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
 309 {
 310         struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
 311         struct ctdb_banning_state *ban_state;
 312
 313         if (culprit > ctdb->num_nodes) {
 314                 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
 315                 return;
 316         }
 317
 318         /* If we are banned or stopped, do not set other nodes as culprits */
 319         if (rec->node_flags & NODE_FLAGS_INACTIVE) {
 320                 DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
 321                 return;
 322         }
 323
 324         if (ctdb->nodes[culprit]->ban_state == NULL) {
 325                 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
 326                 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
 327
 328
 329         }
 330         ban_state = ctdb->nodes[culprit]->ban_state;
 331         if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
 332                 /* this was the first time in a long while this node
 333                    misbehaved so we will forgive any old transgressions.
 334                 */
 335                 ban_state->count = 0;
 336         }
 337
 338         ban_state->count += count;
 339         ban_state->last_reported_time = timeval_current();
 340         rec->last_culprit_node = culprit;
 341 }
 342
 343 /*
 344   remember the trouble maker
 345  */
 346 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
 347 {
 348         ctdb_set_culprit_count(rec, culprit, 1);
 349 }
 350
 351 /*
 352   Retrieve capabilities from all connected nodes
 353  */
 354 static int update_capabilities(struct ctdb_recoverd *rec,
 355                                struct ctdb_node_map_old *nodemap)
 356 {
 357         uint32_t *capp;
 358         TALLOC_CTX *tmp_ctx;
 359         struct ctdb_node_capabilities *caps;
 360         struct ctdb_context *ctdb = rec->ctdb;
 361
 362         tmp_ctx = talloc_new(rec);
 363         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 364
 365         caps = ctdb_get_capabilities(ctdb, tmp_ctx,
 366                                      CONTROL_TIMEOUT(), nodemap);
 367
 368         if (caps == NULL) {
 369                 DEBUG(DEBUG_ERR,
 370                       (__location__ " Failed to get node capabilities\n"));
 371                 talloc_free(tmp_ctx);
 372                 return -1;
 373         }
 374
 375         capp = ctdb_get_node_capabilities(caps, ctdb_get_pnn(ctdb));
 376         if (capp == NULL) {
 377                 DEBUG(DEBUG_ERR,
 378                       (__location__
 379                        " Capabilities don't include current node.\n"));
 380                 talloc_free(tmp_ctx);
 381                 return -1;
 382         }
 383         ctdb->capabilities = *capp;
 384
 385         TALLOC_FREE(rec->caps);
 386         rec->caps = talloc_steal(rec, caps);
 387
 388         talloc_free(tmp_ctx);
 389         return 0;
 390 }
 391
 392 /*
 393   change recovery mode on all nodes
 394  */
 395 static int set_recovery_mode(struct ctdb_context *ctdb,
 396                              struct ctdb_recoverd *rec,
 397                              struct ctdb_node_map_old *nodemap,
 398                              uint32_t rec_mode)
 399 {
 400         TDB_DATA data;
 401         uint32_t *nodes;
 402         TALLOC_CTX *tmp_ctx;
 403
 404         tmp_ctx = talloc_new(ctdb);
 405         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 406
 407         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 408
 409         data.dsize = sizeof(uint32_t);
 410         data.dptr = (unsigned char *)&rec_mode;
 411
 412         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
 413                                         nodes, 0,
 414                                         CONTROL_TIMEOUT(),
 415                                         false, data,
 416                                         NULL, NULL,
 417                                         NULL) != 0) {
 418                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
 419                 talloc_free(tmp_ctx);
 420                 return -1;
 421         }
 422
 423         talloc_free(tmp_ctx);
 424         return 0;
 425 }
 426
 427 /*
 428   update flags on all active nodes
 429  */
 430 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap, uint32_t pnn, uint32_t flags)
 431 {
 432         int ret;
 433
 434         ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
 435                 if (ret != 0) {
 436                 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
 437                 return -1;
 438         }
 439
 440         return 0;
 441 }
 442
 443 /*
 444   called when ctdb_wait_timeout should finish
 445  */
 446 static void ctdb_wait_handler(struct tevent_context *ev,
 447                               struct tevent_timer *te,
 448                               struct timeval yt, void *p)
 449 {
 450         uint32_t *timed_out = (uint32_t *)p;
 451         (*timed_out) = 1;
 452 }
 453
 454 /*
 455   wait for a given number of seconds
 456  */
 457 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
 458 {
 459         uint32_t timed_out = 0;
 460         time_t usecs = (secs - (time_t)secs) * 1000000;
 461         tevent_add_timer(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs),
 462                          ctdb_wait_handler, &timed_out);
 463         while (!timed_out) {
 464                 tevent_loop_once(ctdb->ev);
 465         }
 466 }
 467
 468 /*
 469   called when an election times out (ends)
 470  */
 471 static void ctdb_election_timeout(struct tevent_context *ev,
 472                                   struct tevent_timer *te,
 473                                   struct timeval t, void *p)
 474 {
 475         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
 476         rec->election_timeout = NULL;
 477         fast_start = false;
 478
 479         D_WARNING("Election period ended, master=%u\n", rec->recmaster);
 480 }
 481
 482
 483 /*
 484   wait for an election to finish. It finished election_timeout seconds after
 485   the last election packet is received
 486  */
 487 static void ctdb_wait_election(struct ctdb_recoverd *rec)
 488 {
 489         struct ctdb_context *ctdb = rec->ctdb;
 490         while (rec->election_timeout) {
 491                 tevent_loop_once(ctdb->ev);
 492         }
 493 }
 494
 495 /*
 496   Update our local flags from all remote connected nodes.
 497   This is only run when we are or we belive we are the recovery master
 498  */
 499 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap)
 500 {
 501         unsigned int j;
 502         struct ctdb_context *ctdb = rec->ctdb;
 503         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
 504
 505         /* get the nodemap for all active remote nodes and verify
 506            they are the same as for this node
 507          */
 508         for (j=0; j<nodemap->num; j++) {
 509                 struct ctdb_node_map_old *remote_nodemap=NULL;
 510                 int ret;
 511
 512                 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
 513                         continue;
 514                 }
 515                 if (nodemap->nodes[j].pnn == ctdb->pnn) {
 516                         continue;
 517                 }
 518
 519                 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 520                                            mem_ctx, &remote_nodemap);
 521                 if (ret != 0) {
 522                         DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
 523                                   nodemap->nodes[j].pnn));
 524                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
 525                         talloc_free(mem_ctx);
 526                         return -1;
 527                 }
 528                 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
 529                         /* We should tell our daemon about this so it
 530                            updates its flags or else we will log the same
 531                            message again in the next iteration of recovery.
 532                            Since we are the recovery master we can just as
 533                            well update the flags on all nodes.
 534                         */
 535                         ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
 536                         if (ret != 0) {
 537                                 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
 538                                 return -1;
 539                         }
 540
 541                         /* Update our local copy of the flags in the recovery
 542                            daemon.
 543                         */
 544                         DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
 545                                  nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
 546                                  nodemap->nodes[j].flags));
 547                         nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
 548                 }
 549                 talloc_free(remote_nodemap);
 550         }
 551         talloc_free(mem_ctx);
 552         return 0;
 553 }
 554
 555
 556 /* Create a new random generation id.
 557    The generation id can not be the INVALID_GENERATION id
 558 */
 559 static uint32_t new_generation(void)
 560 {
 561         uint32_t generation;
 562
 563         while (1) {
 564                 generation = random();
 565
 566                 if (generation != INVALID_GENERATION) {
 567                         break;
 568                 }
 569         }
 570
 571         return generation;
 572 }
 573
 574 static bool ctdb_recovery_have_lock(struct ctdb_recoverd *rec)
 575 {
 576         return (rec->recovery_lock_handle != NULL);
 577 }
 578
 579 struct ctdb_recovery_lock_handle {
 580         bool done;
 581         bool locked;
 582         double latency;
 583         struct ctdb_cluster_mutex_handle *h;
 584         struct ctdb_recoverd *rec;
 585 };
 586
 587 static void take_reclock_handler(char status,
 588                                  double latency,
 589                                  void *private_data)
 590 {
 591         struct ctdb_recovery_lock_handle *s =
 592                 (struct ctdb_recovery_lock_handle *) private_data;
 593
 594         s->locked = (status == '0') ;
 595
 596         /*
 597          * If unsuccessful then ensure the process has exited and that
 598          * the file descriptor event handler has been cancelled
 599          */
 600         if (! s->locked) {
 601                 TALLOC_FREE(s->h);
 602         }
 603
 604         switch (status) {
 605         case '0':
 606                 s->latency = latency;
 607                 break;
 608
 609         case '1':
 610                 D_ERR("Unable to take recovery lock - contention\n");
 611                 break;
 612
 613         case '2':
 614                 D_ERR("Unable to take recovery lock - timeout\n");
 615                 break;
 616
 617         default:
 618                 D_ERR("Unable to take recover lock - unknown error\n");
 619
 620                 {
 621                         struct ctdb_recoverd *rec = s->rec;
 622                         struct ctdb_context *ctdb = rec->ctdb;
 623                         uint32_t pnn = ctdb_get_pnn(ctdb);
 624
 625                         D_ERR("Banning this node\n");
 626                         ctdb_ban_node(rec,
 627                                       pnn,
 628                                       ctdb->tunable.recovery_ban_period);
 629                 }
 630         }
 631
 632         s->done = true;
 633 }
 634
 635 static void force_election(struct ctdb_recoverd *rec,
 636                            uint32_t pnn,
 637                            struct ctdb_node_map_old *nodemap);
 638
 639 static void lost_reclock_handler(void *private_data)
 640 {
 641         struct ctdb_recoverd *rec = talloc_get_type_abort(
 642                 private_data, struct ctdb_recoverd);
 643
 644         D_ERR("Recovery lock helper terminated, triggering an election\n");
 645         TALLOC_FREE(rec->recovery_lock_handle);
 646
 647         force_election(rec, ctdb_get_pnn(rec->ctdb), rec->nodemap);
 648 }
 649
 650 static bool ctdb_recovery_lock(struct ctdb_recoverd *rec)
 651 {
 652         struct ctdb_context *ctdb = rec->ctdb;
 653         struct ctdb_cluster_mutex_handle *h;
 654         struct ctdb_recovery_lock_handle *s;
 655
 656         s = talloc_zero(rec, struct ctdb_recovery_lock_handle);
 657         if (s == NULL) {
 658                 DBG_ERR("Memory allocation error\n");
 659                 return false;
 660         };
 661
 662         s->rec = rec;
 663
 664         h = ctdb_cluster_mutex(s,
 665                                ctdb,
 666                                ctdb->recovery_lock,
 667                                120,
 668                                take_reclock_handler,
 669                                s,
 670                                lost_reclock_handler,
 671                                rec);
 672         if (h == NULL) {
 673                 talloc_free(s);
 674                 return false;
 675         }
 676
 677         rec->recovery_lock_handle = s;
 678         s->h = h;
 679
 680         while (! s->done) {
 681                 tevent_loop_once(ctdb->ev);
 682         }
 683
 684         if (! s->locked) {
 685                 TALLOC_FREE(rec->recovery_lock_handle);
 686                 return false;
 687         }
 688
 689         ctdb_ctrl_report_recd_lock_latency(ctdb,
 690                                            CONTROL_TIMEOUT(),
 691                                            s->latency);
 692
 693         return true;
 694 }
 695
 696 static void ctdb_recovery_unlock(struct ctdb_recoverd *rec)
 697 {
 698         if (rec->recovery_lock_handle == NULL) {
 699                 return;
 700         }
 701
 702         if (! rec->recovery_lock_handle->done) {
 703                 /*
 704                  * Taking of recovery lock still in progress.  Free
 705                  * the cluster mutex handle to release it but leave
 706                  * the recovery lock handle in place to allow taking
 707                  * of the lock to fail.
 708                  */
 709                 D_NOTICE("Cancelling recovery lock\n");
 710                 TALLOC_FREE(rec->recovery_lock_handle->h);
 711                 rec->recovery_lock_handle->done = true;
 712                 rec->recovery_lock_handle->locked = false;
 713                 return;
 714         }
 715
 716         D_NOTICE("Releasing recovery lock\n");
 717         TALLOC_FREE(rec->recovery_lock_handle);
 718 }
 719
 720 static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
 721 {
 722         struct ctdb_context *ctdb = rec->ctdb;
 723         unsigned int i;
 724         struct ctdb_banning_state *ban_state;
 725
 726         *self_ban = false;
 727         for (i=0; i<ctdb->num_nodes; i++) {
 728                 if (ctdb->nodes[i]->ban_state == NULL) {
 729                         continue;
 730                 }
 731                 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
 732                 if (ban_state->count < 2*ctdb->num_nodes) {
 733                         continue;
 734                 }
 735
 736                 DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
 737                         ctdb->nodes[i]->pnn, ban_state->count,
 738                         ctdb->tunable.recovery_ban_period));
 739                 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
 740                 ban_state->count = 0;
 741
 742                 /* Banning ourself? */
 743                 if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
 744                         *self_ban = true;
 745                 }
 746         }
 747 }
 748
 749 struct helper_state {
 750         int fd[2];
 751         pid_t pid;
 752         int result;
 753         bool done;
 754 };
 755
 756 static void helper_handler(struct tevent_context *ev,
 757                            struct tevent_fd *fde,
 758                            uint16_t flags, void *private_data)
 759 {
 760         struct helper_state *state = talloc_get_type_abort(
 761                 private_data, struct helper_state);
 762         int ret;
 763
 764         ret = sys_read(state->fd[0], &state->result, sizeof(state->result));
 765         if (ret != sizeof(state->result)) {
 766                 state->result = EPIPE;
 767         }
 768
 769         state->done = true;
 770 }
 771
 772 static int helper_run(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx,
 773                       const char *prog, const char *arg, const char *type)
 774 {
 775         struct helper_state *state;
 776         struct tevent_fd *fde;
 777         const char **args;
 778         int nargs, ret;
 779         uint32_t recmaster = rec->recmaster;
 780
 781         state = talloc_zero(mem_ctx, struct helper_state);
 782         if (state == NULL) {
 783                 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
 784                 return -1;
 785         }
 786
 787         state->pid = -1;
 788
 789         ret = pipe(state->fd);
 790         if (ret != 0) {
 791                 DEBUG(DEBUG_ERR,
 792                       ("Failed to create pipe for %s helper\n", type));
 793                 goto fail;
 794         }
 795
 796         set_close_on_exec(state->fd[0]);
 797
 798         nargs = 4;
 799         args = talloc_array(state, const char *, nargs);
 800         if (args == NULL) {
 801                 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
 802                 goto fail;
 803         }
 804
 805         args[0] = talloc_asprintf(args, "%d", state->fd[1]);
 806         if (args[0] == NULL) {
 807                 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
 808                 goto fail;
 809         }
 810         args[1] = rec->ctdb->daemon.name;
 811         args[2] = arg;
 812         args[3] = NULL;
 813
 814         if (args[2] == NULL) {
 815                 nargs = 3;
 816         }
 817
 818         state->pid = ctdb_vfork_exec(state, rec->ctdb, prog, nargs, args);
 819         if (state->pid == -1) {
 820                 DEBUG(DEBUG_ERR,
 821                       ("Failed to create child for %s helper\n", type));
 822                 goto fail;
 823         }
 824
 825         close(state->fd[1]);
 826         state->fd[1] = -1;
 827
 828         state->done = false;
 829
 830         fde = tevent_add_fd(rec->ctdb->ev, state, state->fd[0],
 831                             TEVENT_FD_READ, helper_handler, state);
 832         if (fde == NULL) {
 833                 goto fail;
 834         }
 835         tevent_fd_set_auto_close(fde);
 836
 837         while (!state->done) {
 838                 tevent_loop_once(rec->ctdb->ev);
 839
 840                 /* If recmaster changes, we have lost election */
 841                 if (recmaster != rec->recmaster) {
 842                         D_ERR("Recmaster changed to %u, aborting %s\n",
 843                               rec->recmaster, type);
 844                         state->result = 1;
 845                         break;
 846                 }
 847         }
 848
 849         close(state->fd[0]);
 850         state->fd[0] = -1;
 851
 852         if (state->result != 0) {
 853                 goto fail;
 854         }
 855
 856         ctdb_kill(rec->ctdb, state->pid, SIGKILL);
 857         talloc_free(state);
 858         return 0;
 859
 860 fail:
 861         if (state->fd[0] != -1) {
 862                 close(state->fd[0]);
 863         }
 864         if (state->fd[1] != -1) {
 865                 close(state->fd[1]);
 866         }
 867         if (state->pid != -1) {
 868                 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
 869         }
 870         talloc_free(state);
 871         return -1;
 872 }
 873
 874
 875 static int ctdb_takeover(struct ctdb_recoverd *rec,
 876                          uint32_t *force_rebalance_nodes)
 877 {
 878         static char prog[PATH_MAX+1] = "";
 879         char *arg;
 880         unsigned int i;
 881         int ret;
 882
 883         if (!ctdb_set_helper("takeover_helper", prog, sizeof(prog),
 884                              "CTDB_TAKEOVER_HELPER", CTDB_HELPER_BINDIR,
 885                              "ctdb_takeover_helper")) {
 886                 ctdb_die(rec->ctdb, "Unable to set takeover helper\n");
 887         }
 888
 889         arg = NULL;
 890         for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
 891                 uint32_t pnn = force_rebalance_nodes[i];
 892                 if (arg == NULL) {
 893                         arg = talloc_asprintf(rec, "%u", pnn);
 894                 } else {
 895                         arg = talloc_asprintf_append(arg, ",%u", pnn);
 896                 }
 897                 if (arg == NULL) {
 898                         DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
 899                         return -1;
 900                 }
 901         }
 902
 903         if (ctdb_config.failover_disabled) {
 904                 ret = setenv("CTDB_DISABLE_IP_FAILOVER", "1", 1);
 905                 if (ret != 0) {
 906                         D_ERR("Failed to set CTDB_DISABLE_IP_FAILOVER variable\n");
 907                         return -1;
 908                 }
 909         }
 910
 911         return helper_run(rec, rec, prog, arg, "takeover");
 912 }
 913
 914 static bool do_takeover_run(struct ctdb_recoverd *rec,
 915                             struct ctdb_node_map_old *nodemap)
 916 {
 917         uint32_t *nodes = NULL;
 918         struct ctdb_disable_message dtr;
 919         TDB_DATA data;
 920         size_t i;
 921         uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
 922         int ret;
 923         bool ok;
 924
 925         DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
 926
 927         if (ctdb_op_is_in_progress(rec->takeover_run)) {
 928                 DEBUG(DEBUG_ERR, (__location__
 929                                   " takeover run already in progress \n"));
 930                 ok = false;
 931                 goto done;
 932         }
 933
 934         if (!ctdb_op_begin(rec->takeover_run)) {
 935                 ok = false;
 936                 goto done;
 937         }
 938
 939         /* Disable IP checks (takeover runs, really) on other nodes
 940          * while doing this takeover run.  This will stop those other
 941          * nodes from triggering takeover runs when think they should
 942          * be hosting an IP but it isn't yet on an interface.  Don't
 943          * wait for replies since a failure here might cause some
 944          * noise in the logs but will not actually cause a problem.
 945          */
 946         ZERO_STRUCT(dtr);
 947         dtr.srvid = 0; /* No reply */
 948         dtr.pnn = -1;
 949
 950         data.dptr  = (uint8_t*)&dtr;
 951         data.dsize = sizeof(dtr);
 952
 953         nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
 954
 955         /* Disable for 60 seconds.  This can be a tunable later if
 956          * necessary.
 957          */
 958         dtr.timeout = 60;
 959         for (i = 0; i < talloc_array_length(nodes); i++) {
 960                 if (ctdb_client_send_message(rec->ctdb, nodes[i],
 961                                              CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
 962                                              data) != 0) {
 963                         DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
 964                 }
 965         }
 966
 967         ret = ctdb_takeover(rec, rec->force_rebalance_nodes);
 968
 969         /* Reenable takeover runs and IP checks on other nodes */
 970         dtr.timeout = 0;
 971         for (i = 0; i < talloc_array_length(nodes); i++) {
 972                 if (ctdb_client_send_message(rec->ctdb, nodes[i],
 973                                              CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
 974                                              data) != 0) {
 975                         DEBUG(DEBUG_INFO,("Failed to re-enable takeover runs\n"));
 976                 }
 977         }
 978
 979         if (ret != 0) {
 980                 DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
 981                 ok = false;
 982                 goto done;
 983         }
 984
 985         ok = true;
 986         /* Takeover run was successful so clear force rebalance targets */
 987         if (rebalance_nodes == rec->force_rebalance_nodes) {
 988                 TALLOC_FREE(rec->force_rebalance_nodes);
 989         } else {
 990                 DEBUG(DEBUG_WARNING,
 991                       ("Rebalance target nodes changed during takeover run - not clearing\n"));
 992         }
 993 done:
 994         rec->need_takeover_run = !ok;
 995         talloc_free(nodes);
 996         ctdb_op_end(rec->takeover_run);
 997
 998         DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
 999         return ok;
1000 }
1001
1002 static int db_recovery_parallel(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx)
1003 {
1004         static char prog[PATH_MAX+1] = "";
1005         const char *arg;
1006
1007         if (!ctdb_set_helper("recovery_helper", prog, sizeof(prog),
1008                              "CTDB_RECOVERY_HELPER", CTDB_HELPER_BINDIR,
1009                              "ctdb_recovery_helper")) {
1010                 ctdb_die(rec->ctdb, "Unable to set recovery helper\n");
1011         }
1012
1013         arg = talloc_asprintf(mem_ctx, "%u", new_generation());
1014         if (arg == NULL) {
1015                 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1016                 return -1;
1017         }
1018
1019         setenv("CTDB_DBDIR_STATE", rec->ctdb->db_directory_state, 1);
1020
1021         return helper_run(rec, mem_ctx, prog, arg, "recovery");
1022 }
1023
1024 /*
1025   we are the recmaster, and recovery is needed - start a recovery run
1026  */
1027 static int do_recovery(struct ctdb_recoverd *rec,
1028                        TALLOC_CTX *mem_ctx, uint32_t pnn,
1029                        struct ctdb_node_map_old *nodemap, struct ctdb_vnn_map *vnnmap)
1030 {
1031         struct ctdb_context *ctdb = rec->ctdb;
1032         unsigned int i;
1033         int ret;
1034         bool self_ban;
1035
1036         DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1037
1038         /* Check if the current node is still the recmaster.  It's possible that
1039          * re-election has changed the recmaster.
1040          */
1041         if (pnn != rec->recmaster) {
1042                 DEBUG(DEBUG_NOTICE,
1043                       ("Recovery master changed to %u, aborting recovery\n",
1044                        rec->recmaster));
1045                 return -1;
1046         }
1047
1048         /* if recovery fails, force it again */
1049         rec->need_recovery = true;
1050
1051         if (!ctdb_op_begin(rec->recovery)) {
1052                 return -1;
1053         }
1054
1055         if (rec->election_timeout) {
1056                 /* an election is in progress */
1057                 DEBUG(DEBUG_ERR, ("do_recovery called while election in progress - try again later\n"));
1058                 goto fail;
1059         }
1060
1061         ban_misbehaving_nodes(rec, &self_ban);
1062         if (self_ban) {
1063                 DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
1064                 goto fail;
1065         }
1066
1067         if (ctdb->recovery_lock != NULL) {
1068                 if (ctdb_recovery_have_lock(rec)) {
1069                         D_NOTICE("Already holding recovery lock\n");
1070                 } else {
1071                         bool ok;
1072
1073                         D_NOTICE("Attempting to take recovery lock (%s)\n",
1074                                  ctdb->recovery_lock);
1075
1076                         ok = ctdb_recovery_lock(rec);
1077                         if (! ok) {
1078                                 D_ERR("Unable to take recovery lock\n");
1079
1080                                 if (pnn != rec->recmaster) {
1081                                         D_NOTICE("Recovery master changed to %u,"
1082                                                  " aborting recovery\n",
1083                                                  rec->recmaster);
1084                                         rec->need_recovery = false;
1085                                         goto fail;
1086                                 }
1087
1088                                 if (ctdb->runstate ==
1089                                     CTDB_RUNSTATE_FIRST_RECOVERY) {
1090                                         /*
1091                                          * First recovery?  Perhaps
1092                                          * current node does not yet
1093                                          * know who the recmaster is.
1094                                          */
1095                                         D_ERR("Retrying recovery\n");
1096                                         goto fail;
1097                                 }
1098
1099                                 D_ERR("Abort recovery, "
1100                                       "ban this node for %u seconds\n",
1101                                       ctdb->tunable.recovery_ban_period);
1102                                 ctdb_ban_node(rec,
1103                                               pnn,
1104                                               ctdb->tunable.recovery_ban_period);
1105                                 goto fail;
1106                         }
1107                         D_NOTICE("Recovery lock taken successfully\n");
1108                 }
1109         }
1110
1111         DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1112
1113         /* Retrieve capabilities from all connected nodes */
1114         ret = update_capabilities(rec, nodemap);
1115         if (ret!=0) {
1116                 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1117                 return -1;
1118         }
1119
1120         /*
1121           update all nodes to have the same flags that we have
1122          */
1123         for (i=0;i<nodemap->num;i++) {
1124                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1125                         continue;
1126                 }
1127
1128                 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1129                 if (ret != 0) {
1130                         if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1131                                 DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
1132                         } else {
1133                                 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1134                                 return -1;
1135                         }
1136                 }
1137         }
1138
1139         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1140
1141         ret = db_recovery_parallel(rec, mem_ctx);
1142         if (ret != 0) {
1143                 goto fail;
1144         }
1145
1146         do_takeover_run(rec, nodemap);
1147
1148         /* send a message to all clients telling them that the cluster
1149            has been reconfigured */
1150         ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
1151                                        CTDB_SRVID_RECONFIGURE, tdb_null);
1152         if (ret != 0) {
1153                 DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n"));
1154                 goto fail;
1155         }
1156
1157         DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1158
1159         rec->need_recovery = false;
1160         ctdb_op_end(rec->recovery);
1161
1162         /* we managed to complete a full recovery, make sure to forgive
1163            any past sins by the nodes that could now participate in the
1164            recovery.
1165         */
1166         DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
1167         for (i=0;i<nodemap->num;i++) {
1168                 struct ctdb_banning_state *ban_state;
1169
1170                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1171                         continue;
1172                 }
1173
1174                 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
1175                 if (ban_state == NULL) {
1176                         continue;
1177                 }
1178
1179                 ban_state->count = 0;
1180         }
1181
1182         /* We just finished a recovery successfully.
1183            We now wait for rerecovery_timeout before we allow
1184            another recovery to take place.
1185         */
1186         DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be suppressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
1187         ctdb_op_disable(rec->recovery, ctdb->ev,
1188                         ctdb->tunable.rerecovery_timeout);
1189         return 0;
1190
1191 fail:
1192         ctdb_op_end(rec->recovery);
1193         return -1;
1194 }
1195
1196
1197 /*
1198   elections are won by first checking the number of connected nodes, then
1199   the priority time, then the pnn
1200  */
1201 struct election_message {
1202         uint32_t num_connected;
1203         struct timeval priority_time;
1204         uint32_t pnn;
1205         uint32_t node_flags;
1206 };
1207
1208 /*
1209   form this nodes election data
1210  */
1211 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1212 {
1213         unsigned int i;
1214         int ret;
1215         struct ctdb_node_map_old *nodemap;
1216         struct ctdb_context *ctdb = rec->ctdb;
1217
1218         ZERO_STRUCTP(em);
1219
1220         em->pnn = rec->ctdb->pnn;
1221         em->priority_time = rec->priority_time;
1222
1223         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1224         if (ret != 0) {
1225                 DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
1226                 return;
1227         }
1228
1229         rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
1230         em->node_flags = rec->node_flags;
1231
1232         for (i=0;i<nodemap->num;i++) {
1233                 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1234                         em->num_connected++;
1235                 }
1236         }
1237
1238         /* we shouldnt try to win this election if we cant be a recmaster */
1239         if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1240                 em->num_connected = 0;
1241                 em->priority_time = timeval_current();
1242         }
1243
1244         talloc_free(nodemap);
1245 }
1246
1247 /*
1248   see if the given election data wins
1249  */
1250 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1251 {
1252         struct election_message myem;
1253         int cmp = 0;
1254
1255         ctdb_election_data(rec, &myem);
1256
1257         /* we cant win if we don't have the recmaster capability */
1258         if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1259                 return false;
1260         }
1261
1262         /* we cant win if we are banned */
1263         if (rec->node_flags & NODE_FLAGS_BANNED) {
1264                 return false;
1265         }
1266
1267         /* we cant win if we are stopped */
1268         if (rec->node_flags & NODE_FLAGS_STOPPED) {
1269                 return false;
1270         }
1271
1272         /* we will automatically win if the other node is banned */
1273         if (em->node_flags & NODE_FLAGS_BANNED) {
1274                 return true;
1275         }
1276
1277         /* we will automatically win if the other node is banned */
1278         if (em->node_flags & NODE_FLAGS_STOPPED) {
1279                 return true;
1280         }
1281
1282         /* then the longest running node */
1283         if (cmp == 0) {
1284                 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1285         }
1286
1287         if (cmp == 0) {
1288                 cmp = (int)myem.pnn - (int)em->pnn;
1289         }
1290
1291         return cmp > 0;
1292 }
1293
1294 /*
1295   send out an election request
1296  */
1297 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
1298 {
1299         int ret;
1300         TDB_DATA election_data;
1301         struct election_message emsg;
1302         uint64_t srvid;
1303         struct ctdb_context *ctdb = rec->ctdb;
1304
1305         srvid = CTDB_SRVID_ELECTION;
1306
1307         ctdb_election_data(rec, &emsg);
1308
1309         election_data.dsize = sizeof(struct election_message);
1310         election_data.dptr  = (unsigned char *)&emsg;
1311
1312
1313         /* first we assume we will win the election and set
1314            recoverymaster to be ourself on the current node
1315          */
1316         ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
1317                                      CTDB_CURRENT_NODE, pnn);
1318         if (ret != 0) {
1319                 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster\n"));
1320                 return -1;
1321         }
1322         rec->recmaster = pnn;
1323
1324         /* send an election message to all active nodes */
1325         DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
1326         return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1327 }
1328
1329 /*
1330   we think we are winning the election - send a broadcast election request
1331  */
1332 static void election_send_request(struct tevent_context *ev,
1333                                   struct tevent_timer *te,
1334                                   struct timeval t, void *p)
1335 {
1336         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1337         int ret;
1338
1339         ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
1340         if (ret != 0) {
1341                 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
1342         }
1343
1344         TALLOC_FREE(rec->send_election_te);
1345 }
1346
1347 /*
1348   handler for memory dumps
1349 */
1350 static void mem_dump_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1351 {
1352         struct ctdb_recoverd *rec = talloc_get_type(
1353                 private_data, struct ctdb_recoverd);
1354         struct ctdb_context *ctdb = rec->ctdb;
1355         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1356         TDB_DATA *dump;
1357         int ret;
1358         struct ctdb_srvid_message *rd;
1359
1360         if (data.dsize != sizeof(struct ctdb_srvid_message)) {
1361                 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1362                 talloc_free(tmp_ctx);
1363                 return;
1364         }
1365         rd = (struct ctdb_srvid_message *)data.dptr;
1366
1367         dump = talloc_zero(tmp_ctx, TDB_DATA);
1368         if (dump == NULL) {
1369                 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
1370                 talloc_free(tmp_ctx);
1371                 return;
1372         }
1373         ret = ctdb_dump_memory(ctdb, dump);
1374         if (ret != 0) {
1375                 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
1376                 talloc_free(tmp_ctx);
1377                 return;
1378         }
1379
1380 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
1381
1382         ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
1383         if (ret != 0) {
1384                 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
1385                 talloc_free(tmp_ctx);
1386                 return;
1387         }
1388
1389         talloc_free(tmp_ctx);
1390 }
1391
1392 /*
1393   handler for reload_nodes
1394 */
1395 static void reload_nodes_handler(uint64_t srvid, TDB_DATA data,
1396                                  void *private_data)
1397 {
1398         struct ctdb_recoverd *rec = talloc_get_type(
1399                 private_data, struct ctdb_recoverd);
1400
1401         DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
1402
1403         ctdb_load_nodes_file(rec->ctdb);
1404 }
1405
1406
1407 static void recd_node_rebalance_handler(uint64_t srvid, TDB_DATA data,
1408                                         void *private_data)
1409 {
1410         struct ctdb_recoverd *rec = talloc_get_type(
1411                 private_data, struct ctdb_recoverd);
1412         struct ctdb_context *ctdb = rec->ctdb;
1413         uint32_t pnn;
1414         uint32_t *t;
1415         int len;
1416
1417         if (rec->recmaster != ctdb_get_pnn(ctdb)) {
1418                 return;
1419         }
1420
1421         if (data.dsize != sizeof(uint32_t)) {
1422                 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
1423                 return;
1424         }
1425
1426         pnn = *(uint32_t *)&data.dptr[0];
1427
1428         DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
1429
1430         /* Copy any existing list of nodes.  There's probably some
1431          * sort of realloc variant that will do this but we need to
1432          * make sure that freeing the old array also cancels the timer
1433          * event for the timeout... not sure if realloc will do that.
1434          */
1435         len = (rec->force_rebalance_nodes != NULL) ?
1436                 talloc_array_length(rec->force_rebalance_nodes) :
1437                 0;
1438
1439         /* This allows duplicates to be added but they don't cause
1440          * harm.  A call to add a duplicate PNN arguably means that
1441          * the timeout should be reset, so this is the simplest
1442          * solution.
1443          */
1444         t = talloc_zero_array(rec, uint32_t, len+1);
1445         CTDB_NO_MEMORY_VOID(ctdb, t);
1446         if (len > 0) {
1447                 memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
1448         }
1449         t[len] = pnn;
1450
1451         talloc_free(rec->force_rebalance_nodes);
1452
1453         rec->force_rebalance_nodes = t;
1454 }
1455
1456
1457
1458 static void srvid_disable_and_reply(struct ctdb_context *ctdb,
1459                                     TDB_DATA data,
1460                                     struct ctdb_op_state *op_state)
1461 {
1462         struct ctdb_disable_message *r;
1463         uint32_t timeout;
1464         TDB_DATA result;
1465         int32_t ret = 0;
1466
1467         /* Validate input data */
1468         if (data.dsize != sizeof(struct ctdb_disable_message)) {
1469                 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1470                                  "expecting %lu\n", (long unsigned)data.dsize,
1471                                  (long unsigned)sizeof(struct ctdb_srvid_message)));
1472                 return;
1473         }
1474         if (data.dptr == NULL) {
1475                 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
1476                 return;
1477         }
1478
1479         r = (struct ctdb_disable_message *)data.dptr;
1480         timeout = r->timeout;
1481
1482         ret = ctdb_op_disable(op_state, ctdb->ev, timeout);
1483         if (ret != 0) {
1484                 goto done;
1485         }
1486
1487         /* Returning our PNN tells the caller that we succeeded */
1488         ret = ctdb_get_pnn(ctdb);
1489 done:
1490         result.dsize = sizeof(int32_t);
1491         result.dptr  = (uint8_t *)&ret;
1492         srvid_request_reply(ctdb, (struct ctdb_srvid_message *)r, result);
1493 }
1494
1495 static void disable_takeover_runs_handler(uint64_t srvid, TDB_DATA data,
1496                                           void *private_data)
1497 {
1498         struct ctdb_recoverd *rec = talloc_get_type(
1499                 private_data, struct ctdb_recoverd);
1500
1501         srvid_disable_and_reply(rec->ctdb, data, rec->takeover_run);
1502 }
1503
1504 /* Backward compatibility for this SRVID */
1505 static void disable_ip_check_handler(uint64_t srvid, TDB_DATA data,
1506                                      void *private_data)
1507 {
1508         struct ctdb_recoverd *rec = talloc_get_type(
1509                 private_data, struct ctdb_recoverd);
1510         uint32_t timeout;
1511
1512         if (data.dsize != sizeof(uint32_t)) {
1513                 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1514                                  "expecting %lu\n", (long unsigned)data.dsize,
1515                                  (long unsigned)sizeof(uint32_t)));
1516                 return;
1517         }
1518         if (data.dptr == NULL) {
1519                 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
1520                 return;
1521         }
1522
1523         timeout = *((uint32_t *)data.dptr);
1524
1525         ctdb_op_disable(rec->takeover_run, rec->ctdb->ev, timeout);
1526 }
1527
1528 static void disable_recoveries_handler(uint64_t srvid, TDB_DATA data,
1529                                        void *private_data)
1530 {
1531         struct ctdb_recoverd *rec = talloc_get_type(
1532                 private_data, struct ctdb_recoverd);
1533
1534         srvid_disable_and_reply(rec->ctdb, data, rec->recovery);
1535 }
1536
1537 /*
1538   handler for ip reallocate, just add it to the list of requests and
1539   handle this later in the monitor_cluster loop so we do not recurse
1540   with other requests to takeover_run()
1541 */
1542 static void ip_reallocate_handler(uint64_t srvid, TDB_DATA data,
1543                                   void *private_data)
1544 {
1545         struct ctdb_srvid_message *request;
1546         struct ctdb_recoverd *rec = talloc_get_type(
1547                 private_data, struct ctdb_recoverd);
1548
1549         if (data.dsize != sizeof(struct ctdb_srvid_message)) {
1550                 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1551                 return;
1552         }
1553
1554         request = (struct ctdb_srvid_message *)data.dptr;
1555
1556         srvid_request_add(rec->ctdb, &rec->reallocate_requests, request);
1557 }
1558
1559 static void process_ipreallocate_requests(struct ctdb_context *ctdb,
1560                                           struct ctdb_recoverd *rec)
1561 {
1562         TDB_DATA result;
1563         int32_t ret;
1564         struct srvid_requests *current;
1565
1566         /* Only process requests that are currently pending.  More
1567          * might come in while the takeover run is in progress and
1568          * they will need to be processed later since they might
1569          * be in response flag changes.
1570          */
1571         current = rec->reallocate_requests;
1572         rec->reallocate_requests = NULL;
1573
1574         if (do_takeover_run(rec, rec->nodemap)) {
1575                 ret = ctdb_get_pnn(ctdb);
1576         } else {
1577                 ret = -1;
1578         }
1579
1580         result.dsize = sizeof(int32_t);
1581         result.dptr  = (uint8_t *)&ret;
1582
1583         srvid_requests_reply(ctdb, &current, result);
1584 }
1585
1586 /*
1587  * handler for assigning banning credits
1588  */
1589 static void banning_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1590 {
1591         struct ctdb_recoverd *rec = talloc_get_type(
1592                 private_data, struct ctdb_recoverd);
1593         uint32_t ban_pnn;
1594
1595         /* Ignore if we are not recmaster */
1596         if (rec->ctdb->pnn != rec->recmaster) {
1597                 return;
1598         }
1599
1600         if (data.dsize != sizeof(uint32_t)) {
1601                 DEBUG(DEBUG_ERR, (__location__ "invalid data size %zu\n",
1602                                   data.dsize));
1603                 return;
1604         }
1605
1606         ban_pnn = *(uint32_t *)data.dptr;
1607
1608         ctdb_set_culprit_count(rec, ban_pnn, rec->nodemap->num);
1609 }
1610
1611 /*
1612   handler for recovery master elections
1613 */
1614 static void election_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1615 {
1616         struct ctdb_recoverd *rec = talloc_get_type(
1617                 private_data, struct ctdb_recoverd);
1618         struct ctdb_context *ctdb = rec->ctdb;
1619         int ret;
1620         struct election_message *em = (struct election_message *)data.dptr;
1621
1622         /* Ignore election packets from ourself */
1623         if (ctdb->pnn == em->pnn) {
1624                 return;
1625         }
1626
1627         /* we got an election packet - update the timeout for the election */
1628         talloc_free(rec->election_timeout);
1629         rec->election_timeout = tevent_add_timer(
1630                         ctdb->ev, ctdb,
1631                         fast_start ?
1632                                 timeval_current_ofs(0, 500000) :
1633                                 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1634                         ctdb_election_timeout, rec);
1635
1636         /* someone called an election. check their election data
1637            and if we disagree and we would rather be the elected node,
1638            send a new election message to all other nodes
1639          */
1640         if (ctdb_election_win(rec, em)) {
1641                 if (!rec->send_election_te) {
1642                         rec->send_election_te = tevent_add_timer(
1643                                         ctdb->ev, rec,
1644                                         timeval_current_ofs(0, 500000),
1645                                         election_send_request, rec);
1646                 }
1647                 return;
1648         }
1649
1650         /* we didn't win */
1651         TALLOC_FREE(rec->send_election_te);
1652
1653         /* Release the recovery lock file */
1654         if (ctdb_recovery_have_lock(rec)) {
1655                 ctdb_recovery_unlock(rec);
1656         }
1657
1658         /* ok, let that guy become recmaster then */
1659         ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
1660                                      CTDB_CURRENT_NODE, em->pnn);
1661         if (ret != 0) {
1662                 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster"));
1663                 return;
1664         }
1665         rec->recmaster = em->pnn;
1666
1667         return;
1668 }
1669
1670
1671 /*
1672   force the start of the election process
1673  */
1674 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
1675                            struct ctdb_node_map_old *nodemap)
1676 {
1677         int ret;
1678         struct ctdb_context *ctdb = rec->ctdb;
1679
1680         DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
1681
1682         /* set all nodes to recovery mode to stop all internode traffic */
1683         ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1684         if (ret != 0) {
1685                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1686                 return;
1687         }
1688
1689         talloc_free(rec->election_timeout);
1690         rec->election_timeout = tevent_add_timer(
1691                         ctdb->ev, ctdb,
1692                         fast_start ?
1693                                 timeval_current_ofs(0, 500000) :
1694                                 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1695                         ctdb_election_timeout, rec);
1696
1697         ret = send_election_request(rec, pnn);
1698         if (ret!=0) {
1699                 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
1700                 return;
1701         }
1702
1703         /* wait for a few seconds to collect all responses */
1704         ctdb_wait_election(rec);
1705 }
1706
1707
1708
1709 /*
1710   handler for when a node changes its flags
1711 */
1712 static void monitor_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1713 {
1714         struct ctdb_recoverd *rec = talloc_get_type(
1715                 private_data, struct ctdb_recoverd);
1716         struct ctdb_context *ctdb = rec->ctdb;
1717         int ret;
1718         struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
1719         struct ctdb_node_map_old *nodemap=NULL;
1720         TALLOC_CTX *tmp_ctx;
1721         unsigned int i;
1722
1723         if (data.dsize != sizeof(*c)) {
1724                 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
1725                 return;
1726         }
1727
1728         tmp_ctx = talloc_new(ctdb);
1729         CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
1730
1731         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1732         if (ret != 0) {
1733                 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
1734                 talloc_free(tmp_ctx);
1735                 return;
1736         }
1737
1738
1739         for (i=0;i<nodemap->num;i++) {
1740                 if (nodemap->nodes[i].pnn == c->pnn) break;
1741         }
1742
1743         if (i == nodemap->num) {
1744                 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
1745                 talloc_free(tmp_ctx);
1746                 return;
1747         }
1748
1749         if (c->old_flags != c->new_flags) {
1750                 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x  was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
1751         }
1752
1753         nodemap->nodes[i].flags = c->new_flags;
1754
1755         talloc_free(tmp_ctx);
1756 }
1757
1758 /*
1759   handler for when we need to push out flag changes to all other nodes
1760 */
1761 static void push_flags_handler(uint64_t srvid, TDB_DATA data,
1762                                void *private_data)
1763 {
1764         struct ctdb_recoverd *rec = talloc_get_type(
1765                 private_data, struct ctdb_recoverd);
1766         struct ctdb_context *ctdb = rec->ctdb;
1767         int ret;
1768         struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
1769         struct ctdb_node_map_old *nodemap=NULL;
1770         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1771         uint32_t *nodes;
1772
1773         /* read the node flags from the recmaster */
1774         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
1775                                    tmp_ctx, &nodemap);
1776         if (ret != 0) {
1777                 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
1778                 talloc_free(tmp_ctx);
1779                 return;
1780         }
1781         if (c->pnn >= nodemap->num) {
1782                 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
1783                 talloc_free(tmp_ctx);
1784                 return;
1785         }
1786
1787         /* send the flags update to all connected nodes */
1788         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
1789
1790         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
1791                                       nodes, 0, CONTROL_TIMEOUT(),
1792                                       false, data,
1793                                       NULL, NULL,
1794                                       NULL) != 0) {
1795                 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
1796
1797                 talloc_free(tmp_ctx);
1798                 return;
1799         }
1800
1801         talloc_free(tmp_ctx);
1802 }
1803
1804
1805 struct verify_recmode_normal_data {
1806         uint32_t count;
1807         enum monitor_result status;
1808 };
1809
1810 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
1811 {
1812         struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
1813
1814
1815         /* one more node has responded with recmode data*/
1816         rmdata->count--;
1817
1818         /* if we failed to get the recmode, then return an error and let
1819            the main loop try again.
1820         */
1821         if (state->state != CTDB_CONTROL_DONE) {
1822                 if (rmdata->status == MONITOR_OK) {
1823                         rmdata->status = MONITOR_FAILED;
1824                 }
1825                 return;
1826         }
1827
1828         /* if we got a response, then the recmode will be stored in the
1829            status field
1830         */
1831         if (state->status != CTDB_RECOVERY_NORMAL) {
1832                 DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
1833                 rmdata->status = MONITOR_RECOVERY_NEEDED;
1834         }
1835
1836         return;
1837 }
1838
1839
1840 /* verify that all nodes are in normal recovery mode */
1841 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap)
1842 {
1843         struct verify_recmode_normal_data *rmdata;
1844         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1845         struct ctdb_client_control_state *state;
1846         enum monitor_result status;
1847         unsigned int j;
1848
1849         rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
1850         CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
1851         rmdata->count  = 0;
1852         rmdata->status = MONITOR_OK;
1853
1854         /* loop over all active nodes and send an async getrecmode call to
1855            them*/
1856         for (j=0; j<nodemap->num; j++) {
1857                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1858                         continue;
1859                 }
1860                 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
1861                                         CONTROL_TIMEOUT(),
1862                                         nodemap->nodes[j].pnn);
1863                 if (state == NULL) {
1864                         /* we failed to send the control, treat this as
1865                            an error and try again next iteration
1866                         */
1867                         DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
1868                         talloc_free(mem_ctx);
1869                         return MONITOR_FAILED;
1870                 }
1871
1872                 /* set up the callback functions */
1873                 state->async.fn = verify_recmode_normal_callback;
1874                 state->async.private_data = rmdata;
1875
1876                 /* one more control to wait for to complete */
1877                 rmdata->count++;
1878         }
1879
1880
1881         /* now wait for up to the maximum number of seconds allowed
1882            or until all nodes we expect a response from has replied
1883         */
1884         while (rmdata->count > 0) {
1885                 tevent_loop_once(ctdb->ev);
1886         }
1887
1888         status = rmdata->status;
1889         talloc_free(mem_ctx);
1890         return status;
1891 }
1892
1893
1894 struct verify_recmaster_data {
1895         struct ctdb_recoverd *rec;
1896         uint32_t count;
1897         uint32_t pnn;
1898         enum monitor_result status;
1899 };
1900
1901 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
1902 {
1903         struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
1904
1905
1906         /* one more node has responded with recmaster data*/
1907         rmdata->count--;
1908
1909         /* if we failed to get the recmaster, then return an error and let
1910            the main loop try again.
1911         */
1912         if (state->state != CTDB_CONTROL_DONE) {
1913                 if (rmdata->status == MONITOR_OK) {
1914                         rmdata->status = MONITOR_FAILED;
1915                 }
1916                 return;
1917         }
1918
1919         /* if we got a response, then the recmaster will be stored in the
1920            status field
1921         */
1922         if ((uint32_t)state->status != rmdata->pnn) {
1923                 DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
1924                 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
1925                 rmdata->status = MONITOR_ELECTION_NEEDED;
1926         }
1927
1928         return;
1929 }
1930
1931
1932 /* verify that all nodes agree that we are the recmaster */
1933 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap, uint32_t pnn)
1934 {
1935         struct ctdb_context *ctdb = rec->ctdb;
1936         struct verify_recmaster_data *rmdata;
1937         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1938         struct ctdb_client_control_state *state;
1939         enum monitor_result status;
1940         unsigned int j;
1941
1942         rmdata = talloc(mem_ctx, struct verify_recmaster_data);
1943         CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
1944         rmdata->rec    = rec;
1945         rmdata->count  = 0;
1946         rmdata->pnn    = pnn;
1947         rmdata->status = MONITOR_OK;
1948
1949         /* loop over all active nodes and send an async getrecmaster call to
1950            them*/
1951         for (j=0; j<nodemap->num; j++) {
1952                 if (nodemap->nodes[j].pnn == rec->recmaster) {
1953                         continue;
1954                 }
1955                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1956                         continue;
1957                 }
1958                 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
1959                                         CONTROL_TIMEOUT(),
1960                                         nodemap->nodes[j].pnn);
1961                 if (state == NULL) {
1962                         /* we failed to send the control, treat this as
1963                            an error and try again next iteration
1964                         */
1965                         DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
1966                         talloc_free(mem_ctx);
1967                         return MONITOR_FAILED;
1968                 }
1969
1970                 /* set up the callback functions */
1971                 state->async.fn = verify_recmaster_callback;
1972                 state->async.private_data = rmdata;
1973
1974                 /* one more control to wait for to complete */
1975                 rmdata->count++;
1976         }
1977
1978
1979         /* now wait for up to the maximum number of seconds allowed
1980            or until all nodes we expect a response from has replied
1981         */
1982         while (rmdata->count > 0) {
1983                 tevent_loop_once(ctdb->ev);
1984         }
1985
1986         status = rmdata->status;
1987         talloc_free(mem_ctx);
1988         return status;
1989 }
1990
1991 static bool interfaces_have_changed(struct ctdb_context *ctdb,
1992                                     struct ctdb_recoverd *rec)
1993 {
1994         struct ctdb_iface_list_old *ifaces = NULL;
1995         TALLOC_CTX *mem_ctx;
1996         bool ret = false;
1997
1998         mem_ctx = talloc_new(NULL);
1999
2000         /* Read the interfaces from the local node */
2001         if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
2002                                  CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
2003                 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
2004                 /* We could return an error.  However, this will be
2005                  * rare so we'll decide that the interfaces have
2006                  * actually changed, just in case.
2007                  */
2008                 talloc_free(mem_ctx);
2009                 return true;
2010         }
2011
2012         if (!rec->ifaces) {
2013                 /* We haven't been here before so things have changed */
2014                 DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
2015                 ret = true;
2016         } else if (rec->ifaces->num != ifaces->num) {
2017                 /* Number of interfaces has changed */
2018                 DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
2019                                      rec->ifaces->num, ifaces->num));
2020                 ret = true;
2021         } else {
2022                 /* See if interface names or link states have changed */
2023                 unsigned int i;
2024                 for (i = 0; i < rec->ifaces->num; i++) {
2025                         struct ctdb_iface * iface = &rec->ifaces->ifaces[i];
2026                         if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
2027                                 DEBUG(DEBUG_NOTICE,
2028                                       ("Interface in slot %d changed: %s => %s\n",
2029                                        i, iface->name, ifaces->ifaces[i].name));
2030                                 ret = true;
2031                                 break;
2032                         }
2033                         if (iface->link_state != ifaces->ifaces[i].link_state) {
2034                                 DEBUG(DEBUG_NOTICE,
2035                                       ("Interface %s changed state: %d => %d\n",
2036                                        iface->name, iface->link_state,
2037                                        ifaces->ifaces[i].link_state));
2038                                 ret = true;
2039                                 break;
2040                         }
2041                 }
2042         }
2043
2044         talloc_free(rec->ifaces);
2045         rec->ifaces = talloc_steal(rec, ifaces);
2046
2047         talloc_free(mem_ctx);
2048         return ret;
2049 }
2050
2051 /* Check that the local allocation of public IP addresses is correct
2052  * and do some house-keeping */
2053 static int verify_local_ip_allocation(struct ctdb_context *ctdb,
2054                                       struct ctdb_recoverd *rec,
2055                                       uint32_t pnn,
2056                                       struct ctdb_node_map_old *nodemap)
2057 {
2058         TALLOC_CTX *mem_ctx = talloc_new(NULL);
2059         unsigned int j;
2060         int ret;
2061         bool need_takeover_run = false;
2062         struct ctdb_public_ip_list_old *ips = NULL;
2063
2064         /* If we are not the recmaster then do some housekeeping */
2065         if (rec->recmaster != pnn) {
2066                 /* Ignore any IP reallocate requests - only recmaster
2067                  * processes them
2068                  */
2069                 TALLOC_FREE(rec->reallocate_requests);
2070                 /* Clear any nodes that should be force rebalanced in
2071                  * the next takeover run.  If the recovery master role
2072                  * has moved then we don't want to process these some
2073                  * time in the future.
2074                  */
2075                 TALLOC_FREE(rec->force_rebalance_nodes);
2076         }
2077
2078         /* Return early if disabled... */
2079         if (ctdb_config.failover_disabled ||
2080             ctdb_op_is_disabled(rec->takeover_run)) {
2081                 return  0;
2082         }
2083
2084         if (interfaces_have_changed(ctdb, rec)) {
2085                 need_takeover_run = true;
2086         }
2087
2088         /* If there are unhosted IPs but this node can host them then
2089          * trigger an IP reallocation */
2090
2091         /* Read *available* IPs from local node */
2092         ret = ctdb_ctrl_get_public_ips_flags(
2093                 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx,
2094                 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
2095         if (ret != 0) {
2096                 DEBUG(DEBUG_ERR, ("Unable to retrieve available public IPs\n"));
2097                 talloc_free(mem_ctx);
2098                 return -1;
2099         }
2100
2101         for (j=0; j<ips->num; j++) {
2102                 if (ips->ips[j].pnn == CTDB_UNKNOWN_PNN &&
2103                     nodemap->nodes[pnn].flags == 0) {
2104                         DEBUG(DEBUG_WARNING,
2105                               ("Unassigned IP %s can be served by this node\n",
2106                                ctdb_addr_to_str(&ips->ips[j].addr)));
2107                         need_takeover_run = true;
2108                 }
2109         }
2110
2111         talloc_free(ips);
2112
2113         if (!ctdb->do_checkpublicip) {
2114                 goto done;
2115         }
2116
2117         /* Validate the IP addresses that this node has on network
2118          * interfaces.  If there is an inconsistency between reality
2119          * and the state expected by CTDB then try to fix it by
2120          * triggering an IP reallocation or releasing extraneous IP
2121          * addresses. */
2122
2123         /* Read *known* IPs from local node */
2124         ret = ctdb_ctrl_get_public_ips_flags(
2125                 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
2126         if (ret != 0) {
2127                 DEBUG(DEBUG_ERR, ("Unable to retrieve known public IPs\n"));
2128                 talloc_free(mem_ctx);
2129                 return -1;
2130         }
2131
2132         for (j=0; j<ips->num; j++) {
2133                 if (ips->ips[j].pnn == pnn) {
2134                         if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
2135                                 DEBUG(DEBUG_ERR,
2136                                       ("Assigned IP %s not on an interface\n",
2137                                        ctdb_addr_to_str(&ips->ips[j].addr)));
2138                                 need_takeover_run = true;
2139                         }
2140                 } else {
2141                         if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
2142                                 DEBUG(DEBUG_ERR,
2143                                       ("IP %s incorrectly on an interface\n",
2144                                        ctdb_addr_to_str(&ips->ips[j].addr)));
2145                                 need_takeover_run = true;
2146                         }
2147                 }
2148         }
2149
2150 done:
2151         if (need_takeover_run) {
2152                 struct ctdb_srvid_message rd;
2153                 TDB_DATA data;
2154
2155                 DEBUG(DEBUG_NOTICE,("Trigger takeoverrun\n"));
2156
2157                 ZERO_STRUCT(rd);
2158                 rd.pnn = ctdb->pnn;
2159                 rd.srvid = 0;
2160                 data.dptr = (uint8_t *)&rd;
2161                 data.dsize = sizeof(rd);
2162
2163                 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
2164                 if (ret != 0) {
2165                         DEBUG(DEBUG_ERR,
2166                               ("Failed to send takeover run request\n"));
2167                 }
2168         }
2169         talloc_free(mem_ctx);
2170         return 0;
2171 }
2172
2173
2174 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2175 {
2176         struct ctdb_node_map_old **remote_nodemaps = callback_data;
2177
2178         if (node_pnn >= ctdb->num_nodes) {
2179                 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
2180                 return;
2181         }
2182
2183         remote_nodemaps[node_pnn] = (struct ctdb_node_map_old *)talloc_steal(remote_nodemaps, outdata.dptr);
2184
2185 }
2186
2187 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
2188         struct ctdb_node_map_old *nodemap,
2189         struct ctdb_node_map_old **remote_nodemaps)
2190 {
2191         uint32_t *nodes;
2192
2193         nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
2194         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
2195                                         nodes, 0,
2196                                         CONTROL_TIMEOUT(), false, tdb_null,
2197                                         async_getnodemap_callback,
2198                                         NULL,
2199                                         remote_nodemaps) != 0) {
2200                 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
2201
2202                 return -1;
2203         }
2204
2205         return 0;
2206 }
2207
2208 static bool validate_recovery_master(struct ctdb_recoverd *rec,
2209                                      TALLOC_CTX *mem_ctx)
2210 {
2211         struct ctdb_context *ctdb = rec->ctdb;
2212         uint32_t pnn = ctdb_get_pnn(ctdb);
2213         struct ctdb_node_map_old *nodemap = rec->nodemap;
2214         struct ctdb_node_map_old *recmaster_nodemap = NULL;
2215         int ret;
2216
2217         /* When recovery daemon is started, recmaster is set to
2218          * "unknown" so it knows to start an election.
2219          */
2220         if (rec->recmaster == CTDB_UNKNOWN_PNN) {
2221                 DEBUG(DEBUG_NOTICE,
2222                       ("Initial recovery master set - forcing election\n"));
2223                 force_election(rec, pnn, nodemap);
2224                 return false;
2225         }
2226
2227         /*
2228          * If the current recmaster does not have CTDB_CAP_RECMASTER,
2229          * but we have, then force an election and try to become the new
2230          * recmaster.
2231          */
2232         if (!ctdb_node_has_capabilities(rec->caps,
2233                                         rec->recmaster,
2234                                         CTDB_CAP_RECMASTER) &&
2235             (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
2236             !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
2237                 DEBUG(DEBUG_ERR,
2238                       (" Current recmaster node %u does not have CAP_RECMASTER,"
2239                        " but we (node %u) have - force an election\n",
2240                        rec->recmaster, pnn));
2241                 force_election(rec, pnn, nodemap);
2242                 return false;
2243         }
2244
2245         /* Verify that the master node has not been deleted.  This
2246          * should not happen because a node should always be shutdown
2247          * before being deleted, causing a new master to be elected
2248          * before now.  However, if something strange has happened
2249          * then checking here will ensure we don't index beyond the
2250          * end of the nodemap array. */
2251         if (rec->recmaster >= nodemap->num) {
2252                 DEBUG(DEBUG_ERR,
2253                       ("Recmaster node %u has been deleted. Force election\n",
2254                        rec->recmaster));
2255                 force_election(rec, pnn, nodemap);
2256                 return false;
2257         }
2258
2259         /* if recovery master is disconnected/deleted we must elect a new recmaster */
2260         if (nodemap->nodes[rec->recmaster].flags &
2261             (NODE_FLAGS_DISCONNECTED|NODE_FLAGS_DELETED)) {
2262                 DEBUG(DEBUG_NOTICE,
2263                       ("Recmaster node %u is disconnected/deleted. Force election\n",
2264                        rec->recmaster));
2265                 force_election(rec, pnn, nodemap);
2266                 return false;
2267         }
2268
2269         /* get nodemap from the recovery master to check if it is inactive */
2270         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
2271                                    mem_ctx, &recmaster_nodemap);
2272         if (ret != 0) {
2273                 DEBUG(DEBUG_ERR,
2274                       (__location__
2275                        " Unable to get nodemap from recovery master %u\n",
2276                           rec->recmaster));
2277                 /* No election, just error */
2278                 return false;
2279         }
2280
2281
2282         if ((recmaster_nodemap->nodes[rec->recmaster].flags & NODE_FLAGS_INACTIVE) &&
2283             (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
2284                 DEBUG(DEBUG_NOTICE,
2285                       ("Recmaster node %u is inactive. Force election\n",
2286                        rec->recmaster));
2287                 /*
2288                  * update our nodemap to carry the recmaster's notion of
2289                  * its own flags, so that we don't keep freezing the
2290                  * inactive recmaster node...
2291                  */
2292                 nodemap->nodes[rec->recmaster].flags =
2293                         recmaster_nodemap->nodes[rec->recmaster].flags;
2294                 force_election(rec, pnn, nodemap);
2295                 return false;
2296         }
2297
2298         return true;
2299 }
2300
2301 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
2302                       TALLOC_CTX *mem_ctx)
2303 {
2304         uint32_t pnn;
2305         struct ctdb_node_map_old *nodemap=NULL;
2306         struct ctdb_node_map_old **remote_nodemaps=NULL;
2307         struct ctdb_vnn_map *vnnmap=NULL;
2308         struct ctdb_vnn_map *remote_vnnmap=NULL;
2309         uint32_t num_lmasters;
2310         int32_t debug_level;
2311         unsigned int i, j;
2312         int ret;
2313         bool self_ban;
2314
2315
2316         /* verify that the main daemon is still running */
2317         if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
2318                 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2319                 exit(-1);
2320         }
2321
2322         /* ping the local daemon to tell it we are alive */
2323         ctdb_ctrl_recd_ping(ctdb);
2324
2325         if (rec->election_timeout) {
2326                 /* an election is in progress */
2327                 return;
2328         }
2329
2330         /* read the debug level from the parent and update locally */
2331         ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
2332         if (ret !=0) {
2333                 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
2334                 return;
2335         }
2336         debuglevel_set(debug_level);
2337
2338         /* get relevant tunables */
2339         ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
2340         if (ret != 0) {
2341                 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
2342                 return;
2343         }
2344
2345         /* get runstate */
2346         ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(),
2347                                      CTDB_CURRENT_NODE, &ctdb->runstate);
2348         if (ret != 0) {
2349                 DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n"));
2350                 return;
2351         }
2352
2353         pnn = ctdb_get_pnn(ctdb);
2354
2355         /* get nodemap */
2356         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &nodemap);
2357         if (ret != 0) {
2358                 DBG_ERR("Unable to get nodemap from node %"PRIu32"\n", pnn);
2359                 return;
2360         }
2361         talloc_free(rec->nodemap);
2362         rec->nodemap = nodemap;
2363
2364         /* remember our own node flags */
2365         rec->node_flags = nodemap->nodes[pnn].flags;
2366
2367         ban_misbehaving_nodes(rec, &self_ban);
2368         if (self_ban) {
2369                 DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
2370                 return;
2371         }
2372
2373         ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2374                                    CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2375         if (ret != 0) {
2376                 D_ERR("Failed to read recmode from local node\n");
2377                 return;
2378         }
2379
2380         /* if the local daemon is STOPPED or BANNED, we verify that the databases are
2381            also frozen and that the recmode is set to active.
2382         */
2383         if (rec->node_flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
2384                 /* If this node has become inactive then we want to
2385                  * reduce the chances of it taking over the recovery
2386                  * master role when it becomes active again.  This
2387                  * helps to stabilise the recovery master role so that
2388                  * it stays on the most stable node.
2389                  */
2390                 rec->priority_time = timeval_current();
2391
2392                 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2393                         DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
2394
2395                         ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2396                         if (ret != 0) {
2397                                 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
2398
2399                                 return;
2400                         }
2401                 }
2402                 if (! rec->frozen_on_inactive) {
2403                         ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(),
2404                                                CTDB_CURRENT_NODE);
2405                         if (ret != 0) {
2406                                 DEBUG(DEBUG_ERR,
2407                                       (__location__ " Failed to freeze node "
2408                                        "in STOPPED or BANNED state\n"));
2409                                 return;
2410                         }
2411
2412                         rec->frozen_on_inactive = true;
2413                 }
2414
2415                 /* If this node is stopped or banned then it is not the recovery
2416                  * master, so don't do anything. This prevents stopped or banned
2417                  * node from starting election and sending unnecessary controls.
2418                  */
2419                 return;
2420         }
2421
2422         rec->frozen_on_inactive = false;
2423
2424         /* Retrieve capabilities from all connected nodes */
2425         ret = update_capabilities(rec, nodemap);
2426         if (ret != 0) {
2427                 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
2428                 return;
2429         }
2430
2431         if (! validate_recovery_master(rec, mem_ctx)) {
2432                 return;
2433         }
2434
2435         if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2436                 /* Check if an IP takeover run is needed and trigger one if
2437                  * necessary */
2438                 verify_local_ip_allocation(ctdb, rec, pnn, nodemap);
2439         }
2440
2441         /* if we are not the recmaster then we do not need to check
2442            if recovery is needed
2443          */
2444         if (pnn != rec->recmaster) {
2445                 return;
2446         }
2447
2448
2449         /* ensure our local copies of flags are right */
2450         ret = update_local_flags(rec, nodemap);
2451         if (ret != 0) {
2452                 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
2453                 return;
2454         }
2455
2456         if (ctdb->num_nodes != nodemap->num) {
2457                 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
2458                 ctdb_load_nodes_file(ctdb);
2459                 return;
2460         }
2461
2462         /* verify that all active nodes agree that we are the recmaster */
2463         switch (verify_recmaster(rec, nodemap, pnn)) {
2464         case MONITOR_RECOVERY_NEEDED:
2465                 /* can not happen */
2466                 return;
2467         case MONITOR_ELECTION_NEEDED:
2468                 force_election(rec, pnn, nodemap);
2469                 return;
2470         case MONITOR_OK:
2471                 break;
2472         case MONITOR_FAILED:
2473                 return;
2474         }
2475
2476
2477         /* get the vnnmap */
2478         ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
2479         if (ret != 0) {
2480                 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
2481                 return;
2482         }
2483
2484         if (rec->need_recovery) {
2485                 /* a previous recovery didn't finish */
2486                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2487                 return;
2488         }
2489
2490         /* verify that all active nodes are in normal mode
2491            and not in recovery mode
2492         */
2493         switch (verify_recmode(ctdb, nodemap)) {
2494         case MONITOR_RECOVERY_NEEDED:
2495                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2496                 return;
2497         case MONITOR_FAILED:
2498                 return;
2499         case MONITOR_ELECTION_NEEDED:
2500                 /* can not happen */
2501         case MONITOR_OK:
2502                 break;
2503         }
2504
2505
2506         if (ctdb->recovery_lock != NULL) {
2507                 /* We must already hold the recovery lock */
2508                 if (!ctdb_recovery_have_lock(rec)) {
2509                         DEBUG(DEBUG_ERR,("Failed recovery lock sanity check.  Force a recovery\n"));
2510                         ctdb_set_culprit(rec, ctdb->pnn);
2511                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2512                         return;
2513                 }
2514         }
2515
2516
2517         /* If recoveries are disabled then there is no use doing any
2518          * nodemap or flags checks.  Recoveries might be disabled due
2519          * to "reloadnodes", so doing these checks might cause an
2520          * unnecessary recovery.  */
2521         if (ctdb_op_is_disabled(rec->recovery)) {
2522                 goto takeover_run_checks;
2523         }
2524
2525         /* get the nodemap for all active remote nodes
2526          */
2527         remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map_old *, nodemap->num);
2528         if (remote_nodemaps == NULL) {
2529                 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
2530                 return;
2531         }
2532         for(i=0; i<nodemap->num; i++) {
2533                 remote_nodemaps[i] = NULL;
2534         }
2535         if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
2536                 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
2537                 return;
2538         }
2539
2540         /* verify that all other nodes have the same nodemap as we have
2541         */
2542         for (j=0; j<nodemap->num; j++) {
2543                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2544                         continue;
2545                 }
2546
2547                 if (remote_nodemaps[j] == NULL) {
2548                         DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
2549                         ctdb_set_culprit(rec, j);
2550
2551                         return;
2552                 }
2553
2554                 /* if the nodes disagree on how many nodes there are
2555                    then this is a good reason to try recovery
2556                  */
2557                 if (remote_nodemaps[j]->num != nodemap->num) {
2558                         DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
2559                                   nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
2560                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2561                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2562                         return;
2563                 }
2564
2565                 /* if the nodes disagree on which nodes exist and are
2566                    active, then that is also a good reason to do recovery
2567                  */
2568                 for (i=0;i<nodemap->num;i++) {
2569                         if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
2570                                 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
2571                                           nodemap->nodes[j].pnn, i,
2572                                           remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
2573                                 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2574                                 do_recovery(rec, mem_ctx, pnn, nodemap,
2575                                             vnnmap);
2576                                 return;
2577                         }
2578                 }
2579         }
2580
2581         /*
2582          * Update node flags obtained from each active node. This ensure we have
2583          * up-to-date information for all the nodes.
2584          */
2585         for (j=0; j<nodemap->num; j++) {
2586                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2587                         continue;
2588                 }
2589                 nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
2590         }
2591
2592         for (j=0; j<nodemap->num; j++) {
2593                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2594                         continue;
2595                 }
2596
2597                 /* verify the flags are consistent
2598                 */
2599                 for (i=0; i<nodemap->num; i++) {
2600                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2601                                 continue;
2602                         }
2603
2604                         if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
2605                                 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
2606                                   nodemap->nodes[j].pnn,
2607                                   nodemap->nodes[i].pnn,
2608                                   remote_nodemaps[j]->nodes[i].flags,
2609                                   nodemap->nodes[i].flags));
2610                                 if (i == j) {
2611                                         DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
2612                                         update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
2613                                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2614                                         do_recovery(rec, mem_ctx, pnn, nodemap,
2615                                                     vnnmap);
2616                                         return;
2617                                 } else {
2618                                         DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
2619                                         update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
2620                                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2621                                         do_recovery(rec, mem_ctx, pnn, nodemap,
2622                                                     vnnmap);
2623                                         return;
2624                                 }
2625                         }
2626                 }
2627         }
2628
2629
2630         /* count how many active nodes there are */
2631         num_lmasters  = 0;
2632         for (i=0; i<nodemap->num; i++) {
2633                 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
2634                         if (ctdb_node_has_capabilities(rec->caps,
2635                                                        ctdb->nodes[i]->pnn,
2636                                                        CTDB_CAP_LMASTER)) {
2637                                 num_lmasters++;
2638                         }
2639                 }
2640         }
2641
2642
2643         /* There must be the same number of lmasters in the vnn map as
2644          * there are active nodes with the lmaster capability...  or
2645          * do a recovery.
2646          */
2647         if (vnnmap->size != num_lmasters) {
2648                 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
2649                           vnnmap->size, num_lmasters));
2650                 ctdb_set_culprit(rec, ctdb->pnn);
2651                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2652                 return;
2653         }
2654
2655         /*
2656          * Verify that all active lmaster nodes in the nodemap also
2657          * exist in the vnnmap
2658          */
2659         for (j=0; j<nodemap->num; j++) {
2660                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2661                         continue;
2662                 }
2663                 if (! ctdb_node_has_capabilities(rec->caps,
2664                                                  nodemap->nodes[j].pnn,
2665                                                  CTDB_CAP_LMASTER)) {
2666                         continue;
2667                 }
2668                 if (nodemap->nodes[j].pnn == pnn) {
2669                         continue;
2670                 }
2671
2672                 for (i=0; i<vnnmap->size; i++) {
2673                         if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
2674                                 break;
2675                         }
2676                 }
2677                 if (i == vnnmap->size) {
2678                         D_ERR("Active LMASTER node %u is not in the vnnmap\n",
2679                               nodemap->nodes[j].pnn);
2680                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2681                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2682                         return;
2683                 }
2684         }
2685
2686
2687         /* verify that all other nodes have the same vnnmap
2688            and are from the same generation
2689          */
2690         for (j=0; j<nodemap->num; j++) {
2691                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2692                         continue;
2693                 }
2694                 if (nodemap->nodes[j].pnn == pnn) {
2695                         continue;
2696                 }
2697
2698                 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
2699                                           mem_ctx, &remote_vnnmap);
2700                 if (ret != 0) {
2701                         DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
2702                                   nodemap->nodes[j].pnn));
2703                         return;
2704                 }
2705
2706                 /* verify the vnnmap generation is the same */
2707                 if (vnnmap->generation != remote_vnnmap->generation) {
2708                         DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
2709                                   nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
2710                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2711                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2712                         return;
2713                 }
2714
2715                 /* verify the vnnmap size is the same */
2716                 if (vnnmap->size != remote_vnnmap->size) {
2717                         DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
2718                                   nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
2719                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2720                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2721                         return;
2722                 }
2723
2724                 /* verify the vnnmap is the same */
2725                 for (i=0;i<vnnmap->size;i++) {
2726                         if (remote_vnnmap->map[i] != vnnmap->map[i]) {
2727                                 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
2728                                           nodemap->nodes[j].pnn));
2729                                 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2730                                 do_recovery(rec, mem_ctx, pnn, nodemap,
2731                                             vnnmap);
2732                                 return;
2733                         }
2734                 }
2735         }
2736
2737         /* FIXME: Add remote public IP checking to ensure that nodes
2738          * have the IP addresses that are allocated to them. */
2739
2740 takeover_run_checks:
2741
2742         /* If there are IP takeover runs requested or the previous one
2743          * failed then perform one and notify the waiters */
2744         if (!ctdb_op_is_disabled(rec->takeover_run) &&
2745             (rec->reallocate_requests || rec->need_takeover_run)) {
2746                 process_ipreallocate_requests(ctdb, rec);
2747         }
2748 }
2749
2750 static void recd_sig_term_handler(struct tevent_context *ev,
2751                                   struct tevent_signal *se, int signum,
2752                                   int count, void *dont_care,
2753                                   void *private_data)
2754 {
2755         struct ctdb_recoverd *rec = talloc_get_type_abort(
2756                 private_data, struct ctdb_recoverd);
2757
2758         DEBUG(DEBUG_ERR, ("Received SIGTERM, exiting\n"));
2759         ctdb_recovery_unlock(rec);
2760         exit(0);
2761 }
2762
2763 /*
2764  * Periodically log elements of the cluster state
2765  *
2766  * This can be used to confirm a split brain has occurred
2767  */
2768 static void maybe_log_cluster_state(struct tevent_context *ev,
2769                                     struct tevent_timer *te,
2770                                     struct timeval current_time,
2771                                     void *private_data)
2772 {
2773         struct ctdb_recoverd *rec = talloc_get_type_abort(
2774                 private_data, struct ctdb_recoverd);
2775         struct ctdb_context *ctdb = rec->ctdb;
2776         struct tevent_timer *tt;
2777
2778         static struct timeval start_incomplete = {
2779                 .tv_sec = 0,
2780         };
2781
2782         bool is_complete;
2783         bool was_complete;
2784         unsigned int i;
2785         double seconds;
2786         unsigned int minutes;
2787         unsigned int num_connected;
2788
2789         if (rec->recmaster != ctdb_get_pnn(ctdb)) {
2790                 goto done;
2791         }
2792
2793         if (rec->nodemap == NULL) {
2794                 goto done;
2795         }
2796
2797         is_complete = true;
2798         num_connected = 0;
2799         for (i = 0; i < rec->nodemap->num; i++) {
2800                 struct ctdb_node_and_flags *n = &rec->nodemap->nodes[i];
2801
2802                 if (n->pnn == ctdb_get_pnn(ctdb)) {
2803                         continue;
2804                 }
2805                 if ((n->flags & NODE_FLAGS_DELETED) != 0) {
2806                         continue;
2807                 }
2808                 if ((n->flags & NODE_FLAGS_DISCONNECTED) != 0) {
2809                         is_complete = false;
2810                         continue;
2811                 }
2812
2813                 num_connected++;
2814         }
2815
2816         was_complete = timeval_is_zero(&start_incomplete);
2817
2818         if (is_complete) {
2819                 if (! was_complete) {
2820                         D_WARNING("Cluster complete with master=%u\n",
2821                                   rec->recmaster);
2822                         start_incomplete = timeval_zero();
2823                 }
2824                 goto done;
2825         }
2826
2827         /* Cluster is newly incomplete... */
2828         if (was_complete) {
2829                 start_incomplete = current_time;
2830                 minutes = 0;
2831                 goto log;
2832         }
2833
2834         /*
2835          * Cluster has been incomplete since previous check, so figure
2836          * out how long (in minutes) and decide whether to log anything
2837          */
2838         seconds = timeval_elapsed2(&start_incomplete, &current_time);
2839         minutes = (unsigned int)seconds / 60;
2840         if (minutes >= 60) {
2841                 /* Over an hour, log every hour */
2842                 if (minutes % 60 != 0) {
2843                         goto done;
2844                 }
2845         } else if (minutes >= 10) {
2846                 /* Over 10 minutes, log every 10 minutes */
2847                 if (minutes % 10 != 0) {
2848                         goto done;
2849                 }
2850         }
2851
2852 log:
2853         D_WARNING("Cluster incomplete with master=%u, elapsed=%u minutes, "
2854                   "connected=%u\n",
2855                   rec->recmaster,
2856                   minutes,
2857                   num_connected);
2858
2859 done:
2860         tt = tevent_add_timer(ctdb->ev,
2861                               rec,
2862                               timeval_current_ofs(60, 0),
2863                               maybe_log_cluster_state,
2864                               rec);
2865         if (tt == NULL) {
2866                 DBG_WARNING("Failed to set up cluster state timer\n");
2867         }
2868 }
2869
2870 /*
2871   the main monitoring loop
2872  */
2873 static void monitor_cluster(struct ctdb_context *ctdb)
2874 {
2875         struct tevent_signal *se;
2876         struct ctdb_recoverd *rec;
2877
2878         DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
2879
2880         rec = talloc_zero(ctdb, struct ctdb_recoverd);
2881         CTDB_NO_MEMORY_FATAL(ctdb, rec);
2882
2883         rec->ctdb = ctdb;
2884         rec->recmaster = CTDB_UNKNOWN_PNN;
2885         rec->recovery_lock_handle = NULL;
2886
2887         rec->takeover_run = ctdb_op_init(rec, "takeover runs");
2888         CTDB_NO_MEMORY_FATAL(ctdb, rec->takeover_run);
2889
2890         rec->recovery = ctdb_op_init(rec, "recoveries");
2891         CTDB_NO_MEMORY_FATAL(ctdb, rec->recovery);
2892
2893         rec->priority_time = timeval_current();
2894         rec->frozen_on_inactive = false;
2895
2896         se = tevent_add_signal(ctdb->ev, ctdb, SIGTERM, 0,
2897                                recd_sig_term_handler, rec);
2898         if (se == NULL) {
2899                 DEBUG(DEBUG_ERR, ("Failed to install SIGTERM handler\n"));
2900                 exit(1);
2901         }
2902
2903         if (ctdb->recovery_lock == NULL) {
2904                 struct tevent_timer *tt;
2905
2906                 tt = tevent_add_timer(ctdb->ev,
2907                                       rec,
2908                                       timeval_current_ofs(60, 0),
2909                                       maybe_log_cluster_state,
2910                                       rec);
2911                 if (tt == NULL) {
2912                         DBG_WARNING("Failed to set up cluster state timer\n");
2913                 }
2914         }
2915
2916         /* register a message port for sending memory dumps */
2917         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
2918
2919         /* when a node is assigned banning credits */
2920         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_BANNING,
2921                                         banning_handler, rec);
2922
2923         /* register a message port for recovery elections */
2924         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_ELECTION, election_handler, rec);
2925
2926         /* when nodes are disabled/enabled */
2927         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
2928
2929         /* when we are asked to puch out a flag change */
2930         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
2931
2932         /* register a message port for reloadnodes  */
2933         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
2934
2935         /* register a message port for performing a takeover run */
2936         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
2937
2938         /* register a message port for disabling the ip check for a short while */
2939         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
2940
2941         /* register a message port for forcing a rebalance of a node next
2942            reallocation */
2943         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
2944
2945         /* Register a message port for disabling takeover runs */
2946         ctdb_client_set_message_handler(ctdb,
2947                                         CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
2948                                         disable_takeover_runs_handler, rec);
2949
2950         /* Register a message port for disabling recoveries */
2951         ctdb_client_set_message_handler(ctdb,
2952                                         CTDB_SRVID_DISABLE_RECOVERIES,
2953                                         disable_recoveries_handler, rec);
2954
2955         for (;;) {
2956                 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2957                 struct timeval start;
2958                 double elapsed;
2959
2960                 if (!mem_ctx) {
2961                         DEBUG(DEBUG_CRIT,(__location__
2962                                           " Failed to create temp context\n"));
2963                         exit(-1);
2964                 }
2965
2966                 start = timeval_current();
2967                 main_loop(ctdb, rec, mem_ctx);
2968                 talloc_free(mem_ctx);
2969
2970                 /* we only check for recovery once every second */
2971                 elapsed = timeval_elapsed(&start);
2972                 if (elapsed < ctdb->tunable.recover_interval) {
2973                         ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
2974                                           - elapsed);
2975                 }
2976         }
2977 }
2978
2979 /*
2980   event handler for when the main ctdbd dies
2981  */
2982 static void ctdb_recoverd_parent(struct tevent_context *ev,
2983                                  struct tevent_fd *fde,
2984                                  uint16_t flags, void *private_data)
2985 {
2986         DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
2987         _exit(1);
2988 }
2989
2990 /*
2991   called regularly to verify that the recovery daemon is still running
2992  */
2993 static void ctdb_check_recd(struct tevent_context *ev,
2994                             struct tevent_timer *te,
2995                             struct timeval yt, void *p)
2996 {
2997         struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
2998
2999         if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
3000                 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
3001
3002                 tevent_add_timer(ctdb->ev, ctdb, timeval_zero(),
3003                                  ctdb_restart_recd, ctdb);
3004
3005                 return;
3006         }
3007
3008         tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3009                          timeval_current_ofs(30, 0),
3010                          ctdb_check_recd, ctdb);
3011 }
3012
3013 static void recd_sig_child_handler(struct tevent_context *ev,
3014                                    struct tevent_signal *se, int signum,
3015                                    int count, void *dont_care,
3016                                    void *private_data)
3017 {
3018 //      struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3019         int status;
3020         pid_t pid = -1;
3021
3022         while (pid != 0) {
3023                 pid = waitpid(-1, &status, WNOHANG);
3024                 if (pid == -1) {
3025                         if (errno != ECHILD) {
3026                                 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3027                         }
3028                         return;
3029                 }
3030                 if (pid > 0) {
3031                         DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3032                 }
3033         }
3034 }
3035
3036 /*
3037   startup the recovery daemon as a child of the main ctdb daemon
3038  */
3039 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3040 {
3041         int fd[2];
3042         struct tevent_signal *se;
3043         struct tevent_fd *fde;
3044         int ret;
3045
3046         if (pipe(fd) != 0) {
3047                 return -1;
3048         }
3049
3050         ctdb->recoverd_pid = ctdb_fork(ctdb);
3051         if (ctdb->recoverd_pid == -1) {
3052                 return -1;
3053         }
3054
3055         if (ctdb->recoverd_pid != 0) {
3056                 talloc_free(ctdb->recd_ctx);
3057                 ctdb->recd_ctx = talloc_new(ctdb);
3058                 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
3059
3060                 close(fd[0]);
3061                 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3062                                  timeval_current_ofs(30, 0),
3063                                  ctdb_check_recd, ctdb);
3064                 return 0;
3065         }
3066
3067         close(fd[1]);
3068
3069         srandom(getpid() ^ time(NULL));
3070
3071         ret = logging_init(ctdb, NULL, NULL, "ctdb-recoverd");
3072         if (ret != 0) {
3073                 return -1;
3074         }
3075
3076         prctl_set_comment("ctdb_recoverd");
3077         if (switch_from_server_to_client(ctdb) != 0) {
3078                 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3079                 exit(1);
3080         }
3081
3082         DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
3083
3084         fde = tevent_add_fd(ctdb->ev, ctdb, fd[0], TEVENT_FD_READ,
3085                             ctdb_recoverd_parent, &fd[0]);
3086         tevent_fd_set_auto_close(fde);
3087
3088         /* set up a handler to pick up sigchld */
3089         se = tevent_add_signal(ctdb->ev, ctdb, SIGCHLD, 0,
3090                                recd_sig_child_handler, ctdb);
3091         if (se == NULL) {
3092                 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3093                 exit(1);
3094         }
3095
3096         monitor_cluster(ctdb);
3097
3098         DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
3099         return -1;
3100 }
3101
3102 /*
3103   shutdown the recovery daemon
3104  */
3105 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
3106 {
3107         if (ctdb->recoverd_pid == 0) {
3108                 return;
3109         }
3110
3111         DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
3112         ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
3113
3114         TALLOC_FREE(ctdb->recd_ctx);
3115         TALLOC_FREE(ctdb->recd_ping_count);
3116 }
3117
3118 static void ctdb_restart_recd(struct tevent_context *ev,
3119                               struct tevent_timer *te,
3120                               struct timeval t, void *private_data)
3121 {
3122         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3123
3124         DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
3125         ctdb_stop_recoverd(ctdb);
3126         ctdb_start_recoverd(ctdb);
3127 }