ctdb/server/ctdb_recoverd.c

   1 /*
   2    ctdb recovery daemon
   3
   4    Copyright (C) Ronnie Sahlberg  2007
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20 #include "replace.h"
  21 #include "system/filesys.h"
  22 #include "system/time.h"
  23 #include "system/network.h"
  24 #include "system/wait.h"
  25
  26 #include <popt.h>
  27 #include <talloc.h>
  28 #include <tevent.h>
  29 #include <tdb.h>
  30
  31 #include "lib/tdb_wrap/tdb_wrap.h"
  32 #include "lib/util/dlinklist.h"
  33 #include "lib/util/debug.h"
  34 #include "lib/util/samba_util.h"
  35 #include "lib/util/sys_rw.h"
  36 #include "lib/util/util_process.h"
  37
  38 #include "ctdb_private.h"
  39 #include "ctdb_client.h"
  40
  41 #include "common/system_socket.h"
  42 #include "common/common.h"
  43 #include "common/logging.h"
  44
  45 #include "server/ctdb_config.h"
  46
  47 #include "ctdb_cluster_mutex.h"
  48
  49 /* List of SRVID requests that need to be processed */
  50 struct srvid_list {
  51         struct srvid_list *next, *prev;
  52         struct ctdb_srvid_message *request;
  53 };
  54
  55 struct srvid_requests {
  56         struct srvid_list *requests;
  57 };
  58
  59 static void srvid_request_reply(struct ctdb_context *ctdb,
  60                                 struct ctdb_srvid_message *request,
  61                                 TDB_DATA result)
  62 {
  63         /* Someone that sent srvid==0 does not want a reply */
  64         if (request->srvid == 0) {
  65                 talloc_free(request);
  66                 return;
  67         }
  68
  69         if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
  70                                      result) == 0) {
  71                 DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
  72                                   (unsigned)request->pnn,
  73                                   (unsigned long long)request->srvid));
  74         } else {
  75                 DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
  76                                  (unsigned)request->pnn,
  77                                  (unsigned long long)request->srvid));
  78         }
  79
  80         talloc_free(request);
  81 }
  82
  83 static void srvid_requests_reply(struct ctdb_context *ctdb,
  84                                  struct srvid_requests **requests,
  85                                  TDB_DATA result)
  86 {
  87         struct srvid_list *r;
  88
  89         if (*requests == NULL) {
  90                 return;
  91         }
  92
  93         for (r = (*requests)->requests; r != NULL; r = r->next) {
  94                 srvid_request_reply(ctdb, r->request, result);
  95         }
  96
  97         /* Free the list structure... */
  98         TALLOC_FREE(*requests);
  99 }
 100
 101 static void srvid_request_add(struct ctdb_context *ctdb,
 102                               struct srvid_requests **requests,
 103                               struct ctdb_srvid_message *request)
 104 {
 105         struct srvid_list *t;
 106         int32_t ret;
 107         TDB_DATA result;
 108
 109         if (*requests == NULL) {
 110                 *requests = talloc_zero(ctdb, struct srvid_requests);
 111                 if (*requests == NULL) {
 112                         goto nomem;
 113                 }
 114         }
 115
 116         t = talloc_zero(*requests, struct srvid_list);
 117         if (t == NULL) {
 118                 /* If *requests was just allocated above then free it */
 119                 if ((*requests)->requests == NULL) {
 120                         TALLOC_FREE(*requests);
 121                 }
 122                 goto nomem;
 123         }
 124
 125         t->request = (struct ctdb_srvid_message *)talloc_steal(t, request);
 126         DLIST_ADD((*requests)->requests, t);
 127
 128         return;
 129
 130 nomem:
 131         /* Failed to add the request to the list.  Send a fail. */
 132         DEBUG(DEBUG_ERR, (__location__
 133                           " Out of memory, failed to queue SRVID request\n"));
 134         ret = -ENOMEM;
 135         result.dsize = sizeof(ret);
 136         result.dptr = (uint8_t *)&ret;
 137         srvid_request_reply(ctdb, request, result);
 138 }
 139
 140 /* An abstraction to allow an operation (takeover runs, recoveries,
 141  * ...) to be disabled for a given timeout */
 142 struct ctdb_op_state {
 143         struct tevent_timer *timer;
 144         bool in_progress;
 145         const char *name;
 146 };
 147
 148 static struct ctdb_op_state *ctdb_op_init(TALLOC_CTX *mem_ctx, const char *name)
 149 {
 150         struct ctdb_op_state *state = talloc_zero(mem_ctx, struct ctdb_op_state);
 151
 152         if (state != NULL) {
 153                 state->in_progress = false;
 154                 state->name = name;
 155         }
 156
 157         return state;
 158 }
 159
 160 static bool ctdb_op_is_disabled(struct ctdb_op_state *state)
 161 {
 162         return state->timer != NULL;
 163 }
 164
 165 static bool ctdb_op_begin(struct ctdb_op_state *state)
 166 {
 167         if (ctdb_op_is_disabled(state)) {
 168                 DEBUG(DEBUG_NOTICE,
 169                       ("Unable to begin - %s are disabled\n", state->name));
 170                 return false;
 171         }
 172
 173         state->in_progress = true;
 174         return true;
 175 }
 176
 177 static bool ctdb_op_end(struct ctdb_op_state *state)
 178 {
 179         return state->in_progress = false;
 180 }
 181
 182 static bool ctdb_op_is_in_progress(struct ctdb_op_state *state)
 183 {
 184         return state->in_progress;
 185 }
 186
 187 static void ctdb_op_enable(struct ctdb_op_state *state)
 188 {
 189         TALLOC_FREE(state->timer);
 190 }
 191
 192 static void ctdb_op_timeout_handler(struct tevent_context *ev,
 193                                     struct tevent_timer *te,
 194                                     struct timeval yt, void *p)
 195 {
 196         struct ctdb_op_state *state =
 197                 talloc_get_type(p, struct ctdb_op_state);
 198
 199         DEBUG(DEBUG_NOTICE,("Reenabling %s after timeout\n", state->name));
 200         ctdb_op_enable(state);
 201 }
 202
 203 static int ctdb_op_disable(struct ctdb_op_state *state,
 204                            struct tevent_context *ev,
 205                            uint32_t timeout)
 206 {
 207         if (timeout == 0) {
 208                 DEBUG(DEBUG_NOTICE,("Reenabling %s\n", state->name));
 209                 ctdb_op_enable(state);
 210                 return 0;
 211         }
 212
 213         if (state->in_progress) {
 214                 DEBUG(DEBUG_ERR,
 215                       ("Unable to disable %s - in progress\n", state->name));
 216                 return -EAGAIN;
 217         }
 218
 219         DEBUG(DEBUG_NOTICE,("Disabling %s for %u seconds\n",
 220                             state->name, timeout));
 221
 222         /* Clear any old timers */
 223         talloc_free(state->timer);
 224
 225         /* Arrange for the timeout to occur */
 226         state->timer = tevent_add_timer(ev, state,
 227                                         timeval_current_ofs(timeout, 0),
 228                                         ctdb_op_timeout_handler, state);
 229         if (state->timer == NULL) {
 230                 DEBUG(DEBUG_ERR,(__location__ " Unable to setup timer\n"));
 231                 return -ENOMEM;
 232         }
 233
 234         return 0;
 235 }
 236
 237 struct ctdb_banning_state {
 238         uint32_t count;
 239         struct timeval last_reported_time;
 240 };
 241
 242 struct ctdb_recovery_lock_handle;
 243
 244 /*
 245   private state of recovery daemon
 246  */
 247 struct ctdb_recoverd {
 248         struct ctdb_context *ctdb;
 249         uint32_t recmaster;
 250         uint32_t last_culprit_node;
 251         struct ctdb_node_map_old *nodemap;
 252         struct timeval priority_time;
 253         bool need_takeover_run;
 254         bool need_recovery;
 255         uint32_t node_flags;
 256         struct tevent_timer *send_election_te;
 257         struct tevent_timer *election_timeout;
 258         struct srvid_requests *reallocate_requests;
 259         struct ctdb_op_state *takeover_run;
 260         struct ctdb_op_state *recovery;
 261         struct ctdb_iface_list_old *ifaces;
 262         uint32_t *force_rebalance_nodes;
 263         struct ctdb_node_capabilities *caps;
 264         bool frozen_on_inactive;
 265         struct ctdb_recovery_lock_handle *recovery_lock_handle;
 266 };
 267
 268 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
 269 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
 270
 271 static void ctdb_restart_recd(struct tevent_context *ev,
 272                               struct tevent_timer *te, struct timeval t,
 273                               void *private_data);
 274
 275 /*
 276   ban a node for a period of time
 277  */
 278 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
 279 {
 280         int ret;
 281         struct ctdb_context *ctdb = rec->ctdb;
 282         struct ctdb_ban_state bantime;
 283
 284         if (!ctdb_validate_pnn(ctdb, pnn)) {
 285                 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
 286                 return;
 287         }
 288
 289         DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
 290
 291         bantime.pnn  = pnn;
 292         bantime.time = ban_time;
 293
 294         ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
 295         if (ret != 0) {
 296                 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
 297                 return;
 298         }
 299
 300 }
 301
 302 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
 303
 304
 305 /*
 306   remember the trouble maker
 307  */
 308 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
 309 {
 310         struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
 311         struct ctdb_banning_state *ban_state;
 312
 313         if (culprit > ctdb->num_nodes) {
 314                 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
 315                 return;
 316         }
 317
 318         /* If we are banned or stopped, do not set other nodes as culprits */
 319         if (rec->node_flags & NODE_FLAGS_INACTIVE) {
 320                 DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
 321                 return;
 322         }
 323
 324         if (ctdb->nodes[culprit]->ban_state == NULL) {
 325                 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
 326                 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
 327
 328
 329         }
 330         ban_state = ctdb->nodes[culprit]->ban_state;
 331         if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
 332                 /* this was the first time in a long while this node
 333                    misbehaved so we will forgive any old transgressions.
 334                 */
 335                 ban_state->count = 0;
 336         }
 337
 338         ban_state->count += count;
 339         ban_state->last_reported_time = timeval_current();
 340         rec->last_culprit_node = culprit;
 341 }
 342
 343 /*
 344   remember the trouble maker
 345  */
 346 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
 347 {
 348         ctdb_set_culprit_count(rec, culprit, 1);
 349 }
 350
 351 /*
 352   Retrieve capabilities from all connected nodes
 353  */
 354 static int update_capabilities(struct ctdb_recoverd *rec,
 355                                struct ctdb_node_map_old *nodemap)
 356 {
 357         uint32_t *capp;
 358         TALLOC_CTX *tmp_ctx;
 359         struct ctdb_node_capabilities *caps;
 360         struct ctdb_context *ctdb = rec->ctdb;
 361
 362         tmp_ctx = talloc_new(rec);
 363         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 364
 365         caps = ctdb_get_capabilities(ctdb, tmp_ctx,
 366                                      CONTROL_TIMEOUT(), nodemap);
 367
 368         if (caps == NULL) {
 369                 DEBUG(DEBUG_ERR,
 370                       (__location__ " Failed to get node capabilities\n"));
 371                 talloc_free(tmp_ctx);
 372                 return -1;
 373         }
 374
 375         capp = ctdb_get_node_capabilities(caps, ctdb_get_pnn(ctdb));
 376         if (capp == NULL) {
 377                 DEBUG(DEBUG_ERR,
 378                       (__location__
 379                        " Capabilities don't include current node.\n"));
 380                 talloc_free(tmp_ctx);
 381                 return -1;
 382         }
 383         ctdb->capabilities = *capp;
 384
 385         TALLOC_FREE(rec->caps);
 386         rec->caps = talloc_steal(rec, caps);
 387
 388         talloc_free(tmp_ctx);
 389         return 0;
 390 }
 391
 392 /*
 393   change recovery mode on all nodes
 394  */
 395 static int set_recovery_mode(struct ctdb_context *ctdb,
 396                              struct ctdb_recoverd *rec,
 397                              struct ctdb_node_map_old *nodemap,
 398                              uint32_t rec_mode)
 399 {
 400         TDB_DATA data;
 401         uint32_t *nodes;
 402         TALLOC_CTX *tmp_ctx;
 403
 404         tmp_ctx = talloc_new(ctdb);
 405         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 406
 407         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 408
 409         data.dsize = sizeof(uint32_t);
 410         data.dptr = (unsigned char *)&rec_mode;
 411
 412         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
 413                                         nodes, 0,
 414                                         CONTROL_TIMEOUT(),
 415                                         false, data,
 416                                         NULL, NULL,
 417                                         NULL) != 0) {
 418                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
 419                 talloc_free(tmp_ctx);
 420                 return -1;
 421         }
 422
 423         talloc_free(tmp_ctx);
 424         return 0;
 425 }
 426
 427 /*
 428   update flags on all active nodes
 429  */
 430 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap, uint32_t pnn, uint32_t flags)
 431 {
 432         int ret;
 433
 434         ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
 435                 if (ret != 0) {
 436                 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
 437                 return -1;
 438         }
 439
 440         return 0;
 441 }
 442
 443 /*
 444   called when ctdb_wait_timeout should finish
 445  */
 446 static void ctdb_wait_handler(struct tevent_context *ev,
 447                               struct tevent_timer *te,
 448                               struct timeval yt, void *p)
 449 {
 450         uint32_t *timed_out = (uint32_t *)p;
 451         (*timed_out) = 1;
 452 }
 453
 454 /*
 455   wait for a given number of seconds
 456  */
 457 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
 458 {
 459         uint32_t timed_out = 0;
 460         time_t usecs = (secs - (time_t)secs) * 1000000;
 461         tevent_add_timer(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs),
 462                          ctdb_wait_handler, &timed_out);
 463         while (!timed_out) {
 464                 tevent_loop_once(ctdb->ev);
 465         }
 466 }
 467
 468 /*
 469   called when an election times out (ends)
 470  */
 471 static void ctdb_election_timeout(struct tevent_context *ev,
 472                                   struct tevent_timer *te,
 473                                   struct timeval t, void *p)
 474 {
 475         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
 476         rec->election_timeout = NULL;
 477         fast_start = false;
 478
 479         D_WARNING("Election period ended, master=%u\n", rec->recmaster);
 480 }
 481
 482
 483 /*
 484   wait for an election to finish. It finished election_timeout seconds after
 485   the last election packet is received
 486  */
 487 static void ctdb_wait_election(struct ctdb_recoverd *rec)
 488 {
 489         struct ctdb_context *ctdb = rec->ctdb;
 490         while (rec->election_timeout) {
 491                 tevent_loop_once(ctdb->ev);
 492         }
 493 }
 494
 495 /*
 496   Update our local flags from all remote connected nodes.
 497   This is only run when we are or we belive we are the recovery master
 498  */
 499 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap)
 500 {
 501         unsigned int j;
 502         struct ctdb_context *ctdb = rec->ctdb;
 503         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
 504
 505         /* get the nodemap for all active remote nodes and verify
 506            they are the same as for this node
 507          */
 508         for (j=0; j<nodemap->num; j++) {
 509                 struct ctdb_node_map_old *remote_nodemap=NULL;
 510                 int ret;
 511
 512                 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
 513                         continue;
 514                 }
 515                 if (nodemap->nodes[j].pnn == ctdb->pnn) {
 516                         continue;
 517                 }
 518
 519                 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 520                                            mem_ctx, &remote_nodemap);
 521                 if (ret != 0) {
 522                         DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
 523                                   nodemap->nodes[j].pnn));
 524                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
 525                         talloc_free(mem_ctx);
 526                         return -1;
 527                 }
 528                 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
 529                         /* We should tell our daemon about this so it
 530                            updates its flags or else we will log the same
 531                            message again in the next iteration of recovery.
 532                            Since we are the recovery master we can just as
 533                            well update the flags on all nodes.
 534                         */
 535                         ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
 536                         if (ret != 0) {
 537                                 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
 538                                 return -1;
 539                         }
 540
 541                         /* Update our local copy of the flags in the recovery
 542                            daemon.
 543                         */
 544                         DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
 545                                  nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
 546                                  nodemap->nodes[j].flags));
 547                         nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
 548                 }
 549                 talloc_free(remote_nodemap);
 550         }
 551         talloc_free(mem_ctx);
 552         return 0;
 553 }
 554
 555
 556 /* Create a new random generation id.
 557    The generation id can not be the INVALID_GENERATION id
 558 */
 559 static uint32_t new_generation(void)
 560 {
 561         uint32_t generation;
 562
 563         while (1) {
 564                 generation = random();
 565
 566                 if (generation != INVALID_GENERATION) {
 567                         break;
 568                 }
 569         }
 570
 571         return generation;
 572 }
 573
 574 static bool ctdb_recovery_have_lock(struct ctdb_recoverd *rec)
 575 {
 576         return (rec->recovery_lock_handle != NULL);
 577 }
 578
 579 struct ctdb_recovery_lock_handle {
 580         bool done;
 581         bool locked;
 582         double latency;
 583         struct ctdb_cluster_mutex_handle *h;
 584         struct ctdb_recoverd *rec;
 585 };
 586
 587 static void take_reclock_handler(char status,
 588                                  double latency,
 589                                  void *private_data)
 590 {
 591         struct ctdb_recovery_lock_handle *s =
 592                 (struct ctdb_recovery_lock_handle *) private_data;
 593
 594         s->locked = (status == '0') ;
 595
 596         /*
 597          * If unsuccessful then ensure the process has exited and that
 598          * the file descriptor event handler has been cancelled
 599          */
 600         if (! s->locked) {
 601                 TALLOC_FREE(s->h);
 602         }
 603
 604         switch (status) {
 605         case '0':
 606                 s->latency = latency;
 607                 break;
 608
 609         case '1':
 610                 D_ERR("Unable to take recovery lock - contention\n");
 611                 break;
 612
 613         case '2':
 614                 D_ERR("Unable to take recovery lock - timeout\n");
 615                 break;
 616
 617         default:
 618                 D_ERR("Unable to take recover lock - unknown error\n");
 619
 620                 {
 621                         struct ctdb_recoverd *rec = s->rec;
 622                         struct ctdb_context *ctdb = rec->ctdb;
 623                         uint32_t pnn = ctdb_get_pnn(ctdb);
 624
 625                         D_ERR("Banning this node\n");
 626                         ctdb_ban_node(rec,
 627                                       pnn,
 628                                       ctdb->tunable.recovery_ban_period);
 629                 }
 630         }
 631
 632         s->done = true;
 633 }
 634
 635 static void force_election(struct ctdb_recoverd *rec,
 636                            uint32_t pnn,
 637                            struct ctdb_node_map_old *nodemap);
 638
 639 static void lost_reclock_handler(void *private_data)
 640 {
 641         struct ctdb_recoverd *rec = talloc_get_type_abort(
 642                 private_data, struct ctdb_recoverd);
 643
 644         D_ERR("Recovery lock helper terminated, triggering an election\n");
 645         TALLOC_FREE(rec->recovery_lock_handle);
 646
 647         force_election(rec, ctdb_get_pnn(rec->ctdb), rec->nodemap);
 648 }
 649
 650 static bool ctdb_recovery_lock(struct ctdb_recoverd *rec)
 651 {
 652         struct ctdb_context *ctdb = rec->ctdb;
 653         struct ctdb_cluster_mutex_handle *h;
 654         struct ctdb_recovery_lock_handle *s;
 655
 656         s = talloc_zero(rec, struct ctdb_recovery_lock_handle);
 657         if (s == NULL) {
 658                 DBG_ERR("Memory allocation error\n");
 659                 return false;
 660         };
 661
 662         s->rec = rec;
 663
 664         h = ctdb_cluster_mutex(s,
 665                                ctdb,
 666                                ctdb->recovery_lock,
 667                                120,
 668                                take_reclock_handler,
 669                                s,
 670                                lost_reclock_handler,
 671                                rec);
 672         if (h == NULL) {
 673                 talloc_free(s);
 674                 return false;
 675         }
 676
 677         rec->recovery_lock_handle = s;
 678         s->h = h;
 679
 680         while (! s->done) {
 681                 tevent_loop_once(ctdb->ev);
 682         }
 683
 684         if (! s->locked) {
 685                 TALLOC_FREE(rec->recovery_lock_handle);
 686                 return false;
 687         }
 688
 689         ctdb_ctrl_report_recd_lock_latency(ctdb,
 690                                            CONTROL_TIMEOUT(),
 691                                            s->latency);
 692
 693         return true;
 694 }
 695
 696 static void ctdb_recovery_unlock(struct ctdb_recoverd *rec)
 697 {
 698         if (rec->recovery_lock_handle == NULL) {
 699                 return;
 700         }
 701
 702         if (! rec->recovery_lock_handle->done) {
 703                 /*
 704                  * Taking of recovery lock still in progress.  Free
 705                  * the cluster mutex handle to release it but leave
 706                  * the recovery lock handle in place to allow taking
 707                  * of the lock to fail.
 708                  */
 709                 D_NOTICE("Cancelling recovery lock\n");
 710                 TALLOC_FREE(rec->recovery_lock_handle->h);
 711                 rec->recovery_lock_handle->done = true;
 712                 rec->recovery_lock_handle->locked = false;
 713                 return;
 714         }
 715
 716         D_NOTICE("Releasing recovery lock\n");
 717         TALLOC_FREE(rec->recovery_lock_handle);
 718 }
 719
 720 static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
 721 {
 722         struct ctdb_context *ctdb = rec->ctdb;
 723         unsigned int i;
 724         struct ctdb_banning_state *ban_state;
 725
 726         *self_ban = false;
 727         for (i=0; i<ctdb->num_nodes; i++) {
 728                 if (ctdb->nodes[i]->ban_state == NULL) {
 729                         continue;
 730                 }
 731                 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
 732                 if (ban_state->count < 2*ctdb->num_nodes) {
 733                         continue;
 734                 }
 735
 736                 DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
 737                         ctdb->nodes[i]->pnn, ban_state->count,
 738                         ctdb->tunable.recovery_ban_period));
 739                 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
 740                 ban_state->count = 0;
 741
 742                 /* Banning ourself? */
 743                 if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
 744                         *self_ban = true;
 745                 }
 746         }
 747 }
 748
 749 struct helper_state {
 750         int fd[2];
 751         pid_t pid;
 752         int result;
 753         bool done;
 754 };
 755
 756 static void helper_handler(struct tevent_context *ev,
 757                            struct tevent_fd *fde,
 758                            uint16_t flags, void *private_data)
 759 {
 760         struct helper_state *state = talloc_get_type_abort(
 761                 private_data, struct helper_state);
 762         int ret;
 763
 764         ret = sys_read(state->fd[0], &state->result, sizeof(state->result));
 765         if (ret != sizeof(state->result)) {
 766                 state->result = EPIPE;
 767         }
 768
 769         state->done = true;
 770 }
 771
 772 static int helper_run(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx,
 773                       const char *prog, const char *arg, const char *type)
 774 {
 775         struct helper_state *state;
 776         struct tevent_fd *fde;
 777         const char **args;
 778         int nargs, ret;
 779         uint32_t recmaster = rec->recmaster;
 780
 781         state = talloc_zero(mem_ctx, struct helper_state);
 782         if (state == NULL) {
 783                 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
 784                 return -1;
 785         }
 786
 787         state->pid = -1;
 788
 789         ret = pipe(state->fd);
 790         if (ret != 0) {
 791                 DEBUG(DEBUG_ERR,
 792                       ("Failed to create pipe for %s helper\n", type));
 793                 goto fail;
 794         }
 795
 796         set_close_on_exec(state->fd[0]);
 797
 798         nargs = 4;
 799         args = talloc_array(state, const char *, nargs);
 800         if (args == NULL) {
 801                 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
 802                 goto fail;
 803         }
 804
 805         args[0] = talloc_asprintf(args, "%d", state->fd[1]);
 806         if (args[0] == NULL) {
 807                 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
 808                 goto fail;
 809         }
 810         args[1] = rec->ctdb->daemon.name;
 811         args[2] = arg;
 812         args[3] = NULL;
 813
 814         if (args[2] == NULL) {
 815                 nargs = 3;
 816         }
 817
 818         state->pid = ctdb_vfork_exec(state, rec->ctdb, prog, nargs, args);
 819         if (state->pid == -1) {
 820                 DEBUG(DEBUG_ERR,
 821                       ("Failed to create child for %s helper\n", type));
 822                 goto fail;
 823         }
 824
 825         close(state->fd[1]);
 826         state->fd[1] = -1;
 827
 828         state->done = false;
 829
 830         fde = tevent_add_fd(rec->ctdb->ev, state, state->fd[0],
 831                             TEVENT_FD_READ, helper_handler, state);
 832         if (fde == NULL) {
 833                 goto fail;
 834         }
 835         tevent_fd_set_auto_close(fde);
 836
 837         while (!state->done) {
 838                 tevent_loop_once(rec->ctdb->ev);
 839
 840                 /* If recmaster changes, we have lost election */
 841                 if (recmaster != rec->recmaster) {
 842                         D_ERR("Recmaster changed to %u, aborting %s\n",
 843                               rec->recmaster, type);
 844                         state->result = 1;
 845                         break;
 846                 }
 847         }
 848
 849         close(state->fd[0]);
 850         state->fd[0] = -1;
 851
 852         if (state->result != 0) {
 853                 goto fail;
 854         }
 855
 856         ctdb_kill(rec->ctdb, state->pid, SIGKILL);
 857         talloc_free(state);
 858         return 0;
 859
 860 fail:
 861         if (state->fd[0] != -1) {
 862                 close(state->fd[0]);
 863         }
 864         if (state->fd[1] != -1) {
 865                 close(state->fd[1]);
 866         }
 867         if (state->pid != -1) {
 868                 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
 869         }
 870         talloc_free(state);
 871         return -1;
 872 }
 873
 874
 875 static int ctdb_takeover(struct ctdb_recoverd *rec,
 876                          uint32_t *force_rebalance_nodes)
 877 {
 878         static char prog[PATH_MAX+1] = "";
 879         char *arg;
 880         unsigned int i;
 881         int ret;
 882
 883         if (!ctdb_set_helper("takeover_helper", prog, sizeof(prog),
 884                              "CTDB_TAKEOVER_HELPER", CTDB_HELPER_BINDIR,
 885                              "ctdb_takeover_helper")) {
 886                 ctdb_die(rec->ctdb, "Unable to set takeover helper\n");
 887         }
 888
 889         arg = NULL;
 890         for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
 891                 uint32_t pnn = force_rebalance_nodes[i];
 892                 if (arg == NULL) {
 893                         arg = talloc_asprintf(rec, "%u", pnn);
 894                 } else {
 895                         arg = talloc_asprintf_append(arg, ",%u", pnn);
 896                 }
 897                 if (arg == NULL) {
 898                         DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
 899                         return -1;
 900                 }
 901         }
 902
 903         if (ctdb_config.failover_disabled) {
 904                 ret = setenv("CTDB_DISABLE_IP_FAILOVER", "1", 1);
 905                 if (ret != 0) {
 906                         D_ERR("Failed to set CTDB_DISABLE_IP_FAILOVER variable\n");
 907                         return -1;
 908                 }
 909         }
 910
 911         return helper_run(rec, rec, prog, arg, "takeover");
 912 }
 913
 914 static bool do_takeover_run(struct ctdb_recoverd *rec,
 915                             struct ctdb_node_map_old *nodemap)
 916 {
 917         uint32_t *nodes = NULL;
 918         struct ctdb_disable_message dtr;
 919         TDB_DATA data;
 920         size_t i;
 921         uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
 922         int ret;
 923         bool ok;
 924
 925         DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
 926
 927         if (ctdb_op_is_in_progress(rec->takeover_run)) {
 928                 DEBUG(DEBUG_ERR, (__location__
 929                                   " takeover run already in progress \n"));
 930                 ok = false;
 931                 goto done;
 932         }
 933
 934         if (!ctdb_op_begin(rec->takeover_run)) {
 935                 ok = false;
 936                 goto done;
 937         }
 938
 939         /* Disable IP checks (takeover runs, really) on other nodes
 940          * while doing this takeover run.  This will stop those other
 941          * nodes from triggering takeover runs when think they should
 942          * be hosting an IP but it isn't yet on an interface.  Don't
 943          * wait for replies since a failure here might cause some
 944          * noise in the logs but will not actually cause a problem.
 945          */
 946         ZERO_STRUCT(dtr);
 947         dtr.srvid = 0; /* No reply */
 948         dtr.pnn = -1;
 949
 950         data.dptr  = (uint8_t*)&dtr;
 951         data.dsize = sizeof(dtr);
 952
 953         nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
 954
 955         /* Disable for 60 seconds.  This can be a tunable later if
 956          * necessary.
 957          */
 958         dtr.timeout = 60;
 959         for (i = 0; i < talloc_array_length(nodes); i++) {
 960                 if (ctdb_client_send_message(rec->ctdb, nodes[i],
 961                                              CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
 962                                              data) != 0) {
 963                         DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
 964                 }
 965         }
 966
 967         ret = ctdb_takeover(rec, rec->force_rebalance_nodes);
 968
 969         /* Reenable takeover runs and IP checks on other nodes */
 970         dtr.timeout = 0;
 971         for (i = 0; i < talloc_array_length(nodes); i++) {
 972                 if (ctdb_client_send_message(rec->ctdb, nodes[i],
 973                                              CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
 974                                              data) != 0) {
 975                         DEBUG(DEBUG_INFO,("Failed to re-enable takeover runs\n"));
 976                 }
 977         }
 978
 979         if (ret != 0) {
 980                 DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
 981                 ok = false;
 982                 goto done;
 983         }
 984
 985         ok = true;
 986         /* Takeover run was successful so clear force rebalance targets */
 987         if (rebalance_nodes == rec->force_rebalance_nodes) {
 988                 TALLOC_FREE(rec->force_rebalance_nodes);
 989         } else {
 990                 DEBUG(DEBUG_WARNING,
 991                       ("Rebalance target nodes changed during takeover run - not clearing\n"));
 992         }
 993 done:
 994         rec->need_takeover_run = !ok;
 995         talloc_free(nodes);
 996         ctdb_op_end(rec->takeover_run);
 997
 998         DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
 999         return ok;
1000 }
1001
1002 static int db_recovery_parallel(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx)
1003 {
1004         static char prog[PATH_MAX+1] = "";
1005         const char *arg;
1006
1007         if (!ctdb_set_helper("recovery_helper", prog, sizeof(prog),
1008                              "CTDB_RECOVERY_HELPER", CTDB_HELPER_BINDIR,
1009                              "ctdb_recovery_helper")) {
1010                 ctdb_die(rec->ctdb, "Unable to set recovery helper\n");
1011         }
1012
1013         arg = talloc_asprintf(mem_ctx, "%u", new_generation());
1014         if (arg == NULL) {
1015                 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1016                 return -1;
1017         }
1018
1019         setenv("CTDB_DBDIR_STATE", rec->ctdb->db_directory_state, 1);
1020
1021         return helper_run(rec, mem_ctx, prog, arg, "recovery");
1022 }
1023
1024 /*
1025   we are the recmaster, and recovery is needed - start a recovery run
1026  */
1027 static int do_recovery(struct ctdb_recoverd *rec,
1028                        TALLOC_CTX *mem_ctx, uint32_t pnn,
1029                        struct ctdb_node_map_old *nodemap, struct ctdb_vnn_map *vnnmap)
1030 {
1031         struct ctdb_context *ctdb = rec->ctdb;
1032         unsigned int i;
1033         int ret;
1034         bool self_ban;
1035
1036         DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1037
1038         /* Check if the current node is still the recmaster.  It's possible that
1039          * re-election has changed the recmaster.
1040          */
1041         if (pnn != rec->recmaster) {
1042                 DEBUG(DEBUG_NOTICE,
1043                       ("Recovery master changed to %u, aborting recovery\n",
1044                        rec->recmaster));
1045                 return -1;
1046         }
1047
1048         /* if recovery fails, force it again */
1049         rec->need_recovery = true;
1050
1051         if (!ctdb_op_begin(rec->recovery)) {
1052                 return -1;
1053         }
1054
1055         if (rec->election_timeout) {
1056                 /* an election is in progress */
1057                 DEBUG(DEBUG_ERR, ("do_recovery called while election in progress - try again later\n"));
1058                 goto fail;
1059         }
1060
1061         ban_misbehaving_nodes(rec, &self_ban);
1062         if (self_ban) {
1063                 DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
1064                 goto fail;
1065         }
1066
1067         if (ctdb->recovery_lock != NULL) {
1068                 if (ctdb_recovery_have_lock(rec)) {
1069                         D_NOTICE("Already holding recovery lock\n");
1070                 } else {
1071                         bool ok;
1072
1073                         D_NOTICE("Attempting to take recovery lock (%s)\n",
1074                                  ctdb->recovery_lock);
1075
1076                         ok = ctdb_recovery_lock(rec);
1077                         if (! ok) {
1078                                 D_ERR("Unable to take recovery lock\n");
1079
1080                                 if (pnn != rec->recmaster) {
1081                                         D_NOTICE("Recovery master changed to %u,"
1082                                                  " aborting recovery\n",
1083                                                  rec->recmaster);
1084                                         rec->need_recovery = false;
1085                                         goto fail;
1086                                 }
1087
1088                                 if (ctdb->runstate ==
1089                                     CTDB_RUNSTATE_FIRST_RECOVERY) {
1090                                         /*
1091                                          * First recovery?  Perhaps
1092                                          * current node does not yet
1093                                          * know who the recmaster is.
1094                                          */
1095                                         D_ERR("Retrying recovery\n");
1096                                         goto fail;
1097                                 }
1098
1099                                 D_ERR("Abort recovery, "
1100                                       "ban this node for %u seconds\n",
1101                                       ctdb->tunable.recovery_ban_period);
1102                                 ctdb_ban_node(rec,
1103                                               pnn,
1104                                               ctdb->tunable.recovery_ban_period);
1105                                 goto fail;
1106                         }
1107                         D_NOTICE("Recovery lock taken successfully\n");
1108                 }
1109         }
1110
1111         DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1112
1113         /* Retrieve capabilities from all connected nodes */
1114         ret = update_capabilities(rec, nodemap);
1115         if (ret!=0) {
1116                 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1117                 return -1;
1118         }
1119
1120         /*
1121           update all nodes to have the same flags that we have
1122          */
1123         for (i=0;i<nodemap->num;i++) {
1124                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1125                         continue;
1126                 }
1127
1128                 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1129                 if (ret != 0) {
1130                         if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1131                                 DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
1132                         } else {
1133                                 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1134                                 return -1;
1135                         }
1136                 }
1137         }
1138
1139         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1140
1141         ret = db_recovery_parallel(rec, mem_ctx);
1142         if (ret != 0) {
1143                 goto fail;
1144         }
1145
1146         do_takeover_run(rec, nodemap);
1147
1148         /* send a message to all clients telling them that the cluster
1149            has been reconfigured */
1150         ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
1151                                        CTDB_SRVID_RECONFIGURE, tdb_null);
1152         if (ret != 0) {
1153                 DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n"));
1154                 goto fail;
1155         }
1156
1157         DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1158
1159         rec->need_recovery = false;
1160         ctdb_op_end(rec->recovery);
1161
1162         /* we managed to complete a full recovery, make sure to forgive
1163            any past sins by the nodes that could now participate in the
1164            recovery.
1165         */
1166         DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
1167         for (i=0;i<nodemap->num;i++) {
1168                 struct ctdb_banning_state *ban_state;
1169
1170                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1171                         continue;
1172                 }
1173
1174                 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
1175                 if (ban_state == NULL) {
1176                         continue;
1177                 }
1178
1179                 ban_state->count = 0;
1180         }
1181
1182         /* We just finished a recovery successfully.
1183            We now wait for rerecovery_timeout before we allow
1184            another recovery to take place.
1185         */
1186         DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be suppressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
1187         ctdb_op_disable(rec->recovery, ctdb->ev,
1188                         ctdb->tunable.rerecovery_timeout);
1189         return 0;
1190
1191 fail:
1192         ctdb_op_end(rec->recovery);
1193         return -1;
1194 }
1195
1196
1197 /*
1198   elections are won by first checking the number of connected nodes, then
1199   the priority time, then the pnn
1200  */
1201 struct election_message {
1202         uint32_t num_connected;
1203         struct timeval priority_time;
1204         uint32_t pnn;
1205         uint32_t node_flags;
1206 };
1207
1208 /*
1209   form this nodes election data
1210  */
1211 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1212 {
1213         unsigned int i;
1214         int ret;
1215         struct ctdb_node_map_old *nodemap;
1216         struct ctdb_context *ctdb = rec->ctdb;
1217
1218         ZERO_STRUCTP(em);
1219
1220         em->pnn = rec->ctdb->pnn;
1221         em->priority_time = rec->priority_time;
1222
1223         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1224         if (ret != 0) {
1225                 DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
1226                 return;
1227         }
1228
1229         rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
1230         em->node_flags = rec->node_flags;
1231
1232         for (i=0;i<nodemap->num;i++) {
1233                 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1234                         em->num_connected++;
1235                 }
1236         }
1237
1238         /* we shouldnt try to win this election if we cant be a recmaster */
1239         if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1240                 em->num_connected = 0;
1241                 em->priority_time = timeval_current();
1242         }
1243
1244         talloc_free(nodemap);
1245 }
1246
1247 /*
1248   see if the given election data wins
1249  */
1250 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1251 {
1252         struct election_message myem;
1253         int cmp = 0;
1254
1255         ctdb_election_data(rec, &myem);
1256
1257         /* we cant win if we don't have the recmaster capability */
1258         if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1259                 return false;
1260         }
1261
1262         /* we cant win if we are banned */
1263         if (rec->node_flags & NODE_FLAGS_BANNED) {
1264                 return false;
1265         }
1266
1267         /* we cant win if we are stopped */
1268         if (rec->node_flags & NODE_FLAGS_STOPPED) {
1269                 return false;
1270         }
1271
1272         /* we will automatically win if the other node is banned */
1273         if (em->node_flags & NODE_FLAGS_BANNED) {
1274                 return true;
1275         }
1276
1277         /* we will automatically win if the other node is banned */
1278         if (em->node_flags & NODE_FLAGS_STOPPED) {
1279                 return true;
1280         }
1281
1282         /* then the longest running node */
1283         if (cmp == 0) {
1284                 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1285         }
1286
1287         if (cmp == 0) {
1288                 cmp = (int)myem.pnn - (int)em->pnn;
1289         }
1290
1291         return cmp > 0;
1292 }
1293
1294 /*
1295   send out an election request
1296  */
1297 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
1298 {
1299         int ret;
1300         TDB_DATA election_data;
1301         struct election_message emsg;
1302         uint64_t srvid;
1303         struct ctdb_context *ctdb = rec->ctdb;
1304
1305         srvid = CTDB_SRVID_ELECTION;
1306
1307         ctdb_election_data(rec, &emsg);
1308
1309         election_data.dsize = sizeof(struct election_message);
1310         election_data.dptr  = (unsigned char *)&emsg;
1311
1312
1313         /* first we assume we will win the election and set
1314            recoverymaster to be ourself on the current node
1315          */
1316         ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
1317                                      CTDB_CURRENT_NODE, pnn);
1318         if (ret != 0) {
1319                 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster\n"));
1320                 return -1;
1321         }
1322         rec->recmaster = pnn;
1323
1324         /* send an election message to all active nodes */
1325         DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
1326         return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1327 }
1328
1329 /*
1330   we think we are winning the election - send a broadcast election request
1331  */
1332 static void election_send_request(struct tevent_context *ev,
1333                                   struct tevent_timer *te,
1334                                   struct timeval t, void *p)
1335 {
1336         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1337         int ret;
1338
1339         ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
1340         if (ret != 0) {
1341                 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
1342         }
1343
1344         TALLOC_FREE(rec->send_election_te);
1345 }
1346
1347 /*
1348   handler for memory dumps
1349 */
1350 static void mem_dump_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1351 {
1352         struct ctdb_recoverd *rec = talloc_get_type(
1353                 private_data, struct ctdb_recoverd);
1354         struct ctdb_context *ctdb = rec->ctdb;
1355         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1356         TDB_DATA *dump;
1357         int ret;
1358         struct ctdb_srvid_message *rd;
1359
1360         if (data.dsize != sizeof(struct ctdb_srvid_message)) {
1361                 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1362                 talloc_free(tmp_ctx);
1363                 return;
1364         }
1365         rd = (struct ctdb_srvid_message *)data.dptr;
1366
1367         dump = talloc_zero(tmp_ctx, TDB_DATA);
1368         if (dump == NULL) {
1369                 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
1370                 talloc_free(tmp_ctx);
1371                 return;
1372         }
1373         ret = ctdb_dump_memory(ctdb, dump);
1374         if (ret != 0) {
1375                 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
1376                 talloc_free(tmp_ctx);
1377                 return;
1378         }
1379
1380 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
1381
1382         ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
1383         if (ret != 0) {
1384                 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
1385                 talloc_free(tmp_ctx);
1386                 return;
1387         }
1388
1389         talloc_free(tmp_ctx);
1390 }
1391
1392 /*
1393   handler for reload_nodes
1394 */
1395 static void reload_nodes_handler(uint64_t srvid, TDB_DATA data,
1396                                  void *private_data)
1397 {
1398         struct ctdb_recoverd *rec = talloc_get_type(
1399                 private_data, struct ctdb_recoverd);
1400
1401         DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
1402
1403         ctdb_load_nodes_file(rec->ctdb);
1404 }
1405
1406
1407 static void recd_node_rebalance_handler(uint64_t srvid, TDB_DATA data,
1408                                         void *private_data)
1409 {
1410         struct ctdb_recoverd *rec = talloc_get_type(
1411                 private_data, struct ctdb_recoverd);
1412         struct ctdb_context *ctdb = rec->ctdb;
1413         uint32_t pnn;
1414         uint32_t *t;
1415         int len;
1416
1417         if (rec->recmaster != ctdb_get_pnn(ctdb)) {
1418                 return;
1419         }
1420
1421         if (data.dsize != sizeof(uint32_t)) {
1422                 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
1423                 return;
1424         }
1425
1426         pnn = *(uint32_t *)&data.dptr[0];
1427
1428         DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
1429
1430         /* Copy any existing list of nodes.  There's probably some
1431          * sort of realloc variant that will do this but we need to
1432          * make sure that freeing the old array also cancels the timer
1433          * event for the timeout... not sure if realloc will do that.
1434          */
1435         len = (rec->force_rebalance_nodes != NULL) ?
1436                 talloc_array_length(rec->force_rebalance_nodes) :
1437                 0;
1438
1439         /* This allows duplicates to be added but they don't cause
1440          * harm.  A call to add a duplicate PNN arguably means that
1441          * the timeout should be reset, so this is the simplest
1442          * solution.
1443          */
1444         t = talloc_zero_array(rec, uint32_t, len+1);
1445         CTDB_NO_MEMORY_VOID(ctdb, t);
1446         if (len > 0) {
1447                 memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
1448         }
1449         t[len] = pnn;
1450
1451         talloc_free(rec->force_rebalance_nodes);
1452
1453         rec->force_rebalance_nodes = t;
1454 }
1455
1456
1457
1458 static void srvid_disable_and_reply(struct ctdb_context *ctdb,
1459                                     TDB_DATA data,
1460                                     struct ctdb_op_state *op_state)
1461 {
1462         struct ctdb_disable_message *r;
1463         uint32_t timeout;
1464         TDB_DATA result;
1465         int32_t ret = 0;
1466
1467         /* Validate input data */
1468         if (data.dsize != sizeof(struct ctdb_disable_message)) {
1469                 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1470                                  "expecting %lu\n", (long unsigned)data.dsize,
1471                                  (long unsigned)sizeof(struct ctdb_srvid_message)));
1472                 return;
1473         }
1474         if (data.dptr == NULL) {
1475                 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
1476                 return;
1477         }
1478
1479         r = (struct ctdb_disable_message *)data.dptr;
1480         timeout = r->timeout;
1481
1482         ret = ctdb_op_disable(op_state, ctdb->ev, timeout);
1483         if (ret != 0) {
1484                 goto done;
1485         }
1486
1487         /* Returning our PNN tells the caller that we succeeded */
1488         ret = ctdb_get_pnn(ctdb);
1489 done:
1490         result.dsize = sizeof(int32_t);
1491         result.dptr  = (uint8_t *)&ret;
1492         srvid_request_reply(ctdb, (struct ctdb_srvid_message *)r, result);
1493 }
1494
1495 static void disable_takeover_runs_handler(uint64_t srvid, TDB_DATA data,
1496                                           void *private_data)
1497 {
1498         struct ctdb_recoverd *rec = talloc_get_type(
1499                 private_data, struct ctdb_recoverd);
1500
1501         srvid_disable_and_reply(rec->ctdb, data, rec->takeover_run);
1502 }
1503
1504 /* Backward compatibility for this SRVID */
1505 static void disable_ip_check_handler(uint64_t srvid, TDB_DATA data,
1506                                      void *private_data)
1507 {
1508         struct ctdb_recoverd *rec = talloc_get_type(
1509                 private_data, struct ctdb_recoverd);
1510         uint32_t timeout;
1511
1512         if (data.dsize != sizeof(uint32_t)) {
1513                 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1514                                  "expecting %lu\n", (long unsigned)data.dsize,
1515                                  (long unsigned)sizeof(uint32_t)));
1516                 return;
1517         }
1518         if (data.dptr == NULL) {
1519                 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
1520                 return;
1521         }
1522
1523         timeout = *((uint32_t *)data.dptr);
1524
1525         ctdb_op_disable(rec->takeover_run, rec->ctdb->ev, timeout);
1526 }
1527
1528 static void disable_recoveries_handler(uint64_t srvid, TDB_DATA data,
1529                                        void *private_data)
1530 {
1531         struct ctdb_recoverd *rec = talloc_get_type(
1532                 private_data, struct ctdb_recoverd);
1533
1534         srvid_disable_and_reply(rec->ctdb, data, rec->recovery);
1535 }
1536
1537 /*
1538   handler for ip reallocate, just add it to the list of requests and
1539   handle this later in the monitor_cluster loop so we do not recurse
1540   with other requests to takeover_run()
1541 */
1542 static void ip_reallocate_handler(uint64_t srvid, TDB_DATA data,
1543                                   void *private_data)
1544 {
1545         struct ctdb_srvid_message *request;
1546         struct ctdb_recoverd *rec = talloc_get_type(
1547                 private_data, struct ctdb_recoverd);
1548
1549         if (data.dsize != sizeof(struct ctdb_srvid_message)) {
1550                 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1551                 return;
1552         }
1553
1554         request = (struct ctdb_srvid_message *)data.dptr;
1555
1556         srvid_request_add(rec->ctdb, &rec->reallocate_requests, request);
1557 }
1558
1559 static void process_ipreallocate_requests(struct ctdb_context *ctdb,
1560                                           struct ctdb_recoverd *rec)
1561 {
1562         TDB_DATA result;
1563         int32_t ret;
1564         struct srvid_requests *current;
1565
1566         /* Only process requests that are currently pending.  More
1567          * might come in while the takeover run is in progress and
1568          * they will need to be processed later since they might
1569          * be in response flag changes.
1570          */
1571         current = rec->reallocate_requests;
1572         rec->reallocate_requests = NULL;
1573
1574         if (do_takeover_run(rec, rec->nodemap)) {
1575                 ret = ctdb_get_pnn(ctdb);
1576         } else {
1577                 ret = -1;
1578         }
1579
1580         result.dsize = sizeof(int32_t);
1581         result.dptr  = (uint8_t *)&ret;
1582
1583         srvid_requests_reply(ctdb, &current, result);
1584 }
1585
1586 /*
1587  * handler for assigning banning credits
1588  */
1589 static void banning_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1590 {
1591         struct ctdb_recoverd *rec = talloc_get_type(
1592                 private_data, struct ctdb_recoverd);
1593         uint32_t ban_pnn;
1594
1595         /* Ignore if we are not recmaster */
1596         if (rec->ctdb->pnn != rec->recmaster) {
1597                 return;
1598         }
1599
1600         if (data.dsize != sizeof(uint32_t)) {
1601                 DEBUG(DEBUG_ERR, (__location__ "invalid data size %zu\n",
1602                                   data.dsize));
1603                 return;
1604         }
1605
1606         ban_pnn = *(uint32_t *)data.dptr;
1607
1608         ctdb_set_culprit_count(rec, ban_pnn, rec->nodemap->num);
1609 }
1610
1611 /*
1612   handler for recovery master elections
1613 */
1614 static void election_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1615 {
1616         struct ctdb_recoverd *rec = talloc_get_type(
1617                 private_data, struct ctdb_recoverd);
1618         struct ctdb_context *ctdb = rec->ctdb;
1619         int ret;
1620         struct election_message *em = (struct election_message *)data.dptr;
1621
1622         /* Ignore election packets from ourself */
1623         if (ctdb->pnn == em->pnn) {
1624                 return;
1625         }
1626
1627         /* we got an election packet - update the timeout for the election */
1628         talloc_free(rec->election_timeout);
1629         rec->election_timeout = tevent_add_timer(
1630                         ctdb->ev, ctdb,
1631                         fast_start ?
1632                                 timeval_current_ofs(0, 500000) :
1633                                 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1634                         ctdb_election_timeout, rec);
1635
1636         /* someone called an election. check their election data
1637            and if we disagree and we would rather be the elected node,
1638            send a new election message to all other nodes
1639          */
1640         if (ctdb_election_win(rec, em)) {
1641                 if (!rec->send_election_te) {
1642                         rec->send_election_te = tevent_add_timer(
1643                                         ctdb->ev, rec,
1644                                         timeval_current_ofs(0, 500000),
1645                                         election_send_request, rec);
1646                 }
1647                 return;
1648         }
1649
1650         /* we didn't win */
1651         TALLOC_FREE(rec->send_election_te);
1652
1653         /* Release the recovery lock file */
1654         if (ctdb_recovery_have_lock(rec)) {
1655                 ctdb_recovery_unlock(rec);
1656         }
1657
1658         /* ok, let that guy become recmaster then */
1659         ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
1660                                      CTDB_CURRENT_NODE, em->pnn);
1661         if (ret != 0) {
1662                 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster"));
1663                 return;
1664         }
1665         rec->recmaster = em->pnn;
1666
1667         return;
1668 }
1669
1670
1671 /*
1672   force the start of the election process
1673  */
1674 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
1675                            struct ctdb_node_map_old *nodemap)
1676 {
1677         int ret;
1678         struct ctdb_context *ctdb = rec->ctdb;
1679
1680         DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
1681
1682         /* set all nodes to recovery mode to stop all internode traffic */
1683         ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1684         if (ret != 0) {
1685                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1686                 return;
1687         }
1688
1689         talloc_free(rec->election_timeout);
1690         rec->election_timeout = tevent_add_timer(
1691                         ctdb->ev, ctdb,
1692                         fast_start ?
1693                                 timeval_current_ofs(0, 500000) :
1694                                 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1695                         ctdb_election_timeout, rec);
1696
1697         ret = send_election_request(rec, pnn);
1698         if (ret!=0) {
1699                 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
1700                 return;
1701         }
1702
1703         /* wait for a few seconds to collect all responses */
1704         ctdb_wait_election(rec);
1705 }
1706
1707
1708
1709 /*
1710   handler for when a node changes its flags
1711 */
1712 static void monitor_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1713 {
1714         struct ctdb_recoverd *rec = talloc_get_type(
1715                 private_data, struct ctdb_recoverd);
1716         struct ctdb_context *ctdb = rec->ctdb;
1717         int ret;
1718         struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
1719         struct ctdb_node_map_old *nodemap=NULL;
1720         TALLOC_CTX *tmp_ctx;
1721         unsigned int i;
1722
1723         if (data.dsize != sizeof(*c)) {
1724                 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
1725                 return;
1726         }
1727
1728         tmp_ctx = talloc_new(ctdb);
1729         CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
1730
1731         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1732         if (ret != 0) {
1733                 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
1734                 talloc_free(tmp_ctx);
1735                 return;
1736         }
1737
1738
1739         for (i=0;i<nodemap->num;i++) {
1740                 if (nodemap->nodes[i].pnn == c->pnn) break;
1741         }
1742
1743         if (i == nodemap->num) {
1744                 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
1745                 talloc_free(tmp_ctx);
1746                 return;
1747         }
1748
1749         if (c->old_flags != c->new_flags) {
1750                 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x  was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
1751         }
1752
1753         nodemap->nodes[i].flags = c->new_flags;
1754
1755         talloc_free(tmp_ctx);
1756 }
1757
1758 /*
1759   handler for when we need to push out flag changes to all other nodes
1760 */
1761 static void push_flags_handler(uint64_t srvid, TDB_DATA data,
1762                                void *private_data)
1763 {
1764         struct ctdb_recoverd *rec = talloc_get_type(
1765                 private_data, struct ctdb_recoverd);
1766         struct ctdb_context *ctdb = rec->ctdb;
1767         int ret;
1768         struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
1769         struct ctdb_node_map_old *nodemap=NULL;
1770         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1771         uint32_t *nodes;
1772
1773         /* read the node flags from the recmaster */
1774         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
1775                                    tmp_ctx, &nodemap);
1776         if (ret != 0) {
1777                 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
1778                 talloc_free(tmp_ctx);
1779                 return;
1780         }
1781         if (c->pnn >= nodemap->num) {
1782                 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
1783                 talloc_free(tmp_ctx);
1784                 return;
1785         }
1786
1787         /* send the flags update to all connected nodes */
1788         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
1789
1790         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
1791                                       nodes, 0, CONTROL_TIMEOUT(),
1792                                       false, data,
1793                                       NULL, NULL,
1794                                       NULL) != 0) {
1795                 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
1796
1797                 talloc_free(tmp_ctx);
1798                 return;
1799         }
1800
1801         talloc_free(tmp_ctx);
1802 }
1803
1804
1805 struct verify_recmode_normal_data {
1806         uint32_t count;
1807         enum monitor_result status;
1808 };
1809
1810 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
1811 {
1812         struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
1813
1814
1815         /* one more node has responded with recmode data*/
1816         rmdata->count--;
1817
1818         /* if we failed to get the recmode, then return an error and let
1819            the main loop try again.
1820         */
1821         if (state->state != CTDB_CONTROL_DONE) {
1822                 if (rmdata->status == MONITOR_OK) {
1823                         rmdata->status = MONITOR_FAILED;
1824                 }
1825                 return;
1826         }
1827
1828         /* if we got a response, then the recmode will be stored in the
1829            status field
1830         */
1831         if (state->status != CTDB_RECOVERY_NORMAL) {
1832                 DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
1833                 rmdata->status = MONITOR_RECOVERY_NEEDED;
1834         }
1835
1836         return;
1837 }
1838
1839
1840 /* verify that all nodes are in normal recovery mode */
1841 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap)
1842 {
1843         struct verify_recmode_normal_data *rmdata;
1844         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1845         struct ctdb_client_control_state *state;
1846         enum monitor_result status;
1847         unsigned int j;
1848
1849         rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
1850         CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
1851         rmdata->count  = 0;
1852         rmdata->status = MONITOR_OK;
1853
1854         /* loop over all active nodes and send an async getrecmode call to
1855            them*/
1856         for (j=0; j<nodemap->num; j++) {
1857                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1858                         continue;
1859                 }
1860                 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
1861                                         CONTROL_TIMEOUT(),
1862                                         nodemap->nodes[j].pnn);
1863                 if (state == NULL) {
1864                         /* we failed to send the control, treat this as
1865                            an error and try again next iteration
1866                         */
1867                         DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
1868                         talloc_free(mem_ctx);
1869                         return MONITOR_FAILED;
1870                 }
1871
1872                 /* set up the callback functions */
1873                 state->async.fn = verify_recmode_normal_callback;
1874                 state->async.private_data = rmdata;
1875
1876                 /* one more control to wait for to complete */
1877                 rmdata->count++;
1878         }
1879
1880
1881         /* now wait for up to the maximum number of seconds allowed
1882            or until all nodes we expect a response from has replied
1883         */
1884         while (rmdata->count > 0) {
1885                 tevent_loop_once(ctdb->ev);
1886         }
1887
1888         status = rmdata->status;
1889         talloc_free(mem_ctx);
1890         return status;
1891 }
1892
1893
1894 struct verify_recmaster_data {
1895         struct ctdb_recoverd *rec;
1896         uint32_t count;
1897         uint32_t pnn;
1898         enum monitor_result status;
1899 };
1900
1901 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
1902 {
1903         struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
1904
1905
1906         /* one more node has responded with recmaster data*/
1907         rmdata->count--;
1908
1909         /* if we failed to get the recmaster, then return an error and let
1910            the main loop try again.
1911         */
1912         if (state->state != CTDB_CONTROL_DONE) {
1913                 if (rmdata->status == MONITOR_OK) {
1914                         rmdata->status = MONITOR_FAILED;
1915                 }
1916                 return;
1917         }
1918
1919         /* if we got a response, then the recmaster will be stored in the
1920            status field
1921         */
1922         if ((uint32_t)state->status != rmdata->pnn) {
1923                 DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
1924                 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
1925                 rmdata->status = MONITOR_ELECTION_NEEDED;
1926         }
1927
1928         return;
1929 }
1930
1931
1932 /* verify that all nodes agree that we are the recmaster */
1933 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap, uint32_t pnn)
1934 {
1935         struct ctdb_context *ctdb = rec->ctdb;
1936         struct verify_recmaster_data *rmdata;
1937         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1938         struct ctdb_client_control_state *state;
1939         enum monitor_result status;
1940         unsigned int j;
1941
1942         rmdata = talloc(mem_ctx, struct verify_recmaster_data);
1943         CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
1944         rmdata->rec    = rec;
1945         rmdata->count  = 0;
1946         rmdata->pnn    = pnn;
1947         rmdata->status = MONITOR_OK;
1948
1949         /* loop over all active nodes and send an async getrecmaster call to
1950            them*/
1951         for (j=0; j<nodemap->num; j++) {
1952                 if (nodemap->nodes[j].pnn == rec->recmaster) {
1953                         continue;
1954                 }
1955                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1956                         continue;
1957                 }
1958                 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
1959                                         CONTROL_TIMEOUT(),
1960                                         nodemap->nodes[j].pnn);
1961                 if (state == NULL) {
1962                         /* we failed to send the control, treat this as
1963                            an error and try again next iteration
1964                         */
1965                         DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
1966                         talloc_free(mem_ctx);
1967                         return MONITOR_FAILED;
1968                 }
1969
1970                 /* set up the callback functions */
1971                 state->async.fn = verify_recmaster_callback;
1972                 state->async.private_data = rmdata;
1973
1974                 /* one more control to wait for to complete */
1975                 rmdata->count++;
1976         }
1977
1978
1979         /* now wait for up to the maximum number of seconds allowed
1980            or until all nodes we expect a response from has replied
1981         */
1982         while (rmdata->count > 0) {
1983                 tevent_loop_once(ctdb->ev);
1984         }
1985
1986         status = rmdata->status;
1987         talloc_free(mem_ctx);
1988         return status;
1989 }
1990
1991 static bool interfaces_have_changed(struct ctdb_context *ctdb,
1992                                     struct ctdb_recoverd *rec)
1993 {
1994         struct ctdb_iface_list_old *ifaces = NULL;
1995         TALLOC_CTX *mem_ctx;
1996         bool ret = false;
1997
1998         mem_ctx = talloc_new(NULL);
1999
2000         /* Read the interfaces from the local node */
2001         if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
2002                                  CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
2003                 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
2004                 /* We could return an error.  However, this will be
2005                  * rare so we'll decide that the interfaces have
2006                  * actually changed, just in case.
2007                  */
2008                 talloc_free(mem_ctx);
2009                 return true;
2010         }
2011
2012         if (!rec->ifaces) {
2013                 /* We haven't been here before so things have changed */
2014                 DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
2015                 ret = true;
2016         } else if (rec->ifaces->num != ifaces->num) {
2017                 /* Number of interfaces has changed */
2018                 DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
2019                                      rec->ifaces->num, ifaces->num));
2020                 ret = true;
2021         } else {
2022                 /* See if interface names or link states have changed */
2023                 unsigned int i;
2024                 for (i = 0; i < rec->ifaces->num; i++) {
2025                         struct ctdb_iface * iface = &rec->ifaces->ifaces[i];
2026                         if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
2027                                 DEBUG(DEBUG_NOTICE,
2028                                       ("Interface in slot %d changed: %s => %s\n",
2029                                        i, iface->name, ifaces->ifaces[i].name));
2030                                 ret = true;
2031                                 break;
2032                         }
2033                         if (iface->link_state != ifaces->ifaces[i].link_state) {
2034                                 DEBUG(DEBUG_NOTICE,
2035                                       ("Interface %s changed state: %d => %d\n",
2036                                        iface->name, iface->link_state,
2037                                        ifaces->ifaces[i].link_state));
2038                                 ret = true;
2039                                 break;
2040                         }
2041                 }
2042         }
2043
2044         talloc_free(rec->ifaces);
2045         rec->ifaces = talloc_steal(rec, ifaces);
2046
2047         talloc_free(mem_ctx);
2048         return ret;
2049 }
2050
2051 /* Check that the local allocation of public IP addresses is correct
2052  * and do some house-keeping */
2053 static int verify_local_ip_allocation(struct ctdb_context *ctdb,
2054                                       struct ctdb_recoverd *rec,
2055                                       uint32_t pnn,
2056                                       struct ctdb_node_map_old *nodemap)
2057 {
2058         TALLOC_CTX *mem_ctx = talloc_new(NULL);
2059         unsigned int j;
2060         int ret;
2061         bool need_takeover_run = false;
2062         struct ctdb_public_ip_list_old *ips = NULL;
2063
2064         /* If we are not the recmaster then do some housekeeping */
2065         if (rec->recmaster != pnn) {
2066                 /* Ignore any IP reallocate requests - only recmaster
2067                  * processes them
2068                  */
2069                 TALLOC_FREE(rec->reallocate_requests);
2070                 /* Clear any nodes that should be force rebalanced in
2071                  * the next takeover run.  If the recovery master role
2072                  * has moved then we don't want to process these some
2073                  * time in the future.
2074                  */
2075                 TALLOC_FREE(rec->force_rebalance_nodes);
2076         }
2077
2078         /* Return early if disabled... */
2079         if (ctdb_config.failover_disabled ||
2080             ctdb_op_is_disabled(rec->takeover_run)) {
2081                 talloc_free(mem_ctx);
2082                 return  0;
2083         }
2084
2085         if (interfaces_have_changed(ctdb, rec)) {
2086                 need_takeover_run = true;
2087         }
2088
2089         /* If there are unhosted IPs but this node can host them then
2090          * trigger an IP reallocation */
2091
2092         /* Read *available* IPs from local node */
2093         ret = ctdb_ctrl_get_public_ips_flags(
2094                 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx,
2095                 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
2096         if (ret != 0) {
2097                 DEBUG(DEBUG_ERR, ("Unable to retrieve available public IPs\n"));
2098                 talloc_free(mem_ctx);
2099                 return -1;
2100         }
2101
2102         for (j=0; j<ips->num; j++) {
2103                 if (ips->ips[j].pnn == CTDB_UNKNOWN_PNN &&
2104                     nodemap->nodes[pnn].flags == 0) {
2105                         DEBUG(DEBUG_WARNING,
2106                               ("Unassigned IP %s can be served by this node\n",
2107                                ctdb_addr_to_str(&ips->ips[j].addr)));
2108                         need_takeover_run = true;
2109                 }
2110         }
2111
2112         talloc_free(ips);
2113
2114         if (!ctdb->do_checkpublicip) {
2115                 goto done;
2116         }
2117
2118         /* Validate the IP addresses that this node has on network
2119          * interfaces.  If there is an inconsistency between reality
2120          * and the state expected by CTDB then try to fix it by
2121          * triggering an IP reallocation or releasing extraneous IP
2122          * addresses. */
2123
2124         /* Read *known* IPs from local node */
2125         ret = ctdb_ctrl_get_public_ips_flags(
2126                 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
2127         if (ret != 0) {
2128                 DEBUG(DEBUG_ERR, ("Unable to retrieve known public IPs\n"));
2129                 talloc_free(mem_ctx);
2130                 return -1;
2131         }
2132
2133         for (j=0; j<ips->num; j++) {
2134                 if (ips->ips[j].pnn == pnn) {
2135                         if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
2136                                 DEBUG(DEBUG_ERR,
2137                                       ("Assigned IP %s not on an interface\n",
2138                                        ctdb_addr_to_str(&ips->ips[j].addr)));
2139                                 need_takeover_run = true;
2140                         }
2141                 } else {
2142                         if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
2143                                 DEBUG(DEBUG_ERR,
2144                                       ("IP %s incorrectly on an interface\n",
2145                                        ctdb_addr_to_str(&ips->ips[j].addr)));
2146                                 need_takeover_run = true;
2147                         }
2148                 }
2149         }
2150
2151 done:
2152         if (need_takeover_run) {
2153                 struct ctdb_srvid_message rd;
2154                 TDB_DATA data;
2155
2156                 DEBUG(DEBUG_NOTICE,("Trigger takeoverrun\n"));
2157
2158                 ZERO_STRUCT(rd);
2159                 rd.pnn = ctdb->pnn;
2160                 rd.srvid = 0;
2161                 data.dptr = (uint8_t *)&rd;
2162                 data.dsize = sizeof(rd);
2163
2164                 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
2165                 if (ret != 0) {
2166                         DEBUG(DEBUG_ERR,
2167                               ("Failed to send takeover run request\n"));
2168                 }
2169         }
2170         talloc_free(mem_ctx);
2171         return 0;
2172 }
2173
2174
2175 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2176 {
2177         struct ctdb_node_map_old **remote_nodemaps = callback_data;
2178
2179         if (node_pnn >= ctdb->num_nodes) {
2180                 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
2181                 return;
2182         }
2183
2184         remote_nodemaps[node_pnn] = (struct ctdb_node_map_old *)talloc_steal(remote_nodemaps, outdata.dptr);
2185
2186 }
2187
2188 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
2189         struct ctdb_node_map_old *nodemap,
2190         struct ctdb_node_map_old **remote_nodemaps)
2191 {
2192         uint32_t *nodes;
2193
2194         nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
2195         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
2196                                         nodes, 0,
2197                                         CONTROL_TIMEOUT(), false, tdb_null,
2198                                         async_getnodemap_callback,
2199                                         NULL,
2200                                         remote_nodemaps) != 0) {
2201                 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
2202
2203                 return -1;
2204         }
2205
2206         return 0;
2207 }
2208
2209 static bool validate_recovery_master(struct ctdb_recoverd *rec,
2210                                      TALLOC_CTX *mem_ctx)
2211 {
2212         struct ctdb_context *ctdb = rec->ctdb;
2213         uint32_t pnn = ctdb_get_pnn(ctdb);
2214         struct ctdb_node_map_old *nodemap = rec->nodemap;
2215         struct ctdb_node_map_old *recmaster_nodemap = NULL;
2216         int ret;
2217
2218         /* When recovery daemon is started, recmaster is set to
2219          * "unknown" so it knows to start an election.
2220          */
2221         if (rec->recmaster == CTDB_UNKNOWN_PNN) {
2222                 DEBUG(DEBUG_NOTICE,
2223                       ("Initial recovery master set - forcing election\n"));
2224                 force_election(rec, pnn, nodemap);
2225                 return false;
2226         }
2227
2228         /*
2229          * If the current recmaster does not have CTDB_CAP_RECMASTER,
2230          * but we have, then force an election and try to become the new
2231          * recmaster.
2232          */
2233         if (!ctdb_node_has_capabilities(rec->caps,
2234                                         rec->recmaster,
2235                                         CTDB_CAP_RECMASTER) &&
2236             (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
2237             !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
2238                 DEBUG(DEBUG_ERR,
2239                       (" Current recmaster node %u does not have CAP_RECMASTER,"
2240                        " but we (node %u) have - force an election\n",
2241                        rec->recmaster, pnn));
2242                 force_election(rec, pnn, nodemap);
2243                 return false;
2244         }
2245
2246         /* Verify that the master node has not been deleted.  This
2247          * should not happen because a node should always be shutdown
2248          * before being deleted, causing a new master to be elected
2249          * before now.  However, if something strange has happened
2250          * then checking here will ensure we don't index beyond the
2251          * end of the nodemap array. */
2252         if (rec->recmaster >= nodemap->num) {
2253                 DEBUG(DEBUG_ERR,
2254                       ("Recmaster node %u has been deleted. Force election\n",
2255                        rec->recmaster));
2256                 force_election(rec, pnn, nodemap);
2257                 return false;
2258         }
2259
2260         /* if recovery master is disconnected/deleted we must elect a new recmaster */
2261         if (nodemap->nodes[rec->recmaster].flags &
2262             (NODE_FLAGS_DISCONNECTED|NODE_FLAGS_DELETED)) {
2263                 DEBUG(DEBUG_NOTICE,
2264                       ("Recmaster node %u is disconnected/deleted. Force election\n",
2265                        rec->recmaster));
2266                 force_election(rec, pnn, nodemap);
2267                 return false;
2268         }
2269
2270         /* get nodemap from the recovery master to check if it is inactive */
2271         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
2272                                    mem_ctx, &recmaster_nodemap);
2273         if (ret != 0) {
2274                 DEBUG(DEBUG_ERR,
2275                       (__location__
2276                        " Unable to get nodemap from recovery master %u\n",
2277                           rec->recmaster));
2278                 /* No election, just error */
2279                 return false;
2280         }
2281
2282
2283         if ((recmaster_nodemap->nodes[rec->recmaster].flags & NODE_FLAGS_INACTIVE) &&
2284             (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
2285                 DEBUG(DEBUG_NOTICE,
2286                       ("Recmaster node %u is inactive. Force election\n",
2287                        rec->recmaster));
2288                 /*
2289                  * update our nodemap to carry the recmaster's notion of
2290                  * its own flags, so that we don't keep freezing the
2291                  * inactive recmaster node...
2292                  */
2293                 nodemap->nodes[rec->recmaster].flags =
2294                         recmaster_nodemap->nodes[rec->recmaster].flags;
2295                 force_election(rec, pnn, nodemap);
2296                 return false;
2297         }
2298
2299         return true;
2300 }
2301
2302 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
2303                       TALLOC_CTX *mem_ctx)
2304 {
2305         uint32_t pnn;
2306         struct ctdb_node_map_old *nodemap=NULL;
2307         struct ctdb_node_map_old **remote_nodemaps=NULL;
2308         struct ctdb_vnn_map *vnnmap=NULL;
2309         struct ctdb_vnn_map *remote_vnnmap=NULL;
2310         uint32_t num_lmasters;
2311         int32_t debug_level;
2312         unsigned int i, j;
2313         int ret;
2314         bool self_ban;
2315
2316
2317         /* verify that the main daemon is still running */
2318         if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
2319                 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2320                 exit(-1);
2321         }
2322
2323         /* ping the local daemon to tell it we are alive */
2324         ctdb_ctrl_recd_ping(ctdb);
2325
2326         if (rec->election_timeout) {
2327                 /* an election is in progress */
2328                 return;
2329         }
2330
2331         /* read the debug level from the parent and update locally */
2332         ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
2333         if (ret !=0) {
2334                 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
2335                 return;
2336         }
2337         debuglevel_set(debug_level);
2338
2339         /* get relevant tunables */
2340         ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
2341         if (ret != 0) {
2342                 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
2343                 return;
2344         }
2345
2346         /* get runstate */
2347         ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(),
2348                                      CTDB_CURRENT_NODE, &ctdb->runstate);
2349         if (ret != 0) {
2350                 DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n"));
2351                 return;
2352         }
2353
2354         pnn = ctdb_get_pnn(ctdb);
2355
2356         /* get nodemap */
2357         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &nodemap);
2358         if (ret != 0) {
2359                 DBG_ERR("Unable to get nodemap from node %"PRIu32"\n", pnn);
2360                 return;
2361         }
2362         talloc_free(rec->nodemap);
2363         rec->nodemap = nodemap;
2364
2365         /* remember our own node flags */
2366         rec->node_flags = nodemap->nodes[pnn].flags;
2367
2368         ban_misbehaving_nodes(rec, &self_ban);
2369         if (self_ban) {
2370                 DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
2371                 return;
2372         }
2373
2374         ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2375                                    CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2376         if (ret != 0) {
2377                 D_ERR("Failed to read recmode from local node\n");
2378                 return;
2379         }
2380
2381         /* if the local daemon is STOPPED or BANNED, we verify that the databases are
2382            also frozen and that the recmode is set to active.
2383         */
2384         if (rec->node_flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
2385                 /* If this node has become inactive then we want to
2386                  * reduce the chances of it taking over the recovery
2387                  * master role when it becomes active again.  This
2388                  * helps to stabilise the recovery master role so that
2389                  * it stays on the most stable node.
2390                  */
2391                 rec->priority_time = timeval_current();
2392
2393                 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2394                         DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
2395
2396                         ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2397                         if (ret != 0) {
2398                                 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
2399
2400                                 return;
2401                         }
2402                 }
2403                 if (! rec->frozen_on_inactive) {
2404                         ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(),
2405                                                CTDB_CURRENT_NODE);
2406                         if (ret != 0) {
2407                                 DEBUG(DEBUG_ERR,
2408                                       (__location__ " Failed to freeze node "
2409                                        "in STOPPED or BANNED state\n"));
2410                                 return;
2411                         }
2412
2413                         rec->frozen_on_inactive = true;
2414                 }
2415
2416                 /* If this node is stopped or banned then it is not the recovery
2417                  * master, so don't do anything. This prevents stopped or banned
2418                  * node from starting election and sending unnecessary controls.
2419                  */
2420                 return;
2421         }
2422
2423         rec->frozen_on_inactive = false;
2424
2425         /* Retrieve capabilities from all connected nodes */
2426         ret = update_capabilities(rec, nodemap);
2427         if (ret != 0) {
2428                 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
2429                 return;
2430         }
2431
2432         if (! validate_recovery_master(rec, mem_ctx)) {
2433                 return;
2434         }
2435
2436         if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2437                 /* Check if an IP takeover run is needed and trigger one if
2438                  * necessary */
2439                 verify_local_ip_allocation(ctdb, rec, pnn, nodemap);
2440         }
2441
2442         /* if we are not the recmaster then we do not need to check
2443            if recovery is needed
2444          */
2445         if (pnn != rec->recmaster) {
2446                 return;
2447         }
2448
2449
2450         /* ensure our local copies of flags are right */
2451         ret = update_local_flags(rec, nodemap);
2452         if (ret != 0) {
2453                 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
2454                 return;
2455         }
2456
2457         if (ctdb->num_nodes != nodemap->num) {
2458                 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
2459                 ctdb_load_nodes_file(ctdb);
2460                 return;
2461         }
2462
2463         /* verify that all active nodes agree that we are the recmaster */
2464         switch (verify_recmaster(rec, nodemap, pnn)) {
2465         case MONITOR_RECOVERY_NEEDED:
2466                 /* can not happen */
2467                 return;
2468         case MONITOR_ELECTION_NEEDED:
2469                 force_election(rec, pnn, nodemap);
2470                 return;
2471         case MONITOR_OK:
2472                 break;
2473         case MONITOR_FAILED:
2474                 return;
2475         }
2476
2477
2478         /* get the vnnmap */
2479         ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
2480         if (ret != 0) {
2481                 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
2482                 return;
2483         }
2484
2485         if (rec->need_recovery) {
2486                 /* a previous recovery didn't finish */
2487                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2488                 return;
2489         }
2490
2491         /* verify that all active nodes are in normal mode
2492            and not in recovery mode
2493         */
2494         switch (verify_recmode(ctdb, nodemap)) {
2495         case MONITOR_RECOVERY_NEEDED:
2496                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2497                 return;
2498         case MONITOR_FAILED:
2499                 return;
2500         case MONITOR_ELECTION_NEEDED:
2501                 /* can not happen */
2502         case MONITOR_OK:
2503                 break;
2504         }
2505
2506
2507         if (ctdb->recovery_lock != NULL) {
2508                 /* We must already hold the recovery lock */
2509                 if (!ctdb_recovery_have_lock(rec)) {
2510                         DEBUG(DEBUG_ERR,("Failed recovery lock sanity check.  Force a recovery\n"));
2511                         ctdb_set_culprit(rec, ctdb->pnn);
2512                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2513                         return;
2514                 }
2515         }
2516
2517
2518         /* If recoveries are disabled then there is no use doing any
2519          * nodemap or flags checks.  Recoveries might be disabled due
2520          * to "reloadnodes", so doing these checks might cause an
2521          * unnecessary recovery.  */
2522         if (ctdb_op_is_disabled(rec->recovery)) {
2523                 goto takeover_run_checks;
2524         }
2525
2526         /* get the nodemap for all active remote nodes
2527          */
2528         remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map_old *, nodemap->num);
2529         if (remote_nodemaps == NULL) {
2530                 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
2531                 return;
2532         }
2533         for(i=0; i<nodemap->num; i++) {
2534                 remote_nodemaps[i] = NULL;
2535         }
2536         if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
2537                 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
2538                 return;
2539         }
2540
2541         /* verify that all other nodes have the same nodemap as we have
2542         */
2543         for (j=0; j<nodemap->num; j++) {
2544                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2545                         continue;
2546                 }
2547
2548                 if (remote_nodemaps[j] == NULL) {
2549                         DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
2550                         ctdb_set_culprit(rec, j);
2551
2552                         return;
2553                 }
2554
2555                 /* if the nodes disagree on how many nodes there are
2556                    then this is a good reason to try recovery
2557                  */
2558                 if (remote_nodemaps[j]->num != nodemap->num) {
2559                         DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
2560                                   nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
2561                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2562                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2563                         return;
2564                 }
2565
2566                 /* if the nodes disagree on which nodes exist and are
2567                    active, then that is also a good reason to do recovery
2568                  */
2569                 for (i=0;i<nodemap->num;i++) {
2570                         if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
2571                                 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
2572                                           nodemap->nodes[j].pnn, i,
2573                                           remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
2574                                 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2575                                 do_recovery(rec, mem_ctx, pnn, nodemap,
2576                                             vnnmap);
2577                                 return;
2578                         }
2579                 }
2580         }
2581
2582         /*
2583          * Update node flags obtained from each active node. This ensure we have
2584          * up-to-date information for all the nodes.
2585          */
2586         for (j=0; j<nodemap->num; j++) {
2587                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2588                         continue;
2589                 }
2590                 nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
2591         }
2592
2593         for (j=0; j<nodemap->num; j++) {
2594                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2595                         continue;
2596                 }
2597
2598                 /* verify the flags are consistent
2599                 */
2600                 for (i=0; i<nodemap->num; i++) {
2601                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2602                                 continue;
2603                         }
2604
2605                         if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
2606                                 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
2607                                   nodemap->nodes[j].pnn,
2608                                   nodemap->nodes[i].pnn,
2609                                   remote_nodemaps[j]->nodes[i].flags,
2610                                   nodemap->nodes[i].flags));
2611                                 if (i == j) {
2612                                         DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
2613                                         update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
2614                                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2615                                         do_recovery(rec, mem_ctx, pnn, nodemap,
2616                                                     vnnmap);
2617                                         return;
2618                                 } else {
2619                                         DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
2620                                         update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
2621                                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2622                                         do_recovery(rec, mem_ctx, pnn, nodemap,
2623                                                     vnnmap);
2624                                         return;
2625                                 }
2626                         }
2627                 }
2628         }
2629
2630
2631         /* count how many active nodes there are */
2632         num_lmasters  = 0;
2633         for (i=0; i<nodemap->num; i++) {
2634                 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
2635                         if (ctdb_node_has_capabilities(rec->caps,
2636                                                        ctdb->nodes[i]->pnn,
2637                                                        CTDB_CAP_LMASTER)) {
2638                                 num_lmasters++;
2639                         }
2640                 }
2641         }
2642
2643
2644         /* There must be the same number of lmasters in the vnn map as
2645          * there are active nodes with the lmaster capability...  or
2646          * do a recovery.
2647          */
2648         if (vnnmap->size != num_lmasters) {
2649                 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
2650                           vnnmap->size, num_lmasters));
2651                 ctdb_set_culprit(rec, ctdb->pnn);
2652                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2653                 return;
2654         }
2655
2656         /*
2657          * Verify that all active lmaster nodes in the nodemap also
2658          * exist in the vnnmap
2659          */
2660         for (j=0; j<nodemap->num; j++) {
2661                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2662                         continue;
2663                 }
2664                 if (! ctdb_node_has_capabilities(rec->caps,
2665                                                  nodemap->nodes[j].pnn,
2666                                                  CTDB_CAP_LMASTER)) {
2667                         continue;
2668                 }
2669                 if (nodemap->nodes[j].pnn == pnn) {
2670                         continue;
2671                 }
2672
2673                 for (i=0; i<vnnmap->size; i++) {
2674                         if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
2675                                 break;
2676                         }
2677                 }
2678                 if (i == vnnmap->size) {
2679                         D_ERR("Active LMASTER node %u is not in the vnnmap\n",
2680                               nodemap->nodes[j].pnn);
2681                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2682                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2683                         return;
2684                 }
2685         }
2686
2687
2688         /* verify that all other nodes have the same vnnmap
2689            and are from the same generation
2690          */
2691         for (j=0; j<nodemap->num; j++) {
2692                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2693                         continue;
2694                 }
2695                 if (nodemap->nodes[j].pnn == pnn) {
2696                         continue;
2697                 }
2698
2699                 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
2700                                           mem_ctx, &remote_vnnmap);
2701                 if (ret != 0) {
2702                         DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
2703                                   nodemap->nodes[j].pnn));
2704                         return;
2705                 }
2706
2707                 /* verify the vnnmap generation is the same */
2708                 if (vnnmap->generation != remote_vnnmap->generation) {
2709                         DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
2710                                   nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
2711                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2712                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2713                         return;
2714                 }
2715
2716                 /* verify the vnnmap size is the same */
2717                 if (vnnmap->size != remote_vnnmap->size) {
2718                         DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
2719                                   nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
2720                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2721                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2722                         return;
2723                 }
2724
2725                 /* verify the vnnmap is the same */
2726                 for (i=0;i<vnnmap->size;i++) {
2727                         if (remote_vnnmap->map[i] != vnnmap->map[i]) {
2728                                 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
2729                                           nodemap->nodes[j].pnn));
2730                                 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2731                                 do_recovery(rec, mem_ctx, pnn, nodemap,
2732                                             vnnmap);
2733                                 return;
2734                         }
2735                 }
2736         }
2737
2738         /* FIXME: Add remote public IP checking to ensure that nodes
2739          * have the IP addresses that are allocated to them. */
2740
2741 takeover_run_checks:
2742
2743         /* If there are IP takeover runs requested or the previous one
2744          * failed then perform one and notify the waiters */
2745         if (!ctdb_op_is_disabled(rec->takeover_run) &&
2746             (rec->reallocate_requests || rec->need_takeover_run)) {
2747                 process_ipreallocate_requests(ctdb, rec);
2748         }
2749 }
2750
2751 static void recd_sig_term_handler(struct tevent_context *ev,
2752                                   struct tevent_signal *se, int signum,
2753                                   int count, void *dont_care,
2754                                   void *private_data)
2755 {
2756         struct ctdb_recoverd *rec = talloc_get_type_abort(
2757                 private_data, struct ctdb_recoverd);
2758
2759         DEBUG(DEBUG_ERR, ("Received SIGTERM, exiting\n"));
2760         ctdb_recovery_unlock(rec);
2761         exit(0);
2762 }
2763
2764 /*
2765  * Periodically log elements of the cluster state
2766  *
2767  * This can be used to confirm a split brain has occurred
2768  */
2769 static void maybe_log_cluster_state(struct tevent_context *ev,
2770                                     struct tevent_timer *te,
2771                                     struct timeval current_time,
2772                                     void *private_data)
2773 {
2774         struct ctdb_recoverd *rec = talloc_get_type_abort(
2775                 private_data, struct ctdb_recoverd);
2776         struct ctdb_context *ctdb = rec->ctdb;
2777         struct tevent_timer *tt;
2778
2779         static struct timeval start_incomplete = {
2780                 .tv_sec = 0,
2781         };
2782
2783         bool is_complete;
2784         bool was_complete;
2785         unsigned int i;
2786         double seconds;
2787         unsigned int minutes;
2788         unsigned int num_connected;
2789
2790         if (rec->recmaster != ctdb_get_pnn(ctdb)) {
2791                 goto done;
2792         }
2793
2794         if (rec->nodemap == NULL) {
2795                 goto done;
2796         }
2797
2798         is_complete = true;
2799         num_connected = 0;
2800         for (i = 0; i < rec->nodemap->num; i++) {
2801                 struct ctdb_node_and_flags *n = &rec->nodemap->nodes[i];
2802
2803                 if (n->pnn == ctdb_get_pnn(ctdb)) {
2804                         continue;
2805                 }
2806                 if ((n->flags & NODE_FLAGS_DELETED) != 0) {
2807                         continue;
2808                 }
2809                 if ((n->flags & NODE_FLAGS_DISCONNECTED) != 0) {
2810                         is_complete = false;
2811                         continue;
2812                 }
2813
2814                 num_connected++;
2815         }
2816
2817         was_complete = timeval_is_zero(&start_incomplete);
2818
2819         if (is_complete) {
2820                 if (! was_complete) {
2821                         D_WARNING("Cluster complete with master=%u\n",
2822                                   rec->recmaster);
2823                         start_incomplete = timeval_zero();
2824                 }
2825                 goto done;
2826         }
2827
2828         /* Cluster is newly incomplete... */
2829         if (was_complete) {
2830                 start_incomplete = current_time;
2831                 minutes = 0;
2832                 goto log;
2833         }
2834
2835         /*
2836          * Cluster has been incomplete since previous check, so figure
2837          * out how long (in minutes) and decide whether to log anything
2838          */
2839         seconds = timeval_elapsed2(&start_incomplete, &current_time);
2840         minutes = (unsigned int)seconds / 60;
2841         if (minutes >= 60) {
2842                 /* Over an hour, log every hour */
2843                 if (minutes % 60 != 0) {
2844                         goto done;
2845                 }
2846         } else if (minutes >= 10) {
2847                 /* Over 10 minutes, log every 10 minutes */
2848                 if (minutes % 10 != 0) {
2849                         goto done;
2850                 }
2851         }
2852
2853 log:
2854         D_WARNING("Cluster incomplete with master=%u, elapsed=%u minutes, "
2855                   "connected=%u\n",
2856                   rec->recmaster,
2857                   minutes,
2858                   num_connected);
2859
2860 done:
2861         tt = tevent_add_timer(ctdb->ev,
2862                               rec,
2863                               timeval_current_ofs(60, 0),
2864                               maybe_log_cluster_state,
2865                               rec);
2866         if (tt == NULL) {
2867                 DBG_WARNING("Failed to set up cluster state timer\n");
2868         }
2869 }
2870
2871 /*
2872   the main monitoring loop
2873  */
2874 static void monitor_cluster(struct ctdb_context *ctdb)
2875 {
2876         struct tevent_signal *se;
2877         struct ctdb_recoverd *rec;
2878
2879         DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
2880
2881         rec = talloc_zero(ctdb, struct ctdb_recoverd);
2882         CTDB_NO_MEMORY_FATAL(ctdb, rec);
2883
2884         rec->ctdb = ctdb;
2885         rec->recmaster = CTDB_UNKNOWN_PNN;
2886         rec->recovery_lock_handle = NULL;
2887
2888         rec->takeover_run = ctdb_op_init(rec, "takeover runs");
2889         CTDB_NO_MEMORY_FATAL(ctdb, rec->takeover_run);
2890
2891         rec->recovery = ctdb_op_init(rec, "recoveries");
2892         CTDB_NO_MEMORY_FATAL(ctdb, rec->recovery);
2893
2894         rec->priority_time = timeval_current();
2895         rec->frozen_on_inactive = false;
2896
2897         se = tevent_add_signal(ctdb->ev, ctdb, SIGTERM, 0,
2898                                recd_sig_term_handler, rec);
2899         if (se == NULL) {
2900                 DEBUG(DEBUG_ERR, ("Failed to install SIGTERM handler\n"));
2901                 exit(1);
2902         }
2903
2904         if (ctdb->recovery_lock == NULL) {
2905                 struct tevent_timer *tt;
2906
2907                 tt = tevent_add_timer(ctdb->ev,
2908                                       rec,
2909                                       timeval_current_ofs(60, 0),
2910                                       maybe_log_cluster_state,
2911                                       rec);
2912                 if (tt == NULL) {
2913                         DBG_WARNING("Failed to set up cluster state timer\n");
2914                 }
2915         }
2916
2917         /* register a message port for sending memory dumps */
2918         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
2919
2920         /* when a node is assigned banning credits */
2921         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_BANNING,
2922                                         banning_handler, rec);
2923
2924         /* register a message port for recovery elections */
2925         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_ELECTION, election_handler, rec);
2926
2927         /* when nodes are disabled/enabled */
2928         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
2929
2930         /* when we are asked to puch out a flag change */
2931         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
2932
2933         /* register a message port for reloadnodes  */
2934         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
2935
2936         /* register a message port for performing a takeover run */
2937         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
2938
2939         /* register a message port for disabling the ip check for a short while */
2940         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
2941
2942         /* register a message port for forcing a rebalance of a node next
2943            reallocation */
2944         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
2945
2946         /* Register a message port for disabling takeover runs */
2947         ctdb_client_set_message_handler(ctdb,
2948                                         CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
2949                                         disable_takeover_runs_handler, rec);
2950
2951         /* Register a message port for disabling recoveries */
2952         ctdb_client_set_message_handler(ctdb,
2953                                         CTDB_SRVID_DISABLE_RECOVERIES,
2954                                         disable_recoveries_handler, rec);
2955
2956         for (;;) {
2957                 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2958                 struct timeval start;
2959                 double elapsed;
2960
2961                 if (!mem_ctx) {
2962                         DEBUG(DEBUG_CRIT,(__location__
2963                                           " Failed to create temp context\n"));
2964                         exit(-1);
2965                 }
2966
2967                 start = timeval_current();
2968                 main_loop(ctdb, rec, mem_ctx);
2969                 talloc_free(mem_ctx);
2970
2971                 /* we only check for recovery once every second */
2972                 elapsed = timeval_elapsed(&start);
2973                 if (elapsed < ctdb->tunable.recover_interval) {
2974                         ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
2975                                           - elapsed);
2976                 }
2977         }
2978 }
2979
2980 /*
2981   event handler for when the main ctdbd dies
2982  */
2983 static void ctdb_recoverd_parent(struct tevent_context *ev,
2984                                  struct tevent_fd *fde,
2985                                  uint16_t flags, void *private_data)
2986 {
2987         DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
2988         _exit(1);
2989 }
2990
2991 /*
2992   called regularly to verify that the recovery daemon is still running
2993  */
2994 static void ctdb_check_recd(struct tevent_context *ev,
2995                             struct tevent_timer *te,
2996                             struct timeval yt, void *p)
2997 {
2998         struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
2999
3000         if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
3001                 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
3002
3003                 tevent_add_timer(ctdb->ev, ctdb, timeval_zero(),
3004                                  ctdb_restart_recd, ctdb);
3005
3006                 return;
3007         }
3008
3009         tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3010                          timeval_current_ofs(30, 0),
3011                          ctdb_check_recd, ctdb);
3012 }
3013
3014 static void recd_sig_child_handler(struct tevent_context *ev,
3015                                    struct tevent_signal *se, int signum,
3016                                    int count, void *dont_care,
3017                                    void *private_data)
3018 {
3019 //      struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3020         int status;
3021         pid_t pid = -1;
3022
3023         while (pid != 0) {
3024                 pid = waitpid(-1, &status, WNOHANG);
3025                 if (pid == -1) {
3026                         if (errno != ECHILD) {
3027                                 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3028                         }
3029                         return;
3030                 }
3031                 if (pid > 0) {
3032                         DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3033                 }
3034         }
3035 }
3036
3037 /*
3038   startup the recovery daemon as a child of the main ctdb daemon
3039  */
3040 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3041 {
3042         int fd[2];
3043         struct tevent_signal *se;
3044         struct tevent_fd *fde;
3045         int ret;
3046
3047         if (pipe(fd) != 0) {
3048                 return -1;
3049         }
3050
3051         ctdb->recoverd_pid = ctdb_fork(ctdb);
3052         if (ctdb->recoverd_pid == -1) {
3053                 return -1;
3054         }
3055
3056         if (ctdb->recoverd_pid != 0) {
3057                 talloc_free(ctdb->recd_ctx);
3058                 ctdb->recd_ctx = talloc_new(ctdb);
3059                 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
3060
3061                 close(fd[0]);
3062                 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3063                                  timeval_current_ofs(30, 0),
3064                                  ctdb_check_recd, ctdb);
3065                 return 0;
3066         }
3067
3068         close(fd[1]);
3069
3070         srandom(getpid() ^ time(NULL));
3071
3072         ret = logging_init(ctdb, NULL, NULL, "ctdb-recoverd");
3073         if (ret != 0) {
3074                 return -1;
3075         }
3076
3077         prctl_set_comment("ctdb_recoverd");
3078         if (switch_from_server_to_client(ctdb) != 0) {
3079                 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3080                 exit(1);
3081         }
3082
3083         DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
3084
3085         fde = tevent_add_fd(ctdb->ev, ctdb, fd[0], TEVENT_FD_READ,
3086                             ctdb_recoverd_parent, &fd[0]);
3087         tevent_fd_set_auto_close(fde);
3088
3089         /* set up a handler to pick up sigchld */
3090         se = tevent_add_signal(ctdb->ev, ctdb, SIGCHLD, 0,
3091                                recd_sig_child_handler, ctdb);
3092         if (se == NULL) {
3093                 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3094                 exit(1);
3095         }
3096
3097         monitor_cluster(ctdb);
3098
3099         DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
3100         return -1;
3101 }
3102
3103 /*
3104   shutdown the recovery daemon
3105  */
3106 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
3107 {
3108         if (ctdb->recoverd_pid == 0) {
3109                 return;
3110         }
3111
3112         DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
3113         ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
3114
3115         TALLOC_FREE(ctdb->recd_ctx);
3116         TALLOC_FREE(ctdb->recd_ping_count);
3117 }
3118
3119 static void ctdb_restart_recd(struct tevent_context *ev,
3120                               struct tevent_timer *te,
3121                               struct timeval t, void *private_data)
3122 {
3123         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3124
3125         DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
3126         ctdb_stop_recoverd(ctdb);
3127         ctdb_start_recoverd(ctdb);
3128 }