ctdb/server/ctdb_recoverd.c

   1 /*
   2    ctdb recovery daemon
   3
   4    Copyright (C) Ronnie Sahlberg  2007
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20 #include "replace.h"
  21 #include "system/filesys.h"
  22 #include "system/time.h"
  23 #include "system/network.h"
  24 #include "system/wait.h"
  25
  26 #include <popt.h>
  27 #include <talloc.h>
  28 #include <tevent.h>
  29 #include <tdb.h>
  30
  31 #include "lib/tdb_wrap/tdb_wrap.h"
  32 #include "lib/util/dlinklist.h"
  33 #include "lib/util/debug.h"
  34 #include "lib/util/samba_util.h"
  35 #include "lib/util/sys_rw.h"
  36 #include "lib/util/util_process.h"
  37
  38 #include "ctdb_private.h"
  39 #include "ctdb_client.h"
  40
  41 #include "common/system_socket.h"
  42 #include "common/common.h"
  43 #include "common/logging.h"
  44
  45 #include "server/ctdb_config.h"
  46
  47 #include "ctdb_cluster_mutex.h"
  48
  49 /* List of SRVID requests that need to be processed */
  50 struct srvid_list {
  51         struct srvid_list *next, *prev;
  52         struct ctdb_srvid_message *request;
  53 };
  54
  55 struct srvid_requests {
  56         struct srvid_list *requests;
  57 };
  58
  59 static void srvid_request_reply(struct ctdb_context *ctdb,
  60                                 struct ctdb_srvid_message *request,
  61                                 TDB_DATA result)
  62 {
  63         /* Someone that sent srvid==0 does not want a reply */
  64         if (request->srvid == 0) {
  65                 talloc_free(request);
  66                 return;
  67         }
  68
  69         if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
  70                                      result) == 0) {
  71                 DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
  72                                   (unsigned)request->pnn,
  73                                   (unsigned long long)request->srvid));
  74         } else {
  75                 DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
  76                                  (unsigned)request->pnn,
  77                                  (unsigned long long)request->srvid));
  78         }
  79
  80         talloc_free(request);
  81 }
  82
  83 static void srvid_requests_reply(struct ctdb_context *ctdb,
  84                                  struct srvid_requests **requests,
  85                                  TDB_DATA result)
  86 {
  87         struct srvid_list *r;
  88
  89         if (*requests == NULL) {
  90                 return;
  91         }
  92
  93         for (r = (*requests)->requests; r != NULL; r = r->next) {
  94                 srvid_request_reply(ctdb, r->request, result);
  95         }
  96
  97         /* Free the list structure... */
  98         TALLOC_FREE(*requests);
  99 }
 100
 101 static void srvid_request_add(struct ctdb_context *ctdb,
 102                               struct srvid_requests **requests,
 103                               struct ctdb_srvid_message *request)
 104 {
 105         struct srvid_list *t;
 106         int32_t ret;
 107         TDB_DATA result;
 108
 109         if (*requests == NULL) {
 110                 *requests = talloc_zero(ctdb, struct srvid_requests);
 111                 if (*requests == NULL) {
 112                         goto nomem;
 113                 }
 114         }
 115
 116         t = talloc_zero(*requests, struct srvid_list);
 117         if (t == NULL) {
 118                 /* If *requests was just allocated above then free it */
 119                 if ((*requests)->requests == NULL) {
 120                         TALLOC_FREE(*requests);
 121                 }
 122                 goto nomem;
 123         }
 124
 125         t->request = (struct ctdb_srvid_message *)talloc_steal(t, request);
 126         DLIST_ADD((*requests)->requests, t);
 127
 128         return;
 129
 130 nomem:
 131         /* Failed to add the request to the list.  Send a fail. */
 132         DEBUG(DEBUG_ERR, (__location__
 133                           " Out of memory, failed to queue SRVID request\n"));
 134         ret = -ENOMEM;
 135         result.dsize = sizeof(ret);
 136         result.dptr = (uint8_t *)&ret;
 137         srvid_request_reply(ctdb, request, result);
 138 }
 139
 140 /* An abstraction to allow an operation (takeover runs, recoveries,
 141  * ...) to be disabled for a given timeout */
 142 struct ctdb_op_state {
 143         struct tevent_timer *timer;
 144         bool in_progress;
 145         const char *name;
 146 };
 147
 148 static struct ctdb_op_state *ctdb_op_init(TALLOC_CTX *mem_ctx, const char *name)
 149 {
 150         struct ctdb_op_state *state = talloc_zero(mem_ctx, struct ctdb_op_state);
 151
 152         if (state != NULL) {
 153                 state->in_progress = false;
 154                 state->name = name;
 155         }
 156
 157         return state;
 158 }
 159
 160 static bool ctdb_op_is_disabled(struct ctdb_op_state *state)
 161 {
 162         return state->timer != NULL;
 163 }
 164
 165 static bool ctdb_op_begin(struct ctdb_op_state *state)
 166 {
 167         if (ctdb_op_is_disabled(state)) {
 168                 DEBUG(DEBUG_NOTICE,
 169                       ("Unable to begin - %s are disabled\n", state->name));
 170                 return false;
 171         }
 172
 173         state->in_progress = true;
 174         return true;
 175 }
 176
 177 static bool ctdb_op_end(struct ctdb_op_state *state)
 178 {
 179         return state->in_progress = false;
 180 }
 181
 182 static bool ctdb_op_is_in_progress(struct ctdb_op_state *state)
 183 {
 184         return state->in_progress;
 185 }
 186
 187 static void ctdb_op_enable(struct ctdb_op_state *state)
 188 {
 189         TALLOC_FREE(state->timer);
 190 }
 191
 192 static void ctdb_op_timeout_handler(struct tevent_context *ev,
 193                                     struct tevent_timer *te,
 194                                     struct timeval yt, void *p)
 195 {
 196         struct ctdb_op_state *state =
 197                 talloc_get_type(p, struct ctdb_op_state);
 198
 199         DEBUG(DEBUG_NOTICE,("Reenabling %s after timeout\n", state->name));
 200         ctdb_op_enable(state);
 201 }
 202
 203 static int ctdb_op_disable(struct ctdb_op_state *state,
 204                            struct tevent_context *ev,
 205                            uint32_t timeout)
 206 {
 207         if (timeout == 0) {
 208                 DEBUG(DEBUG_NOTICE,("Reenabling %s\n", state->name));
 209                 ctdb_op_enable(state);
 210                 return 0;
 211         }
 212
 213         if (state->in_progress) {
 214                 DEBUG(DEBUG_ERR,
 215                       ("Unable to disable %s - in progress\n", state->name));
 216                 return -EAGAIN;
 217         }
 218
 219         DEBUG(DEBUG_NOTICE,("Disabling %s for %u seconds\n",
 220                             state->name, timeout));
 221
 222         /* Clear any old timers */
 223         talloc_free(state->timer);
 224
 225         /* Arrange for the timeout to occur */
 226         state->timer = tevent_add_timer(ev, state,
 227                                         timeval_current_ofs(timeout, 0),
 228                                         ctdb_op_timeout_handler, state);
 229         if (state->timer == NULL) {
 230                 DEBUG(DEBUG_ERR,(__location__ " Unable to setup timer\n"));
 231                 return -ENOMEM;
 232         }
 233
 234         return 0;
 235 }
 236
 237 struct ctdb_banning_state {
 238         uint32_t count;
 239         struct timeval last_reported_time;
 240 };
 241
 242 struct ctdb_recovery_lock_handle;
 243
 244 /*
 245   private state of recovery daemon
 246  */
 247 struct ctdb_recoverd {
 248         struct ctdb_context *ctdb;
 249         uint32_t recmaster;
 250         uint32_t last_culprit_node;
 251         struct ctdb_node_map_old *nodemap;
 252         struct timeval priority_time;
 253         bool need_takeover_run;
 254         bool need_recovery;
 255         uint32_t node_flags;
 256         struct tevent_timer *send_election_te;
 257         struct tevent_timer *election_timeout;
 258         struct srvid_requests *reallocate_requests;
 259         struct ctdb_op_state *takeover_run;
 260         struct ctdb_op_state *recovery;
 261         struct ctdb_iface_list_old *ifaces;
 262         uint32_t *force_rebalance_nodes;
 263         struct ctdb_node_capabilities *caps;
 264         bool frozen_on_inactive;
 265         struct ctdb_recovery_lock_handle *recovery_lock_handle;
 266 };
 267
 268 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
 269 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
 270
 271 static void ctdb_restart_recd(struct tevent_context *ev,
 272                               struct tevent_timer *te, struct timeval t,
 273                               void *private_data);
 274
 275 /*
 276   ban a node for a period of time
 277  */
 278 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
 279 {
 280         int ret;
 281         struct ctdb_context *ctdb = rec->ctdb;
 282         struct ctdb_ban_state bantime;
 283
 284         if (!ctdb_validate_pnn(ctdb, pnn)) {
 285                 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
 286                 return;
 287         }
 288
 289         DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
 290
 291         bantime.pnn  = pnn;
 292         bantime.time = ban_time;
 293
 294         ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
 295         if (ret != 0) {
 296                 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
 297                 return;
 298         }
 299
 300 }
 301
 302 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
 303
 304
 305 /*
 306   remember the trouble maker
 307  */
 308 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
 309 {
 310         struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
 311         struct ctdb_banning_state *ban_state;
 312
 313         if (culprit > ctdb->num_nodes) {
 314                 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
 315                 return;
 316         }
 317
 318         /* If we are banned or stopped, do not set other nodes as culprits */
 319         if (rec->node_flags & NODE_FLAGS_INACTIVE) {
 320                 DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
 321                 return;
 322         }
 323
 324         if (ctdb->nodes[culprit]->ban_state == NULL) {
 325                 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
 326                 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
 327
 328
 329         }
 330         ban_state = ctdb->nodes[culprit]->ban_state;
 331         if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
 332                 /* this was the first time in a long while this node
 333                    misbehaved so we will forgive any old transgressions.
 334                 */
 335                 ban_state->count = 0;
 336         }
 337
 338         ban_state->count += count;
 339         ban_state->last_reported_time = timeval_current();
 340         rec->last_culprit_node = culprit;
 341 }
 342
 343 /*
 344   remember the trouble maker
 345  */
 346 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
 347 {
 348         ctdb_set_culprit_count(rec, culprit, 1);
 349 }
 350
 351 /*
 352   Retrieve capabilities from all connected nodes
 353  */
 354 static int update_capabilities(struct ctdb_recoverd *rec,
 355                                struct ctdb_node_map_old *nodemap)
 356 {
 357         uint32_t *capp;
 358         TALLOC_CTX *tmp_ctx;
 359         struct ctdb_node_capabilities *caps;
 360         struct ctdb_context *ctdb = rec->ctdb;
 361
 362         tmp_ctx = talloc_new(rec);
 363         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 364
 365         caps = ctdb_get_capabilities(ctdb, tmp_ctx,
 366                                      CONTROL_TIMEOUT(), nodemap);
 367
 368         if (caps == NULL) {
 369                 DEBUG(DEBUG_ERR,
 370                       (__location__ " Failed to get node capabilities\n"));
 371                 talloc_free(tmp_ctx);
 372                 return -1;
 373         }
 374
 375         capp = ctdb_get_node_capabilities(caps, ctdb_get_pnn(ctdb));
 376         if (capp == NULL) {
 377                 DEBUG(DEBUG_ERR,
 378                       (__location__
 379                        " Capabilities don't include current node.\n"));
 380                 talloc_free(tmp_ctx);
 381                 return -1;
 382         }
 383         ctdb->capabilities = *capp;
 384
 385         TALLOC_FREE(rec->caps);
 386         rec->caps = talloc_steal(rec, caps);
 387
 388         talloc_free(tmp_ctx);
 389         return 0;
 390 }
 391
 392 /*
 393   change recovery mode on all nodes
 394  */
 395 static int set_recovery_mode(struct ctdb_context *ctdb,
 396                              struct ctdb_recoverd *rec,
 397                              struct ctdb_node_map_old *nodemap,
 398                              uint32_t rec_mode)
 399 {
 400         TDB_DATA data;
 401         uint32_t *nodes;
 402         TALLOC_CTX *tmp_ctx;
 403
 404         tmp_ctx = talloc_new(ctdb);
 405         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 406
 407         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 408
 409         data.dsize = sizeof(uint32_t);
 410         data.dptr = (unsigned char *)&rec_mode;
 411
 412         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
 413                                         nodes, 0,
 414                                         CONTROL_TIMEOUT(),
 415                                         false, data,
 416                                         NULL, NULL,
 417                                         NULL) != 0) {
 418                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
 419                 talloc_free(tmp_ctx);
 420                 return -1;
 421         }
 422
 423         talloc_free(tmp_ctx);
 424         return 0;
 425 }
 426
 427 /*
 428  * Update flags on all connected nodes
 429  */
 430 static int update_flags_on_all_nodes(struct ctdb_context *ctdb,
 431                                      uint32_t pnn,
 432                                      uint32_t flags)
 433 {
 434         int ret;
 435
 436         ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
 437                 if (ret != 0) {
 438                 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
 439                 return -1;
 440         }
 441
 442         return 0;
 443 }
 444
 445 /*
 446   called when ctdb_wait_timeout should finish
 447  */
 448 static void ctdb_wait_handler(struct tevent_context *ev,
 449                               struct tevent_timer *te,
 450                               struct timeval yt, void *p)
 451 {
 452         uint32_t *timed_out = (uint32_t *)p;
 453         (*timed_out) = 1;
 454 }
 455
 456 /*
 457   wait for a given number of seconds
 458  */
 459 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
 460 {
 461         uint32_t timed_out = 0;
 462         time_t usecs = (secs - (time_t)secs) * 1000000;
 463         tevent_add_timer(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs),
 464                          ctdb_wait_handler, &timed_out);
 465         while (!timed_out) {
 466                 tevent_loop_once(ctdb->ev);
 467         }
 468 }
 469
 470 /*
 471   called when an election times out (ends)
 472  */
 473 static void ctdb_election_timeout(struct tevent_context *ev,
 474                                   struct tevent_timer *te,
 475                                   struct timeval t, void *p)
 476 {
 477         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
 478         rec->election_timeout = NULL;
 479         fast_start = false;
 480
 481         D_WARNING("Election period ended, master=%u\n", rec->recmaster);
 482 }
 483
 484
 485 /*
 486   wait for an election to finish. It finished election_timeout seconds after
 487   the last election packet is received
 488  */
 489 static void ctdb_wait_election(struct ctdb_recoverd *rec)
 490 {
 491         struct ctdb_context *ctdb = rec->ctdb;
 492         while (rec->election_timeout) {
 493                 tevent_loop_once(ctdb->ev);
 494         }
 495 }
 496
 497 /*
 498   Update our local flags from all remote connected nodes.
 499   This is only run when we are or we belive we are the recovery master
 500  */
 501 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap)
 502 {
 503         unsigned int j;
 504         struct ctdb_context *ctdb = rec->ctdb;
 505         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
 506
 507         /* get the nodemap for all active remote nodes and verify
 508            they are the same as for this node
 509          */
 510         for (j=0; j<nodemap->num; j++) {
 511                 struct ctdb_node_map_old *remote_nodemap=NULL;
 512                 int ret;
 513
 514                 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
 515                         continue;
 516                 }
 517                 if (nodemap->nodes[j].pnn == ctdb->pnn) {
 518                         continue;
 519                 }
 520
 521                 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 522                                            mem_ctx, &remote_nodemap);
 523                 if (ret != 0) {
 524                         DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
 525                                   nodemap->nodes[j].pnn));
 526                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
 527                         talloc_free(mem_ctx);
 528                         return -1;
 529                 }
 530                 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
 531                         /* We should tell our daemon about this so it
 532                            updates its flags or else we will log the same
 533                            message again in the next iteration of recovery.
 534                            Since we are the recovery master we can just as
 535                            well update the flags on all nodes.
 536                         */
 537                         ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
 538                         if (ret != 0) {
 539                                 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
 540                                 return -1;
 541                         }
 542
 543                         /* Update our local copy of the flags in the recovery
 544                            daemon.
 545                         */
 546                         DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
 547                                  nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
 548                                  nodemap->nodes[j].flags));
 549                         nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
 550                 }
 551                 talloc_free(remote_nodemap);
 552         }
 553         talloc_free(mem_ctx);
 554         return 0;
 555 }
 556
 557
 558 /* Create a new random generation id.
 559    The generation id can not be the INVALID_GENERATION id
 560 */
 561 static uint32_t new_generation(void)
 562 {
 563         uint32_t generation;
 564
 565         while (1) {
 566                 generation = random();
 567
 568                 if (generation != INVALID_GENERATION) {
 569                         break;
 570                 }
 571         }
 572
 573         return generation;
 574 }
 575
 576 static bool ctdb_recovery_have_lock(struct ctdb_recoverd *rec)
 577 {
 578         return (rec->recovery_lock_handle != NULL);
 579 }
 580
 581 struct ctdb_recovery_lock_handle {
 582         bool done;
 583         bool locked;
 584         double latency;
 585         struct ctdb_cluster_mutex_handle *h;
 586         struct ctdb_recoverd *rec;
 587 };
 588
 589 static void take_reclock_handler(char status,
 590                                  double latency,
 591                                  void *private_data)
 592 {
 593         struct ctdb_recovery_lock_handle *s =
 594                 (struct ctdb_recovery_lock_handle *) private_data;
 595
 596         s->locked = (status == '0') ;
 597
 598         /*
 599          * If unsuccessful then ensure the process has exited and that
 600          * the file descriptor event handler has been cancelled
 601          */
 602         if (! s->locked) {
 603                 TALLOC_FREE(s->h);
 604         }
 605
 606         switch (status) {
 607         case '0':
 608                 s->latency = latency;
 609                 break;
 610
 611         case '1':
 612                 D_ERR("Unable to take recovery lock - contention\n");
 613                 break;
 614
 615         case '2':
 616                 D_ERR("Unable to take recovery lock - timeout\n");
 617                 break;
 618
 619         default:
 620                 D_ERR("Unable to take recover lock - unknown error\n");
 621
 622                 {
 623                         struct ctdb_recoverd *rec = s->rec;
 624                         struct ctdb_context *ctdb = rec->ctdb;
 625                         uint32_t pnn = ctdb_get_pnn(ctdb);
 626
 627                         D_ERR("Banning this node\n");
 628                         ctdb_ban_node(rec,
 629                                       pnn,
 630                                       ctdb->tunable.recovery_ban_period);
 631                 }
 632         }
 633
 634         s->done = true;
 635 }
 636
 637 static void force_election(struct ctdb_recoverd *rec,
 638                            uint32_t pnn,
 639                            struct ctdb_node_map_old *nodemap);
 640
 641 static void lost_reclock_handler(void *private_data)
 642 {
 643         struct ctdb_recoverd *rec = talloc_get_type_abort(
 644                 private_data, struct ctdb_recoverd);
 645
 646         D_ERR("Recovery lock helper terminated, triggering an election\n");
 647         TALLOC_FREE(rec->recovery_lock_handle);
 648
 649         force_election(rec, ctdb_get_pnn(rec->ctdb), rec->nodemap);
 650 }
 651
 652 static bool ctdb_recovery_lock(struct ctdb_recoverd *rec)
 653 {
 654         struct ctdb_context *ctdb = rec->ctdb;
 655         struct ctdb_cluster_mutex_handle *h;
 656         struct ctdb_recovery_lock_handle *s;
 657
 658         s = talloc_zero(rec, struct ctdb_recovery_lock_handle);
 659         if (s == NULL) {
 660                 DBG_ERR("Memory allocation error\n");
 661                 return false;
 662         };
 663
 664         s->rec = rec;
 665
 666         h = ctdb_cluster_mutex(s,
 667                                ctdb,
 668                                ctdb->recovery_lock,
 669                                120,
 670                                take_reclock_handler,
 671                                s,
 672                                lost_reclock_handler,
 673                                rec);
 674         if (h == NULL) {
 675                 talloc_free(s);
 676                 return false;
 677         }
 678
 679         rec->recovery_lock_handle = s;
 680         s->h = h;
 681
 682         while (! s->done) {
 683                 tevent_loop_once(ctdb->ev);
 684         }
 685
 686         if (! s->locked) {
 687                 TALLOC_FREE(rec->recovery_lock_handle);
 688                 return false;
 689         }
 690
 691         ctdb_ctrl_report_recd_lock_latency(ctdb,
 692                                            CONTROL_TIMEOUT(),
 693                                            s->latency);
 694
 695         return true;
 696 }
 697
 698 static void ctdb_recovery_unlock(struct ctdb_recoverd *rec)
 699 {
 700         if (rec->recovery_lock_handle == NULL) {
 701                 return;
 702         }
 703
 704         if (! rec->recovery_lock_handle->done) {
 705                 /*
 706                  * Taking of recovery lock still in progress.  Free
 707                  * the cluster mutex handle to release it but leave
 708                  * the recovery lock handle in place to allow taking
 709                  * of the lock to fail.
 710                  */
 711                 D_NOTICE("Cancelling recovery lock\n");
 712                 TALLOC_FREE(rec->recovery_lock_handle->h);
 713                 rec->recovery_lock_handle->done = true;
 714                 rec->recovery_lock_handle->locked = false;
 715                 return;
 716         }
 717
 718         D_NOTICE("Releasing recovery lock\n");
 719         TALLOC_FREE(rec->recovery_lock_handle);
 720 }
 721
 722 static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
 723 {
 724         struct ctdb_context *ctdb = rec->ctdb;
 725         unsigned int i;
 726         struct ctdb_banning_state *ban_state;
 727
 728         *self_ban = false;
 729         for (i=0; i<ctdb->num_nodes; i++) {
 730                 if (ctdb->nodes[i]->ban_state == NULL) {
 731                         continue;
 732                 }
 733                 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
 734                 if (ban_state->count < 2*ctdb->num_nodes) {
 735                         continue;
 736                 }
 737
 738                 DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
 739                         ctdb->nodes[i]->pnn, ban_state->count,
 740                         ctdb->tunable.recovery_ban_period));
 741                 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
 742                 ban_state->count = 0;
 743
 744                 /* Banning ourself? */
 745                 if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
 746                         *self_ban = true;
 747                 }
 748         }
 749 }
 750
 751 struct helper_state {
 752         int fd[2];
 753         pid_t pid;
 754         int result;
 755         bool done;
 756 };
 757
 758 static void helper_handler(struct tevent_context *ev,
 759                            struct tevent_fd *fde,
 760                            uint16_t flags, void *private_data)
 761 {
 762         struct helper_state *state = talloc_get_type_abort(
 763                 private_data, struct helper_state);
 764         int ret;
 765
 766         ret = sys_read(state->fd[0], &state->result, sizeof(state->result));
 767         if (ret != sizeof(state->result)) {
 768                 state->result = EPIPE;
 769         }
 770
 771         state->done = true;
 772 }
 773
 774 static int helper_run(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx,
 775                       const char *prog, const char *arg, const char *type)
 776 {
 777         struct helper_state *state;
 778         struct tevent_fd *fde;
 779         const char **args;
 780         int nargs, ret;
 781         uint32_t recmaster = rec->recmaster;
 782
 783         state = talloc_zero(mem_ctx, struct helper_state);
 784         if (state == NULL) {
 785                 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
 786                 return -1;
 787         }
 788
 789         state->pid = -1;
 790
 791         ret = pipe(state->fd);
 792         if (ret != 0) {
 793                 DEBUG(DEBUG_ERR,
 794                       ("Failed to create pipe for %s helper\n", type));
 795                 goto fail;
 796         }
 797
 798         set_close_on_exec(state->fd[0]);
 799
 800         nargs = 4;
 801         args = talloc_array(state, const char *, nargs);
 802         if (args == NULL) {
 803                 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
 804                 goto fail;
 805         }
 806
 807         args[0] = talloc_asprintf(args, "%d", state->fd[1]);
 808         if (args[0] == NULL) {
 809                 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
 810                 goto fail;
 811         }
 812         args[1] = rec->ctdb->daemon.name;
 813         args[2] = arg;
 814         args[3] = NULL;
 815
 816         if (args[2] == NULL) {
 817                 nargs = 3;
 818         }
 819
 820         state->pid = ctdb_vfork_exec(state, rec->ctdb, prog, nargs, args);
 821         if (state->pid == -1) {
 822                 DEBUG(DEBUG_ERR,
 823                       ("Failed to create child for %s helper\n", type));
 824                 goto fail;
 825         }
 826
 827         close(state->fd[1]);
 828         state->fd[1] = -1;
 829
 830         state->done = false;
 831
 832         fde = tevent_add_fd(rec->ctdb->ev, state, state->fd[0],
 833                             TEVENT_FD_READ, helper_handler, state);
 834         if (fde == NULL) {
 835                 goto fail;
 836         }
 837         tevent_fd_set_auto_close(fde);
 838
 839         while (!state->done) {
 840                 tevent_loop_once(rec->ctdb->ev);
 841
 842                 /* If recmaster changes, we have lost election */
 843                 if (recmaster != rec->recmaster) {
 844                         D_ERR("Recmaster changed to %u, aborting %s\n",
 845                               rec->recmaster, type);
 846                         state->result = 1;
 847                         break;
 848                 }
 849         }
 850
 851         close(state->fd[0]);
 852         state->fd[0] = -1;
 853
 854         if (state->result != 0) {
 855                 goto fail;
 856         }
 857
 858         ctdb_kill(rec->ctdb, state->pid, SIGKILL);
 859         talloc_free(state);
 860         return 0;
 861
 862 fail:
 863         if (state->fd[0] != -1) {
 864                 close(state->fd[0]);
 865         }
 866         if (state->fd[1] != -1) {
 867                 close(state->fd[1]);
 868         }
 869         if (state->pid != -1) {
 870                 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
 871         }
 872         talloc_free(state);
 873         return -1;
 874 }
 875
 876
 877 static int ctdb_takeover(struct ctdb_recoverd *rec,
 878                          uint32_t *force_rebalance_nodes)
 879 {
 880         static char prog[PATH_MAX+1] = "";
 881         char *arg;
 882         unsigned int i;
 883         int ret;
 884
 885         if (!ctdb_set_helper("takeover_helper", prog, sizeof(prog),
 886                              "CTDB_TAKEOVER_HELPER", CTDB_HELPER_BINDIR,
 887                              "ctdb_takeover_helper")) {
 888                 ctdb_die(rec->ctdb, "Unable to set takeover helper\n");
 889         }
 890
 891         arg = NULL;
 892         for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
 893                 uint32_t pnn = force_rebalance_nodes[i];
 894                 if (arg == NULL) {
 895                         arg = talloc_asprintf(rec, "%u", pnn);
 896                 } else {
 897                         arg = talloc_asprintf_append(arg, ",%u", pnn);
 898                 }
 899                 if (arg == NULL) {
 900                         DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
 901                         return -1;
 902                 }
 903         }
 904
 905         if (ctdb_config.failover_disabled) {
 906                 ret = setenv("CTDB_DISABLE_IP_FAILOVER", "1", 1);
 907                 if (ret != 0) {
 908                         D_ERR("Failed to set CTDB_DISABLE_IP_FAILOVER variable\n");
 909                         return -1;
 910                 }
 911         }
 912
 913         return helper_run(rec, rec, prog, arg, "takeover");
 914 }
 915
 916 static bool do_takeover_run(struct ctdb_recoverd *rec,
 917                             struct ctdb_node_map_old *nodemap)
 918 {
 919         uint32_t *nodes = NULL;
 920         struct ctdb_disable_message dtr;
 921         TDB_DATA data;
 922         size_t i;
 923         uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
 924         int ret;
 925         bool ok;
 926
 927         DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
 928
 929         if (ctdb_op_is_in_progress(rec->takeover_run)) {
 930                 DEBUG(DEBUG_ERR, (__location__
 931                                   " takeover run already in progress \n"));
 932                 ok = false;
 933                 goto done;
 934         }
 935
 936         if (!ctdb_op_begin(rec->takeover_run)) {
 937                 ok = false;
 938                 goto done;
 939         }
 940
 941         /* Disable IP checks (takeover runs, really) on other nodes
 942          * while doing this takeover run.  This will stop those other
 943          * nodes from triggering takeover runs when think they should
 944          * be hosting an IP but it isn't yet on an interface.  Don't
 945          * wait for replies since a failure here might cause some
 946          * noise in the logs but will not actually cause a problem.
 947          */
 948         ZERO_STRUCT(dtr);
 949         dtr.srvid = 0; /* No reply */
 950         dtr.pnn = -1;
 951
 952         data.dptr  = (uint8_t*)&dtr;
 953         data.dsize = sizeof(dtr);
 954
 955         nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
 956
 957         /* Disable for 60 seconds.  This can be a tunable later if
 958          * necessary.
 959          */
 960         dtr.timeout = 60;
 961         for (i = 0; i < talloc_array_length(nodes); i++) {
 962                 if (ctdb_client_send_message(rec->ctdb, nodes[i],
 963                                              CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
 964                                              data) != 0) {
 965                         DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
 966                 }
 967         }
 968
 969         ret = ctdb_takeover(rec, rec->force_rebalance_nodes);
 970
 971         /* Reenable takeover runs and IP checks on other nodes */
 972         dtr.timeout = 0;
 973         for (i = 0; i < talloc_array_length(nodes); i++) {
 974                 if (ctdb_client_send_message(rec->ctdb, nodes[i],
 975                                              CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
 976                                              data) != 0) {
 977                         DEBUG(DEBUG_INFO,("Failed to re-enable takeover runs\n"));
 978                 }
 979         }
 980
 981         if (ret != 0) {
 982                 DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
 983                 ok = false;
 984                 goto done;
 985         }
 986
 987         ok = true;
 988         /* Takeover run was successful so clear force rebalance targets */
 989         if (rebalance_nodes == rec->force_rebalance_nodes) {
 990                 TALLOC_FREE(rec->force_rebalance_nodes);
 991         } else {
 992                 DEBUG(DEBUG_WARNING,
 993                       ("Rebalance target nodes changed during takeover run - not clearing\n"));
 994         }
 995 done:
 996         rec->need_takeover_run = !ok;
 997         talloc_free(nodes);
 998         ctdb_op_end(rec->takeover_run);
 999
1000         DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
1001         return ok;
1002 }
1003
1004 static int db_recovery_parallel(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx)
1005 {
1006         static char prog[PATH_MAX+1] = "";
1007         const char *arg;
1008
1009         if (!ctdb_set_helper("recovery_helper", prog, sizeof(prog),
1010                              "CTDB_RECOVERY_HELPER", CTDB_HELPER_BINDIR,
1011                              "ctdb_recovery_helper")) {
1012                 ctdb_die(rec->ctdb, "Unable to set recovery helper\n");
1013         }
1014
1015         arg = talloc_asprintf(mem_ctx, "%u", new_generation());
1016         if (arg == NULL) {
1017                 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1018                 return -1;
1019         }
1020
1021         setenv("CTDB_DBDIR_STATE", rec->ctdb->db_directory_state, 1);
1022
1023         return helper_run(rec, mem_ctx, prog, arg, "recovery");
1024 }
1025
1026 /*
1027   we are the recmaster, and recovery is needed - start a recovery run
1028  */
1029 static int do_recovery(struct ctdb_recoverd *rec,
1030                        TALLOC_CTX *mem_ctx, uint32_t pnn,
1031                        struct ctdb_node_map_old *nodemap, struct ctdb_vnn_map *vnnmap)
1032 {
1033         struct ctdb_context *ctdb = rec->ctdb;
1034         unsigned int i;
1035         int ret;
1036         bool self_ban;
1037
1038         DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1039
1040         /* Check if the current node is still the recmaster.  It's possible that
1041          * re-election has changed the recmaster.
1042          */
1043         if (pnn != rec->recmaster) {
1044                 DEBUG(DEBUG_NOTICE,
1045                       ("Recovery master changed to %u, aborting recovery\n",
1046                        rec->recmaster));
1047                 return -1;
1048         }
1049
1050         /* if recovery fails, force it again */
1051         rec->need_recovery = true;
1052
1053         if (!ctdb_op_begin(rec->recovery)) {
1054                 return -1;
1055         }
1056
1057         if (rec->election_timeout) {
1058                 /* an election is in progress */
1059                 DEBUG(DEBUG_ERR, ("do_recovery called while election in progress - try again later\n"));
1060                 goto fail;
1061         }
1062
1063         ban_misbehaving_nodes(rec, &self_ban);
1064         if (self_ban) {
1065                 DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
1066                 goto fail;
1067         }
1068
1069         if (ctdb->recovery_lock != NULL) {
1070                 if (ctdb_recovery_have_lock(rec)) {
1071                         D_NOTICE("Already holding recovery lock\n");
1072                 } else {
1073                         bool ok;
1074
1075                         D_NOTICE("Attempting to take recovery lock (%s)\n",
1076                                  ctdb->recovery_lock);
1077
1078                         ok = ctdb_recovery_lock(rec);
1079                         if (! ok) {
1080                                 D_ERR("Unable to take recovery lock\n");
1081
1082                                 if (pnn != rec->recmaster) {
1083                                         D_NOTICE("Recovery master changed to %u,"
1084                                                  " aborting recovery\n",
1085                                                  rec->recmaster);
1086                                         rec->need_recovery = false;
1087                                         goto fail;
1088                                 }
1089
1090                                 if (ctdb->runstate ==
1091                                     CTDB_RUNSTATE_FIRST_RECOVERY) {
1092                                         /*
1093                                          * First recovery?  Perhaps
1094                                          * current node does not yet
1095                                          * know who the recmaster is.
1096                                          */
1097                                         D_ERR("Retrying recovery\n");
1098                                         goto fail;
1099                                 }
1100
1101                                 D_ERR("Abort recovery, "
1102                                       "ban this node for %u seconds\n",
1103                                       ctdb->tunable.recovery_ban_period);
1104                                 ctdb_ban_node(rec,
1105                                               pnn,
1106                                               ctdb->tunable.recovery_ban_period);
1107                                 goto fail;
1108                         }
1109                         D_NOTICE("Recovery lock taken successfully\n");
1110                 }
1111         }
1112
1113         DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1114
1115         /* Retrieve capabilities from all connected nodes */
1116         ret = update_capabilities(rec, nodemap);
1117         if (ret!=0) {
1118                 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1119                 return -1;
1120         }
1121
1122         /*
1123           update all nodes to have the same flags that we have
1124          */
1125         for (i=0;i<nodemap->num;i++) {
1126                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1127                         continue;
1128                 }
1129
1130                 ret = update_flags_on_all_nodes(ctdb,
1131                                                 i,
1132                                                 nodemap->nodes[i].flags);
1133                 if (ret != 0) {
1134                         if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1135                                 DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
1136                         } else {
1137                                 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1138                                 return -1;
1139                         }
1140                 }
1141         }
1142
1143         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1144
1145         ret = db_recovery_parallel(rec, mem_ctx);
1146         if (ret != 0) {
1147                 goto fail;
1148         }
1149
1150         do_takeover_run(rec, nodemap);
1151
1152         /* send a message to all clients telling them that the cluster
1153            has been reconfigured */
1154         ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
1155                                        CTDB_SRVID_RECONFIGURE, tdb_null);
1156         if (ret != 0) {
1157                 DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n"));
1158                 goto fail;
1159         }
1160
1161         DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1162
1163         rec->need_recovery = false;
1164         ctdb_op_end(rec->recovery);
1165
1166         /* we managed to complete a full recovery, make sure to forgive
1167            any past sins by the nodes that could now participate in the
1168            recovery.
1169         */
1170         DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
1171         for (i=0;i<nodemap->num;i++) {
1172                 struct ctdb_banning_state *ban_state;
1173
1174                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1175                         continue;
1176                 }
1177
1178                 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
1179                 if (ban_state == NULL) {
1180                         continue;
1181                 }
1182
1183                 ban_state->count = 0;
1184         }
1185
1186         /* We just finished a recovery successfully.
1187            We now wait for rerecovery_timeout before we allow
1188            another recovery to take place.
1189         */
1190         DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be suppressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
1191         ctdb_op_disable(rec->recovery, ctdb->ev,
1192                         ctdb->tunable.rerecovery_timeout);
1193         return 0;
1194
1195 fail:
1196         ctdb_op_end(rec->recovery);
1197         return -1;
1198 }
1199
1200
1201 /*
1202   elections are won by first checking the number of connected nodes, then
1203   the priority time, then the pnn
1204  */
1205 struct election_message {
1206         uint32_t num_connected;
1207         struct timeval priority_time;
1208         uint32_t pnn;
1209         uint32_t node_flags;
1210 };
1211
1212 /*
1213   form this nodes election data
1214  */
1215 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1216 {
1217         unsigned int i;
1218         int ret;
1219         struct ctdb_node_map_old *nodemap;
1220         struct ctdb_context *ctdb = rec->ctdb;
1221
1222         ZERO_STRUCTP(em);
1223
1224         em->pnn = rec->ctdb->pnn;
1225         em->priority_time = rec->priority_time;
1226
1227         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1228         if (ret != 0) {
1229                 DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
1230                 return;
1231         }
1232
1233         rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
1234         em->node_flags = rec->node_flags;
1235
1236         for (i=0;i<nodemap->num;i++) {
1237                 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1238                         em->num_connected++;
1239                 }
1240         }
1241
1242         /* we shouldnt try to win this election if we cant be a recmaster */
1243         if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1244                 em->num_connected = 0;
1245                 em->priority_time = timeval_current();
1246         }
1247
1248         talloc_free(nodemap);
1249 }
1250
1251 /*
1252   see if the given election data wins
1253  */
1254 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1255 {
1256         struct election_message myem;
1257         int cmp = 0;
1258
1259         ctdb_election_data(rec, &myem);
1260
1261         /* we cant win if we don't have the recmaster capability */
1262         if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1263                 return false;
1264         }
1265
1266         /* we cant win if we are banned */
1267         if (rec->node_flags & NODE_FLAGS_BANNED) {
1268                 return false;
1269         }
1270
1271         /* we cant win if we are stopped */
1272         if (rec->node_flags & NODE_FLAGS_STOPPED) {
1273                 return false;
1274         }
1275
1276         /* we will automatically win if the other node is banned */
1277         if (em->node_flags & NODE_FLAGS_BANNED) {
1278                 return true;
1279         }
1280
1281         /* we will automatically win if the other node is banned */
1282         if (em->node_flags & NODE_FLAGS_STOPPED) {
1283                 return true;
1284         }
1285
1286         /* then the longest running node */
1287         if (cmp == 0) {
1288                 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1289         }
1290
1291         if (cmp == 0) {
1292                 cmp = (int)myem.pnn - (int)em->pnn;
1293         }
1294
1295         return cmp > 0;
1296 }
1297
1298 /*
1299   send out an election request
1300  */
1301 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
1302 {
1303         int ret;
1304         TDB_DATA election_data;
1305         struct election_message emsg;
1306         uint64_t srvid;
1307         struct ctdb_context *ctdb = rec->ctdb;
1308
1309         srvid = CTDB_SRVID_ELECTION;
1310
1311         ctdb_election_data(rec, &emsg);
1312
1313         election_data.dsize = sizeof(struct election_message);
1314         election_data.dptr  = (unsigned char *)&emsg;
1315
1316
1317         /* first we assume we will win the election and set
1318            recoverymaster to be ourself on the current node
1319          */
1320         ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
1321                                      CTDB_CURRENT_NODE, pnn);
1322         if (ret != 0) {
1323                 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster\n"));
1324                 return -1;
1325         }
1326         rec->recmaster = pnn;
1327
1328         /* send an election message to all active nodes */
1329         DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
1330         return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1331 }
1332
1333 /*
1334   we think we are winning the election - send a broadcast election request
1335  */
1336 static void election_send_request(struct tevent_context *ev,
1337                                   struct tevent_timer *te,
1338                                   struct timeval t, void *p)
1339 {
1340         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1341         int ret;
1342
1343         ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
1344         if (ret != 0) {
1345                 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
1346         }
1347
1348         TALLOC_FREE(rec->send_election_te);
1349 }
1350
1351 /*
1352   handler for memory dumps
1353 */
1354 static void mem_dump_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1355 {
1356         struct ctdb_recoverd *rec = talloc_get_type(
1357                 private_data, struct ctdb_recoverd);
1358         struct ctdb_context *ctdb = rec->ctdb;
1359         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1360         TDB_DATA *dump;
1361         int ret;
1362         struct ctdb_srvid_message *rd;
1363
1364         if (data.dsize != sizeof(struct ctdb_srvid_message)) {
1365                 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1366                 talloc_free(tmp_ctx);
1367                 return;
1368         }
1369         rd = (struct ctdb_srvid_message *)data.dptr;
1370
1371         dump = talloc_zero(tmp_ctx, TDB_DATA);
1372         if (dump == NULL) {
1373                 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
1374                 talloc_free(tmp_ctx);
1375                 return;
1376         }
1377         ret = ctdb_dump_memory(ctdb, dump);
1378         if (ret != 0) {
1379                 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
1380                 talloc_free(tmp_ctx);
1381                 return;
1382         }
1383
1384 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
1385
1386         ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
1387         if (ret != 0) {
1388                 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
1389                 talloc_free(tmp_ctx);
1390                 return;
1391         }
1392
1393         talloc_free(tmp_ctx);
1394 }
1395
1396 /*
1397   handler for reload_nodes
1398 */
1399 static void reload_nodes_handler(uint64_t srvid, TDB_DATA data,
1400                                  void *private_data)
1401 {
1402         struct ctdb_recoverd *rec = talloc_get_type(
1403                 private_data, struct ctdb_recoverd);
1404
1405         DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
1406
1407         ctdb_load_nodes_file(rec->ctdb);
1408 }
1409
1410
1411 static void recd_node_rebalance_handler(uint64_t srvid, TDB_DATA data,
1412                                         void *private_data)
1413 {
1414         struct ctdb_recoverd *rec = talloc_get_type(
1415                 private_data, struct ctdb_recoverd);
1416         struct ctdb_context *ctdb = rec->ctdb;
1417         uint32_t pnn;
1418         uint32_t *t;
1419         int len;
1420
1421         if (rec->recmaster != ctdb_get_pnn(ctdb)) {
1422                 return;
1423         }
1424
1425         if (data.dsize != sizeof(uint32_t)) {
1426                 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
1427                 return;
1428         }
1429
1430         pnn = *(uint32_t *)&data.dptr[0];
1431
1432         DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
1433
1434         /* Copy any existing list of nodes.  There's probably some
1435          * sort of realloc variant that will do this but we need to
1436          * make sure that freeing the old array also cancels the timer
1437          * event for the timeout... not sure if realloc will do that.
1438          */
1439         len = (rec->force_rebalance_nodes != NULL) ?
1440                 talloc_array_length(rec->force_rebalance_nodes) :
1441                 0;
1442
1443         /* This allows duplicates to be added but they don't cause
1444          * harm.  A call to add a duplicate PNN arguably means that
1445          * the timeout should be reset, so this is the simplest
1446          * solution.
1447          */
1448         t = talloc_zero_array(rec, uint32_t, len+1);
1449         CTDB_NO_MEMORY_VOID(ctdb, t);
1450         if (len > 0) {
1451                 memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
1452         }
1453         t[len] = pnn;
1454
1455         talloc_free(rec->force_rebalance_nodes);
1456
1457         rec->force_rebalance_nodes = t;
1458 }
1459
1460
1461
1462 static void srvid_disable_and_reply(struct ctdb_context *ctdb,
1463                                     TDB_DATA data,
1464                                     struct ctdb_op_state *op_state)
1465 {
1466         struct ctdb_disable_message *r;
1467         uint32_t timeout;
1468         TDB_DATA result;
1469         int32_t ret = 0;
1470
1471         /* Validate input data */
1472         if (data.dsize != sizeof(struct ctdb_disable_message)) {
1473                 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1474                                  "expecting %lu\n", (long unsigned)data.dsize,
1475                                  (long unsigned)sizeof(struct ctdb_srvid_message)));
1476                 return;
1477         }
1478         if (data.dptr == NULL) {
1479                 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
1480                 return;
1481         }
1482
1483         r = (struct ctdb_disable_message *)data.dptr;
1484         timeout = r->timeout;
1485
1486         ret = ctdb_op_disable(op_state, ctdb->ev, timeout);
1487         if (ret != 0) {
1488                 goto done;
1489         }
1490
1491         /* Returning our PNN tells the caller that we succeeded */
1492         ret = ctdb_get_pnn(ctdb);
1493 done:
1494         result.dsize = sizeof(int32_t);
1495         result.dptr  = (uint8_t *)&ret;
1496         srvid_request_reply(ctdb, (struct ctdb_srvid_message *)r, result);
1497 }
1498
1499 static void disable_takeover_runs_handler(uint64_t srvid, TDB_DATA data,
1500                                           void *private_data)
1501 {
1502         struct ctdb_recoverd *rec = talloc_get_type(
1503                 private_data, struct ctdb_recoverd);
1504
1505         srvid_disable_and_reply(rec->ctdb, data, rec->takeover_run);
1506 }
1507
1508 /* Backward compatibility for this SRVID */
1509 static void disable_ip_check_handler(uint64_t srvid, TDB_DATA data,
1510                                      void *private_data)
1511 {
1512         struct ctdb_recoverd *rec = talloc_get_type(
1513                 private_data, struct ctdb_recoverd);
1514         uint32_t timeout;
1515
1516         if (data.dsize != sizeof(uint32_t)) {
1517                 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1518                                  "expecting %lu\n", (long unsigned)data.dsize,
1519                                  (long unsigned)sizeof(uint32_t)));
1520                 return;
1521         }
1522         if (data.dptr == NULL) {
1523                 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
1524                 return;
1525         }
1526
1527         timeout = *((uint32_t *)data.dptr);
1528
1529         ctdb_op_disable(rec->takeover_run, rec->ctdb->ev, timeout);
1530 }
1531
1532 static void disable_recoveries_handler(uint64_t srvid, TDB_DATA data,
1533                                        void *private_data)
1534 {
1535         struct ctdb_recoverd *rec = talloc_get_type(
1536                 private_data, struct ctdb_recoverd);
1537
1538         srvid_disable_and_reply(rec->ctdb, data, rec->recovery);
1539 }
1540
1541 /*
1542   handler for ip reallocate, just add it to the list of requests and
1543   handle this later in the monitor_cluster loop so we do not recurse
1544   with other requests to takeover_run()
1545 */
1546 static void ip_reallocate_handler(uint64_t srvid, TDB_DATA data,
1547                                   void *private_data)
1548 {
1549         struct ctdb_srvid_message *request;
1550         struct ctdb_recoverd *rec = talloc_get_type(
1551                 private_data, struct ctdb_recoverd);
1552
1553         if (data.dsize != sizeof(struct ctdb_srvid_message)) {
1554                 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1555                 return;
1556         }
1557
1558         request = (struct ctdb_srvid_message *)data.dptr;
1559
1560         srvid_request_add(rec->ctdb, &rec->reallocate_requests, request);
1561 }
1562
1563 static void process_ipreallocate_requests(struct ctdb_context *ctdb,
1564                                           struct ctdb_recoverd *rec)
1565 {
1566         TDB_DATA result;
1567         int32_t ret;
1568         struct srvid_requests *current;
1569
1570         /* Only process requests that are currently pending.  More
1571          * might come in while the takeover run is in progress and
1572          * they will need to be processed later since they might
1573          * be in response flag changes.
1574          */
1575         current = rec->reallocate_requests;
1576         rec->reallocate_requests = NULL;
1577
1578         if (do_takeover_run(rec, rec->nodemap)) {
1579                 ret = ctdb_get_pnn(ctdb);
1580         } else {
1581                 ret = -1;
1582         }
1583
1584         result.dsize = sizeof(int32_t);
1585         result.dptr  = (uint8_t *)&ret;
1586
1587         srvid_requests_reply(ctdb, &current, result);
1588 }
1589
1590 /*
1591  * handler for assigning banning credits
1592  */
1593 static void banning_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1594 {
1595         struct ctdb_recoverd *rec = talloc_get_type(
1596                 private_data, struct ctdb_recoverd);
1597         uint32_t ban_pnn;
1598
1599         /* Ignore if we are not recmaster */
1600         if (rec->ctdb->pnn != rec->recmaster) {
1601                 return;
1602         }
1603
1604         if (data.dsize != sizeof(uint32_t)) {
1605                 DEBUG(DEBUG_ERR, (__location__ "invalid data size %zu\n",
1606                                   data.dsize));
1607                 return;
1608         }
1609
1610         ban_pnn = *(uint32_t *)data.dptr;
1611
1612         ctdb_set_culprit_count(rec, ban_pnn, rec->nodemap->num);
1613 }
1614
1615 /*
1616   handler for recovery master elections
1617 */
1618 static void election_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1619 {
1620         struct ctdb_recoverd *rec = talloc_get_type(
1621                 private_data, struct ctdb_recoverd);
1622         struct ctdb_context *ctdb = rec->ctdb;
1623         int ret;
1624         struct election_message *em = (struct election_message *)data.dptr;
1625
1626         /* Ignore election packets from ourself */
1627         if (ctdb->pnn == em->pnn) {
1628                 return;
1629         }
1630
1631         /* we got an election packet - update the timeout for the election */
1632         talloc_free(rec->election_timeout);
1633         rec->election_timeout = tevent_add_timer(
1634                         ctdb->ev, ctdb,
1635                         fast_start ?
1636                                 timeval_current_ofs(0, 500000) :
1637                                 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1638                         ctdb_election_timeout, rec);
1639
1640         /* someone called an election. check their election data
1641            and if we disagree and we would rather be the elected node,
1642            send a new election message to all other nodes
1643          */
1644         if (ctdb_election_win(rec, em)) {
1645                 if (!rec->send_election_te) {
1646                         rec->send_election_te = tevent_add_timer(
1647                                         ctdb->ev, rec,
1648                                         timeval_current_ofs(0, 500000),
1649                                         election_send_request, rec);
1650                 }
1651                 return;
1652         }
1653
1654         /* we didn't win */
1655         TALLOC_FREE(rec->send_election_te);
1656
1657         /* Release the recovery lock file */
1658         if (ctdb_recovery_have_lock(rec)) {
1659                 ctdb_recovery_unlock(rec);
1660         }
1661
1662         /* ok, let that guy become recmaster then */
1663         ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
1664                                      CTDB_CURRENT_NODE, em->pnn);
1665         if (ret != 0) {
1666                 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster"));
1667                 return;
1668         }
1669         rec->recmaster = em->pnn;
1670
1671         return;
1672 }
1673
1674
1675 /*
1676   force the start of the election process
1677  */
1678 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
1679                            struct ctdb_node_map_old *nodemap)
1680 {
1681         int ret;
1682         struct ctdb_context *ctdb = rec->ctdb;
1683
1684         DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
1685
1686         /* set all nodes to recovery mode to stop all internode traffic */
1687         ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1688         if (ret != 0) {
1689                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1690                 return;
1691         }
1692
1693         talloc_free(rec->election_timeout);
1694         rec->election_timeout = tevent_add_timer(
1695                         ctdb->ev, ctdb,
1696                         fast_start ?
1697                                 timeval_current_ofs(0, 500000) :
1698                                 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1699                         ctdb_election_timeout, rec);
1700
1701         ret = send_election_request(rec, pnn);
1702         if (ret!=0) {
1703                 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
1704                 return;
1705         }
1706
1707         /* wait for a few seconds to collect all responses */
1708         ctdb_wait_election(rec);
1709 }
1710
1711
1712
1713 /*
1714   handler for when a node changes its flags
1715 */
1716 static void monitor_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1717 {
1718         struct ctdb_recoverd *rec = talloc_get_type(
1719                 private_data, struct ctdb_recoverd);
1720         struct ctdb_context *ctdb = rec->ctdb;
1721         int ret;
1722         struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
1723         struct ctdb_node_map_old *nodemap=NULL;
1724         TALLOC_CTX *tmp_ctx;
1725         unsigned int i;
1726
1727         if (data.dsize != sizeof(*c)) {
1728                 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
1729                 return;
1730         }
1731
1732         tmp_ctx = talloc_new(ctdb);
1733         CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
1734
1735         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1736         if (ret != 0) {
1737                 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
1738                 talloc_free(tmp_ctx);
1739                 return;
1740         }
1741
1742
1743         for (i=0;i<nodemap->num;i++) {
1744                 if (nodemap->nodes[i].pnn == c->pnn) break;
1745         }
1746
1747         if (i == nodemap->num) {
1748                 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
1749                 talloc_free(tmp_ctx);
1750                 return;
1751         }
1752
1753         if (c->old_flags != c->new_flags) {
1754                 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x  was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
1755         }
1756
1757         nodemap->nodes[i].flags = c->new_flags;
1758
1759         talloc_free(tmp_ctx);
1760 }
1761
1762 /*
1763   handler for when we need to push out flag changes to all other nodes
1764 */
1765 static void push_flags_handler(uint64_t srvid, TDB_DATA data,
1766                                void *private_data)
1767 {
1768         struct ctdb_recoverd *rec = talloc_get_type(
1769                 private_data, struct ctdb_recoverd);
1770         struct ctdb_context *ctdb = rec->ctdb;
1771         int ret;
1772         struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
1773         struct ctdb_node_map_old *nodemap=NULL;
1774         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1775         uint32_t *nodes;
1776
1777         /* read the node flags from the recmaster */
1778         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
1779                                    tmp_ctx, &nodemap);
1780         if (ret != 0) {
1781                 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
1782                 talloc_free(tmp_ctx);
1783                 return;
1784         }
1785         if (c->pnn >= nodemap->num) {
1786                 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
1787                 talloc_free(tmp_ctx);
1788                 return;
1789         }
1790
1791         /* send the flags update to all connected nodes */
1792         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
1793
1794         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
1795                                       nodes, 0, CONTROL_TIMEOUT(),
1796                                       false, data,
1797                                       NULL, NULL,
1798                                       NULL) != 0) {
1799                 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
1800
1801                 talloc_free(tmp_ctx);
1802                 return;
1803         }
1804
1805         talloc_free(tmp_ctx);
1806 }
1807
1808
1809 struct verify_recmode_normal_data {
1810         uint32_t count;
1811         enum monitor_result status;
1812 };
1813
1814 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
1815 {
1816         struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
1817
1818
1819         /* one more node has responded with recmode data*/
1820         rmdata->count--;
1821
1822         /* if we failed to get the recmode, then return an error and let
1823            the main loop try again.
1824         */
1825         if (state->state != CTDB_CONTROL_DONE) {
1826                 if (rmdata->status == MONITOR_OK) {
1827                         rmdata->status = MONITOR_FAILED;
1828                 }
1829                 return;
1830         }
1831
1832         /* if we got a response, then the recmode will be stored in the
1833            status field
1834         */
1835         if (state->status != CTDB_RECOVERY_NORMAL) {
1836                 DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
1837                 rmdata->status = MONITOR_RECOVERY_NEEDED;
1838         }
1839
1840         return;
1841 }
1842
1843
1844 /* verify that all nodes are in normal recovery mode */
1845 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap)
1846 {
1847         struct verify_recmode_normal_data *rmdata;
1848         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1849         struct ctdb_client_control_state *state;
1850         enum monitor_result status;
1851         unsigned int j;
1852
1853         rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
1854         CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
1855         rmdata->count  = 0;
1856         rmdata->status = MONITOR_OK;
1857
1858         /* loop over all active nodes and send an async getrecmode call to
1859            them*/
1860         for (j=0; j<nodemap->num; j++) {
1861                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1862                         continue;
1863                 }
1864                 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
1865                                         CONTROL_TIMEOUT(),
1866                                         nodemap->nodes[j].pnn);
1867                 if (state == NULL) {
1868                         /* we failed to send the control, treat this as
1869                            an error and try again next iteration
1870                         */
1871                         DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
1872                         talloc_free(mem_ctx);
1873                         return MONITOR_FAILED;
1874                 }
1875
1876                 /* set up the callback functions */
1877                 state->async.fn = verify_recmode_normal_callback;
1878                 state->async.private_data = rmdata;
1879
1880                 /* one more control to wait for to complete */
1881                 rmdata->count++;
1882         }
1883
1884
1885         /* now wait for up to the maximum number of seconds allowed
1886            or until all nodes we expect a response from has replied
1887         */
1888         while (rmdata->count > 0) {
1889                 tevent_loop_once(ctdb->ev);
1890         }
1891
1892         status = rmdata->status;
1893         talloc_free(mem_ctx);
1894         return status;
1895 }
1896
1897
1898 struct verify_recmaster_data {
1899         struct ctdb_recoverd *rec;
1900         uint32_t count;
1901         uint32_t pnn;
1902         enum monitor_result status;
1903 };
1904
1905 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
1906 {
1907         struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
1908
1909
1910         /* one more node has responded with recmaster data*/
1911         rmdata->count--;
1912
1913         /* if we failed to get the recmaster, then return an error and let
1914            the main loop try again.
1915         */
1916         if (state->state != CTDB_CONTROL_DONE) {
1917                 if (rmdata->status == MONITOR_OK) {
1918                         rmdata->status = MONITOR_FAILED;
1919                 }
1920                 return;
1921         }
1922
1923         /* if we got a response, then the recmaster will be stored in the
1924            status field
1925         */
1926         if ((uint32_t)state->status != rmdata->pnn) {
1927                 DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
1928                 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
1929                 rmdata->status = MONITOR_ELECTION_NEEDED;
1930         }
1931
1932         return;
1933 }
1934
1935
1936 /* verify that all nodes agree that we are the recmaster */
1937 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap, uint32_t pnn)
1938 {
1939         struct ctdb_context *ctdb = rec->ctdb;
1940         struct verify_recmaster_data *rmdata;
1941         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1942         struct ctdb_client_control_state *state;
1943         enum monitor_result status;
1944         unsigned int j;
1945
1946         rmdata = talloc(mem_ctx, struct verify_recmaster_data);
1947         CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
1948         rmdata->rec    = rec;
1949         rmdata->count  = 0;
1950         rmdata->pnn    = pnn;
1951         rmdata->status = MONITOR_OK;
1952
1953         /* loop over all active nodes and send an async getrecmaster call to
1954            them*/
1955         for (j=0; j<nodemap->num; j++) {
1956                 if (nodemap->nodes[j].pnn == rec->recmaster) {
1957                         continue;
1958                 }
1959                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1960                         continue;
1961                 }
1962                 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
1963                                         CONTROL_TIMEOUT(),
1964                                         nodemap->nodes[j].pnn);
1965                 if (state == NULL) {
1966                         /* we failed to send the control, treat this as
1967                            an error and try again next iteration
1968                         */
1969                         DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
1970                         talloc_free(mem_ctx);
1971                         return MONITOR_FAILED;
1972                 }
1973
1974                 /* set up the callback functions */
1975                 state->async.fn = verify_recmaster_callback;
1976                 state->async.private_data = rmdata;
1977
1978                 /* one more control to wait for to complete */
1979                 rmdata->count++;
1980         }
1981
1982
1983         /* now wait for up to the maximum number of seconds allowed
1984            or until all nodes we expect a response from has replied
1985         */
1986         while (rmdata->count > 0) {
1987                 tevent_loop_once(ctdb->ev);
1988         }
1989
1990         status = rmdata->status;
1991         talloc_free(mem_ctx);
1992         return status;
1993 }
1994
1995 static bool interfaces_have_changed(struct ctdb_context *ctdb,
1996                                     struct ctdb_recoverd *rec)
1997 {
1998         struct ctdb_iface_list_old *ifaces = NULL;
1999         TALLOC_CTX *mem_ctx;
2000         bool ret = false;
2001
2002         mem_ctx = talloc_new(NULL);
2003
2004         /* Read the interfaces from the local node */
2005         if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
2006                                  CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
2007                 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
2008                 /* We could return an error.  However, this will be
2009                  * rare so we'll decide that the interfaces have
2010                  * actually changed, just in case.
2011                  */
2012                 talloc_free(mem_ctx);
2013                 return true;
2014         }
2015
2016         if (!rec->ifaces) {
2017                 /* We haven't been here before so things have changed */
2018                 DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
2019                 ret = true;
2020         } else if (rec->ifaces->num != ifaces->num) {
2021                 /* Number of interfaces has changed */
2022                 DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
2023                                      rec->ifaces->num, ifaces->num));
2024                 ret = true;
2025         } else {
2026                 /* See if interface names or link states have changed */
2027                 unsigned int i;
2028                 for (i = 0; i < rec->ifaces->num; i++) {
2029                         struct ctdb_iface * iface = &rec->ifaces->ifaces[i];
2030                         if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
2031                                 DEBUG(DEBUG_NOTICE,
2032                                       ("Interface in slot %d changed: %s => %s\n",
2033                                        i, iface->name, ifaces->ifaces[i].name));
2034                                 ret = true;
2035                                 break;
2036                         }
2037                         if (iface->link_state != ifaces->ifaces[i].link_state) {
2038                                 DEBUG(DEBUG_NOTICE,
2039                                       ("Interface %s changed state: %d => %d\n",
2040                                        iface->name, iface->link_state,
2041                                        ifaces->ifaces[i].link_state));
2042                                 ret = true;
2043                                 break;
2044                         }
2045                 }
2046         }
2047
2048         talloc_free(rec->ifaces);
2049         rec->ifaces = talloc_steal(rec, ifaces);
2050
2051         talloc_free(mem_ctx);
2052         return ret;
2053 }
2054
2055 /* Check that the local allocation of public IP addresses is correct
2056  * and do some house-keeping */
2057 static int verify_local_ip_allocation(struct ctdb_context *ctdb,
2058                                       struct ctdb_recoverd *rec,
2059                                       uint32_t pnn,
2060                                       struct ctdb_node_map_old *nodemap)
2061 {
2062         TALLOC_CTX *mem_ctx = talloc_new(NULL);
2063         unsigned int j;
2064         int ret;
2065         bool need_takeover_run = false;
2066         struct ctdb_public_ip_list_old *ips = NULL;
2067
2068         /* If we are not the recmaster then do some housekeeping */
2069         if (rec->recmaster != pnn) {
2070                 /* Ignore any IP reallocate requests - only recmaster
2071                  * processes them
2072                  */
2073                 TALLOC_FREE(rec->reallocate_requests);
2074                 /* Clear any nodes that should be force rebalanced in
2075                  * the next takeover run.  If the recovery master role
2076                  * has moved then we don't want to process these some
2077                  * time in the future.
2078                  */
2079                 TALLOC_FREE(rec->force_rebalance_nodes);
2080         }
2081
2082         /* Return early if disabled... */
2083         if (ctdb_config.failover_disabled ||
2084             ctdb_op_is_disabled(rec->takeover_run)) {
2085                 talloc_free(mem_ctx);
2086                 return  0;
2087         }
2088
2089         if (interfaces_have_changed(ctdb, rec)) {
2090                 need_takeover_run = true;
2091         }
2092
2093         /* If there are unhosted IPs but this node can host them then
2094          * trigger an IP reallocation */
2095
2096         /* Read *available* IPs from local node */
2097         ret = ctdb_ctrl_get_public_ips_flags(
2098                 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx,
2099                 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
2100         if (ret != 0) {
2101                 DEBUG(DEBUG_ERR, ("Unable to retrieve available public IPs\n"));
2102                 talloc_free(mem_ctx);
2103                 return -1;
2104         }
2105
2106         for (j=0; j<ips->num; j++) {
2107                 if (ips->ips[j].pnn == CTDB_UNKNOWN_PNN &&
2108                     nodemap->nodes[pnn].flags == 0) {
2109                         DEBUG(DEBUG_WARNING,
2110                               ("Unassigned IP %s can be served by this node\n",
2111                                ctdb_addr_to_str(&ips->ips[j].addr)));
2112                         need_takeover_run = true;
2113                 }
2114         }
2115
2116         talloc_free(ips);
2117
2118         if (!ctdb->do_checkpublicip) {
2119                 goto done;
2120         }
2121
2122         /* Validate the IP addresses that this node has on network
2123          * interfaces.  If there is an inconsistency between reality
2124          * and the state expected by CTDB then try to fix it by
2125          * triggering an IP reallocation or releasing extraneous IP
2126          * addresses. */
2127
2128         /* Read *known* IPs from local node */
2129         ret = ctdb_ctrl_get_public_ips_flags(
2130                 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
2131         if (ret != 0) {
2132                 DEBUG(DEBUG_ERR, ("Unable to retrieve known public IPs\n"));
2133                 talloc_free(mem_ctx);
2134                 return -1;
2135         }
2136
2137         for (j=0; j<ips->num; j++) {
2138                 if (ips->ips[j].pnn == pnn) {
2139                         if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
2140                                 DEBUG(DEBUG_ERR,
2141                                       ("Assigned IP %s not on an interface\n",
2142                                        ctdb_addr_to_str(&ips->ips[j].addr)));
2143                                 need_takeover_run = true;
2144                         }
2145                 } else {
2146                         if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
2147                                 DEBUG(DEBUG_ERR,
2148                                       ("IP %s incorrectly on an interface\n",
2149                                        ctdb_addr_to_str(&ips->ips[j].addr)));
2150                                 need_takeover_run = true;
2151                         }
2152                 }
2153         }
2154
2155 done:
2156         if (need_takeover_run) {
2157                 struct ctdb_srvid_message rd;
2158                 TDB_DATA data;
2159
2160                 DEBUG(DEBUG_NOTICE,("Trigger takeoverrun\n"));
2161
2162                 ZERO_STRUCT(rd);
2163                 rd.pnn = ctdb->pnn;
2164                 rd.srvid = 0;
2165                 data.dptr = (uint8_t *)&rd;
2166                 data.dsize = sizeof(rd);
2167
2168                 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
2169                 if (ret != 0) {
2170                         DEBUG(DEBUG_ERR,
2171                               ("Failed to send takeover run request\n"));
2172                 }
2173         }
2174         talloc_free(mem_ctx);
2175         return 0;
2176 }
2177
2178
2179 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2180 {
2181         struct ctdb_node_map_old **remote_nodemaps = callback_data;
2182
2183         if (node_pnn >= ctdb->num_nodes) {
2184                 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
2185                 return;
2186         }
2187
2188         remote_nodemaps[node_pnn] = (struct ctdb_node_map_old *)talloc_steal(remote_nodemaps, outdata.dptr);
2189
2190 }
2191
2192 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
2193         struct ctdb_node_map_old *nodemap,
2194         struct ctdb_node_map_old **remote_nodemaps)
2195 {
2196         uint32_t *nodes;
2197
2198         nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
2199         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
2200                                         nodes, 0,
2201                                         CONTROL_TIMEOUT(), false, tdb_null,
2202                                         async_getnodemap_callback,
2203                                         NULL,
2204                                         remote_nodemaps) != 0) {
2205                 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
2206
2207                 return -1;
2208         }
2209
2210         return 0;
2211 }
2212
2213 static bool validate_recovery_master(struct ctdb_recoverd *rec,
2214                                      TALLOC_CTX *mem_ctx)
2215 {
2216         struct ctdb_context *ctdb = rec->ctdb;
2217         uint32_t pnn = ctdb_get_pnn(ctdb);
2218         struct ctdb_node_map_old *nodemap = rec->nodemap;
2219         struct ctdb_node_map_old *recmaster_nodemap = NULL;
2220         int ret;
2221
2222         /* When recovery daemon is started, recmaster is set to
2223          * "unknown" so it knows to start an election.
2224          */
2225         if (rec->recmaster == CTDB_UNKNOWN_PNN) {
2226                 DEBUG(DEBUG_NOTICE,
2227                       ("Initial recovery master set - forcing election\n"));
2228                 force_election(rec, pnn, nodemap);
2229                 return false;
2230         }
2231
2232         /*
2233          * If the current recmaster does not have CTDB_CAP_RECMASTER,
2234          * but we have, then force an election and try to become the new
2235          * recmaster.
2236          */
2237         if (!ctdb_node_has_capabilities(rec->caps,
2238                                         rec->recmaster,
2239                                         CTDB_CAP_RECMASTER) &&
2240             (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
2241             !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
2242                 DEBUG(DEBUG_ERR,
2243                       (" Current recmaster node %u does not have CAP_RECMASTER,"
2244                        " but we (node %u) have - force an election\n",
2245                        rec->recmaster, pnn));
2246                 force_election(rec, pnn, nodemap);
2247                 return false;
2248         }
2249
2250         /* Verify that the master node has not been deleted.  This
2251          * should not happen because a node should always be shutdown
2252          * before being deleted, causing a new master to be elected
2253          * before now.  However, if something strange has happened
2254          * then checking here will ensure we don't index beyond the
2255          * end of the nodemap array. */
2256         if (rec->recmaster >= nodemap->num) {
2257                 DEBUG(DEBUG_ERR,
2258                       ("Recmaster node %u has been deleted. Force election\n",
2259                        rec->recmaster));
2260                 force_election(rec, pnn, nodemap);
2261                 return false;
2262         }
2263
2264         /* if recovery master is disconnected/deleted we must elect a new recmaster */
2265         if (nodemap->nodes[rec->recmaster].flags &
2266             (NODE_FLAGS_DISCONNECTED|NODE_FLAGS_DELETED)) {
2267                 DEBUG(DEBUG_NOTICE,
2268                       ("Recmaster node %u is disconnected/deleted. Force election\n",
2269                        rec->recmaster));
2270                 force_election(rec, pnn, nodemap);
2271                 return false;
2272         }
2273
2274         /* get nodemap from the recovery master to check if it is inactive */
2275         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
2276                                    mem_ctx, &recmaster_nodemap);
2277         if (ret != 0) {
2278                 DEBUG(DEBUG_ERR,
2279                       (__location__
2280                        " Unable to get nodemap from recovery master %u\n",
2281                           rec->recmaster));
2282                 /* No election, just error */
2283                 return false;
2284         }
2285
2286
2287         if ((recmaster_nodemap->nodes[rec->recmaster].flags & NODE_FLAGS_INACTIVE) &&
2288             (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
2289                 DEBUG(DEBUG_NOTICE,
2290                       ("Recmaster node %u is inactive. Force election\n",
2291                        rec->recmaster));
2292                 /*
2293                  * update our nodemap to carry the recmaster's notion of
2294                  * its own flags, so that we don't keep freezing the
2295                  * inactive recmaster node...
2296                  */
2297                 nodemap->nodes[rec->recmaster].flags =
2298                         recmaster_nodemap->nodes[rec->recmaster].flags;
2299                 force_election(rec, pnn, nodemap);
2300                 return false;
2301         }
2302
2303         return true;
2304 }
2305
2306 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
2307                       TALLOC_CTX *mem_ctx)
2308 {
2309         uint32_t pnn;
2310         struct ctdb_node_map_old *nodemap=NULL;
2311         struct ctdb_node_map_old **remote_nodemaps=NULL;
2312         struct ctdb_vnn_map *vnnmap=NULL;
2313         struct ctdb_vnn_map *remote_vnnmap=NULL;
2314         uint32_t num_lmasters;
2315         int32_t debug_level;
2316         unsigned int i, j;
2317         int ret;
2318         bool self_ban;
2319
2320
2321         /* verify that the main daemon is still running */
2322         if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
2323                 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2324                 exit(-1);
2325         }
2326
2327         /* ping the local daemon to tell it we are alive */
2328         ctdb_ctrl_recd_ping(ctdb);
2329
2330         if (rec->election_timeout) {
2331                 /* an election is in progress */
2332                 return;
2333         }
2334
2335         /* read the debug level from the parent and update locally */
2336         ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
2337         if (ret !=0) {
2338                 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
2339                 return;
2340         }
2341         debuglevel_set(debug_level);
2342
2343         /* get relevant tunables */
2344         ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
2345         if (ret != 0) {
2346                 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
2347                 return;
2348         }
2349
2350         /* get runstate */
2351         ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(),
2352                                      CTDB_CURRENT_NODE, &ctdb->runstate);
2353         if (ret != 0) {
2354                 DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n"));
2355                 return;
2356         }
2357
2358         pnn = ctdb_get_pnn(ctdb);
2359
2360         /* get nodemap */
2361         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &nodemap);
2362         if (ret != 0) {
2363                 DBG_ERR("Unable to get nodemap from node %"PRIu32"\n", pnn);
2364                 return;
2365         }
2366         talloc_free(rec->nodemap);
2367         rec->nodemap = nodemap;
2368
2369         /* remember our own node flags */
2370         rec->node_flags = nodemap->nodes[pnn].flags;
2371
2372         ban_misbehaving_nodes(rec, &self_ban);
2373         if (self_ban) {
2374                 DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
2375                 return;
2376         }
2377
2378         ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2379                                    CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2380         if (ret != 0) {
2381                 D_ERR("Failed to read recmode from local node\n");
2382                 return;
2383         }
2384
2385         /* if the local daemon is STOPPED or BANNED, we verify that the databases are
2386            also frozen and that the recmode is set to active.
2387         */
2388         if (rec->node_flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
2389                 /* If this node has become inactive then we want to
2390                  * reduce the chances of it taking over the recovery
2391                  * master role when it becomes active again.  This
2392                  * helps to stabilise the recovery master role so that
2393                  * it stays on the most stable node.
2394                  */
2395                 rec->priority_time = timeval_current();
2396
2397                 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2398                         DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
2399
2400                         ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2401                         if (ret != 0) {
2402                                 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
2403
2404                                 return;
2405                         }
2406                 }
2407                 if (! rec->frozen_on_inactive) {
2408                         ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(),
2409                                                CTDB_CURRENT_NODE);
2410                         if (ret != 0) {
2411                                 DEBUG(DEBUG_ERR,
2412                                       (__location__ " Failed to freeze node "
2413                                        "in STOPPED or BANNED state\n"));
2414                                 return;
2415                         }
2416
2417                         rec->frozen_on_inactive = true;
2418                 }
2419
2420                 /* If this node is stopped or banned then it is not the recovery
2421                  * master, so don't do anything. This prevents stopped or banned
2422                  * node from starting election and sending unnecessary controls.
2423                  */
2424                 return;
2425         }
2426
2427         rec->frozen_on_inactive = false;
2428
2429         /* Retrieve capabilities from all connected nodes */
2430         ret = update_capabilities(rec, nodemap);
2431         if (ret != 0) {
2432                 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
2433                 return;
2434         }
2435
2436         if (! validate_recovery_master(rec, mem_ctx)) {
2437                 return;
2438         }
2439
2440         if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2441                 /* Check if an IP takeover run is needed and trigger one if
2442                  * necessary */
2443                 verify_local_ip_allocation(ctdb, rec, pnn, nodemap);
2444         }
2445
2446         /* if we are not the recmaster then we do not need to check
2447            if recovery is needed
2448          */
2449         if (pnn != rec->recmaster) {
2450                 return;
2451         }
2452
2453
2454         /* ensure our local copies of flags are right */
2455         ret = update_local_flags(rec, nodemap);
2456         if (ret != 0) {
2457                 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
2458                 return;
2459         }
2460
2461         if (ctdb->num_nodes != nodemap->num) {
2462                 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
2463                 ctdb_load_nodes_file(ctdb);
2464                 return;
2465         }
2466
2467         /* verify that all active nodes agree that we are the recmaster */
2468         switch (verify_recmaster(rec, nodemap, pnn)) {
2469         case MONITOR_RECOVERY_NEEDED:
2470                 /* can not happen */
2471                 return;
2472         case MONITOR_ELECTION_NEEDED:
2473                 force_election(rec, pnn, nodemap);
2474                 return;
2475         case MONITOR_OK:
2476                 break;
2477         case MONITOR_FAILED:
2478                 return;
2479         }
2480
2481
2482         /* get the vnnmap */
2483         ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
2484         if (ret != 0) {
2485                 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
2486                 return;
2487         }
2488
2489         if (rec->need_recovery) {
2490                 /* a previous recovery didn't finish */
2491                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2492                 return;
2493         }
2494
2495         /* verify that all active nodes are in normal mode
2496            and not in recovery mode
2497         */
2498         switch (verify_recmode(ctdb, nodemap)) {
2499         case MONITOR_RECOVERY_NEEDED:
2500                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2501                 return;
2502         case MONITOR_FAILED:
2503                 return;
2504         case MONITOR_ELECTION_NEEDED:
2505                 /* can not happen */
2506         case MONITOR_OK:
2507                 break;
2508         }
2509
2510
2511         if (ctdb->recovery_lock != NULL) {
2512                 /* We must already hold the recovery lock */
2513                 if (!ctdb_recovery_have_lock(rec)) {
2514                         DEBUG(DEBUG_ERR,("Failed recovery lock sanity check.  Force a recovery\n"));
2515                         ctdb_set_culprit(rec, ctdb->pnn);
2516                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2517                         return;
2518                 }
2519         }
2520
2521
2522         /* If recoveries are disabled then there is no use doing any
2523          * nodemap or flags checks.  Recoveries might be disabled due
2524          * to "reloadnodes", so doing these checks might cause an
2525          * unnecessary recovery.  */
2526         if (ctdb_op_is_disabled(rec->recovery)) {
2527                 goto takeover_run_checks;
2528         }
2529
2530         /* get the nodemap for all active remote nodes
2531          */
2532         remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map_old *, nodemap->num);
2533         if (remote_nodemaps == NULL) {
2534                 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
2535                 return;
2536         }
2537         for(i=0; i<nodemap->num; i++) {
2538                 remote_nodemaps[i] = NULL;
2539         }
2540         if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
2541                 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
2542                 return;
2543         }
2544
2545         /* verify that all other nodes have the same nodemap as we have
2546         */
2547         for (j=0; j<nodemap->num; j++) {
2548                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2549                         continue;
2550                 }
2551
2552                 if (remote_nodemaps[j] == NULL) {
2553                         DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
2554                         ctdb_set_culprit(rec, j);
2555
2556                         return;
2557                 }
2558
2559                 /* if the nodes disagree on how many nodes there are
2560                    then this is a good reason to try recovery
2561                  */
2562                 if (remote_nodemaps[j]->num != nodemap->num) {
2563                         DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
2564                                   nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
2565                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2566                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2567                         return;
2568                 }
2569
2570                 /* if the nodes disagree on which nodes exist and are
2571                    active, then that is also a good reason to do recovery
2572                  */
2573                 for (i=0;i<nodemap->num;i++) {
2574                         if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
2575                                 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
2576                                           nodemap->nodes[j].pnn, i,
2577                                           remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
2578                                 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2579                                 do_recovery(rec, mem_ctx, pnn, nodemap,
2580                                             vnnmap);
2581                                 return;
2582                         }
2583                 }
2584         }
2585
2586         /*
2587          * Update node flags obtained from each active node. This ensure we have
2588          * up-to-date information for all the nodes.
2589          */
2590         for (j=0; j<nodemap->num; j++) {
2591                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2592                         continue;
2593                 }
2594                 nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
2595         }
2596
2597         for (j=0; j<nodemap->num; j++) {
2598                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2599                         continue;
2600                 }
2601
2602                 /* verify the flags are consistent
2603                 */
2604                 for (i=0; i<nodemap->num; i++) {
2605                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2606                                 continue;
2607                         }
2608
2609                         if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
2610                                 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
2611                                   nodemap->nodes[j].pnn,
2612                                   nodemap->nodes[i].pnn,
2613                                   remote_nodemaps[j]->nodes[i].flags,
2614                                   nodemap->nodes[i].flags));
2615                                 if (i == j) {
2616                                         DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
2617                                         update_flags_on_all_nodes(
2618                                             ctdb,
2619                                             nodemap->nodes[i].pnn,
2620                                             remote_nodemaps[j]->nodes[i].flags);
2621                                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2622                                         do_recovery(rec, mem_ctx, pnn, nodemap,
2623                                                     vnnmap);
2624                                         return;
2625                                 } else {
2626                                         DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
2627                                         update_flags_on_all_nodes(
2628                                                 ctdb,
2629                                                 nodemap->nodes[i].pnn,
2630                                                 nodemap->nodes[i].flags);
2631                                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2632                                         do_recovery(rec, mem_ctx, pnn, nodemap,
2633                                                     vnnmap);
2634                                         return;
2635                                 }
2636                         }
2637                 }
2638         }
2639
2640
2641         /* count how many active nodes there are */
2642         num_lmasters  = 0;
2643         for (i=0; i<nodemap->num; i++) {
2644                 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
2645                         if (ctdb_node_has_capabilities(rec->caps,
2646                                                        ctdb->nodes[i]->pnn,
2647                                                        CTDB_CAP_LMASTER)) {
2648                                 num_lmasters++;
2649                         }
2650                 }
2651         }
2652
2653
2654         /* There must be the same number of lmasters in the vnn map as
2655          * there are active nodes with the lmaster capability...  or
2656          * do a recovery.
2657          */
2658         if (vnnmap->size != num_lmasters) {
2659                 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
2660                           vnnmap->size, num_lmasters));
2661                 ctdb_set_culprit(rec, ctdb->pnn);
2662                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2663                 return;
2664         }
2665
2666         /*
2667          * Verify that all active lmaster nodes in the nodemap also
2668          * exist in the vnnmap
2669          */
2670         for (j=0; j<nodemap->num; j++) {
2671                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2672                         continue;
2673                 }
2674                 if (! ctdb_node_has_capabilities(rec->caps,
2675                                                  nodemap->nodes[j].pnn,
2676                                                  CTDB_CAP_LMASTER)) {
2677                         continue;
2678                 }
2679                 if (nodemap->nodes[j].pnn == pnn) {
2680                         continue;
2681                 }
2682
2683                 for (i=0; i<vnnmap->size; i++) {
2684                         if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
2685                                 break;
2686                         }
2687                 }
2688                 if (i == vnnmap->size) {
2689                         D_ERR("Active LMASTER node %u is not in the vnnmap\n",
2690                               nodemap->nodes[j].pnn);
2691                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2692                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2693                         return;
2694                 }
2695         }
2696
2697
2698         /* verify that all other nodes have the same vnnmap
2699            and are from the same generation
2700          */
2701         for (j=0; j<nodemap->num; j++) {
2702                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2703                         continue;
2704                 }
2705                 if (nodemap->nodes[j].pnn == pnn) {
2706                         continue;
2707                 }
2708
2709                 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
2710                                           mem_ctx, &remote_vnnmap);
2711                 if (ret != 0) {
2712                         DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
2713                                   nodemap->nodes[j].pnn));
2714                         return;
2715                 }
2716
2717                 /* verify the vnnmap generation is the same */
2718                 if (vnnmap->generation != remote_vnnmap->generation) {
2719                         DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
2720                                   nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
2721                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2722                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2723                         return;
2724                 }
2725
2726                 /* verify the vnnmap size is the same */
2727                 if (vnnmap->size != remote_vnnmap->size) {
2728                         DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
2729                                   nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
2730                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2731                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2732                         return;
2733                 }
2734
2735                 /* verify the vnnmap is the same */
2736                 for (i=0;i<vnnmap->size;i++) {
2737                         if (remote_vnnmap->map[i] != vnnmap->map[i]) {
2738                                 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
2739                                           nodemap->nodes[j].pnn));
2740                                 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2741                                 do_recovery(rec, mem_ctx, pnn, nodemap,
2742                                             vnnmap);
2743                                 return;
2744                         }
2745                 }
2746         }
2747
2748         /* FIXME: Add remote public IP checking to ensure that nodes
2749          * have the IP addresses that are allocated to them. */
2750
2751 takeover_run_checks:
2752
2753         /* If there are IP takeover runs requested or the previous one
2754          * failed then perform one and notify the waiters */
2755         if (!ctdb_op_is_disabled(rec->takeover_run) &&
2756             (rec->reallocate_requests || rec->need_takeover_run)) {
2757                 process_ipreallocate_requests(ctdb, rec);
2758         }
2759 }
2760
2761 static void recd_sig_term_handler(struct tevent_context *ev,
2762                                   struct tevent_signal *se, int signum,
2763                                   int count, void *dont_care,
2764                                   void *private_data)
2765 {
2766         struct ctdb_recoverd *rec = talloc_get_type_abort(
2767                 private_data, struct ctdb_recoverd);
2768
2769         DEBUG(DEBUG_ERR, ("Received SIGTERM, exiting\n"));
2770         ctdb_recovery_unlock(rec);
2771         exit(0);
2772 }
2773
2774 /*
2775  * Periodically log elements of the cluster state
2776  *
2777  * This can be used to confirm a split brain has occurred
2778  */
2779 static void maybe_log_cluster_state(struct tevent_context *ev,
2780                                     struct tevent_timer *te,
2781                                     struct timeval current_time,
2782                                     void *private_data)
2783 {
2784         struct ctdb_recoverd *rec = talloc_get_type_abort(
2785                 private_data, struct ctdb_recoverd);
2786         struct ctdb_context *ctdb = rec->ctdb;
2787         struct tevent_timer *tt;
2788
2789         static struct timeval start_incomplete = {
2790                 .tv_sec = 0,
2791         };
2792
2793         bool is_complete;
2794         bool was_complete;
2795         unsigned int i;
2796         double seconds;
2797         unsigned int minutes;
2798         unsigned int num_connected;
2799
2800         if (rec->recmaster != ctdb_get_pnn(ctdb)) {
2801                 goto done;
2802         }
2803
2804         if (rec->nodemap == NULL) {
2805                 goto done;
2806         }
2807
2808         is_complete = true;
2809         num_connected = 0;
2810         for (i = 0; i < rec->nodemap->num; i++) {
2811                 struct ctdb_node_and_flags *n = &rec->nodemap->nodes[i];
2812
2813                 if (n->pnn == ctdb_get_pnn(ctdb)) {
2814                         continue;
2815                 }
2816                 if ((n->flags & NODE_FLAGS_DELETED) != 0) {
2817                         continue;
2818                 }
2819                 if ((n->flags & NODE_FLAGS_DISCONNECTED) != 0) {
2820                         is_complete = false;
2821                         continue;
2822                 }
2823
2824                 num_connected++;
2825         }
2826
2827         was_complete = timeval_is_zero(&start_incomplete);
2828
2829         if (is_complete) {
2830                 if (! was_complete) {
2831                         D_WARNING("Cluster complete with master=%u\n",
2832                                   rec->recmaster);
2833                         start_incomplete = timeval_zero();
2834                 }
2835                 goto done;
2836         }
2837
2838         /* Cluster is newly incomplete... */
2839         if (was_complete) {
2840                 start_incomplete = current_time;
2841                 minutes = 0;
2842                 goto log;
2843         }
2844
2845         /*
2846          * Cluster has been incomplete since previous check, so figure
2847          * out how long (in minutes) and decide whether to log anything
2848          */
2849         seconds = timeval_elapsed2(&start_incomplete, &current_time);
2850         minutes = (unsigned int)seconds / 60;
2851         if (minutes >= 60) {
2852                 /* Over an hour, log every hour */
2853                 if (minutes % 60 != 0) {
2854                         goto done;
2855                 }
2856         } else if (minutes >= 10) {
2857                 /* Over 10 minutes, log every 10 minutes */
2858                 if (minutes % 10 != 0) {
2859                         goto done;
2860                 }
2861         }
2862
2863 log:
2864         D_WARNING("Cluster incomplete with master=%u, elapsed=%u minutes, "
2865                   "connected=%u\n",
2866                   rec->recmaster,
2867                   minutes,
2868                   num_connected);
2869
2870 done:
2871         tt = tevent_add_timer(ctdb->ev,
2872                               rec,
2873                               timeval_current_ofs(60, 0),
2874                               maybe_log_cluster_state,
2875                               rec);
2876         if (tt == NULL) {
2877                 DBG_WARNING("Failed to set up cluster state timer\n");
2878         }
2879 }
2880
2881 /*
2882   the main monitoring loop
2883  */
2884 static void monitor_cluster(struct ctdb_context *ctdb)
2885 {
2886         struct tevent_signal *se;
2887         struct ctdb_recoverd *rec;
2888
2889         DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
2890
2891         rec = talloc_zero(ctdb, struct ctdb_recoverd);
2892         CTDB_NO_MEMORY_FATAL(ctdb, rec);
2893
2894         rec->ctdb = ctdb;
2895         rec->recmaster = CTDB_UNKNOWN_PNN;
2896         rec->recovery_lock_handle = NULL;
2897
2898         rec->takeover_run = ctdb_op_init(rec, "takeover runs");
2899         CTDB_NO_MEMORY_FATAL(ctdb, rec->takeover_run);
2900
2901         rec->recovery = ctdb_op_init(rec, "recoveries");
2902         CTDB_NO_MEMORY_FATAL(ctdb, rec->recovery);
2903
2904         rec->priority_time = timeval_current();
2905         rec->frozen_on_inactive = false;
2906
2907         se = tevent_add_signal(ctdb->ev, ctdb, SIGTERM, 0,
2908                                recd_sig_term_handler, rec);
2909         if (se == NULL) {
2910                 DEBUG(DEBUG_ERR, ("Failed to install SIGTERM handler\n"));
2911                 exit(1);
2912         }
2913
2914         if (ctdb->recovery_lock == NULL) {
2915                 struct tevent_timer *tt;
2916
2917                 tt = tevent_add_timer(ctdb->ev,
2918                                       rec,
2919                                       timeval_current_ofs(60, 0),
2920                                       maybe_log_cluster_state,
2921                                       rec);
2922                 if (tt == NULL) {
2923                         DBG_WARNING("Failed to set up cluster state timer\n");
2924                 }
2925         }
2926
2927         /* register a message port for sending memory dumps */
2928         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
2929
2930         /* when a node is assigned banning credits */
2931         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_BANNING,
2932                                         banning_handler, rec);
2933
2934         /* register a message port for recovery elections */
2935         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_ELECTION, election_handler, rec);
2936
2937         /* when nodes are disabled/enabled */
2938         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
2939
2940         /* when we are asked to puch out a flag change */
2941         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
2942
2943         /* register a message port for reloadnodes  */
2944         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
2945
2946         /* register a message port for performing a takeover run */
2947         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
2948
2949         /* register a message port for disabling the ip check for a short while */
2950         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
2951
2952         /* register a message port for forcing a rebalance of a node next
2953            reallocation */
2954         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
2955
2956         /* Register a message port for disabling takeover runs */
2957         ctdb_client_set_message_handler(ctdb,
2958                                         CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
2959                                         disable_takeover_runs_handler, rec);
2960
2961         /* Register a message port for disabling recoveries */
2962         ctdb_client_set_message_handler(ctdb,
2963                                         CTDB_SRVID_DISABLE_RECOVERIES,
2964                                         disable_recoveries_handler, rec);
2965
2966         for (;;) {
2967                 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2968                 struct timeval start;
2969                 double elapsed;
2970
2971                 if (!mem_ctx) {
2972                         DEBUG(DEBUG_CRIT,(__location__
2973                                           " Failed to create temp context\n"));
2974                         exit(-1);
2975                 }
2976
2977                 start = timeval_current();
2978                 main_loop(ctdb, rec, mem_ctx);
2979                 talloc_free(mem_ctx);
2980
2981                 /* we only check for recovery once every second */
2982                 elapsed = timeval_elapsed(&start);
2983                 if (elapsed < ctdb->tunable.recover_interval) {
2984                         ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
2985                                           - elapsed);
2986                 }
2987         }
2988 }
2989
2990 /*
2991   event handler for when the main ctdbd dies
2992  */
2993 static void ctdb_recoverd_parent(struct tevent_context *ev,
2994                                  struct tevent_fd *fde,
2995                                  uint16_t flags, void *private_data)
2996 {
2997         DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
2998         _exit(1);
2999 }
3000
3001 /*
3002   called regularly to verify that the recovery daemon is still running
3003  */
3004 static void ctdb_check_recd(struct tevent_context *ev,
3005                             struct tevent_timer *te,
3006                             struct timeval yt, void *p)
3007 {
3008         struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3009
3010         if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
3011                 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
3012
3013                 tevent_add_timer(ctdb->ev, ctdb, timeval_zero(),
3014                                  ctdb_restart_recd, ctdb);
3015
3016                 return;
3017         }
3018
3019         tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3020                          timeval_current_ofs(30, 0),
3021                          ctdb_check_recd, ctdb);
3022 }
3023
3024 static void recd_sig_child_handler(struct tevent_context *ev,
3025                                    struct tevent_signal *se, int signum,
3026                                    int count, void *dont_care,
3027                                    void *private_data)
3028 {
3029 //      struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3030         int status;
3031         pid_t pid = -1;
3032
3033         while (pid != 0) {
3034                 pid = waitpid(-1, &status, WNOHANG);
3035                 if (pid == -1) {
3036                         if (errno != ECHILD) {
3037                                 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3038                         }
3039                         return;
3040                 }
3041                 if (pid > 0) {
3042                         DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3043                 }
3044         }
3045 }
3046
3047 /*
3048   startup the recovery daemon as a child of the main ctdb daemon
3049  */
3050 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3051 {
3052         int fd[2];
3053         struct tevent_signal *se;
3054         struct tevent_fd *fde;
3055         int ret;
3056
3057         if (pipe(fd) != 0) {
3058                 return -1;
3059         }
3060
3061         ctdb->recoverd_pid = ctdb_fork(ctdb);
3062         if (ctdb->recoverd_pid == -1) {
3063                 return -1;
3064         }
3065
3066         if (ctdb->recoverd_pid != 0) {
3067                 talloc_free(ctdb->recd_ctx);
3068                 ctdb->recd_ctx = talloc_new(ctdb);
3069                 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
3070
3071                 close(fd[0]);
3072                 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3073                                  timeval_current_ofs(30, 0),
3074                                  ctdb_check_recd, ctdb);
3075                 return 0;
3076         }
3077
3078         close(fd[1]);
3079
3080         srandom(getpid() ^ time(NULL));
3081
3082         ret = logging_init(ctdb, NULL, NULL, "ctdb-recoverd");
3083         if (ret != 0) {
3084                 return -1;
3085         }
3086
3087         prctl_set_comment("ctdb_recoverd");
3088         if (switch_from_server_to_client(ctdb) != 0) {
3089                 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3090                 exit(1);
3091         }
3092
3093         DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
3094
3095         fde = tevent_add_fd(ctdb->ev, ctdb, fd[0], TEVENT_FD_READ,
3096                             ctdb_recoverd_parent, &fd[0]);
3097         tevent_fd_set_auto_close(fde);
3098
3099         /* set up a handler to pick up sigchld */
3100         se = tevent_add_signal(ctdb->ev, ctdb, SIGCHLD, 0,
3101                                recd_sig_child_handler, ctdb);
3102         if (se == NULL) {
3103                 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3104                 exit(1);
3105         }
3106
3107         monitor_cluster(ctdb);
3108
3109         DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
3110         return -1;
3111 }
3112
3113 /*
3114   shutdown the recovery daemon
3115  */
3116 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
3117 {
3118         if (ctdb->recoverd_pid == 0) {
3119                 return;
3120         }
3121
3122         DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
3123         ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
3124
3125         TALLOC_FREE(ctdb->recd_ctx);
3126         TALLOC_FREE(ctdb->recd_ping_count);
3127 }
3128
3129 static void ctdb_restart_recd(struct tevent_context *ev,
3130                               struct tevent_timer *te,
3131                               struct timeval t, void *private_data)
3132 {
3133         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3134
3135         DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
3136         ctdb_stop_recoverd(ctdb);
3137         ctdb_start_recoverd(ctdb);
3138 }