ctdb/server/ctdb_recoverd.c

   1 /*
   2    ctdb recovery daemon
   3
   4    Copyright (C) Ronnie Sahlberg  2007
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20 #include "replace.h"
  21 #include "system/filesys.h"
  22 #include "system/time.h"
  23 #include "system/network.h"
  24 #include "system/wait.h"
  25
  26 #include <popt.h>
  27 #include <talloc.h>
  28 #include <tevent.h>
  29 #include <tdb.h>
  30
  31 #include "lib/tdb_wrap/tdb_wrap.h"
  32 #include "lib/util/dlinklist.h"
  33 #include "lib/util/debug.h"
  34 #include "lib/util/samba_util.h"
  35 #include "lib/util/sys_rw.h"
  36 #include "lib/util/util_process.h"
  37
  38 #include "ctdb_private.h"
  39 #include "ctdb_client.h"
  40
  41 #include "common/system.h"
  42 #include "common/common.h"
  43 #include "common/logging.h"
  44
  45 #include "ctdb_cluster_mutex.h"
  46
  47 /* List of SRVID requests that need to be processed */
  48 struct srvid_list {
  49         struct srvid_list *next, *prev;
  50         struct ctdb_srvid_message *request;
  51 };
  52
  53 struct srvid_requests {
  54         struct srvid_list *requests;
  55 };
  56
  57 static void srvid_request_reply(struct ctdb_context *ctdb,
  58                                 struct ctdb_srvid_message *request,
  59                                 TDB_DATA result)
  60 {
  61         /* Someone that sent srvid==0 does not want a reply */
  62         if (request->srvid == 0) {
  63                 talloc_free(request);
  64                 return;
  65         }
  66
  67         if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
  68                                      result) == 0) {
  69                 DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
  70                                   (unsigned)request->pnn,
  71                                   (unsigned long long)request->srvid));
  72         } else {
  73                 DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
  74                                  (unsigned)request->pnn,
  75                                  (unsigned long long)request->srvid));
  76         }
  77
  78         talloc_free(request);
  79 }
  80
  81 static void srvid_requests_reply(struct ctdb_context *ctdb,
  82                                  struct srvid_requests **requests,
  83                                  TDB_DATA result)
  84 {
  85         struct srvid_list *r;
  86
  87         if (*requests == NULL) {
  88                 return;
  89         }
  90
  91         for (r = (*requests)->requests; r != NULL; r = r->next) {
  92                 srvid_request_reply(ctdb, r->request, result);
  93         }
  94
  95         /* Free the list structure... */
  96         TALLOC_FREE(*requests);
  97 }
  98
  99 static void srvid_request_add(struct ctdb_context *ctdb,
 100                               struct srvid_requests **requests,
 101                               struct ctdb_srvid_message *request)
 102 {
 103         struct srvid_list *t;
 104         int32_t ret;
 105         TDB_DATA result;
 106
 107         if (*requests == NULL) {
 108                 *requests = talloc_zero(ctdb, struct srvid_requests);
 109                 if (*requests == NULL) {
 110                         goto nomem;
 111                 }
 112         }
 113
 114         t = talloc_zero(*requests, struct srvid_list);
 115         if (t == NULL) {
 116                 /* If *requests was just allocated above then free it */
 117                 if ((*requests)->requests == NULL) {
 118                         TALLOC_FREE(*requests);
 119                 }
 120                 goto nomem;
 121         }
 122
 123         t->request = (struct ctdb_srvid_message *)talloc_steal(t, request);
 124         DLIST_ADD((*requests)->requests, t);
 125
 126         return;
 127
 128 nomem:
 129         /* Failed to add the request to the list.  Send a fail. */
 130         DEBUG(DEBUG_ERR, (__location__
 131                           " Out of memory, failed to queue SRVID request\n"));
 132         ret = -ENOMEM;
 133         result.dsize = sizeof(ret);
 134         result.dptr = (uint8_t *)&ret;
 135         srvid_request_reply(ctdb, request, result);
 136 }
 137
 138 /* An abstraction to allow an operation (takeover runs, recoveries,
 139  * ...) to be disabled for a given timeout */
 140 struct ctdb_op_state {
 141         struct tevent_timer *timer;
 142         bool in_progress;
 143         const char *name;
 144 };
 145
 146 static struct ctdb_op_state *ctdb_op_init(TALLOC_CTX *mem_ctx, const char *name)
 147 {
 148         struct ctdb_op_state *state = talloc_zero(mem_ctx, struct ctdb_op_state);
 149
 150         if (state != NULL) {
 151                 state->in_progress = false;
 152                 state->name = name;
 153         }
 154
 155         return state;
 156 }
 157
 158 static bool ctdb_op_is_disabled(struct ctdb_op_state *state)
 159 {
 160         return state->timer != NULL;
 161 }
 162
 163 static bool ctdb_op_begin(struct ctdb_op_state *state)
 164 {
 165         if (ctdb_op_is_disabled(state)) {
 166                 DEBUG(DEBUG_NOTICE,
 167                       ("Unable to begin - %s are disabled\n", state->name));
 168                 return false;
 169         }
 170
 171         state->in_progress = true;
 172         return true;
 173 }
 174
 175 static bool ctdb_op_end(struct ctdb_op_state *state)
 176 {
 177         return state->in_progress = false;
 178 }
 179
 180 static bool ctdb_op_is_in_progress(struct ctdb_op_state *state)
 181 {
 182         return state->in_progress;
 183 }
 184
 185 static void ctdb_op_enable(struct ctdb_op_state *state)
 186 {
 187         TALLOC_FREE(state->timer);
 188 }
 189
 190 static void ctdb_op_timeout_handler(struct tevent_context *ev,
 191                                     struct tevent_timer *te,
 192                                     struct timeval yt, void *p)
 193 {
 194         struct ctdb_op_state *state =
 195                 talloc_get_type(p, struct ctdb_op_state);
 196
 197         DEBUG(DEBUG_NOTICE,("Reenabling %s after timeout\n", state->name));
 198         ctdb_op_enable(state);
 199 }
 200
 201 static int ctdb_op_disable(struct ctdb_op_state *state,
 202                            struct tevent_context *ev,
 203                            uint32_t timeout)
 204 {
 205         if (timeout == 0) {
 206                 DEBUG(DEBUG_NOTICE,("Reenabling %s\n", state->name));
 207                 ctdb_op_enable(state);
 208                 return 0;
 209         }
 210
 211         if (state->in_progress) {
 212                 DEBUG(DEBUG_ERR,
 213                       ("Unable to disable %s - in progress\n", state->name));
 214                 return -EAGAIN;
 215         }
 216
 217         DEBUG(DEBUG_NOTICE,("Disabling %s for %u seconds\n",
 218                             state->name, timeout));
 219
 220         /* Clear any old timers */
 221         talloc_free(state->timer);
 222
 223         /* Arrange for the timeout to occur */
 224         state->timer = tevent_add_timer(ev, state,
 225                                         timeval_current_ofs(timeout, 0),
 226                                         ctdb_op_timeout_handler, state);
 227         if (state->timer == NULL) {
 228                 DEBUG(DEBUG_ERR,(__location__ " Unable to setup timer\n"));
 229                 return -ENOMEM;
 230         }
 231
 232         return 0;
 233 }
 234
 235 struct ctdb_banning_state {
 236         uint32_t count;
 237         struct timeval last_reported_time;
 238 };
 239
 240 /*
 241   private state of recovery daemon
 242  */
 243 struct ctdb_recoverd {
 244         struct ctdb_context *ctdb;
 245         uint32_t recmaster;
 246         uint32_t last_culprit_node;
 247         struct ctdb_node_map_old *nodemap;
 248         struct timeval priority_time;
 249         bool need_takeover_run;
 250         bool need_recovery;
 251         uint32_t node_flags;
 252         struct tevent_timer *send_election_te;
 253         struct tevent_timer *election_timeout;
 254         struct srvid_requests *reallocate_requests;
 255         struct ctdb_op_state *takeover_run;
 256         struct ctdb_op_state *recovery;
 257         struct ctdb_iface_list_old *ifaces;
 258         uint32_t *force_rebalance_nodes;
 259         struct ctdb_node_capabilities *caps;
 260         bool frozen_on_inactive;
 261         struct ctdb_cluster_mutex_handle *recovery_lock_handle;
 262 };
 263
 264 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
 265 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
 266
 267 static void ctdb_restart_recd(struct tevent_context *ev,
 268                               struct tevent_timer *te, struct timeval t,
 269                               void *private_data);
 270
 271 /*
 272   ban a node for a period of time
 273  */
 274 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
 275 {
 276         int ret;
 277         struct ctdb_context *ctdb = rec->ctdb;
 278         struct ctdb_ban_state bantime;
 279
 280         if (!ctdb_validate_pnn(ctdb, pnn)) {
 281                 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
 282                 return;
 283         }
 284
 285         DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
 286
 287         bantime.pnn  = pnn;
 288         bantime.time = ban_time;
 289
 290         ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
 291         if (ret != 0) {
 292                 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
 293                 return;
 294         }
 295
 296 }
 297
 298 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
 299
 300
 301 /*
 302   remember the trouble maker
 303  */
 304 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
 305 {
 306         struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
 307         struct ctdb_banning_state *ban_state;
 308
 309         if (culprit > ctdb->num_nodes) {
 310                 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
 311                 return;
 312         }
 313
 314         /* If we are banned or stopped, do not set other nodes as culprits */
 315         if (rec->node_flags & NODE_FLAGS_INACTIVE) {
 316                 DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
 317                 return;
 318         }
 319
 320         if (ctdb->nodes[culprit]->ban_state == NULL) {
 321                 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
 322                 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
 323
 324
 325         }
 326         ban_state = ctdb->nodes[culprit]->ban_state;
 327         if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
 328                 /* this was the first time in a long while this node
 329                    misbehaved so we will forgive any old transgressions.
 330                 */
 331                 ban_state->count = 0;
 332         }
 333
 334         ban_state->count += count;
 335         ban_state->last_reported_time = timeval_current();
 336         rec->last_culprit_node = culprit;
 337 }
 338
 339 /*
 340   remember the trouble maker
 341  */
 342 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
 343 {
 344         ctdb_set_culprit_count(rec, culprit, 1);
 345 }
 346
 347 /*
 348   Retrieve capabilities from all connected nodes
 349  */
 350 static int update_capabilities(struct ctdb_recoverd *rec,
 351                                struct ctdb_node_map_old *nodemap)
 352 {
 353         uint32_t *capp;
 354         TALLOC_CTX *tmp_ctx;
 355         struct ctdb_node_capabilities *caps;
 356         struct ctdb_context *ctdb = rec->ctdb;
 357
 358         tmp_ctx = talloc_new(rec);
 359         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 360
 361         caps = ctdb_get_capabilities(ctdb, tmp_ctx,
 362                                      CONTROL_TIMEOUT(), nodemap);
 363
 364         if (caps == NULL) {
 365                 DEBUG(DEBUG_ERR,
 366                       (__location__ " Failed to get node capabilities\n"));
 367                 talloc_free(tmp_ctx);
 368                 return -1;
 369         }
 370
 371         capp = ctdb_get_node_capabilities(caps, ctdb_get_pnn(ctdb));
 372         if (capp == NULL) {
 373                 DEBUG(DEBUG_ERR,
 374                       (__location__
 375                        " Capabilities don't include current node.\n"));
 376                 talloc_free(tmp_ctx);
 377                 return -1;
 378         }
 379         ctdb->capabilities = *capp;
 380
 381         TALLOC_FREE(rec->caps);
 382         rec->caps = talloc_steal(rec, caps);
 383
 384         talloc_free(tmp_ctx);
 385         return 0;
 386 }
 387
 388 /*
 389   change recovery mode on all nodes
 390  */
 391 static int set_recovery_mode(struct ctdb_context *ctdb,
 392                              struct ctdb_recoverd *rec,
 393                              struct ctdb_node_map_old *nodemap,
 394                              uint32_t rec_mode)
 395 {
 396         TDB_DATA data;
 397         uint32_t *nodes;
 398         TALLOC_CTX *tmp_ctx;
 399
 400         tmp_ctx = talloc_new(ctdb);
 401         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 402
 403         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 404
 405         data.dsize = sizeof(uint32_t);
 406         data.dptr = (unsigned char *)&rec_mode;
 407
 408         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
 409                                         nodes, 0,
 410                                         CONTROL_TIMEOUT(),
 411                                         false, data,
 412                                         NULL, NULL,
 413                                         NULL) != 0) {
 414                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
 415                 talloc_free(tmp_ctx);
 416                 return -1;
 417         }
 418
 419         talloc_free(tmp_ctx);
 420         return 0;
 421 }
 422
 423 /*
 424   ensure all other nodes have attached to any databases that we have
 425  */
 426 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
 427                                            uint32_t pnn, struct ctdb_dbid_map_old *dbmap, TALLOC_CTX *mem_ctx)
 428 {
 429         int i, j, db, ret;
 430         struct ctdb_dbid_map_old *remote_dbmap;
 431
 432         /* verify that all other nodes have all our databases */
 433         for (j=0; j<nodemap->num; j++) {
 434                 /* we don't need to ourself ourselves */
 435                 if (nodemap->nodes[j].pnn == pnn) {
 436                         continue;
 437                 }
 438                 /* don't check nodes that are unavailable */
 439                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 440                         continue;
 441                 }
 442
 443                 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 444                                          mem_ctx, &remote_dbmap);
 445                 if (ret != 0) {
 446                         DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
 447                         return -1;
 448                 }
 449
 450                 /* step through all local databases */
 451                 for (db=0; db<dbmap->num;db++) {
 452                         const char *name;
 453
 454
 455                         for (i=0;i<remote_dbmap->num;i++) {
 456                                 if (dbmap->dbs[db].db_id == remote_dbmap->dbs[i].db_id) {
 457                                         break;
 458                                 }
 459                         }
 460                         /* the remote node already have this database */
 461                         if (i!=remote_dbmap->num) {
 462                                 continue;
 463                         }
 464                         /* ok so we need to create this database */
 465                         ret = ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn,
 466                                                   dbmap->dbs[db].db_id, mem_ctx,
 467                                                   &name);
 468                         if (ret != 0) {
 469                                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
 470                                 return -1;
 471                         }
 472                         ret = ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(),
 473                                                  nodemap->nodes[j].pnn,
 474                                                  mem_ctx, name,
 475                                                  dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
 476                         if (ret != 0) {
 477                                 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
 478                                 return -1;
 479                         }
 480                 }
 481         }
 482
 483         return 0;
 484 }
 485
 486
 487 /*
 488   ensure we are attached to any databases that anyone else is attached to
 489  */
 490 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
 491                                           uint32_t pnn, struct ctdb_dbid_map_old **dbmap, TALLOC_CTX *mem_ctx)
 492 {
 493         int i, j, db, ret;
 494         struct ctdb_dbid_map_old *remote_dbmap;
 495
 496         /* verify that we have all database any other node has */
 497         for (j=0; j<nodemap->num; j++) {
 498                 /* we don't need to ourself ourselves */
 499                 if (nodemap->nodes[j].pnn == pnn) {
 500                         continue;
 501                 }
 502                 /* don't check nodes that are unavailable */
 503                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 504                         continue;
 505                 }
 506
 507                 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 508                                          mem_ctx, &remote_dbmap);
 509                 if (ret != 0) {
 510                         DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
 511                         return -1;
 512                 }
 513
 514                 /* step through all databases on the remote node */
 515                 for (db=0; db<remote_dbmap->num;db++) {
 516                         const char *name;
 517
 518                         for (i=0;i<(*dbmap)->num;i++) {
 519                                 if (remote_dbmap->dbs[db].db_id == (*dbmap)->dbs[i].db_id) {
 520                                         break;
 521                                 }
 522                         }
 523                         /* we already have this db locally */
 524                         if (i!=(*dbmap)->num) {
 525                                 continue;
 526                         }
 527                         /* ok so we need to create this database and
 528                            rebuild dbmap
 529                          */
 530                         ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 531                                             remote_dbmap->dbs[db].db_id, mem_ctx, &name);
 532                         if (ret != 0) {
 533                                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
 534                                           nodemap->nodes[j].pnn));
 535                                 return -1;
 536                         }
 537                         ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
 538                                            remote_dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
 539                         if (ret != 0) {
 540                                 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
 541                                 return -1;
 542                         }
 543                         ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
 544                         if (ret != 0) {
 545                                 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
 546                                 return -1;
 547                         }
 548                 }
 549         }
 550
 551         return 0;
 552 }
 553
 554 /*
 555   update flags on all active nodes
 556  */
 557 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap, uint32_t pnn, uint32_t flags)
 558 {
 559         int ret;
 560
 561         ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
 562                 if (ret != 0) {
 563                 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
 564                 return -1;
 565         }
 566
 567         return 0;
 568 }
 569
 570 /*
 571   called when a vacuum fetch has completed - just free it and do the next one
 572  */
 573 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
 574 {
 575         talloc_free(state);
 576 }
 577
 578
 579 /**
 580  * Process one elements of the vacuum fetch list:
 581  * Migrate it over to us with the special flag
 582  * CTDB_CALL_FLAG_VACUUM_MIGRATION.
 583  */
 584 static bool vacuum_fetch_process_one(struct ctdb_db_context *ctdb_db,
 585                                      uint32_t pnn,
 586                                      struct ctdb_rec_data_old *r)
 587 {
 588         struct ctdb_client_call_state *state;
 589         TDB_DATA data;
 590         struct ctdb_ltdb_header *hdr;
 591         struct ctdb_call call;
 592
 593         ZERO_STRUCT(call);
 594         call.call_id = CTDB_NULL_FUNC;
 595         call.flags = CTDB_IMMEDIATE_MIGRATION;
 596         call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
 597
 598         call.key.dptr = &r->data[0];
 599         call.key.dsize = r->keylen;
 600
 601         /* ensure we don't block this daemon - just skip a record if we can't get
 602            the chainlock */
 603         if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, call.key) != 0) {
 604                 return true;
 605         }
 606
 607         data = tdb_fetch(ctdb_db->ltdb->tdb, call.key);
 608         if (data.dptr == NULL) {
 609                 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
 610                 return true;
 611         }
 612
 613         if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
 614                 free(data.dptr);
 615                 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
 616                 return true;
 617         }
 618
 619         hdr = (struct ctdb_ltdb_header *)data.dptr;
 620         if (hdr->dmaster == pnn) {
 621                 /* its already local */
 622                 free(data.dptr);
 623                 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
 624                 return true;
 625         }
 626
 627         free(data.dptr);
 628
 629         state = ctdb_call_send(ctdb_db, &call);
 630         tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
 631         if (state == NULL) {
 632                 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
 633                 return false;
 634         }
 635         state->async.fn = vacuum_fetch_callback;
 636         state->async.private_data = NULL;
 637
 638         return true;
 639 }
 640
 641
 642 /*
 643   handler for vacuum fetch
 644 */
 645 static void vacuum_fetch_handler(uint64_t srvid, TDB_DATA data,
 646                                  void *private_data)
 647 {
 648         struct ctdb_recoverd *rec = talloc_get_type(
 649                 private_data, struct ctdb_recoverd);
 650         struct ctdb_context *ctdb = rec->ctdb;
 651         struct ctdb_marshall_buffer *recs;
 652         int ret, i;
 653         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
 654         const char *name;
 655         struct ctdb_dbid_map_old *dbmap=NULL;
 656         bool persistent = false;
 657         struct ctdb_db_context *ctdb_db;
 658         struct ctdb_rec_data_old *r;
 659
 660         recs = (struct ctdb_marshall_buffer *)data.dptr;
 661
 662         if (recs->count == 0) {
 663                 goto done;
 664         }
 665
 666         /* work out if the database is persistent */
 667         ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
 668         if (ret != 0) {
 669                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
 670                 goto done;
 671         }
 672
 673         for (i=0;i<dbmap->num;i++) {
 674                 if (dbmap->dbs[i].db_id == recs->db_id) {
 675                         persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
 676                         break;
 677                 }
 678         }
 679         if (i == dbmap->num) {
 680                 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
 681                 goto done;
 682         }
 683
 684         /* find the name of this database */
 685         if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
 686                 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
 687                 goto done;
 688         }
 689
 690         /* attach to it */
 691         ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, persistent, 0);
 692         if (ctdb_db == NULL) {
 693                 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
 694                 goto done;
 695         }
 696
 697         r = (struct ctdb_rec_data_old *)&recs->data[0];
 698         while (recs->count) {
 699                 bool ok;
 700
 701                 ok = vacuum_fetch_process_one(ctdb_db, rec->ctdb->pnn, r);
 702                 if (!ok) {
 703                         break;
 704                 }
 705
 706                 r = (struct ctdb_rec_data_old *)(r->length + (uint8_t *)r);
 707                 recs->count--;
 708         }
 709
 710 done:
 711         talloc_free(tmp_ctx);
 712 }
 713
 714
 715 /*
 716  * handler for database detach
 717  */
 718 static void detach_database_handler(uint64_t srvid, TDB_DATA data,
 719                                     void *private_data)
 720 {
 721         struct ctdb_recoverd *rec = talloc_get_type(
 722                 private_data, struct ctdb_recoverd);
 723         struct ctdb_context *ctdb = rec->ctdb;
 724         uint32_t db_id;
 725         struct ctdb_db_context *ctdb_db;
 726
 727         if (data.dsize != sizeof(db_id)) {
 728                 return;
 729         }
 730         db_id = *(uint32_t *)data.dptr;
 731
 732         ctdb_db = find_ctdb_db(ctdb, db_id);
 733         if (ctdb_db == NULL) {
 734                 /* database is not attached */
 735                 return;
 736         }
 737
 738         DLIST_REMOVE(ctdb->db_list, ctdb_db);
 739
 740         DEBUG(DEBUG_NOTICE, ("Detached from database '%s'\n",
 741                              ctdb_db->db_name));
 742         talloc_free(ctdb_db);
 743 }
 744
 745 /*
 746   called when ctdb_wait_timeout should finish
 747  */
 748 static void ctdb_wait_handler(struct tevent_context *ev,
 749                               struct tevent_timer *te,
 750                               struct timeval yt, void *p)
 751 {
 752         uint32_t *timed_out = (uint32_t *)p;
 753         (*timed_out) = 1;
 754 }
 755
 756 /*
 757   wait for a given number of seconds
 758  */
 759 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
 760 {
 761         uint32_t timed_out = 0;
 762         time_t usecs = (secs - (time_t)secs) * 1000000;
 763         tevent_add_timer(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs),
 764                          ctdb_wait_handler, &timed_out);
 765         while (!timed_out) {
 766                 tevent_loop_once(ctdb->ev);
 767         }
 768 }
 769
 770 /*
 771   called when an election times out (ends)
 772  */
 773 static void ctdb_election_timeout(struct tevent_context *ev,
 774                                   struct tevent_timer *te,
 775                                   struct timeval t, void *p)
 776 {
 777         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
 778         rec->election_timeout = NULL;
 779         fast_start = false;
 780
 781         DEBUG(DEBUG_WARNING,("Election period ended\n"));
 782 }
 783
 784
 785 /*
 786   wait for an election to finish. It finished election_timeout seconds after
 787   the last election packet is received
 788  */
 789 static void ctdb_wait_election(struct ctdb_recoverd *rec)
 790 {
 791         struct ctdb_context *ctdb = rec->ctdb;
 792         while (rec->election_timeout) {
 793                 tevent_loop_once(ctdb->ev);
 794         }
 795 }
 796
 797 /*
 798   Update our local flags from all remote connected nodes.
 799   This is only run when we are or we belive we are the recovery master
 800  */
 801 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap)
 802 {
 803         int j;
 804         struct ctdb_context *ctdb = rec->ctdb;
 805         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
 806
 807         /* get the nodemap for all active remote nodes and verify
 808            they are the same as for this node
 809          */
 810         for (j=0; j<nodemap->num; j++) {
 811                 struct ctdb_node_map_old *remote_nodemap=NULL;
 812                 int ret;
 813
 814                 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
 815                         continue;
 816                 }
 817                 if (nodemap->nodes[j].pnn == ctdb->pnn) {
 818                         continue;
 819                 }
 820
 821                 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 822                                            mem_ctx, &remote_nodemap);
 823                 if (ret != 0) {
 824                         DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
 825                                   nodemap->nodes[j].pnn));
 826                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
 827                         talloc_free(mem_ctx);
 828                         return -1;
 829                 }
 830                 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
 831                         /* We should tell our daemon about this so it
 832                            updates its flags or else we will log the same
 833                            message again in the next iteration of recovery.
 834                            Since we are the recovery master we can just as
 835                            well update the flags on all nodes.
 836                         */
 837                         ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
 838                         if (ret != 0) {
 839                                 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
 840                                 return -1;
 841                         }
 842
 843                         /* Update our local copy of the flags in the recovery
 844                            daemon.
 845                         */
 846                         DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
 847                                  nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
 848                                  nodemap->nodes[j].flags));
 849                         nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
 850                 }
 851                 talloc_free(remote_nodemap);
 852         }
 853         talloc_free(mem_ctx);
 854         return 0;
 855 }
 856
 857
 858 /* Create a new random generation id.
 859    The generation id can not be the INVALID_GENERATION id
 860 */
 861 static uint32_t new_generation(void)
 862 {
 863         uint32_t generation;
 864
 865         while (1) {
 866                 generation = random();
 867
 868                 if (generation != INVALID_GENERATION) {
 869                         break;
 870                 }
 871         }
 872
 873         return generation;
 874 }
 875
 876 static bool ctdb_recovery_have_lock(struct ctdb_recoverd *rec)
 877 {
 878         return (rec->recovery_lock_handle != NULL);
 879 }
 880
 881 struct hold_reclock_state {
 882         bool done;
 883         bool locked;
 884         double latency;
 885 };
 886
 887 static void take_reclock_handler(char status,
 888                                  double latency,
 889                                  void *private_data)
 890 {
 891         struct hold_reclock_state *s =
 892                 (struct hold_reclock_state *) private_data;
 893
 894         switch (status) {
 895         case '0':
 896                 s->latency = latency;
 897                 break;
 898
 899         case '1':
 900                 DEBUG(DEBUG_ERR,
 901                       ("Unable to take recovery lock - contention\n"));
 902                 break;
 903
 904         default:
 905                 DEBUG(DEBUG_ERR, ("ERROR: when taking recovery lock\n"));
 906         }
 907
 908         s->done = true;
 909         s->locked = (status == '0') ;
 910 }
 911
 912 static bool ctdb_recovery_lock(struct ctdb_recoverd *rec);
 913
 914 static void lost_reclock_handler(void *private_data)
 915 {
 916         struct ctdb_recoverd *rec = talloc_get_type_abort(
 917                 private_data, struct ctdb_recoverd);
 918
 919         DEBUG(DEBUG_ERR,
 920               ("Recovery lock helper terminated unexpectedly - "
 921                "trying to retake recovery lock\n"));
 922         TALLOC_FREE(rec->recovery_lock_handle);
 923         if (! ctdb_recovery_lock(rec)) {
 924                 DEBUG(DEBUG_ERR, ("Failed to take recovery lock\n"));
 925         }
 926 }
 927
 928 static bool ctdb_recovery_lock(struct ctdb_recoverd *rec)
 929 {
 930         struct ctdb_context *ctdb = rec->ctdb;
 931         struct ctdb_cluster_mutex_handle *h;
 932         struct hold_reclock_state s = {
 933                 .done = false,
 934                 .locked = false,
 935                 .latency = 0,
 936         };
 937
 938         h = ctdb_cluster_mutex(rec, ctdb, ctdb->recovery_lock, 0,
 939                                take_reclock_handler, &s,
 940                                lost_reclock_handler, rec);
 941         if (h == NULL) {
 942                 return false;
 943         }
 944
 945         while (!s.done) {
 946                 tevent_loop_once(ctdb->ev);
 947         }
 948
 949         if (! s.locked) {
 950                 talloc_free(h);
 951                 return false;
 952         }
 953
 954         rec->recovery_lock_handle = h;
 955         ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(),
 956                                            s.latency);
 957
 958         return true;
 959 }
 960
 961 static void ctdb_recovery_unlock(struct ctdb_recoverd *rec)
 962 {
 963         if (rec->recovery_lock_handle != NULL) {
 964                 DEBUG(DEBUG_NOTICE, ("Releasing recovery lock\n"));
 965                 TALLOC_FREE(rec->recovery_lock_handle);
 966         }
 967 }
 968
 969 static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
 970 {
 971         struct ctdb_context *ctdb = rec->ctdb;
 972         int i;
 973         struct ctdb_banning_state *ban_state;
 974
 975         *self_ban = false;
 976         for (i=0; i<ctdb->num_nodes; i++) {
 977                 if (ctdb->nodes[i]->ban_state == NULL) {
 978                         continue;
 979                 }
 980                 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
 981                 if (ban_state->count < 2*ctdb->num_nodes) {
 982                         continue;
 983                 }
 984
 985                 DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
 986                         ctdb->nodes[i]->pnn, ban_state->count,
 987                         ctdb->tunable.recovery_ban_period));
 988                 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
 989                 ban_state->count = 0;
 990
 991                 /* Banning ourself? */
 992                 if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
 993                         *self_ban = true;
 994                 }
 995         }
 996 }
 997
 998 struct helper_state {
 999         int fd[2];
1000         pid_t pid;
1001         int result;
1002         bool done;
1003 };
1004
1005 static void helper_handler(struct tevent_context *ev,
1006                            struct tevent_fd *fde,
1007                            uint16_t flags, void *private_data)
1008 {
1009         struct helper_state *state = talloc_get_type_abort(
1010                 private_data, struct helper_state);
1011         int ret;
1012
1013         ret = sys_read(state->fd[0], &state->result, sizeof(state->result));
1014         if (ret != sizeof(state->result)) {
1015                 state->result = EPIPE;
1016         }
1017
1018         state->done = true;
1019 }
1020
1021 static int helper_run(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx,
1022                       const char *prog, const char *arg, const char *type)
1023 {
1024         struct helper_state *state;
1025         struct tevent_fd *fde;
1026         const char **args;
1027         int nargs, ret;
1028
1029         state = talloc_zero(mem_ctx, struct helper_state);
1030         if (state == NULL) {
1031                 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1032                 return -1;
1033         }
1034
1035         state->pid = -1;
1036
1037         ret = pipe(state->fd);
1038         if (ret != 0) {
1039                 DEBUG(DEBUG_ERR,
1040                       ("Failed to create pipe for %s helper\n", type));
1041                 goto fail;
1042         }
1043
1044         set_close_on_exec(state->fd[0]);
1045
1046         nargs = 4;
1047         args = talloc_array(state, const char *, nargs);
1048         if (args == NULL) {
1049                 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1050                 goto fail;
1051         }
1052
1053         args[0] = talloc_asprintf(args, "%d", state->fd[1]);
1054         if (args[0] == NULL) {
1055                 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1056                 goto fail;
1057         }
1058         args[1] = rec->ctdb->daemon.name;
1059         args[2] = arg;
1060         args[3] = NULL;
1061
1062         if (args[2] == NULL) {
1063                 nargs = 3;
1064         }
1065
1066         state->pid = ctdb_vfork_exec(state, rec->ctdb, prog, nargs, args);
1067         if (state->pid == -1) {
1068                 DEBUG(DEBUG_ERR,
1069                       ("Failed to create child for %s helper\n", type));
1070                 goto fail;
1071         }
1072
1073         close(state->fd[1]);
1074         state->fd[1] = -1;
1075
1076         state->done = false;
1077
1078         fde = tevent_add_fd(rec->ctdb->ev, rec->ctdb, state->fd[0],
1079                             TEVENT_FD_READ, helper_handler, state);
1080         if (fde == NULL) {
1081                 goto fail;
1082         }
1083         tevent_fd_set_auto_close(fde);
1084
1085         while (!state->done) {
1086                 tevent_loop_once(rec->ctdb->ev);
1087         }
1088
1089         close(state->fd[0]);
1090         state->fd[0] = -1;
1091
1092         if (state->result != 0) {
1093                 goto fail;
1094         }
1095
1096         ctdb_kill(rec->ctdb, state->pid, SIGKILL);
1097         talloc_free(state);
1098         return 0;
1099
1100 fail:
1101         if (state->fd[0] != -1) {
1102                 close(state->fd[0]);
1103         }
1104         if (state->fd[1] != -1) {
1105                 close(state->fd[1]);
1106         }
1107         if (state->pid != -1) {
1108                 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
1109         }
1110         talloc_free(state);
1111         return -1;
1112 }
1113
1114
1115 static int ctdb_takeover(struct ctdb_recoverd *rec,
1116                          uint32_t *force_rebalance_nodes)
1117 {
1118         static char prog[PATH_MAX+1] = "";
1119         char *arg;
1120         int i;
1121
1122         if (!ctdb_set_helper("takeover_helper", prog, sizeof(prog),
1123                              "CTDB_TAKEOVER_HELPER", CTDB_HELPER_BINDIR,
1124                              "ctdb_takeover_helper")) {
1125                 ctdb_die(rec->ctdb, "Unable to set takeover helper\n");
1126         }
1127
1128         arg = NULL;
1129         for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1130                 uint32_t pnn = force_rebalance_nodes[i];
1131                 if (arg == NULL) {
1132                         arg = talloc_asprintf(rec, "%u", pnn);
1133                 } else {
1134                         arg = talloc_asprintf_append(arg, ",%u", pnn);
1135                 }
1136                 if (arg == NULL) {
1137                         DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1138                         return -1;
1139                 }
1140         }
1141
1142         return helper_run(rec, rec, prog, arg, "takeover");
1143 }
1144
1145 static bool do_takeover_run(struct ctdb_recoverd *rec,
1146                             struct ctdb_node_map_old *nodemap)
1147 {
1148         uint32_t *nodes = NULL;
1149         struct ctdb_disable_message dtr;
1150         TDB_DATA data;
1151         int i;
1152         uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
1153         int ret;
1154         bool ok;
1155
1156         DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
1157
1158         if (ctdb_op_is_in_progress(rec->takeover_run)) {
1159                 DEBUG(DEBUG_ERR, (__location__
1160                                   " takeover run already in progress \n"));
1161                 ok = false;
1162                 goto done;
1163         }
1164
1165         if (!ctdb_op_begin(rec->takeover_run)) {
1166                 ok = false;
1167                 goto done;
1168         }
1169
1170         /* Disable IP checks (takeover runs, really) on other nodes
1171          * while doing this takeover run.  This will stop those other
1172          * nodes from triggering takeover runs when think they should
1173          * be hosting an IP but it isn't yet on an interface.  Don't
1174          * wait for replies since a failure here might cause some
1175          * noise in the logs but will not actually cause a problem.
1176          */
1177         ZERO_STRUCT(dtr);
1178         dtr.srvid = 0; /* No reply */
1179         dtr.pnn = -1;
1180
1181         data.dptr  = (uint8_t*)&dtr;
1182         data.dsize = sizeof(dtr);
1183
1184         nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
1185
1186         /* Disable for 60 seconds.  This can be a tunable later if
1187          * necessary.
1188          */
1189         dtr.timeout = 60;
1190         for (i = 0; i < talloc_array_length(nodes); i++) {
1191                 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1192                                              CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1193                                              data) != 0) {
1194                         DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
1195                 }
1196         }
1197
1198         ret = ctdb_takeover(rec, rec->force_rebalance_nodes);
1199
1200         /* Reenable takeover runs and IP checks on other nodes */
1201         dtr.timeout = 0;
1202         for (i = 0; i < talloc_array_length(nodes); i++) {
1203                 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1204                                              CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1205                                              data) != 0) {
1206                         DEBUG(DEBUG_INFO,("Failed to re-enable takeover runs\n"));
1207                 }
1208         }
1209
1210         if (ret != 0) {
1211                 DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
1212                 ok = false;
1213                 goto done;
1214         }
1215
1216         ok = true;
1217         /* Takeover run was successful so clear force rebalance targets */
1218         if (rebalance_nodes == rec->force_rebalance_nodes) {
1219                 TALLOC_FREE(rec->force_rebalance_nodes);
1220         } else {
1221                 DEBUG(DEBUG_WARNING,
1222                       ("Rebalance target nodes changed during takeover run - not clearing\n"));
1223         }
1224 done:
1225         rec->need_takeover_run = !ok;
1226         talloc_free(nodes);
1227         ctdb_op_end(rec->takeover_run);
1228
1229         DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
1230         return ok;
1231 }
1232
1233 static int db_recovery_parallel(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx)
1234 {
1235         static char prog[PATH_MAX+1] = "";
1236         const char *arg;
1237
1238         if (!ctdb_set_helper("recovery_helper", prog, sizeof(prog),
1239                              "CTDB_RECOVERY_HELPER", CTDB_HELPER_BINDIR,
1240                              "ctdb_recovery_helper")) {
1241                 ctdb_die(rec->ctdb, "Unable to set recovery helper\n");
1242         }
1243
1244         arg = talloc_asprintf(mem_ctx, "%u", new_generation());
1245         if (arg == NULL) {
1246                 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1247                 return -1;
1248         }
1249
1250         setenv("CTDB_DBDIR_STATE", rec->ctdb->db_directory_state, 1);
1251
1252         return helper_run(rec, mem_ctx, prog, arg, "recovery");
1253 }
1254
1255 /*
1256   we are the recmaster, and recovery is needed - start a recovery run
1257  */
1258 static int do_recovery(struct ctdb_recoverd *rec,
1259                        TALLOC_CTX *mem_ctx, uint32_t pnn,
1260                        struct ctdb_node_map_old *nodemap, struct ctdb_vnn_map *vnnmap)
1261 {
1262         struct ctdb_context *ctdb = rec->ctdb;
1263         int i, ret;
1264         struct ctdb_dbid_map_old *dbmap;
1265         bool self_ban;
1266
1267         DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1268
1269         /* Check if the current node is still the recmaster.  It's possible that
1270          * re-election has changed the recmaster.
1271          */
1272         if (pnn != rec->recmaster) {
1273                 DEBUG(DEBUG_NOTICE,
1274                       ("Recovery master changed to %u, aborting recovery\n",
1275                        rec->recmaster));
1276                 return -1;
1277         }
1278
1279         /* if recovery fails, force it again */
1280         rec->need_recovery = true;
1281
1282         if (!ctdb_op_begin(rec->recovery)) {
1283                 return -1;
1284         }
1285
1286         if (rec->election_timeout) {
1287                 /* an election is in progress */
1288                 DEBUG(DEBUG_ERR, ("do_recovery called while election in progress - try again later\n"));
1289                 goto fail;
1290         }
1291
1292         ban_misbehaving_nodes(rec, &self_ban);
1293         if (self_ban) {
1294                 DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
1295                 goto fail;
1296         }
1297
1298         if (ctdb->recovery_lock != NULL) {
1299                 if (ctdb_recovery_have_lock(rec)) {
1300                         DEBUG(DEBUG_NOTICE, ("Already holding recovery lock\n"));
1301                 } else {
1302                         DEBUG(DEBUG_NOTICE, ("Attempting to take recovery lock (%s)\n",
1303                                              ctdb->recovery_lock));
1304                         if (!ctdb_recovery_lock(rec)) {
1305                                 if (ctdb->runstate == CTDB_RUNSTATE_FIRST_RECOVERY) {
1306                                         /* If ctdb is trying first recovery, it's
1307                                          * possible that current node does not know
1308                                          * yet who the recmaster is.
1309                                          */
1310                                         DEBUG(DEBUG_ERR, ("Unable to get recovery lock"
1311                                                           " - retrying recovery\n"));
1312                                         goto fail;
1313                                 }
1314
1315                                 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
1316                                                  "and ban ourself for %u seconds\n",
1317                                                  ctdb->tunable.recovery_ban_period));
1318                                 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
1319                                 goto fail;
1320                         }
1321                         DEBUG(DEBUG_NOTICE,
1322                               ("Recovery lock taken successfully by recovery daemon\n"));
1323                 }
1324         }
1325
1326         DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1327
1328         /* get a list of all databases */
1329         ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1330         if (ret != 0) {
1331                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1332                 goto fail;
1333         }
1334
1335         /* we do the db creation before we set the recovery mode, so the freeze happens
1336            on all databases we will be dealing with. */
1337
1338         /* verify that we have all the databases any other node has */
1339         ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1340         if (ret != 0) {
1341                 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1342                 goto fail;
1343         }
1344
1345         /* verify that all other nodes have all our databases */
1346         ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1347         if (ret != 0) {
1348                 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1349                 goto fail;
1350         }
1351         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1352
1353
1354         /* Retrieve capabilities from all connected nodes */
1355         ret = update_capabilities(rec, nodemap);
1356         if (ret!=0) {
1357                 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1358                 return -1;
1359         }
1360
1361         /*
1362           update all nodes to have the same flags that we have
1363          */
1364         for (i=0;i<nodemap->num;i++) {
1365                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1366                         continue;
1367                 }
1368
1369                 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1370                 if (ret != 0) {
1371                         if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1372                                 DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
1373                         } else {
1374                                 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1375                                 return -1;
1376                         }
1377                 }
1378         }
1379
1380         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1381
1382         ret = db_recovery_parallel(rec, mem_ctx);
1383         if (ret != 0) {
1384                 goto fail;
1385         }
1386
1387         do_takeover_run(rec, nodemap);
1388
1389         /* send a message to all clients telling them that the cluster
1390            has been reconfigured */
1391         ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
1392                                        CTDB_SRVID_RECONFIGURE, tdb_null);
1393         if (ret != 0) {
1394                 DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n"));
1395                 goto fail;
1396         }
1397
1398         DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1399
1400         rec->need_recovery = false;
1401         ctdb_op_end(rec->recovery);
1402
1403         /* we managed to complete a full recovery, make sure to forgive
1404            any past sins by the nodes that could now participate in the
1405            recovery.
1406         */
1407         DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
1408         for (i=0;i<nodemap->num;i++) {
1409                 struct ctdb_banning_state *ban_state;
1410
1411                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1412                         continue;
1413                 }
1414
1415                 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
1416                 if (ban_state == NULL) {
1417                         continue;
1418                 }
1419
1420                 ban_state->count = 0;
1421         }
1422
1423         /* We just finished a recovery successfully.
1424            We now wait for rerecovery_timeout before we allow
1425            another recovery to take place.
1426         */
1427         DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
1428         ctdb_op_disable(rec->recovery, ctdb->ev,
1429                         ctdb->tunable.rerecovery_timeout);
1430         return 0;
1431
1432 fail:
1433         ctdb_op_end(rec->recovery);
1434         return -1;
1435 }
1436
1437
1438 /*
1439   elections are won by first checking the number of connected nodes, then
1440   the priority time, then the pnn
1441  */
1442 struct election_message {
1443         uint32_t num_connected;
1444         struct timeval priority_time;
1445         uint32_t pnn;
1446         uint32_t node_flags;
1447 };
1448
1449 /*
1450   form this nodes election data
1451  */
1452 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1453 {
1454         int ret, i;
1455         struct ctdb_node_map_old *nodemap;
1456         struct ctdb_context *ctdb = rec->ctdb;
1457
1458         ZERO_STRUCTP(em);
1459
1460         em->pnn = rec->ctdb->pnn;
1461         em->priority_time = rec->priority_time;
1462
1463         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1464         if (ret != 0) {
1465                 DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
1466                 return;
1467         }
1468
1469         rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
1470         em->node_flags = rec->node_flags;
1471
1472         for (i=0;i<nodemap->num;i++) {
1473                 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1474                         em->num_connected++;
1475                 }
1476         }
1477
1478         /* we shouldnt try to win this election if we cant be a recmaster */
1479         if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1480                 em->num_connected = 0;
1481                 em->priority_time = timeval_current();
1482         }
1483
1484         talloc_free(nodemap);
1485 }
1486
1487 /*
1488   see if the given election data wins
1489  */
1490 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1491 {
1492         struct election_message myem;
1493         int cmp = 0;
1494
1495         ctdb_election_data(rec, &myem);
1496
1497         /* we cant win if we don't have the recmaster capability */
1498         if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1499                 return false;
1500         }
1501
1502         /* we cant win if we are banned */
1503         if (rec->node_flags & NODE_FLAGS_BANNED) {
1504                 return false;
1505         }
1506
1507         /* we cant win if we are stopped */
1508         if (rec->node_flags & NODE_FLAGS_STOPPED) {
1509                 return false;
1510         }
1511
1512         /* we will automatically win if the other node is banned */
1513         if (em->node_flags & NODE_FLAGS_BANNED) {
1514                 return true;
1515         }
1516
1517         /* we will automatically win if the other node is banned */
1518         if (em->node_flags & NODE_FLAGS_STOPPED) {
1519                 return true;
1520         }
1521
1522         /* then the longest running node */
1523         if (cmp == 0) {
1524                 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1525         }
1526
1527         if (cmp == 0) {
1528                 cmp = (int)myem.pnn - (int)em->pnn;
1529         }
1530
1531         return cmp > 0;
1532 }
1533
1534 /*
1535   send out an election request
1536  */
1537 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
1538 {
1539         int ret;
1540         TDB_DATA election_data;
1541         struct election_message emsg;
1542         uint64_t srvid;
1543         struct ctdb_context *ctdb = rec->ctdb;
1544
1545         srvid = CTDB_SRVID_ELECTION;
1546
1547         ctdb_election_data(rec, &emsg);
1548
1549         election_data.dsize = sizeof(struct election_message);
1550         election_data.dptr  = (unsigned char *)&emsg;
1551
1552
1553         /* first we assume we will win the election and set
1554            recoverymaster to be ourself on the current node
1555          */
1556         ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
1557                                      CTDB_CURRENT_NODE, pnn);
1558         if (ret != 0) {
1559                 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster\n"));
1560                 return -1;
1561         }
1562         rec->recmaster = pnn;
1563
1564         /* send an election message to all active nodes */
1565         DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
1566         return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1567 }
1568
1569 /*
1570   we think we are winning the election - send a broadcast election request
1571  */
1572 static void election_send_request(struct tevent_context *ev,
1573                                   struct tevent_timer *te,
1574                                   struct timeval t, void *p)
1575 {
1576         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1577         int ret;
1578
1579         ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
1580         if (ret != 0) {
1581                 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
1582         }
1583
1584         TALLOC_FREE(rec->send_election_te);
1585 }
1586
1587 /*
1588   handler for memory dumps
1589 */
1590 static void mem_dump_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1591 {
1592         struct ctdb_recoverd *rec = talloc_get_type(
1593                 private_data, struct ctdb_recoverd);
1594         struct ctdb_context *ctdb = rec->ctdb;
1595         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1596         TDB_DATA *dump;
1597         int ret;
1598         struct ctdb_srvid_message *rd;
1599
1600         if (data.dsize != sizeof(struct ctdb_srvid_message)) {
1601                 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1602                 talloc_free(tmp_ctx);
1603                 return;
1604         }
1605         rd = (struct ctdb_srvid_message *)data.dptr;
1606
1607         dump = talloc_zero(tmp_ctx, TDB_DATA);
1608         if (dump == NULL) {
1609                 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
1610                 talloc_free(tmp_ctx);
1611                 return;
1612         }
1613         ret = ctdb_dump_memory(ctdb, dump);
1614         if (ret != 0) {
1615                 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
1616                 talloc_free(tmp_ctx);
1617                 return;
1618         }
1619
1620 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
1621
1622         ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
1623         if (ret != 0) {
1624                 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
1625                 talloc_free(tmp_ctx);
1626                 return;
1627         }
1628
1629         talloc_free(tmp_ctx);
1630 }
1631
1632 /*
1633   handler for reload_nodes
1634 */
1635 static void reload_nodes_handler(uint64_t srvid, TDB_DATA data,
1636                                  void *private_data)
1637 {
1638         struct ctdb_recoverd *rec = talloc_get_type(
1639                 private_data, struct ctdb_recoverd);
1640
1641         DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
1642
1643         ctdb_load_nodes_file(rec->ctdb);
1644 }
1645
1646
1647 static void recd_node_rebalance_handler(uint64_t srvid, TDB_DATA data,
1648                                         void *private_data)
1649 {
1650         struct ctdb_recoverd *rec = talloc_get_type(
1651                 private_data, struct ctdb_recoverd);
1652         struct ctdb_context *ctdb = rec->ctdb;
1653         uint32_t pnn;
1654         uint32_t *t;
1655         int len;
1656
1657         if (rec->recmaster != ctdb_get_pnn(ctdb)) {
1658                 return;
1659         }
1660
1661         if (data.dsize != sizeof(uint32_t)) {
1662                 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
1663                 return;
1664         }
1665
1666         pnn = *(uint32_t *)&data.dptr[0];
1667
1668         DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
1669
1670         /* Copy any existing list of nodes.  There's probably some
1671          * sort of realloc variant that will do this but we need to
1672          * make sure that freeing the old array also cancels the timer
1673          * event for the timeout... not sure if realloc will do that.
1674          */
1675         len = (rec->force_rebalance_nodes != NULL) ?
1676                 talloc_array_length(rec->force_rebalance_nodes) :
1677                 0;
1678
1679         /* This allows duplicates to be added but they don't cause
1680          * harm.  A call to add a duplicate PNN arguably means that
1681          * the timeout should be reset, so this is the simplest
1682          * solution.
1683          */
1684         t = talloc_zero_array(rec, uint32_t, len+1);
1685         CTDB_NO_MEMORY_VOID(ctdb, t);
1686         if (len > 0) {
1687                 memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
1688         }
1689         t[len] = pnn;
1690
1691         talloc_free(rec->force_rebalance_nodes);
1692
1693         rec->force_rebalance_nodes = t;
1694 }
1695
1696
1697
1698 static void srvid_disable_and_reply(struct ctdb_context *ctdb,
1699                                     TDB_DATA data,
1700                                     struct ctdb_op_state *op_state)
1701 {
1702         struct ctdb_disable_message *r;
1703         uint32_t timeout;
1704         TDB_DATA result;
1705         int32_t ret = 0;
1706
1707         /* Validate input data */
1708         if (data.dsize != sizeof(struct ctdb_disable_message)) {
1709                 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1710                                  "expecting %lu\n", (long unsigned)data.dsize,
1711                                  (long unsigned)sizeof(struct ctdb_srvid_message)));
1712                 return;
1713         }
1714         if (data.dptr == NULL) {
1715                 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
1716                 return;
1717         }
1718
1719         r = (struct ctdb_disable_message *)data.dptr;
1720         timeout = r->timeout;
1721
1722         ret = ctdb_op_disable(op_state, ctdb->ev, timeout);
1723         if (ret != 0) {
1724                 goto done;
1725         }
1726
1727         /* Returning our PNN tells the caller that we succeeded */
1728         ret = ctdb_get_pnn(ctdb);
1729 done:
1730         result.dsize = sizeof(int32_t);
1731         result.dptr  = (uint8_t *)&ret;
1732         srvid_request_reply(ctdb, (struct ctdb_srvid_message *)r, result);
1733 }
1734
1735 static void disable_takeover_runs_handler(uint64_t srvid, TDB_DATA data,
1736                                           void *private_data)
1737 {
1738         struct ctdb_recoverd *rec = talloc_get_type(
1739                 private_data, struct ctdb_recoverd);
1740
1741         srvid_disable_and_reply(rec->ctdb, data, rec->takeover_run);
1742 }
1743
1744 /* Backward compatibility for this SRVID */
1745 static void disable_ip_check_handler(uint64_t srvid, TDB_DATA data,
1746                                      void *private_data)
1747 {
1748         struct ctdb_recoverd *rec = talloc_get_type(
1749                 private_data, struct ctdb_recoverd);
1750         uint32_t timeout;
1751
1752         if (data.dsize != sizeof(uint32_t)) {
1753                 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1754                                  "expecting %lu\n", (long unsigned)data.dsize,
1755                                  (long unsigned)sizeof(uint32_t)));
1756                 return;
1757         }
1758         if (data.dptr == NULL) {
1759                 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
1760                 return;
1761         }
1762
1763         timeout = *((uint32_t *)data.dptr);
1764
1765         ctdb_op_disable(rec->takeover_run, rec->ctdb->ev, timeout);
1766 }
1767
1768 static void disable_recoveries_handler(uint64_t srvid, TDB_DATA data,
1769                                        void *private_data)
1770 {
1771         struct ctdb_recoverd *rec = talloc_get_type(
1772                 private_data, struct ctdb_recoverd);
1773
1774         srvid_disable_and_reply(rec->ctdb, data, rec->recovery);
1775 }
1776
1777 /*
1778   handler for ip reallocate, just add it to the list of requests and
1779   handle this later in the monitor_cluster loop so we do not recurse
1780   with other requests to takeover_run()
1781 */
1782 static void ip_reallocate_handler(uint64_t srvid, TDB_DATA data,
1783                                   void *private_data)
1784 {
1785         struct ctdb_srvid_message *request;
1786         struct ctdb_recoverd *rec = talloc_get_type(
1787                 private_data, struct ctdb_recoverd);
1788
1789         if (data.dsize != sizeof(struct ctdb_srvid_message)) {
1790                 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1791                 return;
1792         }
1793
1794         request = (struct ctdb_srvid_message *)data.dptr;
1795
1796         srvid_request_add(rec->ctdb, &rec->reallocate_requests, request);
1797 }
1798
1799 static void process_ipreallocate_requests(struct ctdb_context *ctdb,
1800                                           struct ctdb_recoverd *rec)
1801 {
1802         TDB_DATA result;
1803         int32_t ret;
1804         struct srvid_requests *current;
1805
1806         /* Only process requests that are currently pending.  More
1807          * might come in while the takeover run is in progress and
1808          * they will need to be processed later since they might
1809          * be in response flag changes.
1810          */
1811         current = rec->reallocate_requests;
1812         rec->reallocate_requests = NULL;
1813
1814         if (do_takeover_run(rec, rec->nodemap)) {
1815                 ret = ctdb_get_pnn(ctdb);
1816         } else {
1817                 ret = -1;
1818         }
1819
1820         result.dsize = sizeof(int32_t);
1821         result.dptr  = (uint8_t *)&ret;
1822
1823         srvid_requests_reply(ctdb, &current, result);
1824 }
1825
1826 /*
1827  * handler for assigning banning credits
1828  */
1829 static void banning_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1830 {
1831         struct ctdb_recoverd *rec = talloc_get_type(
1832                 private_data, struct ctdb_recoverd);
1833         uint32_t ban_pnn;
1834
1835         /* Ignore if we are not recmaster */
1836         if (rec->ctdb->pnn != rec->recmaster) {
1837                 return;
1838         }
1839
1840         if (data.dsize != sizeof(uint32_t)) {
1841                 DEBUG(DEBUG_ERR, (__location__ "invalid data size %zu\n",
1842                                   data.dsize));
1843                 return;
1844         }
1845
1846         ban_pnn = *(uint32_t *)data.dptr;
1847
1848         ctdb_set_culprit_count(rec, ban_pnn, rec->nodemap->num);
1849 }
1850
1851 /*
1852   handler for recovery master elections
1853 */
1854 static void election_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1855 {
1856         struct ctdb_recoverd *rec = talloc_get_type(
1857                 private_data, struct ctdb_recoverd);
1858         struct ctdb_context *ctdb = rec->ctdb;
1859         int ret;
1860         struct election_message *em = (struct election_message *)data.dptr;
1861
1862         /* Ignore election packets from ourself */
1863         if (ctdb->pnn == em->pnn) {
1864                 return;
1865         }
1866
1867         /* we got an election packet - update the timeout for the election */
1868         talloc_free(rec->election_timeout);
1869         rec->election_timeout = tevent_add_timer(
1870                         ctdb->ev, ctdb,
1871                         fast_start ?
1872                                 timeval_current_ofs(0, 500000) :
1873                                 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1874                         ctdb_election_timeout, rec);
1875
1876         /* someone called an election. check their election data
1877            and if we disagree and we would rather be the elected node,
1878            send a new election message to all other nodes
1879          */
1880         if (ctdb_election_win(rec, em)) {
1881                 if (!rec->send_election_te) {
1882                         rec->send_election_te = tevent_add_timer(
1883                                         ctdb->ev, rec,
1884                                         timeval_current_ofs(0, 500000),
1885                                         election_send_request, rec);
1886                 }
1887                 return;
1888         }
1889
1890         /* we didn't win */
1891         TALLOC_FREE(rec->send_election_te);
1892
1893         /* Release the recovery lock file */
1894         if (ctdb_recovery_have_lock(rec)) {
1895                 ctdb_recovery_unlock(rec);
1896         }
1897
1898         /* ok, let that guy become recmaster then */
1899         ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
1900                                      CTDB_CURRENT_NODE, em->pnn);
1901         if (ret != 0) {
1902                 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster"));
1903                 return;
1904         }
1905         rec->recmaster = em->pnn;
1906
1907         return;
1908 }
1909
1910
1911 /*
1912   force the start of the election process
1913  */
1914 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
1915                            struct ctdb_node_map_old *nodemap)
1916 {
1917         int ret;
1918         struct ctdb_context *ctdb = rec->ctdb;
1919
1920         DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
1921
1922         /* set all nodes to recovery mode to stop all internode traffic */
1923         ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1924         if (ret != 0) {
1925                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1926                 return;
1927         }
1928
1929         talloc_free(rec->election_timeout);
1930         rec->election_timeout = tevent_add_timer(
1931                         ctdb->ev, ctdb,
1932                         fast_start ?
1933                                 timeval_current_ofs(0, 500000) :
1934                                 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1935                         ctdb_election_timeout, rec);
1936
1937         ret = send_election_request(rec, pnn);
1938         if (ret!=0) {
1939                 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
1940                 return;
1941         }
1942
1943         /* wait for a few seconds to collect all responses */
1944         ctdb_wait_election(rec);
1945 }
1946
1947
1948
1949 /*
1950   handler for when a node changes its flags
1951 */
1952 static void monitor_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1953 {
1954         struct ctdb_recoverd *rec = talloc_get_type(
1955                 private_data, struct ctdb_recoverd);
1956         struct ctdb_context *ctdb = rec->ctdb;
1957         int ret;
1958         struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
1959         struct ctdb_node_map_old *nodemap=NULL;
1960         TALLOC_CTX *tmp_ctx;
1961         int i;
1962
1963         if (data.dsize != sizeof(*c)) {
1964                 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
1965                 return;
1966         }
1967
1968         tmp_ctx = talloc_new(ctdb);
1969         CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
1970
1971         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
1972         if (ret != 0) {
1973                 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
1974                 talloc_free(tmp_ctx);
1975                 return;
1976         }
1977
1978
1979         for (i=0;i<nodemap->num;i++) {
1980                 if (nodemap->nodes[i].pnn == c->pnn) break;
1981         }
1982
1983         if (i == nodemap->num) {
1984                 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
1985                 talloc_free(tmp_ctx);
1986                 return;
1987         }
1988
1989         if (c->old_flags != c->new_flags) {
1990                 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x  was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
1991         }
1992
1993         nodemap->nodes[i].flags = c->new_flags;
1994
1995         talloc_free(tmp_ctx);
1996 }
1997
1998 /*
1999   handler for when we need to push out flag changes ot all other nodes
2000 */
2001 static void push_flags_handler(uint64_t srvid, TDB_DATA data,
2002                                void *private_data)
2003 {
2004         struct ctdb_recoverd *rec = talloc_get_type(
2005                 private_data, struct ctdb_recoverd);
2006         struct ctdb_context *ctdb = rec->ctdb;
2007         int ret;
2008         struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2009         struct ctdb_node_map_old *nodemap=NULL;
2010         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2011         uint32_t *nodes;
2012
2013         /* read the node flags from the recmaster */
2014         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
2015                                    tmp_ctx, &nodemap);
2016         if (ret != 0) {
2017                 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2018                 talloc_free(tmp_ctx);
2019                 return;
2020         }
2021         if (c->pnn >= nodemap->num) {
2022                 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2023                 talloc_free(tmp_ctx);
2024                 return;
2025         }
2026
2027         /* send the flags update to all connected nodes */
2028         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2029
2030         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2031                                       nodes, 0, CONTROL_TIMEOUT(),
2032                                       false, data,
2033                                       NULL, NULL,
2034                                       NULL) != 0) {
2035                 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2036
2037                 talloc_free(tmp_ctx);
2038                 return;
2039         }
2040
2041         talloc_free(tmp_ctx);
2042 }
2043
2044
2045 struct verify_recmode_normal_data {
2046         uint32_t count;
2047         enum monitor_result status;
2048 };
2049
2050 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2051 {
2052         struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2053
2054
2055         /* one more node has responded with recmode data*/
2056         rmdata->count--;
2057
2058         /* if we failed to get the recmode, then return an error and let
2059            the main loop try again.
2060         */
2061         if (state->state != CTDB_CONTROL_DONE) {
2062                 if (rmdata->status == MONITOR_OK) {
2063                         rmdata->status = MONITOR_FAILED;
2064                 }
2065                 return;
2066         }
2067
2068         /* if we got a response, then the recmode will be stored in the
2069            status field
2070         */
2071         if (state->status != CTDB_RECOVERY_NORMAL) {
2072                 DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
2073                 rmdata->status = MONITOR_RECOVERY_NEEDED;
2074         }
2075
2076         return;
2077 }
2078
2079
2080 /* verify that all nodes are in normal recovery mode */
2081 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap)
2082 {
2083         struct verify_recmode_normal_data *rmdata;
2084         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2085         struct ctdb_client_control_state *state;
2086         enum monitor_result status;
2087         int j;
2088
2089         rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2090         CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2091         rmdata->count  = 0;
2092         rmdata->status = MONITOR_OK;
2093
2094         /* loop over all active nodes and send an async getrecmode call to
2095            them*/
2096         for (j=0; j<nodemap->num; j++) {
2097                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2098                         continue;
2099                 }
2100                 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2101                                         CONTROL_TIMEOUT(),
2102                                         nodemap->nodes[j].pnn);
2103                 if (state == NULL) {
2104                         /* we failed to send the control, treat this as
2105                            an error and try again next iteration
2106                         */
2107                         DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2108                         talloc_free(mem_ctx);
2109                         return MONITOR_FAILED;
2110                 }
2111
2112                 /* set up the callback functions */
2113                 state->async.fn = verify_recmode_normal_callback;
2114                 state->async.private_data = rmdata;
2115
2116                 /* one more control to wait for to complete */
2117                 rmdata->count++;
2118         }
2119
2120
2121         /* now wait for up to the maximum number of seconds allowed
2122            or until all nodes we expect a response from has replied
2123         */
2124         while (rmdata->count > 0) {
2125                 tevent_loop_once(ctdb->ev);
2126         }
2127
2128         status = rmdata->status;
2129         talloc_free(mem_ctx);
2130         return status;
2131 }
2132
2133
2134 struct verify_recmaster_data {
2135         struct ctdb_recoverd *rec;
2136         uint32_t count;
2137         uint32_t pnn;
2138         enum monitor_result status;
2139 };
2140
2141 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2142 {
2143         struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2144
2145
2146         /* one more node has responded with recmaster data*/
2147         rmdata->count--;
2148
2149         /* if we failed to get the recmaster, then return an error and let
2150            the main loop try again.
2151         */
2152         if (state->state != CTDB_CONTROL_DONE) {
2153                 if (rmdata->status == MONITOR_OK) {
2154                         rmdata->status = MONITOR_FAILED;
2155                 }
2156                 return;
2157         }
2158
2159         /* if we got a response, then the recmaster will be stored in the
2160            status field
2161         */
2162         if (state->status != rmdata->pnn) {
2163                 DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
2164                 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2165                 rmdata->status = MONITOR_ELECTION_NEEDED;
2166         }
2167
2168         return;
2169 }
2170
2171
2172 /* verify that all nodes agree that we are the recmaster */
2173 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap, uint32_t pnn)
2174 {
2175         struct ctdb_context *ctdb = rec->ctdb;
2176         struct verify_recmaster_data *rmdata;
2177         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2178         struct ctdb_client_control_state *state;
2179         enum monitor_result status;
2180         int j;
2181
2182         rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2183         CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2184         rmdata->rec    = rec;
2185         rmdata->count  = 0;
2186         rmdata->pnn    = pnn;
2187         rmdata->status = MONITOR_OK;
2188
2189         /* loop over all active nodes and send an async getrecmaster call to
2190            them*/
2191         for (j=0; j<nodemap->num; j++) {
2192                 if (nodemap->nodes[j].pnn == rec->recmaster) {
2193                         continue;
2194                 }
2195                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2196                         continue;
2197                 }
2198                 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
2199                                         CONTROL_TIMEOUT(),
2200                                         nodemap->nodes[j].pnn);
2201                 if (state == NULL) {
2202                         /* we failed to send the control, treat this as
2203                            an error and try again next iteration
2204                         */
2205                         DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
2206                         talloc_free(mem_ctx);
2207                         return MONITOR_FAILED;
2208                 }
2209
2210                 /* set up the callback functions */
2211                 state->async.fn = verify_recmaster_callback;
2212                 state->async.private_data = rmdata;
2213
2214                 /* one more control to wait for to complete */
2215                 rmdata->count++;
2216         }
2217
2218
2219         /* now wait for up to the maximum number of seconds allowed
2220            or until all nodes we expect a response from has replied
2221         */
2222         while (rmdata->count > 0) {
2223                 tevent_loop_once(ctdb->ev);
2224         }
2225
2226         status = rmdata->status;
2227         talloc_free(mem_ctx);
2228         return status;
2229 }
2230
2231 static bool interfaces_have_changed(struct ctdb_context *ctdb,
2232                                     struct ctdb_recoverd *rec)
2233 {
2234         struct ctdb_iface_list_old *ifaces = NULL;
2235         TALLOC_CTX *mem_ctx;
2236         bool ret = false;
2237
2238         mem_ctx = talloc_new(NULL);
2239
2240         /* Read the interfaces from the local node */
2241         if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
2242                                  CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
2243                 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
2244                 /* We could return an error.  However, this will be
2245                  * rare so we'll decide that the interfaces have
2246                  * actually changed, just in case.
2247                  */
2248                 talloc_free(mem_ctx);
2249                 return true;
2250         }
2251
2252         if (!rec->ifaces) {
2253                 /* We haven't been here before so things have changed */
2254                 DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
2255                 ret = true;
2256         } else if (rec->ifaces->num != ifaces->num) {
2257                 /* Number of interfaces has changed */
2258                 DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
2259                                      rec->ifaces->num, ifaces->num));
2260                 ret = true;
2261         } else {
2262                 /* See if interface names or link states have changed */
2263                 int i;
2264                 for (i = 0; i < rec->ifaces->num; i++) {
2265                         struct ctdb_iface * iface = &rec->ifaces->ifaces[i];
2266                         if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
2267                                 DEBUG(DEBUG_NOTICE,
2268                                       ("Interface in slot %d changed: %s => %s\n",
2269                                        i, iface->name, ifaces->ifaces[i].name));
2270                                 ret = true;
2271                                 break;
2272                         }
2273                         if (iface->link_state != ifaces->ifaces[i].link_state) {
2274                                 DEBUG(DEBUG_NOTICE,
2275                                       ("Interface %s changed state: %d => %d\n",
2276                                        iface->name, iface->link_state,
2277                                        ifaces->ifaces[i].link_state));
2278                                 ret = true;
2279                                 break;
2280                         }
2281                 }
2282         }
2283
2284         talloc_free(rec->ifaces);
2285         rec->ifaces = talloc_steal(rec, ifaces);
2286
2287         talloc_free(mem_ctx);
2288         return ret;
2289 }
2290
2291 /* Check that the local allocation of public IP addresses is correct
2292  * and do some house-keeping */
2293 static int verify_local_ip_allocation(struct ctdb_context *ctdb,
2294                                       struct ctdb_recoverd *rec,
2295                                       uint32_t pnn,
2296                                       struct ctdb_node_map_old *nodemap)
2297 {
2298         TALLOC_CTX *mem_ctx = talloc_new(NULL);
2299         int ret, j;
2300         bool need_takeover_run = false;
2301         struct ctdb_public_ip_list_old *ips = NULL;
2302
2303         /* If we are not the recmaster then do some housekeeping */
2304         if (rec->recmaster != pnn) {
2305                 /* Ignore any IP reallocate requests - only recmaster
2306                  * processes them
2307                  */
2308                 TALLOC_FREE(rec->reallocate_requests);
2309                 /* Clear any nodes that should be force rebalanced in
2310                  * the next takeover run.  If the recovery master role
2311                  * has moved then we don't want to process these some
2312                  * time in the future.
2313                  */
2314                 TALLOC_FREE(rec->force_rebalance_nodes);
2315         }
2316
2317         /* Return early if disabled... */
2318         if (ctdb->tunable.disable_ip_failover != 0 ||
2319             ctdb_op_is_disabled(rec->takeover_run)) {
2320                 return  0;
2321         }
2322
2323         if (interfaces_have_changed(ctdb, rec)) {
2324                 need_takeover_run = true;
2325         }
2326
2327         /* If there are unhosted IPs but this node can host them then
2328          * trigger an IP reallocation */
2329
2330         /* Read *available* IPs from local node */
2331         ret = ctdb_ctrl_get_public_ips_flags(
2332                 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx,
2333                 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
2334         if (ret != 0) {
2335                 DEBUG(DEBUG_ERR, ("Unable to retrieve available public IPs\n"));
2336                 talloc_free(mem_ctx);
2337                 return -1;
2338         }
2339
2340         for (j=0; j<ips->num; j++) {
2341                 if (ips->ips[j].pnn == -1 &&
2342                     nodemap->nodes[pnn].flags == 0) {
2343                         DEBUG(DEBUG_WARNING,
2344                               ("Unassigned IP %s can be served by this node\n",
2345                                ctdb_addr_to_str(&ips->ips[j].addr)));
2346                         need_takeover_run = true;
2347                 }
2348         }
2349
2350         talloc_free(ips);
2351
2352         if (!ctdb->do_checkpublicip) {
2353                 goto done;
2354         }
2355
2356         /* Validate the IP addresses that this node has on network
2357          * interfaces.  If there is an inconsistency between reality
2358          * and the state expected by CTDB then try to fix it by
2359          * triggering an IP reallocation or releasing extraneous IP
2360          * addresses. */
2361
2362         /* Read *known* IPs from local node */
2363         ret = ctdb_ctrl_get_public_ips_flags(
2364                 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
2365         if (ret != 0) {
2366                 DEBUG(DEBUG_ERR, ("Unable to retrieve known public IPs\n"));
2367                 talloc_free(mem_ctx);
2368                 return -1;
2369         }
2370
2371         for (j=0; j<ips->num; j++) {
2372                 if (ips->ips[j].pnn == pnn) {
2373                         if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
2374                                 DEBUG(DEBUG_ERR,
2375                                       ("Assigned IP %s not on an interface\n",
2376                                        ctdb_addr_to_str(&ips->ips[j].addr)));
2377                                 need_takeover_run = true;
2378                         }
2379                 } else {
2380                         if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
2381                                 DEBUG(DEBUG_ERR,
2382                                       ("IP %s incorrectly on an interface\n",
2383                                        ctdb_addr_to_str(&ips->ips[j].addr)));
2384                                 need_takeover_run = true;
2385                         }
2386                 }
2387         }
2388
2389 done:
2390         if (need_takeover_run) {
2391                 struct ctdb_srvid_message rd;
2392                 TDB_DATA data;
2393
2394                 DEBUG(DEBUG_NOTICE,("Trigger takeoverrun\n"));
2395
2396                 ZERO_STRUCT(rd);
2397                 rd.pnn = ctdb->pnn;
2398                 rd.srvid = 0;
2399                 data.dptr = (uint8_t *)&rd;
2400                 data.dsize = sizeof(rd);
2401
2402                 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
2403                 if (ret != 0) {
2404                         DEBUG(DEBUG_ERR,
2405                               ("Failed to send takeover run request\n"));
2406                 }
2407         }
2408         talloc_free(mem_ctx);
2409         return 0;
2410 }
2411
2412
2413 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2414 {
2415         struct ctdb_node_map_old **remote_nodemaps = callback_data;
2416
2417         if (node_pnn >= ctdb->num_nodes) {
2418                 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
2419                 return;
2420         }
2421
2422         remote_nodemaps[node_pnn] = (struct ctdb_node_map_old *)talloc_steal(remote_nodemaps, outdata.dptr);
2423
2424 }
2425
2426 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
2427         struct ctdb_node_map_old *nodemap,
2428         struct ctdb_node_map_old **remote_nodemaps)
2429 {
2430         uint32_t *nodes;
2431
2432         nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
2433         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
2434                                         nodes, 0,
2435                                         CONTROL_TIMEOUT(), false, tdb_null,
2436                                         async_getnodemap_callback,
2437                                         NULL,
2438                                         remote_nodemaps) != 0) {
2439                 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
2440
2441                 return -1;
2442         }
2443
2444         return 0;
2445 }
2446
2447 static bool validate_recovery_master(struct ctdb_recoverd *rec,
2448                                      TALLOC_CTX *mem_ctx)
2449 {
2450         struct ctdb_context *ctdb = rec->ctdb;
2451         uint32_t pnn = ctdb_get_pnn(ctdb);
2452         struct ctdb_node_map_old *nodemap = rec->nodemap;
2453         struct ctdb_node_map_old *recmaster_nodemap = NULL;
2454         int ret;
2455
2456         /* When recovery daemon is started, recmaster is set to
2457          * "unknown" so it knows to start an election.
2458          */
2459         if (rec->recmaster == CTDB_UNKNOWN_PNN) {
2460                 DEBUG(DEBUG_NOTICE,
2461                       ("Initial recovery master set - forcing election\n"));
2462                 force_election(rec, pnn, nodemap);
2463                 return false;
2464         }
2465
2466         /*
2467          * If the current recmaster does not have CTDB_CAP_RECMASTER,
2468          * but we have, then force an election and try to become the new
2469          * recmaster.
2470          */
2471         if (!ctdb_node_has_capabilities(rec->caps,
2472                                         rec->recmaster,
2473                                         CTDB_CAP_RECMASTER) &&
2474             (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
2475             !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
2476                 DEBUG(DEBUG_ERR,
2477                       (" Current recmaster node %u does not have CAP_RECMASTER,"
2478                        " but we (node %u) have - force an election\n",
2479                        rec->recmaster, pnn));
2480                 force_election(rec, pnn, nodemap);
2481                 return false;
2482         }
2483
2484         /* Verify that the master node has not been deleted.  This
2485          * should not happen because a node should always be shutdown
2486          * before being deleted, causing a new master to be elected
2487          * before now.  However, if something strange has happened
2488          * then checking here will ensure we don't index beyond the
2489          * end of the nodemap array. */
2490         if (rec->recmaster >= nodemap->num) {
2491                 DEBUG(DEBUG_ERR,
2492                       ("Recmaster node %u has been deleted. Force election\n",
2493                        rec->recmaster));
2494                 force_election(rec, pnn, nodemap);
2495                 return false;
2496         }
2497
2498         /* if recovery master is disconnected/deleted we must elect a new recmaster */
2499         if (nodemap->nodes[rec->recmaster].flags &
2500             (NODE_FLAGS_DISCONNECTED|NODE_FLAGS_DELETED)) {
2501                 DEBUG(DEBUG_NOTICE,
2502                       ("Recmaster node %u is disconnected/deleted. Force election\n",
2503                        rec->recmaster));
2504                 force_election(rec, pnn, nodemap);
2505                 return false;
2506         }
2507
2508         /* get nodemap from the recovery master to check if it is inactive */
2509         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
2510                                    mem_ctx, &recmaster_nodemap);
2511         if (ret != 0) {
2512                 DEBUG(DEBUG_ERR,
2513                       (__location__
2514                        " Unable to get nodemap from recovery master %u\n",
2515                           rec->recmaster));
2516                 /* No election, just error */
2517                 return false;
2518         }
2519
2520
2521         if ((recmaster_nodemap->nodes[rec->recmaster].flags & NODE_FLAGS_INACTIVE) &&
2522             (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
2523                 DEBUG(DEBUG_NOTICE,
2524                       ("Recmaster node %u is inactive. Force election\n",
2525                        rec->recmaster));
2526                 /*
2527                  * update our nodemap to carry the recmaster's notion of
2528                  * its own flags, so that we don't keep freezing the
2529                  * inactive recmaster node...
2530                  */
2531                 nodemap->nodes[rec->recmaster].flags =
2532                         recmaster_nodemap->nodes[rec->recmaster].flags;
2533                 force_election(rec, pnn, nodemap);
2534                 return false;
2535         }
2536
2537         return true;
2538 }
2539
2540 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
2541                       TALLOC_CTX *mem_ctx)
2542 {
2543         uint32_t pnn;
2544         struct ctdb_node_map_old *nodemap=NULL;
2545         struct ctdb_node_map_old **remote_nodemaps=NULL;
2546         struct ctdb_vnn_map *vnnmap=NULL;
2547         struct ctdb_vnn_map *remote_vnnmap=NULL;
2548         uint32_t num_lmasters;
2549         int32_t debug_level;
2550         int i, j, ret;
2551         bool self_ban;
2552
2553
2554         /* verify that the main daemon is still running */
2555         if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
2556                 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2557                 exit(-1);
2558         }
2559
2560         /* ping the local daemon to tell it we are alive */
2561         ctdb_ctrl_recd_ping(ctdb);
2562
2563         if (rec->election_timeout) {
2564                 /* an election is in progress */
2565                 return;
2566         }
2567
2568         /* read the debug level from the parent and update locally */
2569         ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
2570         if (ret !=0) {
2571                 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
2572                 return;
2573         }
2574         DEBUGLEVEL = debug_level;
2575
2576         /* get relevant tunables */
2577         ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
2578         if (ret != 0) {
2579                 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
2580                 return;
2581         }
2582
2583         /* get runstate */
2584         ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(),
2585                                      CTDB_CURRENT_NODE, &ctdb->runstate);
2586         if (ret != 0) {
2587                 DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n"));
2588                 return;
2589         }
2590
2591         pnn = ctdb_get_pnn(ctdb);
2592
2593         /* get nodemap */
2594         TALLOC_FREE(rec->nodemap);
2595         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
2596         if (ret != 0) {
2597                 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
2598                 return;
2599         }
2600         nodemap = rec->nodemap;
2601
2602         /* remember our own node flags */
2603         rec->node_flags = nodemap->nodes[pnn].flags;
2604
2605         ban_misbehaving_nodes(rec, &self_ban);
2606         if (self_ban) {
2607                 DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
2608                 return;
2609         }
2610
2611         ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2612                                    CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2613         if (ret != 0) {
2614                 D_ERR("Failed to read recmode from local node\n");
2615                 return;
2616         }
2617
2618         /* if the local daemon is STOPPED or BANNED, we verify that the databases are
2619            also frozen and that the recmode is set to active.
2620         */
2621         if (rec->node_flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
2622                 /* If this node has become inactive then we want to
2623                  * reduce the chances of it taking over the recovery
2624                  * master role when it becomes active again.  This
2625                  * helps to stabilise the recovery master role so that
2626                  * it stays on the most stable node.
2627                  */
2628                 rec->priority_time = timeval_current();
2629
2630                 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2631                         DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
2632
2633                         ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2634                         if (ret != 0) {
2635                                 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
2636
2637                                 return;
2638                         }
2639                 }
2640                 if (! rec->frozen_on_inactive) {
2641                         ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(),
2642                                                CTDB_CURRENT_NODE);
2643                         if (ret != 0) {
2644                                 DEBUG(DEBUG_ERR,
2645                                       (__location__ " Failed to freeze node "
2646                                        "in STOPPED or BANNED state\n"));
2647                                 return;
2648                         }
2649
2650                         rec->frozen_on_inactive = true;
2651                 }
2652
2653                 /* If this node is stopped or banned then it is not the recovery
2654                  * master, so don't do anything. This prevents stopped or banned
2655                  * node from starting election and sending unnecessary controls.
2656                  */
2657                 return;
2658         }
2659
2660         rec->frozen_on_inactive = false;
2661
2662         /* Retrieve capabilities from all connected nodes */
2663         ret = update_capabilities(rec, nodemap);
2664         if (ret != 0) {
2665                 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
2666                 return;
2667         }
2668
2669         if (! validate_recovery_master(rec, mem_ctx)) {
2670                 return;
2671         }
2672
2673         if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2674                 /* Check if an IP takeover run is needed and trigger one if
2675                  * necessary */
2676                 verify_local_ip_allocation(ctdb, rec, pnn, nodemap);
2677         }
2678
2679         /* if we are not the recmaster then we do not need to check
2680            if recovery is needed
2681          */
2682         if (pnn != rec->recmaster) {
2683                 return;
2684         }
2685
2686
2687         /* ensure our local copies of flags are right */
2688         ret = update_local_flags(rec, nodemap);
2689         if (ret != 0) {
2690                 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
2691                 return;
2692         }
2693
2694         if (ctdb->num_nodes != nodemap->num) {
2695                 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
2696                 ctdb_load_nodes_file(ctdb);
2697                 return;
2698         }
2699
2700         /* verify that all active nodes agree that we are the recmaster */
2701         switch (verify_recmaster(rec, nodemap, pnn)) {
2702         case MONITOR_RECOVERY_NEEDED:
2703                 /* can not happen */
2704                 return;
2705         case MONITOR_ELECTION_NEEDED:
2706                 force_election(rec, pnn, nodemap);
2707                 return;
2708         case MONITOR_OK:
2709                 break;
2710         case MONITOR_FAILED:
2711                 return;
2712         }
2713
2714
2715         /* get the vnnmap */
2716         ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
2717         if (ret != 0) {
2718                 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
2719                 return;
2720         }
2721
2722         if (rec->need_recovery) {
2723                 /* a previous recovery didn't finish */
2724                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2725                 return;
2726         }
2727
2728         /* verify that all active nodes are in normal mode
2729            and not in recovery mode
2730         */
2731         switch (verify_recmode(ctdb, nodemap)) {
2732         case MONITOR_RECOVERY_NEEDED:
2733                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2734                 return;
2735         case MONITOR_FAILED:
2736                 return;
2737         case MONITOR_ELECTION_NEEDED:
2738                 /* can not happen */
2739         case MONITOR_OK:
2740                 break;
2741         }
2742
2743
2744         if (ctdb->recovery_lock != NULL) {
2745                 /* We must already hold the recovery lock */
2746                 if (!ctdb_recovery_have_lock(rec)) {
2747                         DEBUG(DEBUG_ERR,("Failed recovery lock sanity check.  Force a recovery\n"));
2748                         ctdb_set_culprit(rec, ctdb->pnn);
2749                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2750                         return;
2751                 }
2752         }
2753
2754
2755         /* If recoveries are disabled then there is no use doing any
2756          * nodemap or flags checks.  Recoveries might be disabled due
2757          * to "reloadnodes", so doing these checks might cause an
2758          * unnecessary recovery.  */
2759         if (ctdb_op_is_disabled(rec->recovery)) {
2760                 goto takeover_run_checks;
2761         }
2762
2763         /* get the nodemap for all active remote nodes
2764          */
2765         remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map_old *, nodemap->num);
2766         if (remote_nodemaps == NULL) {
2767                 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
2768                 return;
2769         }
2770         for(i=0; i<nodemap->num; i++) {
2771                 remote_nodemaps[i] = NULL;
2772         }
2773         if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
2774                 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
2775                 return;
2776         }
2777
2778         /* verify that all other nodes have the same nodemap as we have
2779         */
2780         for (j=0; j<nodemap->num; j++) {
2781                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2782                         continue;
2783                 }
2784
2785                 if (remote_nodemaps[j] == NULL) {
2786                         DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
2787                         ctdb_set_culprit(rec, j);
2788
2789                         return;
2790                 }
2791
2792                 /* if the nodes disagree on how many nodes there are
2793                    then this is a good reason to try recovery
2794                  */
2795                 if (remote_nodemaps[j]->num != nodemap->num) {
2796                         DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
2797                                   nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
2798                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2799                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2800                         return;
2801                 }
2802
2803                 /* if the nodes disagree on which nodes exist and are
2804                    active, then that is also a good reason to do recovery
2805                  */
2806                 for (i=0;i<nodemap->num;i++) {
2807                         if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
2808                                 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
2809                                           nodemap->nodes[j].pnn, i,
2810                                           remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
2811                                 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2812                                 do_recovery(rec, mem_ctx, pnn, nodemap,
2813                                             vnnmap);
2814                                 return;
2815                         }
2816                 }
2817         }
2818
2819         /*
2820          * Update node flags obtained from each active node. This ensure we have
2821          * up-to-date information for all the nodes.
2822          */
2823         for (j=0; j<nodemap->num; j++) {
2824                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2825                         continue;
2826                 }
2827                 nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
2828         }
2829
2830         for (j=0; j<nodemap->num; j++) {
2831                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2832                         continue;
2833                 }
2834
2835                 /* verify the flags are consistent
2836                 */
2837                 for (i=0; i<nodemap->num; i++) {
2838                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2839                                 continue;
2840                         }
2841
2842                         if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
2843                                 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
2844                                   nodemap->nodes[j].pnn,
2845                                   nodemap->nodes[i].pnn,
2846                                   remote_nodemaps[j]->nodes[i].flags,
2847                                   nodemap->nodes[i].flags));
2848                                 if (i == j) {
2849                                         DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
2850                                         update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
2851                                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2852                                         do_recovery(rec, mem_ctx, pnn, nodemap,
2853                                                     vnnmap);
2854                                         return;
2855                                 } else {
2856                                         DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
2857                                         update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
2858                                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2859                                         do_recovery(rec, mem_ctx, pnn, nodemap,
2860                                                     vnnmap);
2861                                         return;
2862                                 }
2863                         }
2864                 }
2865         }
2866
2867
2868         /* count how many active nodes there are */
2869         num_lmasters  = 0;
2870         for (i=0; i<nodemap->num; i++) {
2871                 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
2872                         if (ctdb_node_has_capabilities(rec->caps,
2873                                                        ctdb->nodes[i]->pnn,
2874                                                        CTDB_CAP_LMASTER)) {
2875                                 num_lmasters++;
2876                         }
2877                 }
2878         }
2879
2880
2881         /* There must be the same number of lmasters in the vnn map as
2882          * there are active nodes with the lmaster capability...  or
2883          * do a recovery.
2884          */
2885         if (vnnmap->size != num_lmasters) {
2886                 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
2887                           vnnmap->size, num_lmasters));
2888                 ctdb_set_culprit(rec, ctdb->pnn);
2889                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2890                 return;
2891         }
2892
2893         /* verify that all active nodes in the nodemap also exist in
2894            the vnnmap.
2895          */
2896         for (j=0; j<nodemap->num; j++) {
2897                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2898                         continue;
2899                 }
2900                 if (nodemap->nodes[j].pnn == pnn) {
2901                         continue;
2902                 }
2903
2904                 for (i=0; i<vnnmap->size; i++) {
2905                         if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
2906                                 break;
2907                         }
2908                 }
2909                 if (i == vnnmap->size) {
2910                         DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
2911                                   nodemap->nodes[j].pnn));
2912                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2913                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2914                         return;
2915                 }
2916         }
2917
2918
2919         /* verify that all other nodes have the same vnnmap
2920            and are from the same generation
2921          */
2922         for (j=0; j<nodemap->num; j++) {
2923                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2924                         continue;
2925                 }
2926                 if (nodemap->nodes[j].pnn == pnn) {
2927                         continue;
2928                 }
2929
2930                 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
2931                                           mem_ctx, &remote_vnnmap);
2932                 if (ret != 0) {
2933                         DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
2934                                   nodemap->nodes[j].pnn));
2935                         return;
2936                 }
2937
2938                 /* verify the vnnmap generation is the same */
2939                 if (vnnmap->generation != remote_vnnmap->generation) {
2940                         DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
2941                                   nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
2942                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2943                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2944                         return;
2945                 }
2946
2947                 /* verify the vnnmap size is the same */
2948                 if (vnnmap->size != remote_vnnmap->size) {
2949                         DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
2950                                   nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
2951                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2952                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2953                         return;
2954                 }
2955
2956                 /* verify the vnnmap is the same */
2957                 for (i=0;i<vnnmap->size;i++) {
2958                         if (remote_vnnmap->map[i] != vnnmap->map[i]) {
2959                                 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
2960                                           nodemap->nodes[j].pnn));
2961                                 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2962                                 do_recovery(rec, mem_ctx, pnn, nodemap,
2963                                             vnnmap);
2964                                 return;
2965                         }
2966                 }
2967         }
2968
2969         /* FIXME: Add remote public IP checking to ensure that nodes
2970          * have the IP addresses that are allocated to them. */
2971
2972 takeover_run_checks:
2973
2974         /* If there are IP takeover runs requested or the previous one
2975          * failed then perform one and notify the waiters */
2976         if (!ctdb_op_is_disabled(rec->takeover_run) &&
2977             (rec->reallocate_requests || rec->need_takeover_run)) {
2978                 process_ipreallocate_requests(ctdb, rec);
2979         }
2980 }
2981
2982 static void recd_sig_term_handler(struct tevent_context *ev,
2983                                   struct tevent_signal *se, int signum,
2984                                   int count, void *dont_care,
2985                                   void *private_data)
2986 {
2987         struct ctdb_recoverd *rec = talloc_get_type_abort(
2988                 private_data, struct ctdb_recoverd);
2989
2990         DEBUG(DEBUG_ERR, ("Received SIGTERM, exiting\n"));
2991         ctdb_recovery_unlock(rec);
2992         exit(0);
2993 }
2994
2995
2996 /*
2997   the main monitoring loop
2998  */
2999 static void monitor_cluster(struct ctdb_context *ctdb)
3000 {
3001         struct tevent_signal *se;
3002         struct ctdb_recoverd *rec;
3003
3004         DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
3005
3006         rec = talloc_zero(ctdb, struct ctdb_recoverd);
3007         CTDB_NO_MEMORY_FATAL(ctdb, rec);
3008
3009         rec->ctdb = ctdb;
3010         rec->recmaster = CTDB_UNKNOWN_PNN;
3011         rec->recovery_lock_handle = NULL;
3012
3013         rec->takeover_run = ctdb_op_init(rec, "takeover runs");
3014         CTDB_NO_MEMORY_FATAL(ctdb, rec->takeover_run);
3015
3016         rec->recovery = ctdb_op_init(rec, "recoveries");
3017         CTDB_NO_MEMORY_FATAL(ctdb, rec->recovery);
3018
3019         rec->priority_time = timeval_current();
3020         rec->frozen_on_inactive = false;
3021
3022         se = tevent_add_signal(ctdb->ev, ctdb, SIGTERM, 0,
3023                                recd_sig_term_handler, rec);
3024         if (se == NULL) {
3025                 DEBUG(DEBUG_ERR, ("Failed to install SIGTERM handler\n"));
3026                 exit(1);
3027         }
3028
3029         /* register a message port for sending memory dumps */
3030         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
3031
3032         /* when a node is assigned banning credits */
3033         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_BANNING,
3034                                         banning_handler, rec);
3035
3036         /* register a message port for recovery elections */
3037         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_ELECTION, election_handler, rec);
3038
3039         /* when nodes are disabled/enabled */
3040         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
3041
3042         /* when we are asked to puch out a flag change */
3043         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
3044
3045         /* register a message port for vacuum fetch */
3046         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
3047
3048         /* register a message port for reloadnodes  */
3049         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
3050
3051         /* register a message port for performing a takeover run */
3052         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
3053
3054         /* register a message port for disabling the ip check for a short while */
3055         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
3056
3057         /* register a message port for forcing a rebalance of a node next
3058            reallocation */
3059         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
3060
3061         /* Register a message port for disabling takeover runs */
3062         ctdb_client_set_message_handler(ctdb,
3063                                         CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
3064                                         disable_takeover_runs_handler, rec);
3065
3066         /* Register a message port for disabling recoveries */
3067         ctdb_client_set_message_handler(ctdb,
3068                                         CTDB_SRVID_DISABLE_RECOVERIES,
3069                                         disable_recoveries_handler, rec);
3070
3071         /* register a message port for detaching database */
3072         ctdb_client_set_message_handler(ctdb,
3073                                         CTDB_SRVID_DETACH_DATABASE,
3074                                         detach_database_handler, rec);
3075
3076         for (;;) {
3077                 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3078                 struct timeval start;
3079                 double elapsed;
3080
3081                 if (!mem_ctx) {
3082                         DEBUG(DEBUG_CRIT,(__location__
3083                                           " Failed to create temp context\n"));
3084                         exit(-1);
3085                 }
3086
3087                 start = timeval_current();
3088                 main_loop(ctdb, rec, mem_ctx);
3089                 talloc_free(mem_ctx);
3090
3091                 /* we only check for recovery once every second */
3092                 elapsed = timeval_elapsed(&start);
3093                 if (elapsed < ctdb->tunable.recover_interval) {
3094                         ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
3095                                           - elapsed);
3096                 }
3097         }
3098 }
3099
3100 /*
3101   event handler for when the main ctdbd dies
3102  */
3103 static void ctdb_recoverd_parent(struct tevent_context *ev,
3104                                  struct tevent_fd *fde,
3105                                  uint16_t flags, void *private_data)
3106 {
3107         DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
3108         _exit(1);
3109 }
3110
3111 /*
3112   called regularly to verify that the recovery daemon is still running
3113  */
3114 static void ctdb_check_recd(struct tevent_context *ev,
3115                             struct tevent_timer *te,
3116                             struct timeval yt, void *p)
3117 {
3118         struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3119
3120         if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
3121                 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
3122
3123                 tevent_add_timer(ctdb->ev, ctdb, timeval_zero(),
3124                                  ctdb_restart_recd, ctdb);
3125
3126                 return;
3127         }
3128
3129         tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3130                          timeval_current_ofs(30, 0),
3131                          ctdb_check_recd, ctdb);
3132 }
3133
3134 static void recd_sig_child_handler(struct tevent_context *ev,
3135                                    struct tevent_signal *se, int signum,
3136                                    int count, void *dont_care,
3137                                    void *private_data)
3138 {
3139 //      struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3140         int status;
3141         pid_t pid = -1;
3142
3143         while (pid != 0) {
3144                 pid = waitpid(-1, &status, WNOHANG);
3145                 if (pid == -1) {
3146                         if (errno != ECHILD) {
3147                                 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3148                         }
3149                         return;
3150                 }
3151                 if (pid > 0) {
3152                         DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3153                 }
3154         }
3155 }
3156
3157 /*
3158   startup the recovery daemon as a child of the main ctdb daemon
3159  */
3160 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3161 {
3162         int fd[2];
3163         struct tevent_signal *se;
3164         struct tevent_fd *fde;
3165         int ret;
3166
3167         if (pipe(fd) != 0) {
3168                 return -1;
3169         }
3170
3171         ctdb->recoverd_pid = ctdb_fork(ctdb);
3172         if (ctdb->recoverd_pid == -1) {
3173                 return -1;
3174         }
3175
3176         if (ctdb->recoverd_pid != 0) {
3177                 talloc_free(ctdb->recd_ctx);
3178                 ctdb->recd_ctx = talloc_new(ctdb);
3179                 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
3180
3181                 close(fd[0]);
3182                 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3183                                  timeval_current_ofs(30, 0),
3184                                  ctdb_check_recd, ctdb);
3185                 return 0;
3186         }
3187
3188         close(fd[1]);
3189
3190         srandom(getpid() ^ time(NULL));
3191
3192         ret = logging_init(ctdb, NULL, NULL, "ctdb-recoverd");
3193         if (ret != 0) {
3194                 return -1;
3195         }
3196
3197         prctl_set_comment("ctdb_recovered");
3198         if (switch_from_server_to_client(ctdb) != 0) {
3199                 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3200                 exit(1);
3201         }
3202
3203         DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
3204
3205         fde = tevent_add_fd(ctdb->ev, ctdb, fd[0], TEVENT_FD_READ,
3206                             ctdb_recoverd_parent, &fd[0]);
3207         tevent_fd_set_auto_close(fde);
3208
3209         /* set up a handler to pick up sigchld */
3210         se = tevent_add_signal(ctdb->ev, ctdb, SIGCHLD, 0,
3211                                recd_sig_child_handler, ctdb);
3212         if (se == NULL) {
3213                 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3214                 exit(1);
3215         }
3216
3217         monitor_cluster(ctdb);
3218
3219         DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
3220         return -1;
3221 }
3222
3223 /*
3224   shutdown the recovery daemon
3225  */
3226 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
3227 {
3228         if (ctdb->recoverd_pid == 0) {
3229                 return;
3230         }
3231
3232         DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
3233         ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
3234
3235         TALLOC_FREE(ctdb->recd_ctx);
3236         TALLOC_FREE(ctdb->recd_ping_count);
3237 }
3238
3239 static void ctdb_restart_recd(struct tevent_context *ev,
3240                               struct tevent_timer *te,
3241                               struct timeval t, void *private_data)
3242 {
3243         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3244
3245         DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
3246         ctdb_stop_recoverd(ctdb);
3247         ctdb_start_recoverd(ctdb);
3248 }