ctdb/server/ctdb_recoverd.c

   1 /*
   2    ctdb recovery daemon
   3
   4    Copyright (C) Ronnie Sahlberg  2007
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20 #include "replace.h"
  21 #include "system/filesys.h"
  22 #include "system/time.h"
  23 #include "system/network.h"
  24 #include "system/wait.h"
  25
  26 #include <popt.h>
  27 #include <talloc.h>
  28 #include <tevent.h>
  29 #include <tdb.h>
  30
  31 #include "lib/tdb_wrap/tdb_wrap.h"
  32 #include "lib/util/dlinklist.h"
  33 #include "lib/util/debug.h"
  34 #include "lib/util/samba_util.h"
  35 #include "lib/util/util_process.h"
  36
  37 #include "ctdb_private.h"
  38 #include "ctdb_client.h"
  39
  40 #include "common/system.h"
  41 #include "common/cmdline.h"
  42 #include "common/common.h"
  43 #include "common/logging.h"
  44
  45 #include "ctdb_cluster_mutex.h"
  46
  47 /* List of SRVID requests that need to be processed */
  48 struct srvid_list {
  49         struct srvid_list *next, *prev;
  50         struct ctdb_srvid_message *request;
  51 };
  52
  53 struct srvid_requests {
  54         struct srvid_list *requests;
  55 };
  56
  57 static void srvid_request_reply(struct ctdb_context *ctdb,
  58                                 struct ctdb_srvid_message *request,
  59                                 TDB_DATA result)
  60 {
  61         /* Someone that sent srvid==0 does not want a reply */
  62         if (request->srvid == 0) {
  63                 talloc_free(request);
  64                 return;
  65         }
  66
  67         if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
  68                                      result) == 0) {
  69                 DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
  70                                   (unsigned)request->pnn,
  71                                   (unsigned long long)request->srvid));
  72         } else {
  73                 DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
  74                                  (unsigned)request->pnn,
  75                                  (unsigned long long)request->srvid));
  76         }
  77
  78         talloc_free(request);
  79 }
  80
  81 static void srvid_requests_reply(struct ctdb_context *ctdb,
  82                                  struct srvid_requests **requests,
  83                                  TDB_DATA result)
  84 {
  85         struct srvid_list *r;
  86
  87         if (*requests == NULL) {
  88                 return;
  89         }
  90
  91         for (r = (*requests)->requests; r != NULL; r = r->next) {
  92                 srvid_request_reply(ctdb, r->request, result);
  93         }
  94
  95         /* Free the list structure... */
  96         TALLOC_FREE(*requests);
  97 }
  98
  99 static void srvid_request_add(struct ctdb_context *ctdb,
 100                               struct srvid_requests **requests,
 101                               struct ctdb_srvid_message *request)
 102 {
 103         struct srvid_list *t;
 104         int32_t ret;
 105         TDB_DATA result;
 106
 107         if (*requests == NULL) {
 108                 *requests = talloc_zero(ctdb, struct srvid_requests);
 109                 if (*requests == NULL) {
 110                         goto nomem;
 111                 }
 112         }
 113
 114         t = talloc_zero(*requests, struct srvid_list);
 115         if (t == NULL) {
 116                 /* If *requests was just allocated above then free it */
 117                 if ((*requests)->requests == NULL) {
 118                         TALLOC_FREE(*requests);
 119                 }
 120                 goto nomem;
 121         }
 122
 123         t->request = (struct ctdb_srvid_message *)talloc_steal(t, request);
 124         DLIST_ADD((*requests)->requests, t);
 125
 126         return;
 127
 128 nomem:
 129         /* Failed to add the request to the list.  Send a fail. */
 130         DEBUG(DEBUG_ERR, (__location__
 131                           " Out of memory, failed to queue SRVID request\n"));
 132         ret = -ENOMEM;
 133         result.dsize = sizeof(ret);
 134         result.dptr = (uint8_t *)&ret;
 135         srvid_request_reply(ctdb, request, result);
 136 }
 137
 138 /* An abstraction to allow an operation (takeover runs, recoveries,
 139  * ...) to be disabled for a given timeout */
 140 struct ctdb_op_state {
 141         struct tevent_timer *timer;
 142         bool in_progress;
 143         const char *name;
 144 };
 145
 146 static struct ctdb_op_state *ctdb_op_init(TALLOC_CTX *mem_ctx, const char *name)
 147 {
 148         struct ctdb_op_state *state = talloc_zero(mem_ctx, struct ctdb_op_state);
 149
 150         if (state != NULL) {
 151                 state->in_progress = false;
 152                 state->name = name;
 153         }
 154
 155         return state;
 156 }
 157
 158 static bool ctdb_op_is_disabled(struct ctdb_op_state *state)
 159 {
 160         return state->timer != NULL;
 161 }
 162
 163 static bool ctdb_op_begin(struct ctdb_op_state *state)
 164 {
 165         if (ctdb_op_is_disabled(state)) {
 166                 DEBUG(DEBUG_NOTICE,
 167                       ("Unable to begin - %s are disabled\n", state->name));
 168                 return false;
 169         }
 170
 171         state->in_progress = true;
 172         return true;
 173 }
 174
 175 static bool ctdb_op_end(struct ctdb_op_state *state)
 176 {
 177         return state->in_progress = false;
 178 }
 179
 180 static bool ctdb_op_is_in_progress(struct ctdb_op_state *state)
 181 {
 182         return state->in_progress;
 183 }
 184
 185 static void ctdb_op_enable(struct ctdb_op_state *state)
 186 {
 187         TALLOC_FREE(state->timer);
 188 }
 189
 190 static void ctdb_op_timeout_handler(struct tevent_context *ev,
 191                                     struct tevent_timer *te,
 192                                     struct timeval yt, void *p)
 193 {
 194         struct ctdb_op_state *state =
 195                 talloc_get_type(p, struct ctdb_op_state);
 196
 197         DEBUG(DEBUG_NOTICE,("Reenabling %s after timeout\n", state->name));
 198         ctdb_op_enable(state);
 199 }
 200
 201 static int ctdb_op_disable(struct ctdb_op_state *state,
 202                            struct tevent_context *ev,
 203                            uint32_t timeout)
 204 {
 205         if (timeout == 0) {
 206                 DEBUG(DEBUG_NOTICE,("Reenabling %s\n", state->name));
 207                 ctdb_op_enable(state);
 208                 return 0;
 209         }
 210
 211         if (state->in_progress) {
 212                 DEBUG(DEBUG_ERR,
 213                       ("Unable to disable %s - in progress\n", state->name));
 214                 return -EAGAIN;
 215         }
 216
 217         DEBUG(DEBUG_NOTICE,("Disabling %s for %u seconds\n",
 218                             state->name, timeout));
 219
 220         /* Clear any old timers */
 221         talloc_free(state->timer);
 222
 223         /* Arrange for the timeout to occur */
 224         state->timer = tevent_add_timer(ev, state,
 225                                         timeval_current_ofs(timeout, 0),
 226                                         ctdb_op_timeout_handler, state);
 227         if (state->timer == NULL) {
 228                 DEBUG(DEBUG_ERR,(__location__ " Unable to setup timer\n"));
 229                 return -ENOMEM;
 230         }
 231
 232         return 0;
 233 }
 234
 235 struct ctdb_banning_state {
 236         uint32_t count;
 237         struct timeval last_reported_time;
 238 };
 239
 240 /*
 241   private state of recovery daemon
 242  */
 243 struct ctdb_recoverd {
 244         struct ctdb_context *ctdb;
 245         uint32_t recmaster;
 246         uint32_t last_culprit_node;
 247         struct ctdb_node_map_old *nodemap;
 248         struct timeval priority_time;
 249         bool need_takeover_run;
 250         bool need_recovery;
 251         uint32_t node_flags;
 252         struct tevent_timer *send_election_te;
 253         struct tevent_timer *election_timeout;
 254         struct srvid_requests *reallocate_requests;
 255         struct ctdb_op_state *takeover_run;
 256         struct ctdb_op_state *recovery;
 257         struct ctdb_iface_list_old *ifaces;
 258         uint32_t *force_rebalance_nodes;
 259         struct ctdb_node_capabilities *caps;
 260         bool frozen_on_inactive;
 261         struct ctdb_cluster_mutex_handle *recovery_lock_handle;
 262 };
 263
 264 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
 265 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
 266
 267 static void ctdb_restart_recd(struct tevent_context *ev,
 268                               struct tevent_timer *te, struct timeval t,
 269                               void *private_data);
 270
 271 /*
 272   ban a node for a period of time
 273  */
 274 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
 275 {
 276         int ret;
 277         struct ctdb_context *ctdb = rec->ctdb;
 278         struct ctdb_ban_state bantime;
 279
 280         if (!ctdb_validate_pnn(ctdb, pnn)) {
 281                 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
 282                 return;
 283         }
 284
 285         DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
 286
 287         bantime.pnn  = pnn;
 288         bantime.time = ban_time;
 289
 290         ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
 291         if (ret != 0) {
 292                 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
 293                 return;
 294         }
 295
 296 }
 297
 298 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
 299
 300
 301 /*
 302   remember the trouble maker
 303  */
 304 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
 305 {
 306         struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
 307         struct ctdb_banning_state *ban_state;
 308
 309         if (culprit > ctdb->num_nodes) {
 310                 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
 311                 return;
 312         }
 313
 314         /* If we are banned or stopped, do not set other nodes as culprits */
 315         if (rec->node_flags & NODE_FLAGS_INACTIVE) {
 316                 DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
 317                 return;
 318         }
 319
 320         if (ctdb->nodes[culprit]->ban_state == NULL) {
 321                 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
 322                 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
 323
 324
 325         }
 326         ban_state = ctdb->nodes[culprit]->ban_state;
 327         if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
 328                 /* this was the first time in a long while this node
 329                    misbehaved so we will forgive any old transgressions.
 330                 */
 331                 ban_state->count = 0;
 332         }
 333
 334         ban_state->count += count;
 335         ban_state->last_reported_time = timeval_current();
 336         rec->last_culprit_node = culprit;
 337 }
 338
 339 /*
 340   remember the trouble maker
 341  */
 342 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
 343 {
 344         ctdb_set_culprit_count(rec, culprit, 1);
 345 }
 346
 347
 348 /* this callback is called for every node that failed to execute the
 349    recovered event
 350 */
 351 static void recovered_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 352 {
 353         struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
 354
 355         DEBUG(DEBUG_ERR, (__location__ " Node %u failed the recovered event. Setting it as recovery fail culprit\n", node_pnn));
 356
 357         ctdb_set_culprit(rec, node_pnn);
 358 }
 359
 360 /*
 361   run the "recovered" eventscript on all nodes
 362  */
 363 static int run_recovered_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap, const char *caller)
 364 {
 365         TALLOC_CTX *tmp_ctx;
 366         uint32_t *nodes;
 367         struct ctdb_context *ctdb = rec->ctdb;
 368
 369         tmp_ctx = talloc_new(ctdb);
 370         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 371
 372         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 373         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
 374                                         nodes, 0,
 375                                         CONTROL_TIMEOUT(), false, tdb_null,
 376                                         NULL, recovered_fail_callback,
 377                                         rec) != 0) {
 378                 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
 379
 380                 talloc_free(tmp_ctx);
 381                 return -1;
 382         }
 383
 384         talloc_free(tmp_ctx);
 385         return 0;
 386 }
 387
 388 /* this callback is called for every node that failed to execute the
 389    start recovery event
 390 */
 391 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 392 {
 393         struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
 394
 395         DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
 396
 397         ctdb_set_culprit(rec, node_pnn);
 398 }
 399
 400 /*
 401   run the "startrecovery" eventscript on all nodes
 402  */
 403 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap)
 404 {
 405         TALLOC_CTX *tmp_ctx;
 406         uint32_t *nodes;
 407         struct ctdb_context *ctdb = rec->ctdb;
 408
 409         tmp_ctx = talloc_new(ctdb);
 410         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 411
 412         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 413         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
 414                                         nodes, 0,
 415                                         CONTROL_TIMEOUT(), false, tdb_null,
 416                                         NULL,
 417                                         startrecovery_fail_callback,
 418                                         rec) != 0) {
 419                 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
 420                 talloc_free(tmp_ctx);
 421                 return -1;
 422         }
 423
 424         talloc_free(tmp_ctx);
 425         return 0;
 426 }
 427
 428 /*
 429   Retrieve capabilities from all connected nodes
 430  */
 431 static int update_capabilities(struct ctdb_recoverd *rec,
 432                                struct ctdb_node_map_old *nodemap)
 433 {
 434         uint32_t *capp;
 435         TALLOC_CTX *tmp_ctx;
 436         struct ctdb_node_capabilities *caps;
 437         struct ctdb_context *ctdb = rec->ctdb;
 438
 439         tmp_ctx = talloc_new(rec);
 440         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 441
 442         caps = ctdb_get_capabilities(ctdb, tmp_ctx,
 443                                      CONTROL_TIMEOUT(), nodemap);
 444
 445         if (caps == NULL) {
 446                 DEBUG(DEBUG_ERR,
 447                       (__location__ " Failed to get node capabilities\n"));
 448                 talloc_free(tmp_ctx);
 449                 return -1;
 450         }
 451
 452         capp = ctdb_get_node_capabilities(caps, ctdb_get_pnn(ctdb));
 453         if (capp == NULL) {
 454                 DEBUG(DEBUG_ERR,
 455                       (__location__
 456                        " Capabilities don't include current node.\n"));
 457                 talloc_free(tmp_ctx);
 458                 return -1;
 459         }
 460         ctdb->capabilities = *capp;
 461
 462         TALLOC_FREE(rec->caps);
 463         rec->caps = talloc_steal(rec, caps);
 464
 465         talloc_free(tmp_ctx);
 466         return 0;
 467 }
 468
 469 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 470 {
 471         struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
 472
 473         DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
 474         ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
 475 }
 476
 477 static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 478 {
 479         struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
 480
 481         DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
 482         ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
 483 }
 484
 485 /*
 486   change recovery mode on all nodes
 487  */
 488 static int set_recovery_mode(struct ctdb_context *ctdb,
 489                              struct ctdb_recoverd *rec,
 490                              struct ctdb_node_map_old *nodemap,
 491                              uint32_t rec_mode, bool freeze)
 492 {
 493         TDB_DATA data;
 494         uint32_t *nodes;
 495         TALLOC_CTX *tmp_ctx;
 496
 497         tmp_ctx = talloc_new(ctdb);
 498         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 499
 500         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 501
 502         data.dsize = sizeof(uint32_t);
 503         data.dptr = (unsigned char *)&rec_mode;
 504
 505         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
 506                                         nodes, 0,
 507                                         CONTROL_TIMEOUT(),
 508                                         false, data,
 509                                         NULL, NULL,
 510                                         NULL) != 0) {
 511                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
 512                 talloc_free(tmp_ctx);
 513                 return -1;
 514         }
 515
 516         /* freeze all nodes */
 517         if (freeze && rec_mode == CTDB_RECOVERY_ACTIVE) {
 518                 int i;
 519
 520                 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
 521                         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
 522                                                 nodes, i,
 523                                                 CONTROL_TIMEOUT(),
 524                                                 false, tdb_null,
 525                                                 NULL,
 526                                                 set_recmode_fail_callback,
 527                                                 rec) != 0) {
 528                                 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
 529                                 talloc_free(tmp_ctx);
 530                                 return -1;
 531                         }
 532                 }
 533         }
 534
 535         talloc_free(tmp_ctx);
 536         return 0;
 537 }
 538
 539 /* update all remote nodes to use the same db priority that we have
 540    this can fail if the remove node has not yet been upgraded to
 541    support this function, so we always return success and never fail
 542    a recovery if this call fails.
 543 */
 544 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
 545         struct ctdb_node_map_old *nodemap,
 546         uint32_t pnn, struct ctdb_dbid_map_old *dbmap, TALLOC_CTX *mem_ctx)
 547 {
 548         int db;
 549
 550         /* step through all local databases */
 551         for (db=0; db<dbmap->num;db++) {
 552                 struct ctdb_db_priority db_prio;
 553                 int ret;
 554
 555                 db_prio.db_id     = dbmap->dbs[db].db_id;
 556                 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].db_id, &db_prio.priority);
 557                 if (ret != 0) {
 558                         DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].db_id));
 559                         continue;
 560                 }
 561
 562                 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].db_id, db_prio.priority));
 563
 564                 ret = ctdb_ctrl_set_db_priority(ctdb, CONTROL_TIMEOUT(),
 565                                                 CTDB_CURRENT_NODE, &db_prio);
 566                 if (ret != 0) {
 567                         DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n",
 568                                          db_prio.db_id));
 569                 }
 570         }
 571
 572         return 0;
 573 }
 574
 575 /*
 576   ensure all other nodes have attached to any databases that we have
 577  */
 578 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
 579                                            uint32_t pnn, struct ctdb_dbid_map_old *dbmap, TALLOC_CTX *mem_ctx)
 580 {
 581         int i, j, db, ret;
 582         struct ctdb_dbid_map_old *remote_dbmap;
 583
 584         /* verify that all other nodes have all our databases */
 585         for (j=0; j<nodemap->num; j++) {
 586                 /* we don't need to ourself ourselves */
 587                 if (nodemap->nodes[j].pnn == pnn) {
 588                         continue;
 589                 }
 590                 /* don't check nodes that are unavailable */
 591                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 592                         continue;
 593                 }
 594
 595                 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 596                                          mem_ctx, &remote_dbmap);
 597                 if (ret != 0) {
 598                         DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
 599                         return -1;
 600                 }
 601
 602                 /* step through all local databases */
 603                 for (db=0; db<dbmap->num;db++) {
 604                         const char *name;
 605
 606
 607                         for (i=0;i<remote_dbmap->num;i++) {
 608                                 if (dbmap->dbs[db].db_id == remote_dbmap->dbs[i].db_id) {
 609                                         break;
 610                                 }
 611                         }
 612                         /* the remote node already have this database */
 613                         if (i!=remote_dbmap->num) {
 614                                 continue;
 615                         }
 616                         /* ok so we need to create this database */
 617                         ret = ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn,
 618                                                   dbmap->dbs[db].db_id, mem_ctx,
 619                                                   &name);
 620                         if (ret != 0) {
 621                                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
 622                                 return -1;
 623                         }
 624                         ret = ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(),
 625                                                  nodemap->nodes[j].pnn,
 626                                                  mem_ctx, name,
 627                                                  dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
 628                         if (ret != 0) {
 629                                 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
 630                                 return -1;
 631                         }
 632                 }
 633         }
 634
 635         return 0;
 636 }
 637
 638
 639 /*
 640   ensure we are attached to any databases that anyone else is attached to
 641  */
 642 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
 643                                           uint32_t pnn, struct ctdb_dbid_map_old **dbmap, TALLOC_CTX *mem_ctx)
 644 {
 645         int i, j, db, ret;
 646         struct ctdb_dbid_map_old *remote_dbmap;
 647
 648         /* verify that we have all database any other node has */
 649         for (j=0; j<nodemap->num; j++) {
 650                 /* we don't need to ourself ourselves */
 651                 if (nodemap->nodes[j].pnn == pnn) {
 652                         continue;
 653                 }
 654                 /* don't check nodes that are unavailable */
 655                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 656                         continue;
 657                 }
 658
 659                 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 660                                          mem_ctx, &remote_dbmap);
 661                 if (ret != 0) {
 662                         DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
 663                         return -1;
 664                 }
 665
 666                 /* step through all databases on the remote node */
 667                 for (db=0; db<remote_dbmap->num;db++) {
 668                         const char *name;
 669
 670                         for (i=0;i<(*dbmap)->num;i++) {
 671                                 if (remote_dbmap->dbs[db].db_id == (*dbmap)->dbs[i].db_id) {
 672                                         break;
 673                                 }
 674                         }
 675                         /* we already have this db locally */
 676                         if (i!=(*dbmap)->num) {
 677                                 continue;
 678                         }
 679                         /* ok so we need to create this database and
 680                            rebuild dbmap
 681                          */
 682                         ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 683                                             remote_dbmap->dbs[db].db_id, mem_ctx, &name);
 684                         if (ret != 0) {
 685                                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
 686                                           nodemap->nodes[j].pnn));
 687                                 return -1;
 688                         }
 689                         ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
 690                                            remote_dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
 691                         if (ret != 0) {
 692                                 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
 693                                 return -1;
 694                         }
 695                         ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
 696                         if (ret != 0) {
 697                                 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
 698                                 return -1;
 699                         }
 700                 }
 701         }
 702
 703         return 0;
 704 }
 705
 706
 707 /*
 708   pull the remote database contents from one node into the recdb
 709  */
 710 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
 711                                     struct tdb_wrap *recdb, uint32_t dbid)
 712 {
 713         int ret;
 714         TDB_DATA outdata;
 715         struct ctdb_marshall_buffer *reply;
 716         struct ctdb_rec_data_old *recdata;
 717         int i;
 718         TALLOC_CTX *tmp_ctx = talloc_new(recdb);
 719
 720         ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
 721                                CONTROL_TIMEOUT(), &outdata);
 722         if (ret != 0) {
 723                 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
 724                 talloc_free(tmp_ctx);
 725                 return -1;
 726         }
 727
 728         reply = (struct ctdb_marshall_buffer *)outdata.dptr;
 729
 730         if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
 731                 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
 732                 talloc_free(tmp_ctx);
 733                 return -1;
 734         }
 735
 736         recdata = (struct ctdb_rec_data_old *)&reply->data[0];
 737
 738         for (i=0;
 739              i<reply->count;
 740              recdata = (struct ctdb_rec_data_old *)(recdata->length + (uint8_t *)recdata), i++) {
 741                 TDB_DATA key, data;
 742                 struct ctdb_ltdb_header *hdr;
 743                 TDB_DATA existing;
 744
 745                 key.dptr = &recdata->data[0];
 746                 key.dsize = recdata->keylen;
 747                 data.dptr = &recdata->data[key.dsize];
 748                 data.dsize = recdata->datalen;
 749
 750                 hdr = (struct ctdb_ltdb_header *)data.dptr;
 751
 752                 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
 753                         DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
 754                         talloc_free(tmp_ctx);
 755                         return -1;
 756                 }
 757
 758                 /* fetch the existing record, if any */
 759                 existing = tdb_fetch(recdb->tdb, key);
 760
 761                 if (existing.dptr != NULL) {
 762                         struct ctdb_ltdb_header header;
 763                         if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
 764                                 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
 765                                          (unsigned)existing.dsize, srcnode));
 766                                 free(existing.dptr);
 767                                 talloc_free(tmp_ctx);
 768                                 return -1;
 769                         }
 770                         header = *(struct ctdb_ltdb_header *)existing.dptr;
 771                         free(existing.dptr);
 772                         if (!(header.rsn < hdr->rsn ||
 773                               (header.dmaster != ctdb_get_pnn(ctdb) &&
 774                                header.rsn == hdr->rsn))) {
 775                                 continue;
 776                         }
 777                 }
 778
 779                 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
 780                         DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
 781                         talloc_free(tmp_ctx);
 782                         return -1;
 783                 }
 784         }
 785
 786         talloc_free(tmp_ctx);
 787
 788         return 0;
 789 }
 790
 791
 792 struct pull_seqnum_cbdata {
 793         int failed;
 794         uint32_t pnn;
 795         uint64_t seqnum;
 796 };
 797
 798 static void pull_seqnum_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 799 {
 800         struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
 801         uint64_t seqnum;
 802
 803         if (cb_data->failed != 0) {
 804                 DEBUG(DEBUG_ERR, ("Got seqnum from node %d but we have already failed the entire operation\n", node_pnn));
 805                 return;
 806         }
 807
 808         if (res != 0) {
 809                 DEBUG(DEBUG_ERR, ("Error when pulling seqnum from node %d\n", node_pnn));
 810                 cb_data->failed = 1;
 811                 return;
 812         }
 813
 814         if (outdata.dsize != sizeof(uint64_t)) {
 815                 DEBUG(DEBUG_ERR, ("Error when reading pull seqnum from node %d, got %d bytes but expected %d\n", node_pnn, (int)outdata.dsize, (int)sizeof(uint64_t)));
 816                 cb_data->failed = -1;
 817                 return;
 818         }
 819
 820         seqnum = *((uint64_t *)outdata.dptr);
 821
 822         if (seqnum > cb_data->seqnum ||
 823             (cb_data->pnn == -1 && seqnum == 0)) {
 824                 cb_data->seqnum = seqnum;
 825                 cb_data->pnn = node_pnn;
 826         }
 827 }
 828
 829 static void pull_seqnum_fail_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 830 {
 831         struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
 832
 833         DEBUG(DEBUG_ERR, ("Failed to pull db seqnum from node %d\n", node_pnn));
 834         cb_data->failed = 1;
 835 }
 836
 837 static int pull_highest_seqnum_pdb(struct ctdb_context *ctdb,
 838                                 struct ctdb_recoverd *rec,
 839                                 struct ctdb_node_map_old *nodemap,
 840                                 struct tdb_wrap *recdb, uint32_t dbid)
 841 {
 842         TALLOC_CTX *tmp_ctx = talloc_new(NULL);
 843         uint32_t *nodes;
 844         TDB_DATA data;
 845         uint32_t outdata[2];
 846         struct pull_seqnum_cbdata *cb_data;
 847
 848         DEBUG(DEBUG_NOTICE, ("Scan for highest seqnum pdb for db:0x%08x\n", dbid));
 849
 850         outdata[0] = dbid;
 851         outdata[1] = 0;
 852
 853         data.dsize = sizeof(outdata);
 854         data.dptr  = (uint8_t *)&outdata[0];
 855
 856         cb_data = talloc(tmp_ctx, struct pull_seqnum_cbdata);
 857         if (cb_data == NULL) {
 858                 DEBUG(DEBUG_ERR, ("Failed to allocate pull highest seqnum cb_data structure\n"));
 859                 talloc_free(tmp_ctx);
 860                 return -1;
 861         }
 862
 863         cb_data->failed = 0;
 864         cb_data->pnn    = -1;
 865         cb_data->seqnum = 0;
 866
 867         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 868         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_DB_SEQNUM,
 869                                         nodes, 0,
 870                                         CONTROL_TIMEOUT(), false, data,
 871                                         pull_seqnum_cb,
 872                                         pull_seqnum_fail_cb,
 873                                         cb_data) != 0) {
 874                 DEBUG(DEBUG_ERR, (__location__ " Failed to run async GET_DB_SEQNUM\n"));
 875
 876                 talloc_free(tmp_ctx);
 877                 return -1;
 878         }
 879
 880         if (cb_data->failed != 0) {
 881                 DEBUG(DEBUG_NOTICE, ("Failed to pull sequence numbers for DB 0x%08x\n", dbid));
 882                 talloc_free(tmp_ctx);
 883                 return -1;
 884         }
 885
 886         if (cb_data->pnn == -1) {
 887                 DEBUG(DEBUG_NOTICE, ("Failed to find a node with highest sequence numbers for DB 0x%08x\n", dbid));
 888                 talloc_free(tmp_ctx);
 889                 return -1;
 890         }
 891
 892         DEBUG(DEBUG_NOTICE, ("Pull persistent db:0x%08x from node %d with highest seqnum:%lld\n", dbid, cb_data->pnn, (long long)cb_data->seqnum));
 893
 894         if (pull_one_remote_database(ctdb, cb_data->pnn, recdb, dbid) != 0) {
 895                 DEBUG(DEBUG_ERR, ("Failed to pull higest seqnum database 0x%08x from node %d\n", dbid, cb_data->pnn));
 896                 talloc_free(tmp_ctx);
 897                 return -1;
 898         }
 899
 900         talloc_free(tmp_ctx);
 901         return 0;
 902 }
 903
 904
 905 /*
 906   pull all the remote database contents into the recdb
 907  */
 908 static int pull_remote_database(struct ctdb_context *ctdb,
 909                                 struct ctdb_recoverd *rec,
 910                                 struct ctdb_node_map_old *nodemap,
 911                                 struct tdb_wrap *recdb, uint32_t dbid,
 912                                 bool persistent)
 913 {
 914         int j;
 915
 916         if (persistent && ctdb->tunable.recover_pdb_by_seqnum != 0) {
 917                 int ret;
 918                 ret = pull_highest_seqnum_pdb(ctdb, rec, nodemap, recdb, dbid);
 919                 if (ret == 0) {
 920                         return 0;
 921                 }
 922         }
 923
 924         /* pull all records from all other nodes across onto this node
 925            (this merges based on rsn)
 926         */
 927         for (j=0; j<nodemap->num; j++) {
 928                 /* don't merge from nodes that are unavailable */
 929                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 930                         continue;
 931                 }
 932                 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
 933                         DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
 934                                  nodemap->nodes[j].pnn));
 935                         ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
 936                         return -1;
 937                 }
 938         }
 939
 940         return 0;
 941 }
 942
 943
 944 /*
 945   update flags on all active nodes
 946  */
 947 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap, uint32_t pnn, uint32_t flags)
 948 {
 949         int ret;
 950
 951         ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
 952                 if (ret != 0) {
 953                 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
 954                 return -1;
 955         }
 956
 957         return 0;
 958 }
 959
 960 /*
 961   ensure all nodes have the same vnnmap we do
 962  */
 963 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
 964                                       uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
 965 {
 966         int j, ret;
 967
 968         /* push the new vnn map out to all the nodes */
 969         for (j=0; j<nodemap->num; j++) {
 970                 /* don't push to nodes that are unavailable */
 971                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 972                         continue;
 973                 }
 974
 975                 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
 976                 if (ret != 0) {
 977                         DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
 978                         return -1;
 979                 }
 980         }
 981
 982         return 0;
 983 }
 984
 985
 986 /*
 987   called when a vacuum fetch has completed - just free it and do the next one
 988  */
 989 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
 990 {
 991         talloc_free(state);
 992 }
 993
 994
 995 /**
 996  * Process one elements of the vacuum fetch list:
 997  * Migrate it over to us with the special flag
 998  * CTDB_CALL_FLAG_VACUUM_MIGRATION.
 999  */
1000 static bool vacuum_fetch_process_one(struct ctdb_db_context *ctdb_db,
1001                                      uint32_t pnn,
1002                                      struct ctdb_rec_data_old *r)
1003 {
1004         struct ctdb_client_call_state *state;
1005         TDB_DATA data;
1006         struct ctdb_ltdb_header *hdr;
1007         struct ctdb_call call;
1008
1009         ZERO_STRUCT(call);
1010         call.call_id = CTDB_NULL_FUNC;
1011         call.flags = CTDB_IMMEDIATE_MIGRATION;
1012         call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
1013
1014         call.key.dptr = &r->data[0];
1015         call.key.dsize = r->keylen;
1016
1017         /* ensure we don't block this daemon - just skip a record if we can't get
1018            the chainlock */
1019         if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, call.key) != 0) {
1020                 return true;
1021         }
1022
1023         data = tdb_fetch(ctdb_db->ltdb->tdb, call.key);
1024         if (data.dptr == NULL) {
1025                 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
1026                 return true;
1027         }
1028
1029         if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
1030                 free(data.dptr);
1031                 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
1032                 return true;
1033         }
1034
1035         hdr = (struct ctdb_ltdb_header *)data.dptr;
1036         if (hdr->dmaster == pnn) {
1037                 /* its already local */
1038                 free(data.dptr);
1039                 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
1040                 return true;
1041         }
1042
1043         free(data.dptr);
1044
1045         state = ctdb_call_send(ctdb_db, &call);
1046         tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
1047         if (state == NULL) {
1048                 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
1049                 return false;
1050         }
1051         state->async.fn = vacuum_fetch_callback;
1052         state->async.private_data = NULL;
1053
1054         return true;
1055 }
1056
1057
1058 /*
1059   handler for vacuum fetch
1060 */
1061 static void vacuum_fetch_handler(uint64_t srvid, TDB_DATA data,
1062                                  void *private_data)
1063 {
1064         struct ctdb_recoverd *rec = talloc_get_type(
1065                 private_data, struct ctdb_recoverd);
1066         struct ctdb_context *ctdb = rec->ctdb;
1067         struct ctdb_marshall_buffer *recs;
1068         int ret, i;
1069         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1070         const char *name;
1071         struct ctdb_dbid_map_old *dbmap=NULL;
1072         bool persistent = false;
1073         struct ctdb_db_context *ctdb_db;
1074         struct ctdb_rec_data_old *r;
1075
1076         recs = (struct ctdb_marshall_buffer *)data.dptr;
1077
1078         if (recs->count == 0) {
1079                 goto done;
1080         }
1081
1082         /* work out if the database is persistent */
1083         ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
1084         if (ret != 0) {
1085                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
1086                 goto done;
1087         }
1088
1089         for (i=0;i<dbmap->num;i++) {
1090                 if (dbmap->dbs[i].db_id == recs->db_id) {
1091                         persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
1092                         break;
1093                 }
1094         }
1095         if (i == dbmap->num) {
1096                 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
1097                 goto done;
1098         }
1099
1100         /* find the name of this database */
1101         if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
1102                 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
1103                 goto done;
1104         }
1105
1106         /* attach to it */
1107         ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, persistent, 0);
1108         if (ctdb_db == NULL) {
1109                 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
1110                 goto done;
1111         }
1112
1113         r = (struct ctdb_rec_data_old *)&recs->data[0];
1114         while (recs->count) {
1115                 bool ok;
1116
1117                 ok = vacuum_fetch_process_one(ctdb_db, rec->ctdb->pnn, r);
1118                 if (!ok) {
1119                         break;
1120                 }
1121
1122                 r = (struct ctdb_rec_data_old *)(r->length + (uint8_t *)r);
1123                 recs->count--;
1124         }
1125
1126 done:
1127         talloc_free(tmp_ctx);
1128 }
1129
1130
1131 /*
1132  * handler for database detach
1133  */
1134 static void detach_database_handler(uint64_t srvid, TDB_DATA data,
1135                                     void *private_data)
1136 {
1137         struct ctdb_recoverd *rec = talloc_get_type(
1138                 private_data, struct ctdb_recoverd);
1139         struct ctdb_context *ctdb = rec->ctdb;
1140         uint32_t db_id;
1141         struct ctdb_db_context *ctdb_db;
1142
1143         if (data.dsize != sizeof(db_id)) {
1144                 return;
1145         }
1146         db_id = *(uint32_t *)data.dptr;
1147
1148         ctdb_db = find_ctdb_db(ctdb, db_id);
1149         if (ctdb_db == NULL) {
1150                 /* database is not attached */
1151                 return;
1152         }
1153
1154         DLIST_REMOVE(ctdb->db_list, ctdb_db);
1155
1156         DEBUG(DEBUG_NOTICE, ("Detached from database '%s'\n",
1157                              ctdb_db->db_name));
1158         talloc_free(ctdb_db);
1159 }
1160
1161 /*
1162   called when ctdb_wait_timeout should finish
1163  */
1164 static void ctdb_wait_handler(struct tevent_context *ev,
1165                               struct tevent_timer *te,
1166                               struct timeval yt, void *p)
1167 {
1168         uint32_t *timed_out = (uint32_t *)p;
1169         (*timed_out) = 1;
1170 }
1171
1172 /*
1173   wait for a given number of seconds
1174  */
1175 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
1176 {
1177         uint32_t timed_out = 0;
1178         time_t usecs = (secs - (time_t)secs) * 1000000;
1179         tevent_add_timer(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs),
1180                          ctdb_wait_handler, &timed_out);
1181         while (!timed_out) {
1182                 tevent_loop_once(ctdb->ev);
1183         }
1184 }
1185
1186 /*
1187   called when an election times out (ends)
1188  */
1189 static void ctdb_election_timeout(struct tevent_context *ev,
1190                                   struct tevent_timer *te,
1191                                   struct timeval t, void *p)
1192 {
1193         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1194         rec->election_timeout = NULL;
1195         fast_start = false;
1196
1197         DEBUG(DEBUG_WARNING,("Election period ended\n"));
1198 }
1199
1200
1201 /*
1202   wait for an election to finish. It finished election_timeout seconds after
1203   the last election packet is received
1204  */
1205 static void ctdb_wait_election(struct ctdb_recoverd *rec)
1206 {
1207         struct ctdb_context *ctdb = rec->ctdb;
1208         while (rec->election_timeout) {
1209                 tevent_loop_once(ctdb->ev);
1210         }
1211 }
1212
1213 /*
1214   Update our local flags from all remote connected nodes.
1215   This is only run when we are or we belive we are the recovery master
1216  */
1217 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap)
1218 {
1219         int j;
1220         struct ctdb_context *ctdb = rec->ctdb;
1221         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1222
1223         /* get the nodemap for all active remote nodes and verify
1224            they are the same as for this node
1225          */
1226         for (j=0; j<nodemap->num; j++) {
1227                 struct ctdb_node_map_old *remote_nodemap=NULL;
1228                 int ret;
1229
1230                 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
1231                         continue;
1232                 }
1233                 if (nodemap->nodes[j].pnn == ctdb->pnn) {
1234                         continue;
1235                 }
1236
1237                 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
1238                                            mem_ctx, &remote_nodemap);
1239                 if (ret != 0) {
1240                         DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
1241                                   nodemap->nodes[j].pnn));
1242                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
1243                         talloc_free(mem_ctx);
1244                         return -1;
1245                 }
1246                 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
1247                         /* We should tell our daemon about this so it
1248                            updates its flags or else we will log the same
1249                            message again in the next iteration of recovery.
1250                            Since we are the recovery master we can just as
1251                            well update the flags on all nodes.
1252                         */
1253                         ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
1254                         if (ret != 0) {
1255                                 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
1256                                 return -1;
1257                         }
1258
1259                         /* Update our local copy of the flags in the recovery
1260                            daemon.
1261                         */
1262                         DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
1263                                  nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
1264                                  nodemap->nodes[j].flags));
1265                         nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
1266                 }
1267                 talloc_free(remote_nodemap);
1268         }
1269         talloc_free(mem_ctx);
1270         return 0;
1271 }
1272
1273
1274 /* Create a new random generation id.
1275    The generation id can not be the INVALID_GENERATION id
1276 */
1277 static uint32_t new_generation(void)
1278 {
1279         uint32_t generation;
1280
1281         while (1) {
1282                 generation = random();
1283
1284                 if (generation != INVALID_GENERATION) {
1285                         break;
1286                 }
1287         }
1288
1289         return generation;
1290 }
1291
1292
1293 /*
1294   create a temporary working database
1295  */
1296 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1297 {
1298         char *name;
1299         struct tdb_wrap *recdb;
1300         unsigned tdb_flags;
1301
1302         /* open up the temporary recovery database */
1303         name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
1304                                ctdb->db_directory_state,
1305                                ctdb->pnn);
1306         if (name == NULL) {
1307                 return NULL;
1308         }
1309         unlink(name);
1310
1311         tdb_flags = TDB_NOLOCK;
1312         if (ctdb->valgrinding) {
1313                 tdb_flags |= TDB_NOMMAP;
1314         }
1315         tdb_flags |= (TDB_INCOMPATIBLE_HASH | TDB_DISALLOW_NESTING);
1316
1317         recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1318                               tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1319         if (recdb == NULL) {
1320                 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1321         }
1322
1323         talloc_free(name);
1324
1325         return recdb;
1326 }
1327
1328
1329 /*
1330    a traverse function for pulling all relevant records from recdb
1331  */
1332 struct recdb_data {
1333         struct ctdb_context *ctdb;
1334         struct ctdb_marshall_buffer *recdata;
1335         uint32_t len;
1336         uint32_t allocated_len;
1337         bool failed;
1338         bool persistent;
1339 };
1340
1341 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1342 {
1343         struct recdb_data *params = (struct recdb_data *)p;
1344         struct ctdb_rec_data_old *recdata;
1345         struct ctdb_ltdb_header *hdr;
1346
1347         /*
1348          * skip empty records - but NOT for persistent databases:
1349          *
1350          * The record-by-record mode of recovery deletes empty records.
1351          * For persistent databases, this can lead to data corruption
1352          * by deleting records that should be there:
1353          *
1354          * - Assume the cluster has been running for a while.
1355          *
1356          * - A record R in a persistent database has been created and
1357          *   deleted a couple of times, the last operation being deletion,
1358          *   leaving an empty record with a high RSN, say 10.
1359          *
1360          * - Now a node N is turned off.
1361          *
1362          * - This leaves the local database copy of D on N with the empty
1363          *   copy of R and RSN 10. On all other nodes, the recovery has deleted
1364          *   the copy of record R.
1365          *
1366          * - Now the record is created again while node N is turned off.
1367          *   This creates R with RSN = 1 on all nodes except for N.
1368          *
1369          * - Now node N is turned on again. The following recovery will chose
1370          *   the older empty copy of R due to RSN 10 > RSN 1.
1371          *
1372          * ==> Hence the record is gone after the recovery.
1373          *
1374          * On databases like Samba's registry, this can damage the higher-level
1375          * data structures built from the various tdb-level records.
1376          */
1377         if (!params->persistent && data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1378                 return 0;
1379         }
1380
1381         /* update the dmaster field to point to us */
1382         hdr = (struct ctdb_ltdb_header *)data.dptr;
1383         if (!params->persistent) {
1384                 hdr->dmaster = params->ctdb->pnn;
1385                 hdr->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
1386         }
1387
1388         /* add the record to the blob ready to send to the nodes */
1389         recdata = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1390         if (recdata == NULL) {
1391                 params->failed = true;
1392                 return -1;
1393         }
1394         if (params->len + recdata->length >= params->allocated_len) {
1395                 params->allocated_len = recdata->length + params->len + params->ctdb->tunable.pulldb_preallocation_size;
1396                 params->recdata = talloc_realloc_size(NULL, params->recdata, params->allocated_len);
1397         }
1398         if (params->recdata == NULL) {
1399                 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u\n",
1400                          recdata->length + params->len));
1401                 params->failed = true;
1402                 return -1;
1403         }
1404         params->recdata->count++;
1405         memcpy(params->len+(uint8_t *)params->recdata, recdata, recdata->length);
1406         params->len += recdata->length;
1407         talloc_free(recdata);
1408
1409         return 0;
1410 }
1411
1412 /*
1413   push the recdb database out to all nodes
1414  */
1415 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1416                                bool persistent,
1417                                struct tdb_wrap *recdb, struct ctdb_node_map_old *nodemap)
1418 {
1419         struct recdb_data params;
1420         struct ctdb_marshall_buffer *recdata;
1421         TDB_DATA outdata;
1422         TALLOC_CTX *tmp_ctx;
1423         uint32_t *nodes;
1424
1425         tmp_ctx = talloc_new(ctdb);
1426         CTDB_NO_MEMORY(ctdb, tmp_ctx);
1427
1428         recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1429         CTDB_NO_MEMORY(ctdb, recdata);
1430
1431         recdata->db_id = dbid;
1432
1433         params.ctdb = ctdb;
1434         params.recdata = recdata;
1435         params.len = offsetof(struct ctdb_marshall_buffer, data);
1436         params.allocated_len = params.len;
1437         params.failed = false;
1438         params.persistent = persistent;
1439
1440         if (tdb_traverse_read(recdb->tdb, traverse_recdb, &params) == -1) {
1441                 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1442                 talloc_free(params.recdata);
1443                 talloc_free(tmp_ctx);
1444                 return -1;
1445         }
1446
1447         if (params.failed) {
1448                 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1449                 talloc_free(params.recdata);
1450                 talloc_free(tmp_ctx);
1451                 return -1;
1452         }
1453
1454         recdata = params.recdata;
1455
1456         outdata.dptr = (void *)recdata;
1457         outdata.dsize = params.len;
1458
1459         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1460         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1461                                         nodes, 0,
1462                                         CONTROL_TIMEOUT(), false, outdata,
1463                                         NULL, NULL,
1464                                         NULL) != 0) {
1465                 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1466                 talloc_free(recdata);
1467                 talloc_free(tmp_ctx);
1468                 return -1;
1469         }
1470
1471         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1472                   dbid, recdata->count));
1473
1474         talloc_free(recdata);
1475         talloc_free(tmp_ctx);
1476
1477         return 0;
1478 }
1479
1480
1481 /*
1482   go through a full recovery on one database
1483  */
1484 static int recover_database(struct ctdb_recoverd *rec,
1485                             TALLOC_CTX *mem_ctx,
1486                             uint32_t dbid,
1487                             bool persistent,
1488                             uint32_t pnn,
1489                             struct ctdb_node_map_old *nodemap,
1490                             uint32_t transaction_id)
1491 {
1492         struct tdb_wrap *recdb;
1493         int ret;
1494         struct ctdb_context *ctdb = rec->ctdb;
1495         TDB_DATA data;
1496         struct ctdb_transdb w;
1497         uint32_t *nodes;
1498
1499         recdb = create_recdb(ctdb, mem_ctx);
1500         if (recdb == NULL) {
1501                 return -1;
1502         }
1503
1504         /* pull all remote databases onto the recdb */
1505         ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
1506         if (ret != 0) {
1507                 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1508                 return -1;
1509         }
1510
1511         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1512
1513         /* wipe all the remote databases. This is safe as we are in a transaction */
1514         w.db_id = dbid;
1515         w.tid = transaction_id;
1516
1517         data.dptr = (void *)&w;
1518         data.dsize = sizeof(w);
1519
1520         nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1521         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1522                                         nodes, 0,
1523                                         CONTROL_TIMEOUT(), false, data,
1524                                         NULL, NULL,
1525                                         NULL) != 0) {
1526                 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1527                 talloc_free(recdb);
1528                 return -1;
1529         }
1530
1531         /* push out the correct database. This sets the dmaster and skips
1532            the empty records */
1533         ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
1534         if (ret != 0) {
1535                 talloc_free(recdb);
1536                 return -1;
1537         }
1538
1539         /* all done with this database */
1540         talloc_free(recdb);
1541
1542         return 0;
1543 }
1544
1545 static bool ctdb_recovery_have_lock(struct ctdb_recoverd *rec)
1546 {
1547         return (rec->recovery_lock_handle != NULL);
1548 }
1549
1550 struct hold_reclock_state {
1551         bool done;
1552         bool locked;
1553         double latency;
1554 };
1555
1556 static void take_reclock_handler(char status,
1557                                  double latency,
1558                                  void *private_data)
1559 {
1560         struct hold_reclock_state *s =
1561                 (struct hold_reclock_state *) private_data;
1562
1563         switch (status) {
1564         case '0':
1565                 s->latency = latency;
1566                 break;
1567
1568         case '1':
1569                 DEBUG(DEBUG_ERR,
1570                       ("Unable to take recovery lock - contention\n"));
1571                 break;
1572
1573         default:
1574                 DEBUG(DEBUG_ERR, ("ERROR: when taking recovery lock\n"));
1575         }
1576
1577         s->done = true;
1578         s->locked = (status == '0') ;
1579 }
1580
1581 static bool ctdb_recovery_lock(struct ctdb_recoverd *rec);
1582
1583 static void lost_reclock_handler(void *private_data)
1584 {
1585         struct ctdb_recoverd *rec = talloc_get_type_abort(
1586                 private_data, struct ctdb_recoverd);
1587
1588         DEBUG(DEBUG_ERR,
1589               ("Recovery lock helper terminated unexpectedly - "
1590                "trying to retake recovery lock\n"));
1591         TALLOC_FREE(rec->recovery_lock_handle);
1592         if (! ctdb_recovery_lock(rec)) {
1593                 DEBUG(DEBUG_ERR, ("Failed to take recovery lock\n"));
1594         }
1595 }
1596
1597 static bool ctdb_recovery_lock(struct ctdb_recoverd *rec)
1598 {
1599         struct ctdb_context *ctdb = rec->ctdb;
1600         struct ctdb_cluster_mutex_handle *h;
1601         struct hold_reclock_state s = {
1602                 .done = false,
1603                 .locked = false,
1604                 .latency = 0,
1605         };
1606
1607         h = ctdb_cluster_mutex(rec, ctdb, ctdb->recovery_lock, 0,
1608                                take_reclock_handler, &s,
1609                                lost_reclock_handler, rec);
1610         if (h == NULL) {
1611                 return false;
1612         }
1613
1614         while (!s.done) {
1615                 tevent_loop_once(ctdb->ev);
1616         }
1617
1618         if (! s.locked) {
1619                 talloc_free(h);
1620                 return false;
1621         }
1622
1623         rec->recovery_lock_handle = h;
1624         ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(),
1625                                            s.latency);
1626
1627         return true;
1628 }
1629
1630 static void ctdb_recovery_unlock(struct ctdb_recoverd *rec)
1631 {
1632         if (rec->recovery_lock_handle != NULL) {
1633                 DEBUG(DEBUG_NOTICE, ("Releasing recovery lock\n"));
1634                 TALLOC_FREE(rec->recovery_lock_handle);
1635         }
1636 }
1637
1638 static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
1639 {
1640         struct ctdb_context *ctdb = rec->ctdb;
1641         int i;
1642         struct ctdb_banning_state *ban_state;
1643
1644         *self_ban = false;
1645         for (i=0; i<ctdb->num_nodes; i++) {
1646                 if (ctdb->nodes[i]->ban_state == NULL) {
1647                         continue;
1648                 }
1649                 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1650                 if (ban_state->count < 2*ctdb->num_nodes) {
1651                         continue;
1652                 }
1653
1654                 DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
1655                         ctdb->nodes[i]->pnn, ban_state->count,
1656                         ctdb->tunable.recovery_ban_period));
1657                 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1658                 ban_state->count = 0;
1659
1660                 /* Banning ourself? */
1661                 if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
1662                         *self_ban = true;
1663                 }
1664         }
1665 }
1666
1667 static bool do_takeover_run(struct ctdb_recoverd *rec,
1668                             struct ctdb_node_map_old *nodemap)
1669 {
1670         uint32_t *nodes = NULL;
1671         struct ctdb_disable_message dtr;
1672         TDB_DATA data;
1673         int i;
1674         uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
1675         int ret;
1676         bool ok;
1677
1678         DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
1679
1680         if (ctdb_op_is_in_progress(rec->takeover_run)) {
1681                 DEBUG(DEBUG_ERR, (__location__
1682                                   " takeover run already in progress \n"));
1683                 ok = false;
1684                 goto done;
1685         }
1686
1687         if (!ctdb_op_begin(rec->takeover_run)) {
1688                 ok = false;
1689                 goto done;
1690         }
1691
1692         /* Disable IP checks (takeover runs, really) on other nodes
1693          * while doing this takeover run.  This will stop those other
1694          * nodes from triggering takeover runs when think they should
1695          * be hosting an IP but it isn't yet on an interface.  Don't
1696          * wait for replies since a failure here might cause some
1697          * noise in the logs but will not actually cause a problem.
1698          */
1699         ZERO_STRUCT(dtr);
1700         dtr.srvid = 0; /* No reply */
1701         dtr.pnn = -1;
1702
1703         data.dptr  = (uint8_t*)&dtr;
1704         data.dsize = sizeof(dtr);
1705
1706         nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
1707
1708         /* Disable for 60 seconds.  This can be a tunable later if
1709          * necessary.
1710          */
1711         dtr.timeout = 60;
1712         for (i = 0; i < talloc_array_length(nodes); i++) {
1713                 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1714                                              CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1715                                              data) != 0) {
1716                         DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
1717                 }
1718         }
1719
1720         ret = ctdb_takeover_run(rec->ctdb, nodemap,
1721                                 rec->force_rebalance_nodes);
1722
1723         /* Reenable takeover runs and IP checks on other nodes */
1724         dtr.timeout = 0;
1725         for (i = 0; i < talloc_array_length(nodes); i++) {
1726                 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1727                                              CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1728                                              data) != 0) {
1729                         DEBUG(DEBUG_INFO,("Failed to re-enable takeover runs\n"));
1730                 }
1731         }
1732
1733         if (ret != 0) {
1734                 DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
1735                 ok = false;
1736                 goto done;
1737         }
1738
1739         ok = true;
1740         /* Takeover run was successful so clear force rebalance targets */
1741         if (rebalance_nodes == rec->force_rebalance_nodes) {
1742                 TALLOC_FREE(rec->force_rebalance_nodes);
1743         } else {
1744                 DEBUG(DEBUG_WARNING,
1745                       ("Rebalance target nodes changed during takeover run - not clearing\n"));
1746         }
1747 done:
1748         rec->need_takeover_run = !ok;
1749         talloc_free(nodes);
1750         ctdb_op_end(rec->takeover_run);
1751
1752         DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
1753         return ok;
1754 }
1755
1756 struct recovery_helper_state {
1757         int fd[2];
1758         pid_t pid;
1759         int result;
1760         bool done;
1761 };
1762
1763 static void ctdb_recovery_handler(struct tevent_context *ev,
1764                                   struct tevent_fd *fde,
1765                                   uint16_t flags, void *private_data)
1766 {
1767         struct recovery_helper_state *state = talloc_get_type_abort(
1768                 private_data, struct recovery_helper_state);
1769         int ret;
1770
1771         ret = sys_read(state->fd[0], &state->result, sizeof(state->result));
1772         if (ret != sizeof(state->result)) {
1773                 state->result = EPIPE;
1774         }
1775
1776         state->done = true;
1777 }
1778
1779
1780 static int db_recovery_parallel(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx)
1781 {
1782         static char prog[PATH_MAX+1] = "";
1783         const char **args;
1784         struct recovery_helper_state *state;
1785         struct tevent_fd *fde;
1786         int nargs, ret;
1787
1788         if (!ctdb_set_helper("recovery_helper", prog, sizeof(prog),
1789                              "CTDB_RECOVERY_HELPER", CTDB_HELPER_BINDIR,
1790                              "ctdb_recovery_helper")) {
1791                 ctdb_die(rec->ctdb, "Unable to set recovery helper\n");
1792         }
1793
1794         state = talloc_zero(mem_ctx, struct recovery_helper_state);
1795         if (state == NULL) {
1796                 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1797                 return -1;
1798         }
1799
1800         state->pid = -1;
1801
1802         ret = pipe(state->fd);
1803         if (ret != 0) {
1804                 DEBUG(DEBUG_ERR,
1805                       ("Failed to create pipe for recovery helper\n"));
1806                 goto fail;
1807         }
1808
1809         set_close_on_exec(state->fd[0]);
1810
1811         nargs = 4;
1812         args = talloc_array(state, const char *, nargs);
1813         if (args == NULL) {
1814                 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1815                 goto fail;
1816         }
1817
1818         args[0] = talloc_asprintf(args, "%d", state->fd[1]);
1819         args[1] = rec->ctdb->daemon.name;
1820         args[2] = talloc_asprintf(args, "%u", new_generation());
1821         args[3] = NULL;
1822
1823         if (args[0] == NULL || args[2] == NULL) {
1824                 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1825                 goto fail;
1826         }
1827
1828         setenv("CTDB_DBDIR_STATE", rec->ctdb->db_directory_state, 1);
1829
1830         if (!ctdb_vfork_with_logging(state, rec->ctdb, "recovery", prog, nargs,
1831                                      args, NULL, NULL, &state->pid)) {
1832                 DEBUG(DEBUG_ERR,
1833                       ("Failed to create child for recovery helper\n"));
1834                 goto fail;
1835         }
1836
1837         close(state->fd[1]);
1838         state->fd[1] = -1;
1839
1840         state->done = false;
1841
1842         fde = tevent_add_fd(rec->ctdb->ev, rec->ctdb, state->fd[0],
1843                             TEVENT_FD_READ, ctdb_recovery_handler, state);
1844         if (fde == NULL) {
1845                 goto fail;
1846         }
1847         tevent_fd_set_auto_close(fde);
1848
1849         while (!state->done) {
1850                 tevent_loop_once(rec->ctdb->ev);
1851         }
1852
1853         close(state->fd[0]);
1854         state->fd[0] = -1;
1855
1856         if (state->result != 0) {
1857                 goto fail;
1858         }
1859
1860         ctdb_kill(rec->ctdb, state->pid, SIGKILL);
1861         talloc_free(state);
1862         return 0;
1863
1864 fail:
1865         if (state->fd[0] != -1) {
1866                 close(state->fd[0]);
1867         }
1868         if (state->fd[1] != -1) {
1869                 close(state->fd[1]);
1870         }
1871         if (state->pid != -1) {
1872                 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
1873         }
1874         talloc_free(state);
1875         return -1;
1876 }
1877
1878 static int db_recovery_serial(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx,
1879                               uint32_t pnn, struct ctdb_node_map_old *nodemap,
1880                               struct ctdb_vnn_map *vnnmap,
1881                               struct ctdb_dbid_map_old *dbmap)
1882 {
1883         struct ctdb_context *ctdb = rec->ctdb;
1884         uint32_t generation;
1885         TDB_DATA data;
1886         uint32_t *nodes;
1887         int ret, i, j;
1888
1889         /* set recovery mode to active on all nodes */
1890         ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE, true);
1891         if (ret != 0) {
1892                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1893                 return -1;
1894         }
1895
1896         /* execute the "startrecovery" event script on all nodes */
1897         ret = run_startrecovery_eventscript(rec, nodemap);
1898         if (ret!=0) {
1899                 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1900                 return -1;
1901         }
1902
1903         /* pick a new generation number */
1904         generation = new_generation();
1905
1906         /* change the vnnmap on this node to use the new generation
1907            number but not on any other nodes.
1908            this guarantees that if we abort the recovery prematurely
1909            for some reason (a node stops responding?)
1910            that we can just return immediately and we will reenter
1911            recovery shortly again.
1912            I.e. we deliberately leave the cluster with an inconsistent
1913            generation id to allow us to abort recovery at any stage and
1914            just restart it from scratch.
1915          */
1916         vnnmap->generation = generation;
1917         ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1918         if (ret != 0) {
1919                 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1920                 return -1;
1921         }
1922
1923         /* Database generations are updated when the transaction is commited to
1924          * the databases.  So make sure to use the final generation as the
1925          * transaction id
1926          */
1927         generation = new_generation();
1928
1929         data.dptr = (void *)&generation;
1930         data.dsize = sizeof(uint32_t);
1931
1932         nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1933         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1934                                         nodes, 0,
1935                                         CONTROL_TIMEOUT(), false, data,
1936                                         NULL,
1937                                         transaction_start_fail_callback,
1938                                         rec) != 0) {
1939                 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1940                 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
1941                                         nodes, 0,
1942                                         CONTROL_TIMEOUT(), false, tdb_null,
1943                                         NULL,
1944                                         NULL,
1945                                         NULL) != 0) {
1946                         DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
1947                 }
1948                 return -1;
1949         }
1950
1951         DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1952
1953         for (i=0;i<dbmap->num;i++) {
1954                 ret = recover_database(rec, mem_ctx,
1955                                        dbmap->dbs[i].db_id,
1956                                        dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT,
1957                                        pnn, nodemap, generation);
1958                 if (ret != 0) {
1959                         DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].db_id));
1960                         return -1;
1961                 }
1962         }
1963
1964         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
1965
1966         /* commit all the changes */
1967         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
1968                                         nodes, 0,
1969                                         CONTROL_TIMEOUT(), false, data,
1970                                         NULL, NULL,
1971                                         NULL) != 0) {
1972                 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
1973                 return -1;
1974         }
1975
1976         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
1977
1978         /* build a new vnn map with all the currently active and
1979            unbanned nodes */
1980         vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
1981         CTDB_NO_MEMORY(ctdb, vnnmap);
1982         vnnmap->generation = generation;
1983         vnnmap->size = 0;
1984         vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
1985         CTDB_NO_MEMORY(ctdb, vnnmap->map);
1986         for (i=j=0;i<nodemap->num;i++) {
1987                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1988                         continue;
1989                 }
1990                 if (!ctdb_node_has_capabilities(rec->caps,
1991                                                 ctdb->nodes[i]->pnn,
1992                                                 CTDB_CAP_LMASTER)) {
1993                         /* this node can not be an lmaster */
1994                         DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
1995                         continue;
1996                 }
1997
1998                 vnnmap->size++;
1999                 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
2000                 CTDB_NO_MEMORY(ctdb, vnnmap->map);
2001                 vnnmap->map[j++] = nodemap->nodes[i].pnn;
2002
2003         }
2004         if (vnnmap->size == 0) {
2005                 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
2006                 vnnmap->size++;
2007                 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
2008                 CTDB_NO_MEMORY(ctdb, vnnmap->map);
2009                 vnnmap->map[0] = pnn;
2010         }
2011
2012         /* update to the new vnnmap on all nodes */
2013         ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
2014         if (ret != 0) {
2015                 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
2016                 return -1;
2017         }
2018
2019         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
2020
2021         /* disable recovery mode */
2022         ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL, false);
2023         if (ret != 0) {
2024                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
2025                 return -1;
2026         }
2027
2028         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
2029
2030         /* execute the "recovered" event script on all nodes */
2031         ret = run_recovered_eventscript(rec, nodemap, "do_recovery");
2032         if (ret!=0) {
2033                 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
2034                 return -1;
2035         }
2036
2037         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
2038
2039         return 0;
2040 }
2041
2042 /*
2043   we are the recmaster, and recovery is needed - start a recovery run
2044  */
2045 static int do_recovery(struct ctdb_recoverd *rec,
2046                        TALLOC_CTX *mem_ctx, uint32_t pnn,
2047                        struct ctdb_node_map_old *nodemap, struct ctdb_vnn_map *vnnmap)
2048 {
2049         struct ctdb_context *ctdb = rec->ctdb;
2050         int i, ret;
2051         struct ctdb_dbid_map_old *dbmap;
2052         bool self_ban;
2053         bool par_recovery;
2054
2055         DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
2056
2057         /* Check if the current node is still the recmaster.  It's possible that
2058          * re-election has changed the recmaster.
2059          */
2060         if (pnn != rec->recmaster) {
2061                 DEBUG(DEBUG_NOTICE,
2062                       ("Recovery master changed to %u, aborting recovery\n",
2063                        rec->recmaster));
2064                 return -1;
2065         }
2066
2067         /* if recovery fails, force it again */
2068         rec->need_recovery = true;
2069
2070         if (!ctdb_op_begin(rec->recovery)) {
2071                 return -1;
2072         }
2073
2074         if (rec->election_timeout) {
2075                 /* an election is in progress */
2076                 DEBUG(DEBUG_ERR, ("do_recovery called while election in progress - try again later\n"));
2077                 goto fail;
2078         }
2079
2080         ban_misbehaving_nodes(rec, &self_ban);
2081         if (self_ban) {
2082                 DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
2083                 goto fail;
2084         }
2085
2086         if (ctdb->recovery_lock != NULL) {
2087                 if (ctdb_recovery_have_lock(rec)) {
2088                         DEBUG(DEBUG_NOTICE, ("Already holding recovery lock\n"));
2089                 } else {
2090                         DEBUG(DEBUG_NOTICE, ("Attempting to take recovery lock (%s)\n",
2091                                              ctdb->recovery_lock));
2092                         if (!ctdb_recovery_lock(rec)) {
2093                                 if (ctdb->runstate == CTDB_RUNSTATE_FIRST_RECOVERY) {
2094                                         /* If ctdb is trying first recovery, it's
2095                                          * possible that current node does not know
2096                                          * yet who the recmaster is.
2097                                          */
2098                                         DEBUG(DEBUG_ERR, ("Unable to get recovery lock"
2099                                                           " - retrying recovery\n"));
2100                                         goto fail;
2101                                 }
2102
2103                                 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
2104                                                  "and ban ourself for %u seconds\n",
2105                                                  ctdb->tunable.recovery_ban_period));
2106                                 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
2107                                 goto fail;
2108                         }
2109                         DEBUG(DEBUG_NOTICE,
2110                               ("Recovery lock taken successfully by recovery daemon\n"));
2111                 }
2112         }
2113
2114         DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
2115
2116         /* get a list of all databases */
2117         ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
2118         if (ret != 0) {
2119                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
2120                 goto fail;
2121         }
2122
2123         /* we do the db creation before we set the recovery mode, so the freeze happens
2124            on all databases we will be dealing with. */
2125
2126         /* verify that we have all the databases any other node has */
2127         ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
2128         if (ret != 0) {
2129                 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
2130                 goto fail;
2131         }
2132
2133         /* verify that all other nodes have all our databases */
2134         ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
2135         if (ret != 0) {
2136                 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
2137                 goto fail;
2138         }
2139         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
2140
2141         /* update the database priority for all remote databases */
2142         ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
2143         if (ret != 0) {
2144                 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
2145         }
2146         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
2147
2148
2149         /* Retrieve capabilities from all connected nodes */
2150         ret = update_capabilities(rec, nodemap);
2151         if (ret!=0) {
2152                 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
2153                 return -1;
2154         }
2155
2156         /*
2157           update all nodes to have the same flags that we have
2158          */
2159         for (i=0;i<nodemap->num;i++) {
2160                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2161                         continue;
2162                 }
2163
2164                 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
2165                 if (ret != 0) {
2166                         if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2167                                 DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
2168                         } else {
2169                                 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
2170                                 return -1;
2171                         }
2172                 }
2173         }
2174
2175         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
2176
2177         /* Check if all participating nodes have parallel recovery capability */
2178         par_recovery = true;
2179         for (i=0; i<nodemap->num; i++) {
2180                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2181                         continue;
2182                 }
2183
2184                 if (!(rec->caps[i].capabilities &
2185                       CTDB_CAP_PARALLEL_RECOVERY)) {
2186                         par_recovery = false;
2187                         break;
2188                 }
2189         }
2190
2191         if (par_recovery) {
2192                 ret = db_recovery_parallel(rec, mem_ctx);
2193         } else {
2194                 ret = db_recovery_serial(rec, mem_ctx, pnn, nodemap, vnnmap,
2195                                          dbmap);
2196         }
2197
2198         if (ret != 0) {
2199                 goto fail;
2200         }
2201
2202         do_takeover_run(rec, nodemap);
2203
2204         /* send a message to all clients telling them that the cluster
2205            has been reconfigured */
2206         ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
2207                                        CTDB_SRVID_RECONFIGURE, tdb_null);
2208         if (ret != 0) {
2209                 DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n"));
2210                 goto fail;
2211         }
2212
2213         DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
2214
2215         rec->need_recovery = false;
2216         ctdb_op_end(rec->recovery);
2217
2218         /* we managed to complete a full recovery, make sure to forgive
2219            any past sins by the nodes that could now participate in the
2220            recovery.
2221         */
2222         DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
2223         for (i=0;i<nodemap->num;i++) {
2224                 struct ctdb_banning_state *ban_state;
2225
2226                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2227                         continue;
2228                 }
2229
2230                 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
2231                 if (ban_state == NULL) {
2232                         continue;
2233                 }
2234
2235                 ban_state->count = 0;
2236         }
2237
2238         /* We just finished a recovery successfully.
2239            We now wait for rerecovery_timeout before we allow
2240            another recovery to take place.
2241         */
2242         DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
2243         ctdb_op_disable(rec->recovery, ctdb->ev,
2244                         ctdb->tunable.rerecovery_timeout);
2245         return 0;
2246
2247 fail:
2248         ctdb_op_end(rec->recovery);
2249         return -1;
2250 }
2251
2252
2253 /*
2254   elections are won by first checking the number of connected nodes, then
2255   the priority time, then the pnn
2256  */
2257 struct election_message {
2258         uint32_t num_connected;
2259         struct timeval priority_time;
2260         uint32_t pnn;
2261         uint32_t node_flags;
2262 };
2263
2264 /*
2265   form this nodes election data
2266  */
2267 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
2268 {
2269         int ret, i;
2270         struct ctdb_node_map_old *nodemap;
2271         struct ctdb_context *ctdb = rec->ctdb;
2272
2273         ZERO_STRUCTP(em);
2274
2275         em->pnn = rec->ctdb->pnn;
2276         em->priority_time = rec->priority_time;
2277
2278         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
2279         if (ret != 0) {
2280                 DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
2281                 return;
2282         }
2283
2284         rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
2285         em->node_flags = rec->node_flags;
2286
2287         for (i=0;i<nodemap->num;i++) {
2288                 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
2289                         em->num_connected++;
2290                 }
2291         }
2292
2293         /* we shouldnt try to win this election if we cant be a recmaster */
2294         if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2295                 em->num_connected = 0;
2296                 em->priority_time = timeval_current();
2297         }
2298
2299         talloc_free(nodemap);
2300 }
2301
2302 /*
2303   see if the given election data wins
2304  */
2305 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
2306 {
2307         struct election_message myem;
2308         int cmp = 0;
2309
2310         ctdb_election_data(rec, &myem);
2311
2312         /* we cant win if we don't have the recmaster capability */
2313         if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2314                 return false;
2315         }
2316
2317         /* we cant win if we are banned */
2318         if (rec->node_flags & NODE_FLAGS_BANNED) {
2319                 return false;
2320         }
2321
2322         /* we cant win if we are stopped */
2323         if (rec->node_flags & NODE_FLAGS_STOPPED) {
2324                 return false;
2325         }
2326
2327         /* we will automatically win if the other node is banned */
2328         if (em->node_flags & NODE_FLAGS_BANNED) {
2329                 return true;
2330         }
2331
2332         /* we will automatically win if the other node is banned */
2333         if (em->node_flags & NODE_FLAGS_STOPPED) {
2334                 return true;
2335         }
2336
2337         /* then the longest running node */
2338         if (cmp == 0) {
2339                 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
2340         }
2341
2342         if (cmp == 0) {
2343                 cmp = (int)myem.pnn - (int)em->pnn;
2344         }
2345
2346         return cmp > 0;
2347 }
2348
2349 /*
2350   send out an election request
2351  */
2352 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
2353 {
2354         int ret;
2355         TDB_DATA election_data;
2356         struct election_message emsg;
2357         uint64_t srvid;
2358         struct ctdb_context *ctdb = rec->ctdb;
2359
2360         srvid = CTDB_SRVID_ELECTION;
2361
2362         ctdb_election_data(rec, &emsg);
2363
2364         election_data.dsize = sizeof(struct election_message);
2365         election_data.dptr  = (unsigned char *)&emsg;
2366
2367
2368         /* first we assume we will win the election and set
2369            recoverymaster to be ourself on the current node
2370          */
2371         ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
2372                                      CTDB_CURRENT_NODE, pnn);
2373         if (ret != 0) {
2374                 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster\n"));
2375                 return -1;
2376         }
2377         rec->recmaster = pnn;
2378
2379         /* send an election message to all active nodes */
2380         DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
2381         return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
2382 }
2383
2384 /*
2385   we think we are winning the election - send a broadcast election request
2386  */
2387 static void election_send_request(struct tevent_context *ev,
2388                                   struct tevent_timer *te,
2389                                   struct timeval t, void *p)
2390 {
2391         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2392         int ret;
2393
2394         ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
2395         if (ret != 0) {
2396                 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
2397         }
2398
2399         TALLOC_FREE(rec->send_election_te);
2400 }
2401
2402 /*
2403   handler for memory dumps
2404 */
2405 static void mem_dump_handler(uint64_t srvid, TDB_DATA data, void *private_data)
2406 {
2407         struct ctdb_recoverd *rec = talloc_get_type(
2408                 private_data, struct ctdb_recoverd);
2409         struct ctdb_context *ctdb = rec->ctdb;
2410         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2411         TDB_DATA *dump;
2412         int ret;
2413         struct ctdb_srvid_message *rd;
2414
2415         if (data.dsize != sizeof(struct ctdb_srvid_message)) {
2416                 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2417                 talloc_free(tmp_ctx);
2418                 return;
2419         }
2420         rd = (struct ctdb_srvid_message *)data.dptr;
2421
2422         dump = talloc_zero(tmp_ctx, TDB_DATA);
2423         if (dump == NULL) {
2424                 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
2425                 talloc_free(tmp_ctx);
2426                 return;
2427         }
2428         ret = ctdb_dump_memory(ctdb, dump);
2429         if (ret != 0) {
2430                 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
2431                 talloc_free(tmp_ctx);
2432                 return;
2433         }
2434
2435 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
2436
2437         ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
2438         if (ret != 0) {
2439                 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
2440                 talloc_free(tmp_ctx);
2441                 return;
2442         }
2443
2444         talloc_free(tmp_ctx);
2445 }
2446
2447 /*
2448   handler for reload_nodes
2449 */
2450 static void reload_nodes_handler(uint64_t srvid, TDB_DATA data,
2451                                  void *private_data)
2452 {
2453         struct ctdb_recoverd *rec = talloc_get_type(
2454                 private_data, struct ctdb_recoverd);
2455
2456         DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
2457
2458         ctdb_load_nodes_file(rec->ctdb);
2459 }
2460
2461
2462 static void recd_node_rebalance_handler(uint64_t srvid, TDB_DATA data,
2463                                         void *private_data)
2464 {
2465         struct ctdb_recoverd *rec = talloc_get_type(
2466                 private_data, struct ctdb_recoverd);
2467         struct ctdb_context *ctdb = rec->ctdb;
2468         uint32_t pnn;
2469         uint32_t *t;
2470         int len;
2471
2472         if (rec->recmaster != ctdb_get_pnn(ctdb)) {
2473                 return;
2474         }
2475
2476         if (data.dsize != sizeof(uint32_t)) {
2477                 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
2478                 return;
2479         }
2480
2481         pnn = *(uint32_t *)&data.dptr[0];
2482
2483         DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
2484
2485         /* Copy any existing list of nodes.  There's probably some
2486          * sort of realloc variant that will do this but we need to
2487          * make sure that freeing the old array also cancels the timer
2488          * event for the timeout... not sure if realloc will do that.
2489          */
2490         len = (rec->force_rebalance_nodes != NULL) ?
2491                 talloc_array_length(rec->force_rebalance_nodes) :
2492                 0;
2493
2494         /* This allows duplicates to be added but they don't cause
2495          * harm.  A call to add a duplicate PNN arguably means that
2496          * the timeout should be reset, so this is the simplest
2497          * solution.
2498          */
2499         t = talloc_zero_array(rec, uint32_t, len+1);
2500         CTDB_NO_MEMORY_VOID(ctdb, t);
2501         if (len > 0) {
2502                 memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
2503         }
2504         t[len] = pnn;
2505
2506         talloc_free(rec->force_rebalance_nodes);
2507
2508         rec->force_rebalance_nodes = t;
2509 }
2510
2511
2512
2513 static void srvid_disable_and_reply(struct ctdb_context *ctdb,
2514                                     TDB_DATA data,
2515                                     struct ctdb_op_state *op_state)
2516 {
2517         struct ctdb_disable_message *r;
2518         uint32_t timeout;
2519         TDB_DATA result;
2520         int32_t ret = 0;
2521
2522         /* Validate input data */
2523         if (data.dsize != sizeof(struct ctdb_disable_message)) {
2524                 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2525                                  "expecting %lu\n", (long unsigned)data.dsize,
2526                                  (long unsigned)sizeof(struct ctdb_srvid_message)));
2527                 return;
2528         }
2529         if (data.dptr == NULL) {
2530                 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2531                 return;
2532         }
2533
2534         r = (struct ctdb_disable_message *)data.dptr;
2535         timeout = r->timeout;
2536
2537         ret = ctdb_op_disable(op_state, ctdb->ev, timeout);
2538         if (ret != 0) {
2539                 goto done;
2540         }
2541
2542         /* Returning our PNN tells the caller that we succeeded */
2543         ret = ctdb_get_pnn(ctdb);
2544 done:
2545         result.dsize = sizeof(int32_t);
2546         result.dptr  = (uint8_t *)&ret;
2547         srvid_request_reply(ctdb, (struct ctdb_srvid_message *)r, result);
2548 }
2549
2550 static void disable_takeover_runs_handler(uint64_t srvid, TDB_DATA data,
2551                                           void *private_data)
2552 {
2553         struct ctdb_recoverd *rec = talloc_get_type(
2554                 private_data, struct ctdb_recoverd);
2555
2556         srvid_disable_and_reply(rec->ctdb, data, rec->takeover_run);
2557 }
2558
2559 /* Backward compatibility for this SRVID */
2560 static void disable_ip_check_handler(uint64_t srvid, TDB_DATA data,
2561                                      void *private_data)
2562 {
2563         struct ctdb_recoverd *rec = talloc_get_type(
2564                 private_data, struct ctdb_recoverd);
2565         uint32_t timeout;
2566
2567         if (data.dsize != sizeof(uint32_t)) {
2568                 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2569                                  "expecting %lu\n", (long unsigned)data.dsize,
2570                                  (long unsigned)sizeof(uint32_t)));
2571                 return;
2572         }
2573         if (data.dptr == NULL) {
2574                 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2575                 return;
2576         }
2577
2578         timeout = *((uint32_t *)data.dptr);
2579
2580         ctdb_op_disable(rec->takeover_run, rec->ctdb->ev, timeout);
2581 }
2582
2583 static void disable_recoveries_handler(uint64_t srvid, TDB_DATA data,
2584                                        void *private_data)
2585 {
2586         struct ctdb_recoverd *rec = talloc_get_type(
2587                 private_data, struct ctdb_recoverd);
2588
2589         srvid_disable_and_reply(rec->ctdb, data, rec->recovery);
2590 }
2591
2592 /*
2593   handler for ip reallocate, just add it to the list of requests and
2594   handle this later in the monitor_cluster loop so we do not recurse
2595   with other requests to takeover_run()
2596 */
2597 static void ip_reallocate_handler(uint64_t srvid, TDB_DATA data,
2598                                   void *private_data)
2599 {
2600         struct ctdb_srvid_message *request;
2601         struct ctdb_recoverd *rec = talloc_get_type(
2602                 private_data, struct ctdb_recoverd);
2603
2604         if (data.dsize != sizeof(struct ctdb_srvid_message)) {
2605                 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2606                 return;
2607         }
2608
2609         request = (struct ctdb_srvid_message *)data.dptr;
2610
2611         srvid_request_add(rec->ctdb, &rec->reallocate_requests, request);
2612 }
2613
2614 static void process_ipreallocate_requests(struct ctdb_context *ctdb,
2615                                           struct ctdb_recoverd *rec)
2616 {
2617         TDB_DATA result;
2618         int32_t ret;
2619         struct srvid_requests *current;
2620
2621         /* Only process requests that are currently pending.  More
2622          * might come in while the takeover run is in progress and
2623          * they will need to be processed later since they might
2624          * be in response flag changes.
2625          */
2626         current = rec->reallocate_requests;
2627         rec->reallocate_requests = NULL;
2628
2629         if (do_takeover_run(rec, rec->nodemap)) {
2630                 ret = ctdb_get_pnn(ctdb);
2631         } else {
2632                 ret = -1;
2633         }
2634
2635         result.dsize = sizeof(int32_t);
2636         result.dptr  = (uint8_t *)&ret;
2637
2638         srvid_requests_reply(ctdb, &current, result);
2639 }
2640
2641 /*
2642  * handler for assigning banning credits
2643  */
2644 static void banning_handler(uint64_t srvid, TDB_DATA data, void *private_data)
2645 {
2646         struct ctdb_recoverd *rec = talloc_get_type(
2647                 private_data, struct ctdb_recoverd);
2648         uint32_t ban_pnn;
2649
2650         /* Ignore if we are not recmaster */
2651         if (rec->ctdb->pnn != rec->recmaster) {
2652                 return;
2653         }
2654
2655         if (data.dsize != sizeof(uint32_t)) {
2656                 DEBUG(DEBUG_ERR, (__location__ "invalid data size %zu\n",
2657                                   data.dsize));
2658                 return;
2659         }
2660
2661         ban_pnn = *(uint32_t *)data.dptr;
2662
2663         ctdb_set_culprit_count(rec, ban_pnn, rec->nodemap->num);
2664 }
2665
2666 /*
2667   handler for recovery master elections
2668 */
2669 static void election_handler(uint64_t srvid, TDB_DATA data, void *private_data)
2670 {
2671         struct ctdb_recoverd *rec = talloc_get_type(
2672                 private_data, struct ctdb_recoverd);
2673         struct ctdb_context *ctdb = rec->ctdb;
2674         int ret;
2675         struct election_message *em = (struct election_message *)data.dptr;
2676
2677         /* Ignore election packets from ourself */
2678         if (ctdb->pnn == em->pnn) {
2679                 return;
2680         }
2681
2682         /* we got an election packet - update the timeout for the election */
2683         talloc_free(rec->election_timeout);
2684         rec->election_timeout = tevent_add_timer(
2685                         ctdb->ev, ctdb,
2686                         fast_start ?
2687                                 timeval_current_ofs(0, 500000) :
2688                                 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2689                         ctdb_election_timeout, rec);
2690
2691         /* someone called an election. check their election data
2692            and if we disagree and we would rather be the elected node,
2693            send a new election message to all other nodes
2694          */
2695         if (ctdb_election_win(rec, em)) {
2696                 if (!rec->send_election_te) {
2697                         rec->send_election_te = tevent_add_timer(
2698                                         ctdb->ev, rec,
2699                                         timeval_current_ofs(0, 500000),
2700                                         election_send_request, rec);
2701                 }
2702                 return;
2703         }
2704
2705         /* we didn't win */
2706         TALLOC_FREE(rec->send_election_te);
2707
2708         /* Release the recovery lock file */
2709         if (ctdb_recovery_have_lock(rec)) {
2710                 ctdb_recovery_unlock(rec);
2711         }
2712
2713         /* ok, let that guy become recmaster then */
2714         ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
2715                                      CTDB_CURRENT_NODE, em->pnn);
2716         if (ret != 0) {
2717                 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster"));
2718                 return;
2719         }
2720         rec->recmaster = em->pnn;
2721
2722         return;
2723 }
2724
2725
2726 /*
2727   force the start of the election process
2728  */
2729 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
2730                            struct ctdb_node_map_old *nodemap)
2731 {
2732         int ret;
2733         struct ctdb_context *ctdb = rec->ctdb;
2734
2735         DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
2736
2737         /* set all nodes to recovery mode to stop all internode traffic */
2738         ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE, false);
2739         if (ret != 0) {
2740                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
2741                 return;
2742         }
2743
2744         talloc_free(rec->election_timeout);
2745         rec->election_timeout = tevent_add_timer(
2746                         ctdb->ev, ctdb,
2747                         fast_start ?
2748                                 timeval_current_ofs(0, 500000) :
2749                                 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2750                         ctdb_election_timeout, rec);
2751
2752         ret = send_election_request(rec, pnn);
2753         if (ret!=0) {
2754                 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
2755                 return;
2756         }
2757
2758         /* wait for a few seconds to collect all responses */
2759         ctdb_wait_election(rec);
2760 }
2761
2762
2763
2764 /*
2765   handler for when a node changes its flags
2766 */
2767 static void monitor_handler(uint64_t srvid, TDB_DATA data, void *private_data)
2768 {
2769         struct ctdb_recoverd *rec = talloc_get_type(
2770                 private_data, struct ctdb_recoverd);
2771         struct ctdb_context *ctdb = rec->ctdb;
2772         int ret;
2773         struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2774         struct ctdb_node_map_old *nodemap=NULL;
2775         TALLOC_CTX *tmp_ctx;
2776         int i;
2777
2778         if (data.dsize != sizeof(*c)) {
2779                 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2780                 return;
2781         }
2782
2783         tmp_ctx = talloc_new(ctdb);
2784         CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2785
2786         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2787         if (ret != 0) {
2788                 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2789                 talloc_free(tmp_ctx);
2790                 return;
2791         }
2792
2793
2794         for (i=0;i<nodemap->num;i++) {
2795                 if (nodemap->nodes[i].pnn == c->pnn) break;
2796         }
2797
2798         if (i == nodemap->num) {
2799                 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2800                 talloc_free(tmp_ctx);
2801                 return;
2802         }
2803
2804         if (c->old_flags != c->new_flags) {
2805                 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x  was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2806         }
2807
2808         nodemap->nodes[i].flags = c->new_flags;
2809
2810         talloc_free(tmp_ctx);
2811 }
2812
2813 /*
2814   handler for when we need to push out flag changes ot all other nodes
2815 */
2816 static void push_flags_handler(uint64_t srvid, TDB_DATA data,
2817                                void *private_data)
2818 {
2819         struct ctdb_recoverd *rec = talloc_get_type(
2820                 private_data, struct ctdb_recoverd);
2821         struct ctdb_context *ctdb = rec->ctdb;
2822         int ret;
2823         struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2824         struct ctdb_node_map_old *nodemap=NULL;
2825         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2826         uint32_t *nodes;
2827
2828         /* read the node flags from the recmaster */
2829         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
2830                                    tmp_ctx, &nodemap);
2831         if (ret != 0) {
2832                 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2833                 talloc_free(tmp_ctx);
2834                 return;
2835         }
2836         if (c->pnn >= nodemap->num) {
2837                 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2838                 talloc_free(tmp_ctx);
2839                 return;
2840         }
2841
2842         /* send the flags update to all connected nodes */
2843         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2844
2845         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2846                                       nodes, 0, CONTROL_TIMEOUT(),
2847                                       false, data,
2848                                       NULL, NULL,
2849                                       NULL) != 0) {
2850                 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2851
2852                 talloc_free(tmp_ctx);
2853                 return;
2854         }
2855
2856         talloc_free(tmp_ctx);
2857 }
2858
2859
2860 struct verify_recmode_normal_data {
2861         uint32_t count;
2862         enum monitor_result status;
2863 };
2864
2865 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2866 {
2867         struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2868
2869
2870         /* one more node has responded with recmode data*/
2871         rmdata->count--;
2872
2873         /* if we failed to get the recmode, then return an error and let
2874            the main loop try again.
2875         */
2876         if (state->state != CTDB_CONTROL_DONE) {
2877                 if (rmdata->status == MONITOR_OK) {
2878                         rmdata->status = MONITOR_FAILED;
2879                 }
2880                 return;
2881         }
2882
2883         /* if we got a response, then the recmode will be stored in the
2884            status field
2885         */
2886         if (state->status != CTDB_RECOVERY_NORMAL) {
2887                 DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
2888                 rmdata->status = MONITOR_RECOVERY_NEEDED;
2889         }
2890
2891         return;
2892 }
2893
2894
2895 /* verify that all nodes are in normal recovery mode */
2896 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap)
2897 {
2898         struct verify_recmode_normal_data *rmdata;
2899         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2900         struct ctdb_client_control_state *state;
2901         enum monitor_result status;
2902         int j;
2903
2904         rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2905         CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2906         rmdata->count  = 0;
2907         rmdata->status = MONITOR_OK;
2908
2909         /* loop over all active nodes and send an async getrecmode call to
2910            them*/
2911         for (j=0; j<nodemap->num; j++) {
2912                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2913                         continue;
2914                 }
2915                 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2916                                         CONTROL_TIMEOUT(),
2917                                         nodemap->nodes[j].pnn);
2918                 if (state == NULL) {
2919                         /* we failed to send the control, treat this as
2920                            an error and try again next iteration
2921                         */
2922                         DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2923                         talloc_free(mem_ctx);
2924                         return MONITOR_FAILED;
2925                 }
2926
2927                 /* set up the callback functions */
2928                 state->async.fn = verify_recmode_normal_callback;
2929                 state->async.private_data = rmdata;
2930
2931                 /* one more control to wait for to complete */
2932                 rmdata->count++;
2933         }
2934
2935
2936         /* now wait for up to the maximum number of seconds allowed
2937            or until all nodes we expect a response from has replied
2938         */
2939         while (rmdata->count > 0) {
2940                 tevent_loop_once(ctdb->ev);
2941         }
2942
2943         status = rmdata->status;
2944         talloc_free(mem_ctx);
2945         return status;
2946 }
2947
2948
2949 struct verify_recmaster_data {
2950         struct ctdb_recoverd *rec;
2951         uint32_t count;
2952         uint32_t pnn;
2953         enum monitor_result status;
2954 };
2955
2956 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2957 {
2958         struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2959
2960
2961         /* one more node has responded with recmaster data*/
2962         rmdata->count--;
2963
2964         /* if we failed to get the recmaster, then return an error and let
2965            the main loop try again.
2966         */
2967         if (state->state != CTDB_CONTROL_DONE) {
2968                 if (rmdata->status == MONITOR_OK) {
2969                         rmdata->status = MONITOR_FAILED;
2970                 }
2971                 return;
2972         }
2973
2974         /* if we got a response, then the recmaster will be stored in the
2975            status field
2976         */
2977         if (state->status != rmdata->pnn) {
2978                 DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
2979                 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2980                 rmdata->status = MONITOR_ELECTION_NEEDED;
2981         }
2982
2983         return;
2984 }
2985
2986
2987 /* verify that all nodes agree that we are the recmaster */
2988 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap, uint32_t pnn)
2989 {
2990         struct ctdb_context *ctdb = rec->ctdb;
2991         struct verify_recmaster_data *rmdata;
2992         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2993         struct ctdb_client_control_state *state;
2994         enum monitor_result status;
2995         int j;
2996
2997         rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2998         CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2999         rmdata->rec    = rec;
3000         rmdata->count  = 0;
3001         rmdata->pnn    = pnn;
3002         rmdata->status = MONITOR_OK;
3003
3004         /* loop over all active nodes and send an async getrecmaster call to
3005            them*/
3006         for (j=0; j<nodemap->num; j++) {
3007                 if (nodemap->nodes[j].pnn == rec->recmaster) {
3008                         continue;
3009                 }
3010                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3011                         continue;
3012                 }
3013                 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
3014                                         CONTROL_TIMEOUT(),
3015                                         nodemap->nodes[j].pnn);
3016                 if (state == NULL) {
3017                         /* we failed to send the control, treat this as
3018                            an error and try again next iteration
3019                         */
3020                         DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
3021                         talloc_free(mem_ctx);
3022                         return MONITOR_FAILED;
3023                 }
3024
3025                 /* set up the callback functions */
3026                 state->async.fn = verify_recmaster_callback;
3027                 state->async.private_data = rmdata;
3028
3029                 /* one more control to wait for to complete */
3030                 rmdata->count++;
3031         }
3032
3033
3034         /* now wait for up to the maximum number of seconds allowed
3035            or until all nodes we expect a response from has replied
3036         */
3037         while (rmdata->count > 0) {
3038                 tevent_loop_once(ctdb->ev);
3039         }
3040
3041         status = rmdata->status;
3042         talloc_free(mem_ctx);
3043         return status;
3044 }
3045
3046 static bool interfaces_have_changed(struct ctdb_context *ctdb,
3047                                     struct ctdb_recoverd *rec)
3048 {
3049         struct ctdb_iface_list_old *ifaces = NULL;
3050         TALLOC_CTX *mem_ctx;
3051         bool ret = false;
3052
3053         mem_ctx = talloc_new(NULL);
3054
3055         /* Read the interfaces from the local node */
3056         if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
3057                                  CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
3058                 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
3059                 /* We could return an error.  However, this will be
3060                  * rare so we'll decide that the interfaces have
3061                  * actually changed, just in case.
3062                  */
3063                 talloc_free(mem_ctx);
3064                 return true;
3065         }
3066
3067         if (!rec->ifaces) {
3068                 /* We haven't been here before so things have changed */
3069                 DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
3070                 ret = true;
3071         } else if (rec->ifaces->num != ifaces->num) {
3072                 /* Number of interfaces has changed */
3073                 DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
3074                                      rec->ifaces->num, ifaces->num));
3075                 ret = true;
3076         } else {
3077                 /* See if interface names or link states have changed */
3078                 int i;
3079                 for (i = 0; i < rec->ifaces->num; i++) {
3080                         struct ctdb_iface * iface = &rec->ifaces->ifaces[i];
3081                         if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
3082                                 DEBUG(DEBUG_NOTICE,
3083                                       ("Interface in slot %d changed: %s => %s\n",
3084                                        i, iface->name, ifaces->ifaces[i].name));
3085                                 ret = true;
3086                                 break;
3087                         }
3088                         if (iface->link_state != ifaces->ifaces[i].link_state) {
3089                                 DEBUG(DEBUG_NOTICE,
3090                                       ("Interface %s changed state: %d => %d\n",
3091                                        iface->name, iface->link_state,
3092                                        ifaces->ifaces[i].link_state));
3093                                 ret = true;
3094                                 break;
3095                         }
3096                 }
3097         }
3098
3099         talloc_free(rec->ifaces);
3100         rec->ifaces = talloc_steal(rec, ifaces);
3101
3102         talloc_free(mem_ctx);
3103         return ret;
3104 }
3105
3106 /* Check that the local allocation of public IP addresses is correct
3107  * and do some house-keeping */
3108 static int verify_local_ip_allocation(struct ctdb_context *ctdb,
3109                                       struct ctdb_recoverd *rec,
3110                                       uint32_t pnn,
3111                                       struct ctdb_node_map_old *nodemap)
3112 {
3113         TALLOC_CTX *mem_ctx = talloc_new(NULL);
3114         int ret, j;
3115         bool need_takeover_run = false;
3116         struct ctdb_public_ip_list_old *ips = NULL;
3117
3118         /* If we are not the recmaster then do some housekeeping */
3119         if (rec->recmaster != pnn) {
3120                 /* Ignore any IP reallocate requests - only recmaster
3121                  * processes them
3122                  */
3123                 TALLOC_FREE(rec->reallocate_requests);
3124                 /* Clear any nodes that should be force rebalanced in
3125                  * the next takeover run.  If the recovery master role
3126                  * has moved then we don't want to process these some
3127                  * time in the future.
3128                  */
3129                 TALLOC_FREE(rec->force_rebalance_nodes);
3130         }
3131
3132         /* Return early if disabled... */
3133         if (ctdb->tunable.disable_ip_failover != 0 ||
3134             ctdb_op_is_disabled(rec->takeover_run)) {
3135                 return  0;
3136         }
3137
3138         if (interfaces_have_changed(ctdb, rec)) {
3139                 need_takeover_run = true;
3140         }
3141
3142         /* If there are unhosted IPs but this node can host them then
3143          * trigger an IP reallocation */
3144
3145         /* Read *available* IPs from local node */
3146         ret = ctdb_ctrl_get_public_ips_flags(
3147                 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx,
3148                 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
3149         if (ret != 0) {
3150                 DEBUG(DEBUG_ERR, ("Unable to retrieve available public IPs\n"));
3151                 talloc_free(mem_ctx);
3152                 return -1;
3153         }
3154
3155         for (j=0; j<ips->num; j++) {
3156                 if (ips->ips[j].pnn == -1 &&
3157                     nodemap->nodes[pnn].flags == 0) {
3158                         DEBUG(DEBUG_WARNING,
3159                               ("Unassigned IP %s can be served by this node\n",
3160                                ctdb_addr_to_str(&ips->ips[j].addr)));
3161                         need_takeover_run = true;
3162                 }
3163         }
3164
3165         talloc_free(ips);
3166
3167         if (!ctdb->do_checkpublicip) {
3168                 goto done;
3169         }
3170
3171         /* Validate the IP addresses that this node has on network
3172          * interfaces.  If there is an inconsistency between reality
3173          * and the state expected by CTDB then try to fix it by
3174          * triggering an IP reallocation or releasing extraneous IP
3175          * addresses. */
3176
3177         /* Read *known* IPs from local node */
3178         ret = ctdb_ctrl_get_public_ips_flags(
3179                 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
3180         if (ret != 0) {
3181                 DEBUG(DEBUG_ERR, ("Unable to retrieve known public IPs\n"));
3182                 talloc_free(mem_ctx);
3183                 return -1;
3184         }
3185
3186         for (j=0; j<ips->num; j++) {
3187                 if (ips->ips[j].pnn == pnn) {
3188                         if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
3189                                 DEBUG(DEBUG_ERR,
3190                                       ("Assigned IP %s not on an interface\n",
3191                                        ctdb_addr_to_str(&ips->ips[j].addr)));
3192                                 need_takeover_run = true;
3193                         }
3194                 } else {
3195                         if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
3196                                 DEBUG(DEBUG_ERR,
3197                                       ("IP %s incorrectly on an interface - releasing\n",
3198                                        ctdb_addr_to_str(&ips->ips[j].addr)));
3199                                 ret = ctdb_ctrl_release_ip(ctdb,
3200                                                            CONTROL_TIMEOUT(),
3201                                                            CTDB_CURRENT_NODE,
3202                                                            &ips->ips[j]);
3203                                 if (ret != 0) {
3204                                         DEBUG(DEBUG_ERR,
3205                                               ("Failed to release IP address\n"));
3206                                 }
3207                         }
3208                 }
3209         }
3210
3211 done:
3212         if (need_takeover_run) {
3213                 struct ctdb_srvid_message rd;
3214                 TDB_DATA data;
3215
3216                 DEBUG(DEBUG_NOTICE,("Trigger takeoverrun\n"));
3217
3218                 ZERO_STRUCT(rd);
3219                 rd.pnn = ctdb->pnn;
3220                 rd.srvid = 0;
3221                 data.dptr = (uint8_t *)&rd;
3222                 data.dsize = sizeof(rd);
3223
3224                 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
3225                 if (ret != 0) {
3226                         DEBUG(DEBUG_ERR,
3227                               ("Failed to send takeover run request\n"));
3228                 }
3229         }
3230         talloc_free(mem_ctx);
3231         return 0;
3232 }
3233
3234
3235 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
3236 {
3237         struct ctdb_node_map_old **remote_nodemaps = callback_data;
3238
3239         if (node_pnn >= ctdb->num_nodes) {
3240                 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
3241                 return;
3242         }
3243
3244         remote_nodemaps[node_pnn] = (struct ctdb_node_map_old *)talloc_steal(remote_nodemaps, outdata.dptr);
3245
3246 }
3247
3248 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
3249         struct ctdb_node_map_old *nodemap,
3250         struct ctdb_node_map_old **remote_nodemaps)
3251 {
3252         uint32_t *nodes;
3253
3254         nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
3255         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
3256                                         nodes, 0,
3257                                         CONTROL_TIMEOUT(), false, tdb_null,
3258                                         async_getnodemap_callback,
3259                                         NULL,
3260                                         remote_nodemaps) != 0) {
3261                 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
3262
3263                 return -1;
3264         }
3265
3266         return 0;
3267 }
3268
3269 static bool validate_recovery_master(struct ctdb_recoverd *rec,
3270                                      TALLOC_CTX *mem_ctx)
3271 {
3272         struct ctdb_context *ctdb = rec->ctdb;
3273         uint32_t pnn = ctdb_get_pnn(ctdb);
3274         struct ctdb_node_map_old *nodemap = rec->nodemap;
3275         struct ctdb_node_map_old *recmaster_nodemap = NULL;
3276         int ret;
3277
3278         /* When recovery daemon is started, recmaster is set to
3279          * "unknown" so it knows to start an election.
3280          */
3281         if (rec->recmaster == CTDB_UNKNOWN_PNN) {
3282                 DEBUG(DEBUG_NOTICE,
3283                       ("Initial recovery master set - forcing election\n"));
3284                 force_election(rec, pnn, nodemap);
3285                 return false;
3286         }
3287
3288         /*
3289          * If the current recmaster does not have CTDB_CAP_RECMASTER,
3290          * but we have, then force an election and try to become the new
3291          * recmaster.
3292          */
3293         if (!ctdb_node_has_capabilities(rec->caps,
3294                                         rec->recmaster,
3295                                         CTDB_CAP_RECMASTER) &&
3296             (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
3297             !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
3298                 DEBUG(DEBUG_ERR,
3299                       (" Current recmaster node %u does not have CAP_RECMASTER,"
3300                        " but we (node %u) have - force an election\n",
3301                        rec->recmaster, pnn));
3302                 force_election(rec, pnn, nodemap);
3303                 return false;
3304         }
3305
3306         /* Verify that the master node has not been deleted.  This
3307          * should not happen because a node should always be shutdown
3308          * before being deleted, causing a new master to be elected
3309          * before now.  However, if something strange has happened
3310          * then checking here will ensure we don't index beyond the
3311          * end of the nodemap array. */
3312         if (rec->recmaster >= nodemap->num) {
3313                 DEBUG(DEBUG_ERR,
3314                       ("Recmaster node %u has been deleted. Force election\n",
3315                        rec->recmaster));
3316                 force_election(rec, pnn, nodemap);
3317                 return false;
3318         }
3319
3320         /* if recovery master is disconnected/deleted we must elect a new recmaster */
3321         if (nodemap->nodes[rec->recmaster].flags &
3322             (NODE_FLAGS_DISCONNECTED|NODE_FLAGS_DELETED)) {
3323                 DEBUG(DEBUG_NOTICE,
3324                       ("Recmaster node %u is disconnected/deleted. Force election\n",
3325                        rec->recmaster));
3326                 force_election(rec, pnn, nodemap);
3327                 return false;
3328         }
3329
3330         /* get nodemap from the recovery master to check if it is inactive */
3331         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
3332                                    mem_ctx, &recmaster_nodemap);
3333         if (ret != 0) {
3334                 DEBUG(DEBUG_ERR,
3335                       (__location__
3336                        " Unable to get nodemap from recovery master %u\n",
3337                           rec->recmaster));
3338                 /* No election, just error */
3339                 return false;
3340         }
3341
3342
3343         if ((recmaster_nodemap->nodes[rec->recmaster].flags & NODE_FLAGS_INACTIVE) &&
3344             (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
3345                 DEBUG(DEBUG_NOTICE,
3346                       ("Recmaster node %u is inactive. Force election\n",
3347                        rec->recmaster));
3348                 /*
3349                  * update our nodemap to carry the recmaster's notion of
3350                  * its own flags, so that we don't keep freezing the
3351                  * inactive recmaster node...
3352                  */
3353                 nodemap->nodes[rec->recmaster].flags =
3354                         recmaster_nodemap->nodes[rec->recmaster].flags;
3355                 force_election(rec, pnn, nodemap);
3356                 return false;
3357         }
3358
3359         return true;
3360 }
3361
3362 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
3363                       TALLOC_CTX *mem_ctx)
3364 {
3365         uint32_t pnn;
3366         struct ctdb_node_map_old *nodemap=NULL;
3367         struct ctdb_node_map_old **remote_nodemaps=NULL;
3368         struct ctdb_vnn_map *vnnmap=NULL;
3369         struct ctdb_vnn_map *remote_vnnmap=NULL;
3370         uint32_t num_lmasters;
3371         int32_t debug_level;
3372         int i, j, ret;
3373         bool self_ban;
3374
3375
3376         /* verify that the main daemon is still running */
3377         if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
3378                 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
3379                 exit(-1);
3380         }
3381
3382         /* ping the local daemon to tell it we are alive */
3383         ctdb_ctrl_recd_ping(ctdb);
3384
3385         if (rec->election_timeout) {
3386                 /* an election is in progress */
3387                 return;
3388         }
3389
3390         /* read the debug level from the parent and update locally */
3391         ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
3392         if (ret !=0) {
3393                 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
3394                 return;
3395         }
3396         DEBUGLEVEL = debug_level;
3397
3398         /* get relevant tunables */
3399         ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
3400         if (ret != 0) {
3401                 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
3402                 return;
3403         }
3404
3405         /* get runstate */
3406         ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(),
3407                                      CTDB_CURRENT_NODE, &ctdb->runstate);
3408         if (ret != 0) {
3409                 DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n"));
3410                 return;
3411         }
3412
3413         pnn = ctdb_get_pnn(ctdb);
3414
3415         /* get nodemap */
3416         TALLOC_FREE(rec->nodemap);
3417         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
3418         if (ret != 0) {
3419                 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
3420                 return;
3421         }
3422         nodemap = rec->nodemap;
3423
3424         /* remember our own node flags */
3425         rec->node_flags = nodemap->nodes[pnn].flags;
3426
3427         ban_misbehaving_nodes(rec, &self_ban);
3428         if (self_ban) {
3429                 DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
3430                 return;
3431         }
3432
3433         /* if the local daemon is STOPPED or BANNED, we verify that the databases are
3434            also frozen and that the recmode is set to active.
3435         */
3436         if (rec->node_flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
3437                 /* If this node has become inactive then we want to
3438                  * reduce the chances of it taking over the recovery
3439                  * master role when it becomes active again.  This
3440                  * helps to stabilise the recovery master role so that
3441                  * it stays on the most stable node.
3442                  */
3443                 rec->priority_time = timeval_current();
3444
3445                 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
3446                 if (ret != 0) {
3447                         DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
3448                 }
3449                 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
3450                         DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
3451
3452                         ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
3453                         if (ret != 0) {
3454                                 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
3455
3456                                 return;
3457                         }
3458                 }
3459                 if (! rec->frozen_on_inactive) {
3460                         ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(),
3461                                                CTDB_CURRENT_NODE);
3462                         if (ret != 0) {
3463                                 DEBUG(DEBUG_ERR,
3464                                       (__location__ " Failed to freeze node "
3465                                        "in STOPPED or BANNED state\n"));
3466                                 return;
3467                         }
3468
3469                         rec->frozen_on_inactive = true;
3470                 }
3471
3472                 /* If this node is stopped or banned then it is not the recovery
3473                  * master, so don't do anything. This prevents stopped or banned
3474                  * node from starting election and sending unnecessary controls.
3475                  */
3476                 return;
3477         }
3478
3479         rec->frozen_on_inactive = false;
3480
3481         /* Retrieve capabilities from all connected nodes */
3482         ret = update_capabilities(rec, nodemap);
3483         if (ret != 0) {
3484                 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
3485                 return;
3486         }
3487
3488         if (! validate_recovery_master(rec, mem_ctx)) {
3489                 return;
3490         }
3491
3492         /* Check if an IP takeover run is needed and trigger one if
3493          * necessary */
3494         verify_local_ip_allocation(ctdb, rec, pnn, nodemap);
3495
3496         /* if we are not the recmaster then we do not need to check
3497            if recovery is needed
3498          */
3499         if (pnn != rec->recmaster) {
3500                 return;
3501         }
3502
3503
3504         /* ensure our local copies of flags are right */
3505         ret = update_local_flags(rec, nodemap);
3506         if (ret != 0) {
3507                 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
3508                 return;
3509         }
3510
3511         if (ctdb->num_nodes != nodemap->num) {
3512                 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
3513                 ctdb_load_nodes_file(ctdb);
3514                 return;
3515         }
3516
3517         /* verify that all active nodes agree that we are the recmaster */
3518         switch (verify_recmaster(rec, nodemap, pnn)) {
3519         case MONITOR_RECOVERY_NEEDED:
3520                 /* can not happen */
3521                 return;
3522         case MONITOR_ELECTION_NEEDED:
3523                 force_election(rec, pnn, nodemap);
3524                 return;
3525         case MONITOR_OK:
3526                 break;
3527         case MONITOR_FAILED:
3528                 return;
3529         }
3530
3531
3532         /* get the vnnmap */
3533         ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
3534         if (ret != 0) {
3535                 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
3536                 return;
3537         }
3538
3539         if (rec->need_recovery) {
3540                 /* a previous recovery didn't finish */
3541                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3542                 return;
3543         }
3544
3545         /* verify that all active nodes are in normal mode
3546            and not in recovery mode
3547         */
3548         switch (verify_recmode(ctdb, nodemap)) {
3549         case MONITOR_RECOVERY_NEEDED:
3550                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3551                 return;
3552         case MONITOR_FAILED:
3553                 return;
3554         case MONITOR_ELECTION_NEEDED:
3555                 /* can not happen */
3556         case MONITOR_OK:
3557                 break;
3558         }
3559
3560
3561         if (ctdb->recovery_lock != NULL) {
3562                 /* We must already hold the recovery lock */
3563                 if (!ctdb_recovery_have_lock(rec)) {
3564                         DEBUG(DEBUG_ERR,("Failed recovery lock sanity check.  Force a recovery\n"));
3565                         ctdb_set_culprit(rec, ctdb->pnn);
3566                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3567                         return;
3568                 }
3569         }
3570
3571
3572         /* If recoveries are disabled then there is no use doing any
3573          * nodemap or flags checks.  Recoveries might be disabled due
3574          * to "reloadnodes", so doing these checks might cause an
3575          * unnecessary recovery.  */
3576         if (ctdb_op_is_disabled(rec->recovery)) {
3577                 goto takeover_run_checks;
3578         }
3579
3580         /* get the nodemap for all active remote nodes
3581          */
3582         remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map_old *, nodemap->num);
3583         if (remote_nodemaps == NULL) {
3584                 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3585                 return;
3586         }
3587         for(i=0; i<nodemap->num; i++) {
3588                 remote_nodemaps[i] = NULL;
3589         }
3590         if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3591                 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3592                 return;
3593         }
3594
3595         /* verify that all other nodes have the same nodemap as we have
3596         */
3597         for (j=0; j<nodemap->num; j++) {
3598                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3599                         continue;
3600                 }
3601
3602                 if (remote_nodemaps[j] == NULL) {
3603                         DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3604                         ctdb_set_culprit(rec, j);
3605
3606                         return;
3607                 }
3608
3609                 /* if the nodes disagree on how many nodes there are
3610                    then this is a good reason to try recovery
3611                  */
3612                 if (remote_nodemaps[j]->num != nodemap->num) {
3613                         DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3614                                   nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3615                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3616                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3617                         return;
3618                 }
3619
3620                 /* if the nodes disagree on which nodes exist and are
3621                    active, then that is also a good reason to do recovery
3622                  */
3623                 for (i=0;i<nodemap->num;i++) {
3624                         if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3625                                 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3626                                           nodemap->nodes[j].pnn, i,
3627                                           remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3628                                 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3629                                 do_recovery(rec, mem_ctx, pnn, nodemap,
3630                                             vnnmap);
3631                                 return;
3632                         }
3633                 }
3634         }
3635
3636         /*
3637          * Update node flags obtained from each active node. This ensure we have
3638          * up-to-date information for all the nodes.
3639          */
3640         for (j=0; j<nodemap->num; j++) {
3641                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3642                         continue;
3643                 }
3644                 nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
3645         }
3646
3647         for (j=0; j<nodemap->num; j++) {
3648                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3649                         continue;
3650                 }
3651
3652                 /* verify the flags are consistent
3653                 */
3654                 for (i=0; i<nodemap->num; i++) {
3655                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3656                                 continue;
3657                         }
3658
3659                         if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3660                                 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3661                                   nodemap->nodes[j].pnn,
3662                                   nodemap->nodes[i].pnn,
3663                                   remote_nodemaps[j]->nodes[i].flags,
3664                                   nodemap->nodes[i].flags));
3665                                 if (i == j) {
3666                                         DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3667                                         update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3668                                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3669                                         do_recovery(rec, mem_ctx, pnn, nodemap,
3670                                                     vnnmap);
3671                                         return;
3672                                 } else {
3673                                         DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3674                                         update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3675                                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3676                                         do_recovery(rec, mem_ctx, pnn, nodemap,
3677                                                     vnnmap);
3678                                         return;
3679                                 }
3680                         }
3681                 }
3682         }
3683
3684
3685         /* count how many active nodes there are */
3686         num_lmasters  = 0;
3687         for (i=0; i<nodemap->num; i++) {
3688                 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
3689                         if (ctdb_node_has_capabilities(rec->caps,
3690                                                        ctdb->nodes[i]->pnn,
3691                                                        CTDB_CAP_LMASTER)) {
3692                                 num_lmasters++;
3693                         }
3694                 }
3695         }
3696
3697
3698         /* There must be the same number of lmasters in the vnn map as
3699          * there are active nodes with the lmaster capability...  or
3700          * do a recovery.
3701          */
3702         if (vnnmap->size != num_lmasters) {
3703                 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
3704                           vnnmap->size, num_lmasters));
3705                 ctdb_set_culprit(rec, ctdb->pnn);
3706                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3707                 return;
3708         }
3709
3710         /* verify that all active nodes in the nodemap also exist in
3711            the vnnmap.
3712          */
3713         for (j=0; j<nodemap->num; j++) {
3714                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3715                         continue;
3716                 }
3717                 if (nodemap->nodes[j].pnn == pnn) {
3718                         continue;
3719                 }
3720
3721                 for (i=0; i<vnnmap->size; i++) {
3722                         if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3723                                 break;
3724                         }
3725                 }
3726                 if (i == vnnmap->size) {
3727                         DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3728                                   nodemap->nodes[j].pnn));
3729                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3730                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3731                         return;
3732                 }
3733         }
3734
3735
3736         /* verify that all other nodes have the same vnnmap
3737            and are from the same generation
3738          */
3739         for (j=0; j<nodemap->num; j++) {
3740                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3741                         continue;
3742                 }
3743                 if (nodemap->nodes[j].pnn == pnn) {
3744                         continue;
3745                 }
3746
3747                 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3748                                           mem_ctx, &remote_vnnmap);
3749                 if (ret != 0) {
3750                         DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3751                                   nodemap->nodes[j].pnn));
3752                         return;
3753                 }
3754
3755                 /* verify the vnnmap generation is the same */
3756                 if (vnnmap->generation != remote_vnnmap->generation) {
3757                         DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3758                                   nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3759                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3760                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3761                         return;
3762                 }
3763
3764                 /* verify the vnnmap size is the same */
3765                 if (vnnmap->size != remote_vnnmap->size) {
3766                         DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3767                                   nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3768                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3769                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3770                         return;
3771                 }
3772
3773                 /* verify the vnnmap is the same */
3774                 for (i=0;i<vnnmap->size;i++) {
3775                         if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3776                                 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3777                                           nodemap->nodes[j].pnn));
3778                                 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3779                                 do_recovery(rec, mem_ctx, pnn, nodemap,
3780                                             vnnmap);
3781                                 return;
3782                         }
3783                 }
3784         }
3785
3786         /* FIXME: Add remote public IP checking to ensure that nodes
3787          * have the IP addresses that are allocated to them. */
3788
3789 takeover_run_checks:
3790
3791         /* If there are IP takeover runs requested or the previous one
3792          * failed then perform one and notify the waiters */
3793         if (!ctdb_op_is_disabled(rec->takeover_run) &&
3794             (rec->reallocate_requests || rec->need_takeover_run)) {
3795                 process_ipreallocate_requests(ctdb, rec);
3796         }
3797 }
3798
3799 static void recd_sig_term_handler(struct tevent_context *ev,
3800                                   struct tevent_signal *se, int signum,
3801                                   int count, void *dont_care,
3802                                   void *private_data)
3803 {
3804         struct ctdb_recoverd *rec = talloc_get_type_abort(
3805                 private_data, struct ctdb_recoverd);
3806
3807         ctdb_recovery_unlock(rec);
3808         exit(0);
3809 }
3810
3811
3812 /*
3813   the main monitoring loop
3814  */
3815 static void monitor_cluster(struct ctdb_context *ctdb)
3816 {
3817         struct tevent_signal *se;
3818         struct ctdb_recoverd *rec;
3819
3820         DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
3821
3822         rec = talloc_zero(ctdb, struct ctdb_recoverd);
3823         CTDB_NO_MEMORY_FATAL(ctdb, rec);
3824
3825         rec->ctdb = ctdb;
3826         rec->recmaster = CTDB_UNKNOWN_PNN;
3827         rec->recovery_lock_handle = NULL;
3828
3829         rec->takeover_run = ctdb_op_init(rec, "takeover runs");
3830         CTDB_NO_MEMORY_FATAL(ctdb, rec->takeover_run);
3831
3832         rec->recovery = ctdb_op_init(rec, "recoveries");
3833         CTDB_NO_MEMORY_FATAL(ctdb, rec->recovery);
3834
3835         rec->priority_time = timeval_current();
3836         rec->frozen_on_inactive = false;
3837
3838         se = tevent_add_signal(ctdb->ev, ctdb, SIGTERM, 0,
3839                                recd_sig_term_handler, rec);
3840         if (se == NULL) {
3841                 DEBUG(DEBUG_ERR, ("Failed to install SIGTERM handler\n"));
3842                 exit(1);
3843         }
3844
3845         /* register a message port for sending memory dumps */
3846         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
3847
3848         /* when a node is assigned banning credits */
3849         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_BANNING,
3850                                         banning_handler, rec);
3851
3852         /* register a message port for recovery elections */
3853         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_ELECTION, election_handler, rec);
3854
3855         /* when nodes are disabled/enabled */
3856         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
3857
3858         /* when we are asked to puch out a flag change */
3859         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
3860
3861         /* register a message port for vacuum fetch */
3862         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
3863
3864         /* register a message port for reloadnodes  */
3865         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
3866
3867         /* register a message port for performing a takeover run */
3868         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
3869
3870         /* register a message port for disabling the ip check for a short while */
3871         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
3872
3873         /* register a message port for forcing a rebalance of a node next
3874            reallocation */
3875         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
3876
3877         /* Register a message port for disabling takeover runs */
3878         ctdb_client_set_message_handler(ctdb,
3879                                         CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
3880                                         disable_takeover_runs_handler, rec);
3881
3882         /* Register a message port for disabling recoveries */
3883         ctdb_client_set_message_handler(ctdb,
3884                                         CTDB_SRVID_DISABLE_RECOVERIES,
3885                                         disable_recoveries_handler, rec);
3886
3887         /* register a message port for detaching database */
3888         ctdb_client_set_message_handler(ctdb,
3889                                         CTDB_SRVID_DETACH_DATABASE,
3890                                         detach_database_handler, rec);
3891
3892         for (;;) {
3893                 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3894                 struct timeval start;
3895                 double elapsed;
3896
3897                 if (!mem_ctx) {
3898                         DEBUG(DEBUG_CRIT,(__location__
3899                                           " Failed to create temp context\n"));
3900                         exit(-1);
3901                 }
3902
3903                 start = timeval_current();
3904                 main_loop(ctdb, rec, mem_ctx);
3905                 talloc_free(mem_ctx);
3906
3907                 /* we only check for recovery once every second */
3908                 elapsed = timeval_elapsed(&start);
3909                 if (elapsed < ctdb->tunable.recover_interval) {
3910                         ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
3911                                           - elapsed);
3912                 }
3913         }
3914 }
3915
3916 /*
3917   event handler for when the main ctdbd dies
3918  */
3919 static void ctdb_recoverd_parent(struct tevent_context *ev,
3920                                  struct tevent_fd *fde,
3921                                  uint16_t flags, void *private_data)
3922 {
3923         DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
3924         _exit(1);
3925 }
3926
3927 /*
3928   called regularly to verify that the recovery daemon is still running
3929  */
3930 static void ctdb_check_recd(struct tevent_context *ev,
3931                             struct tevent_timer *te,
3932                             struct timeval yt, void *p)
3933 {
3934         struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3935
3936         if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
3937                 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
3938
3939                 tevent_add_timer(ctdb->ev, ctdb, timeval_zero(),
3940                                  ctdb_restart_recd, ctdb);
3941
3942                 return;
3943         }
3944
3945         tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3946                          timeval_current_ofs(30, 0),
3947                          ctdb_check_recd, ctdb);
3948 }
3949
3950 static void recd_sig_child_handler(struct tevent_context *ev,
3951                                    struct tevent_signal *se, int signum,
3952                                    int count, void *dont_care,
3953                                    void *private_data)
3954 {
3955 //      struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3956         int status;
3957         pid_t pid = -1;
3958
3959         while (pid != 0) {
3960                 pid = waitpid(-1, &status, WNOHANG);
3961                 if (pid == -1) {
3962                         if (errno != ECHILD) {
3963                                 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3964                         }
3965                         return;
3966                 }
3967                 if (pid > 0) {
3968                         DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3969                 }
3970         }
3971 }
3972
3973 /*
3974   startup the recovery daemon as a child of the main ctdb daemon
3975  */
3976 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3977 {
3978         int fd[2];
3979         struct tevent_signal *se;
3980         struct tevent_fd *fde;
3981
3982         if (pipe(fd) != 0) {
3983                 return -1;
3984         }
3985
3986         ctdb->recoverd_pid = ctdb_fork(ctdb);
3987         if (ctdb->recoverd_pid == -1) {
3988                 return -1;
3989         }
3990
3991         if (ctdb->recoverd_pid != 0) {
3992                 talloc_free(ctdb->recd_ctx);
3993                 ctdb->recd_ctx = talloc_new(ctdb);
3994                 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
3995
3996                 close(fd[0]);
3997                 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3998                                  timeval_current_ofs(30, 0),
3999                                  ctdb_check_recd, ctdb);
4000                 return 0;
4001         }
4002
4003         close(fd[1]);
4004
4005         srandom(getpid() ^ time(NULL));
4006
4007         prctl_set_comment("ctdb_recovered");
4008         if (switch_from_server_to_client(ctdb, "recoverd") != 0) {
4009                 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
4010                 exit(1);
4011         }
4012
4013         DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
4014
4015         fde = tevent_add_fd(ctdb->ev, ctdb, fd[0], TEVENT_FD_READ,
4016                             ctdb_recoverd_parent, &fd[0]);
4017         tevent_fd_set_auto_close(fde);
4018
4019         /* set up a handler to pick up sigchld */
4020         se = tevent_add_signal(ctdb->ev, ctdb, SIGCHLD, 0,
4021                                recd_sig_child_handler, ctdb);
4022         if (se == NULL) {
4023                 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
4024                 exit(1);
4025         }
4026
4027         monitor_cluster(ctdb);
4028
4029         DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
4030         return -1;
4031 }
4032
4033 /*
4034   shutdown the recovery daemon
4035  */
4036 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
4037 {
4038         if (ctdb->recoverd_pid == 0) {
4039                 return;
4040         }
4041
4042         DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
4043         ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
4044
4045         TALLOC_FREE(ctdb->recd_ctx);
4046         TALLOC_FREE(ctdb->recd_ping_count);
4047 }
4048
4049 static void ctdb_restart_recd(struct tevent_context *ev,
4050                               struct tevent_timer *te,
4051                               struct timeval t, void *private_data)
4052 {
4053         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4054
4055         DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
4056         ctdb_stop_recoverd(ctdb);
4057         ctdb_start_recoverd(ctdb);
4058 }