ctdb/server/ctdb_recoverd.c

   1 /*
   2    ctdb recovery daemon
   3
   4    Copyright (C) Ronnie Sahlberg  2007
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20 #include "includes.h"
  21 #include "system/filesys.h"
  22 #include "system/time.h"
  23 #include "system/network.h"
  24 #include "system/wait.h"
  25 #include "popt.h"
  26 #include "cmdline.h"
  27 #include "../include/ctdb_client.h"
  28 #include "../include/ctdb_private.h"
  29 #include "lib/tdb_wrap/tdb_wrap.h"
  30 #include "lib/util/dlinklist.h"
  31
  32
  33 /* List of SRVID requests that need to be processed */
  34 struct srvid_list {
  35         struct srvid_list *next, *prev;
  36         struct srvid_request *request;
  37 };
  38
  39 struct srvid_requests {
  40         struct srvid_list *requests;
  41 };
  42
  43 static void srvid_request_reply(struct ctdb_context *ctdb,
  44                                 struct srvid_request *request,
  45                                 TDB_DATA result)
  46 {
  47         /* Someone that sent srvid==0 does not want a reply */
  48         if (request->srvid == 0) {
  49                 talloc_free(request);
  50                 return;
  51         }
  52
  53         if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
  54                                      result) == 0) {
  55                 DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
  56                                   (unsigned)request->pnn,
  57                                   (unsigned long long)request->srvid));
  58         } else {
  59                 DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
  60                                  (unsigned)request->pnn,
  61                                  (unsigned long long)request->srvid));
  62         }
  63
  64         talloc_free(request);
  65 }
  66
  67 static void srvid_requests_reply(struct ctdb_context *ctdb,
  68                                  struct srvid_requests **requests,
  69                                  TDB_DATA result)
  70 {
  71         struct srvid_list *r;
  72
  73         for (r = (*requests)->requests; r != NULL; r = r->next) {
  74                 srvid_request_reply(ctdb, r->request, result);
  75         }
  76
  77         /* Free the list structure... */
  78         TALLOC_FREE(*requests);
  79 }
  80
  81 static void srvid_request_add(struct ctdb_context *ctdb,
  82                               struct srvid_requests **requests,
  83                               struct srvid_request *request)
  84 {
  85         struct srvid_list *t;
  86         int32_t ret;
  87         TDB_DATA result;
  88
  89         if (*requests == NULL) {
  90                 *requests = talloc_zero(ctdb, struct srvid_requests);
  91                 if (*requests == NULL) {
  92                         goto nomem;
  93                 }
  94         }
  95
  96         t = talloc_zero(*requests, struct srvid_list);
  97         if (t == NULL) {
  98                 /* If *requests was just allocated above then free it */
  99                 if ((*requests)->requests == NULL) {
 100                         TALLOC_FREE(*requests);
 101                 }
 102                 goto nomem;
 103         }
 104
 105         t->request = (struct srvid_request *)talloc_steal(t, request);
 106         DLIST_ADD((*requests)->requests, t);
 107
 108         return;
 109
 110 nomem:
 111         /* Failed to add the request to the list.  Send a fail. */
 112         DEBUG(DEBUG_ERR, (__location__
 113                           " Out of memory, failed to queue SRVID request\n"));
 114         ret = -ENOMEM;
 115         result.dsize = sizeof(ret);
 116         result.dptr = (uint8_t *)&ret;
 117         srvid_request_reply(ctdb, request, result);
 118 }
 119
 120 /* An abstraction to allow an operation (takeover runs, recoveries,
 121  * ...) to be disabled for a given timeout */
 122 struct ctdb_op_state {
 123         struct tevent_timer *timer;
 124         bool in_progress;
 125         const char *name;
 126 };
 127
 128 static struct ctdb_op_state *ctdb_op_init(TALLOC_CTX *mem_ctx, const char *name)
 129 {
 130         struct ctdb_op_state *state = talloc_zero(mem_ctx, struct ctdb_op_state);
 131
 132         if (state != NULL) {
 133                 state->in_progress = false;
 134                 state->name = name;
 135         }
 136
 137         return state;
 138 }
 139
 140 static bool ctdb_op_is_disabled(struct ctdb_op_state *state)
 141 {
 142         return state->timer != NULL;
 143 }
 144
 145 static bool ctdb_op_begin(struct ctdb_op_state *state)
 146 {
 147         if (ctdb_op_is_disabled(state)) {
 148                 DEBUG(DEBUG_NOTICE,
 149                       ("Unable to begin - %s are disabled\n", state->name));
 150                 return false;
 151         }
 152
 153         state->in_progress = true;
 154         return true;
 155 }
 156
 157 static bool ctdb_op_end(struct ctdb_op_state *state)
 158 {
 159         return state->in_progress = false;
 160 }
 161
 162 static bool ctdb_op_is_in_progress(struct ctdb_op_state *state)
 163 {
 164         return state->in_progress;
 165 }
 166
 167 static void ctdb_op_enable(struct ctdb_op_state *state)
 168 {
 169         TALLOC_FREE(state->timer);
 170 }
 171
 172 static void ctdb_op_timeout_handler(struct event_context *ev,
 173                                     struct timed_event *te,
 174                                     struct timeval yt, void *p)
 175 {
 176         struct ctdb_op_state *state =
 177                 talloc_get_type(p, struct ctdb_op_state);
 178
 179         DEBUG(DEBUG_NOTICE,("Reenabling %s after timeout\n", state->name));
 180         ctdb_op_enable(state);
 181 }
 182
 183 static int ctdb_op_disable(struct ctdb_op_state *state,
 184                            struct tevent_context *ev,
 185                            uint32_t timeout)
 186 {
 187         if (timeout == 0) {
 188                 DEBUG(DEBUG_NOTICE,("Reenabling %s\n", state->name));
 189                 ctdb_op_enable(state);
 190                 return 0;
 191         }
 192
 193         if (state->in_progress) {
 194                 DEBUG(DEBUG_ERR,
 195                       ("Unable to disable %s - in progress\n", state->name));
 196                 return -EAGAIN;
 197         }
 198
 199         DEBUG(DEBUG_NOTICE,("Disabling %s for %u seconds\n",
 200                             state->name, timeout));
 201
 202         /* Clear any old timers */
 203         talloc_free(state->timer);
 204
 205         /* Arrange for the timeout to occur */
 206         state->timer = tevent_add_timer(ev, state,
 207                                         timeval_current_ofs(timeout, 0),
 208                                         ctdb_op_timeout_handler, state);
 209         if (state->timer == NULL) {
 210                 DEBUG(DEBUG_ERR,(__location__ " Unable to setup timer\n"));
 211                 return -ENOMEM;
 212         }
 213
 214         return 0;
 215 }
 216
 217 struct ctdb_banning_state {
 218         uint32_t count;
 219         struct timeval last_reported_time;
 220 };
 221
 222 /*
 223   private state of recovery daemon
 224  */
 225 struct ctdb_recoverd {
 226         struct ctdb_context *ctdb;
 227         uint32_t recmaster;
 228         uint32_t last_culprit_node;
 229         struct ctdb_node_map *nodemap;
 230         struct timeval priority_time;
 231         bool need_takeover_run;
 232         bool need_recovery;
 233         uint32_t node_flags;
 234         struct timed_event *send_election_te;
 235         struct timed_event *election_timeout;
 236         struct srvid_requests *reallocate_requests;
 237         struct ctdb_op_state *takeover_run;
 238         struct ctdb_op_state *recovery;
 239         struct ctdb_control_get_ifaces *ifaces;
 240         uint32_t *force_rebalance_nodes;
 241         struct ctdb_node_capabilities *caps;
 242 };
 243
 244 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
 245 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
 246
 247 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te, struct timeval t, void *private_data);
 248
 249 /*
 250   ban a node for a period of time
 251  */
 252 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
 253 {
 254         int ret;
 255         struct ctdb_context *ctdb = rec->ctdb;
 256         struct ctdb_ban_time bantime;
 257
 258         if (!ctdb_validate_pnn(ctdb, pnn)) {
 259                 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
 260                 return;
 261         }
 262
 263         DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
 264
 265         bantime.pnn  = pnn;
 266         bantime.time = ban_time;
 267
 268         ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
 269         if (ret != 0) {
 270                 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
 271                 return;
 272         }
 273
 274 }
 275
 276 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
 277
 278
 279 /*
 280   remember the trouble maker
 281  */
 282 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
 283 {
 284         struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
 285         struct ctdb_banning_state *ban_state;
 286
 287         if (culprit > ctdb->num_nodes) {
 288                 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
 289                 return;
 290         }
 291
 292         /* If we are banned or stopped, do not set other nodes as culprits */
 293         if (rec->node_flags & NODE_FLAGS_INACTIVE) {
 294                 DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
 295                 return;
 296         }
 297
 298         if (ctdb->nodes[culprit]->ban_state == NULL) {
 299                 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
 300                 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
 301
 302
 303         }
 304         ban_state = ctdb->nodes[culprit]->ban_state;
 305         if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
 306                 /* this was the first time in a long while this node
 307                    misbehaved so we will forgive any old transgressions.
 308                 */
 309                 ban_state->count = 0;
 310         }
 311
 312         ban_state->count += count;
 313         ban_state->last_reported_time = timeval_current();
 314         rec->last_culprit_node = culprit;
 315 }
 316
 317 /*
 318   remember the trouble maker
 319  */
 320 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
 321 {
 322         ctdb_set_culprit_count(rec, culprit, 1);
 323 }
 324
 325
 326 /* this callback is called for every node that failed to execute the
 327    recovered event
 328 */
 329 static void recovered_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 330 {
 331         struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
 332
 333         DEBUG(DEBUG_ERR, (__location__ " Node %u failed the recovered event. Setting it as recovery fail culprit\n", node_pnn));
 334
 335         ctdb_set_culprit(rec, node_pnn);
 336 }
 337
 338 /*
 339   run the "recovered" eventscript on all nodes
 340  */
 341 static int run_recovered_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, const char *caller)
 342 {
 343         TALLOC_CTX *tmp_ctx;
 344         uint32_t *nodes;
 345         struct ctdb_context *ctdb = rec->ctdb;
 346
 347         tmp_ctx = talloc_new(ctdb);
 348         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 349
 350         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 351         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
 352                                         nodes, 0,
 353                                         CONTROL_TIMEOUT(), false, tdb_null,
 354                                         NULL, recovered_fail_callback,
 355                                         rec) != 0) {
 356                 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
 357
 358                 talloc_free(tmp_ctx);
 359                 return -1;
 360         }
 361
 362         talloc_free(tmp_ctx);
 363         return 0;
 364 }
 365
 366 /* this callback is called for every node that failed to execute the
 367    start recovery event
 368 */
 369 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 370 {
 371         struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
 372
 373         DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
 374
 375         ctdb_set_culprit(rec, node_pnn);
 376 }
 377
 378 /*
 379   run the "startrecovery" eventscript on all nodes
 380  */
 381 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
 382 {
 383         TALLOC_CTX *tmp_ctx;
 384         uint32_t *nodes;
 385         struct ctdb_context *ctdb = rec->ctdb;
 386
 387         tmp_ctx = talloc_new(ctdb);
 388         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 389
 390         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 391         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
 392                                         nodes, 0,
 393                                         CONTROL_TIMEOUT(), false, tdb_null,
 394                                         NULL,
 395                                         startrecovery_fail_callback,
 396                                         rec) != 0) {
 397                 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
 398                 talloc_free(tmp_ctx);
 399                 return -1;
 400         }
 401
 402         talloc_free(tmp_ctx);
 403         return 0;
 404 }
 405
 406 /*
 407   update the node capabilities for all connected nodes
 408  */
 409 static int update_capabilities(struct ctdb_recoverd *rec,
 410                                struct ctdb_node_map *nodemap)
 411 {
 412         uint32_t *capp;
 413         TALLOC_CTX *tmp_ctx;
 414         struct ctdb_node_capabilities *caps;
 415         struct ctdb_context *ctdb = rec->ctdb;
 416
 417         tmp_ctx = talloc_new(rec);
 418         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 419
 420         caps = ctdb_get_capabilities(ctdb, tmp_ctx,
 421                                      CONTROL_TIMEOUT(), nodemap);
 422
 423         if (caps == NULL) {
 424                 DEBUG(DEBUG_ERR,
 425                       (__location__ " Failed to get node capabilities\n"));
 426                 talloc_free(tmp_ctx);
 427                 return -1;
 428         }
 429
 430         capp = ctdb_get_node_capabilities(caps, ctdb_get_pnn(ctdb));
 431         if (capp == NULL) {
 432                 DEBUG(DEBUG_ERR,
 433                       (__location__
 434                        " Capabilities don't include current node.\n"));
 435                 talloc_free(tmp_ctx);
 436                 return -1;
 437         }
 438         ctdb->capabilities = *capp;
 439
 440         TALLOC_FREE(rec->caps);
 441         rec->caps = talloc_steal(rec, caps);
 442
 443         talloc_free(tmp_ctx);
 444         return 0;
 445 }
 446
 447 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 448 {
 449         struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
 450
 451         DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
 452         ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
 453 }
 454
 455 static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 456 {
 457         struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
 458
 459         DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
 460         ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
 461 }
 462
 463 /*
 464   change recovery mode on all nodes
 465  */
 466 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t rec_mode)
 467 {
 468         TDB_DATA data;
 469         uint32_t *nodes;
 470         TALLOC_CTX *tmp_ctx;
 471
 472         tmp_ctx = talloc_new(ctdb);
 473         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 474
 475         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 476
 477         data.dsize = sizeof(uint32_t);
 478         data.dptr = (unsigned char *)&rec_mode;
 479
 480         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
 481                                         nodes, 0,
 482                                         CONTROL_TIMEOUT(),
 483                                         false, data,
 484                                         NULL, NULL,
 485                                         NULL) != 0) {
 486                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
 487                 talloc_free(tmp_ctx);
 488                 return -1;
 489         }
 490
 491         /* freeze all nodes */
 492         if (rec_mode == CTDB_RECOVERY_ACTIVE) {
 493                 int i;
 494
 495                 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
 496                         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
 497                                                 nodes, i,
 498                                                 CONTROL_TIMEOUT(),
 499                                                 false, tdb_null,
 500                                                 NULL,
 501                                                 set_recmode_fail_callback,
 502                                                 rec) != 0) {
 503                                 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
 504                                 talloc_free(tmp_ctx);
 505                                 return -1;
 506                         }
 507                 }
 508         }
 509
 510         talloc_free(tmp_ctx);
 511         return 0;
 512 }
 513
 514 /*
 515   change recovery master on all node
 516  */
 517 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
 518 {
 519         TDB_DATA data;
 520         TALLOC_CTX *tmp_ctx;
 521         uint32_t *nodes;
 522
 523         tmp_ctx = talloc_new(ctdb);
 524         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 525
 526         data.dsize = sizeof(uint32_t);
 527         data.dptr = (unsigned char *)&pnn;
 528
 529         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 530         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
 531                                         nodes, 0,
 532                                         CONTROL_TIMEOUT(), false, data,
 533                                         NULL, NULL,
 534                                         NULL) != 0) {
 535                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
 536                 talloc_free(tmp_ctx);
 537                 return -1;
 538         }
 539
 540         talloc_free(tmp_ctx);
 541         return 0;
 542 }
 543
 544 /* update all remote nodes to use the same db priority that we have
 545    this can fail if the remove node has not yet been upgraded to
 546    support this function, so we always return success and never fail
 547    a recovery if this call fails.
 548 */
 549 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
 550         struct ctdb_node_map *nodemap,
 551         uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
 552 {
 553         int db;
 554
 555         /* step through all local databases */
 556         for (db=0; db<dbmap->num;db++) {
 557                 struct ctdb_db_priority db_prio;
 558                 int ret;
 559
 560                 db_prio.db_id     = dbmap->dbs[db].dbid;
 561                 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].dbid, &db_prio.priority);
 562                 if (ret != 0) {
 563                         DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].dbid));
 564                         continue;
 565                 }
 566
 567                 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].dbid, db_prio.priority));
 568
 569                 ret = ctdb_ctrl_set_db_priority(ctdb, CONTROL_TIMEOUT(),
 570                                                 CTDB_CURRENT_NODE, &db_prio);
 571                 if (ret != 0) {
 572                         DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n",
 573                                          db_prio.db_id));
 574                 }
 575         }
 576
 577         return 0;
 578 }
 579
 580 /*
 581   ensure all other nodes have attached to any databases that we have
 582  */
 583 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
 584                                            uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
 585 {
 586         int i, j, db, ret;
 587         struct ctdb_dbid_map *remote_dbmap;
 588
 589         /* verify that all other nodes have all our databases */
 590         for (j=0; j<nodemap->num; j++) {
 591                 /* we dont need to ourself ourselves */
 592                 if (nodemap->nodes[j].pnn == pnn) {
 593                         continue;
 594                 }
 595                 /* dont check nodes that are unavailable */
 596                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 597                         continue;
 598                 }
 599
 600                 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 601                                          mem_ctx, &remote_dbmap);
 602                 if (ret != 0) {
 603                         DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
 604                         return -1;
 605                 }
 606
 607                 /* step through all local databases */
 608                 for (db=0; db<dbmap->num;db++) {
 609                         const char *name;
 610
 611
 612                         for (i=0;i<remote_dbmap->num;i++) {
 613                                 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
 614                                         break;
 615                                 }
 616                         }
 617                         /* the remote node already have this database */
 618                         if (i!=remote_dbmap->num) {
 619                                 continue;
 620                         }
 621                         /* ok so we need to create this database */
 622                         ret = ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn,
 623                                                   dbmap->dbs[db].dbid, mem_ctx,
 624                                                   &name);
 625                         if (ret != 0) {
 626                                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
 627                                 return -1;
 628                         }
 629                         ret = ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(),
 630                                                  nodemap->nodes[j].pnn,
 631                                                  mem_ctx, name,
 632                                                  dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
 633                         if (ret != 0) {
 634                                 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
 635                                 return -1;
 636                         }
 637                 }
 638         }
 639
 640         return 0;
 641 }
 642
 643
 644 /*
 645   ensure we are attached to any databases that anyone else is attached to
 646  */
 647 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
 648                                           uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
 649 {
 650         int i, j, db, ret;
 651         struct ctdb_dbid_map *remote_dbmap;
 652
 653         /* verify that we have all database any other node has */
 654         for (j=0; j<nodemap->num; j++) {
 655                 /* we dont need to ourself ourselves */
 656                 if (nodemap->nodes[j].pnn == pnn) {
 657                         continue;
 658                 }
 659                 /* dont check nodes that are unavailable */
 660                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 661                         continue;
 662                 }
 663
 664                 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 665                                          mem_ctx, &remote_dbmap);
 666                 if (ret != 0) {
 667                         DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
 668                         return -1;
 669                 }
 670
 671                 /* step through all databases on the remote node */
 672                 for (db=0; db<remote_dbmap->num;db++) {
 673                         const char *name;
 674
 675                         for (i=0;i<(*dbmap)->num;i++) {
 676                                 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
 677                                         break;
 678                                 }
 679                         }
 680                         /* we already have this db locally */
 681                         if (i!=(*dbmap)->num) {
 682                                 continue;
 683                         }
 684                         /* ok so we need to create this database and
 685                            rebuild dbmap
 686                          */
 687                         ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 688                                             remote_dbmap->dbs[db].dbid, mem_ctx, &name);
 689                         if (ret != 0) {
 690                                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
 691                                           nodemap->nodes[j].pnn));
 692                                 return -1;
 693                         }
 694                         ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
 695                                            remote_dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
 696                         if (ret != 0) {
 697                                 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
 698                                 return -1;
 699                         }
 700                         ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
 701                         if (ret != 0) {
 702                                 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
 703                                 return -1;
 704                         }
 705                 }
 706         }
 707
 708         return 0;
 709 }
 710
 711
 712 /*
 713   pull the remote database contents from one node into the recdb
 714  */
 715 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
 716                                     struct tdb_wrap *recdb, uint32_t dbid)
 717 {
 718         int ret;
 719         TDB_DATA outdata;
 720         struct ctdb_marshall_buffer *reply;
 721         struct ctdb_rec_data *recdata;
 722         int i;
 723         TALLOC_CTX *tmp_ctx = talloc_new(recdb);
 724
 725         ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
 726                                CONTROL_TIMEOUT(), &outdata);
 727         if (ret != 0) {
 728                 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
 729                 talloc_free(tmp_ctx);
 730                 return -1;
 731         }
 732
 733         reply = (struct ctdb_marshall_buffer *)outdata.dptr;
 734
 735         if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
 736                 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
 737                 talloc_free(tmp_ctx);
 738                 return -1;
 739         }
 740
 741         recdata = (struct ctdb_rec_data *)&reply->data[0];
 742
 743         for (i=0;
 744              i<reply->count;
 745              recdata = (struct ctdb_rec_data *)(recdata->length + (uint8_t *)recdata), i++) {
 746                 TDB_DATA key, data;
 747                 struct ctdb_ltdb_header *hdr;
 748                 TDB_DATA existing;
 749
 750                 key.dptr = &recdata->data[0];
 751                 key.dsize = recdata->keylen;
 752                 data.dptr = &recdata->data[key.dsize];
 753                 data.dsize = recdata->datalen;
 754
 755                 hdr = (struct ctdb_ltdb_header *)data.dptr;
 756
 757                 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
 758                         DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
 759                         talloc_free(tmp_ctx);
 760                         return -1;
 761                 }
 762
 763                 /* fetch the existing record, if any */
 764                 existing = tdb_fetch(recdb->tdb, key);
 765
 766                 if (existing.dptr != NULL) {
 767                         struct ctdb_ltdb_header header;
 768                         if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
 769                                 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
 770                                          (unsigned)existing.dsize, srcnode));
 771                                 free(existing.dptr);
 772                                 talloc_free(tmp_ctx);
 773                                 return -1;
 774                         }
 775                         header = *(struct ctdb_ltdb_header *)existing.dptr;
 776                         free(existing.dptr);
 777                         if (!(header.rsn < hdr->rsn ||
 778                               (header.dmaster != ctdb_get_pnn(ctdb) &&
 779                                header.rsn == hdr->rsn))) {
 780                                 continue;
 781                         }
 782                 }
 783
 784                 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
 785                         DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
 786                         talloc_free(tmp_ctx);
 787                         return -1;
 788                 }
 789         }
 790
 791         talloc_free(tmp_ctx);
 792
 793         return 0;
 794 }
 795
 796
 797 struct pull_seqnum_cbdata {
 798         int failed;
 799         uint32_t pnn;
 800         uint64_t seqnum;
 801 };
 802
 803 static void pull_seqnum_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 804 {
 805         struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
 806         uint64_t seqnum;
 807
 808         if (cb_data->failed != 0) {
 809                 DEBUG(DEBUG_ERR, ("Got seqnum from node %d but we have already failed the entire operation\n", node_pnn));
 810                 return;
 811         }
 812
 813         if (res != 0) {
 814                 DEBUG(DEBUG_ERR, ("Error when pulling seqnum from node %d\n", node_pnn));
 815                 cb_data->failed = 1;
 816                 return;
 817         }
 818
 819         if (outdata.dsize != sizeof(uint64_t)) {
 820                 DEBUG(DEBUG_ERR, ("Error when reading pull seqnum from node %d, got %d bytes but expected %d\n", node_pnn, (int)outdata.dsize, (int)sizeof(uint64_t)));
 821                 cb_data->failed = -1;
 822                 return;
 823         }
 824
 825         seqnum = *((uint64_t *)outdata.dptr);
 826
 827         if (seqnum > cb_data->seqnum ||
 828             (cb_data->pnn == -1 && seqnum == 0)) {
 829                 cb_data->seqnum = seqnum;
 830                 cb_data->pnn = node_pnn;
 831         }
 832 }
 833
 834 static void pull_seqnum_fail_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 835 {
 836         struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
 837
 838         DEBUG(DEBUG_ERR, ("Failed to pull db seqnum from node %d\n", node_pnn));
 839         cb_data->failed = 1;
 840 }
 841
 842 static int pull_highest_seqnum_pdb(struct ctdb_context *ctdb,
 843                                 struct ctdb_recoverd *rec,
 844                                 struct ctdb_node_map *nodemap,
 845                                 struct tdb_wrap *recdb, uint32_t dbid)
 846 {
 847         TALLOC_CTX *tmp_ctx = talloc_new(NULL);
 848         uint32_t *nodes;
 849         TDB_DATA data;
 850         uint32_t outdata[2];
 851         struct pull_seqnum_cbdata *cb_data;
 852
 853         DEBUG(DEBUG_NOTICE, ("Scan for highest seqnum pdb for db:0x%08x\n", dbid));
 854
 855         outdata[0] = dbid;
 856         outdata[1] = 0;
 857
 858         data.dsize = sizeof(outdata);
 859         data.dptr  = (uint8_t *)&outdata[0];
 860
 861         cb_data = talloc(tmp_ctx, struct pull_seqnum_cbdata);
 862         if (cb_data == NULL) {
 863                 DEBUG(DEBUG_ERR, ("Failed to allocate pull highest seqnum cb_data structure\n"));
 864                 talloc_free(tmp_ctx);
 865                 return -1;
 866         }
 867
 868         cb_data->failed = 0;
 869         cb_data->pnn    = -1;
 870         cb_data->seqnum = 0;
 871
 872         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 873         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_DB_SEQNUM,
 874                                         nodes, 0,
 875                                         CONTROL_TIMEOUT(), false, data,
 876                                         pull_seqnum_cb,
 877                                         pull_seqnum_fail_cb,
 878                                         cb_data) != 0) {
 879                 DEBUG(DEBUG_ERR, (__location__ " Failed to run async GET_DB_SEQNUM\n"));
 880
 881                 talloc_free(tmp_ctx);
 882                 return -1;
 883         }
 884
 885         if (cb_data->failed != 0) {
 886                 DEBUG(DEBUG_NOTICE, ("Failed to pull sequence numbers for DB 0x%08x\n", dbid));
 887                 talloc_free(tmp_ctx);
 888                 return -1;
 889         }
 890
 891         if (cb_data->pnn == -1) {
 892                 DEBUG(DEBUG_NOTICE, ("Failed to find a node with highest sequence numbers for DB 0x%08x\n", dbid));
 893                 talloc_free(tmp_ctx);
 894                 return -1;
 895         }
 896
 897         DEBUG(DEBUG_NOTICE, ("Pull persistent db:0x%08x from node %d with highest seqnum:%lld\n", dbid, cb_data->pnn, (long long)cb_data->seqnum));
 898
 899         if (pull_one_remote_database(ctdb, cb_data->pnn, recdb, dbid) != 0) {
 900                 DEBUG(DEBUG_ERR, ("Failed to pull higest seqnum database 0x%08x from node %d\n", dbid, cb_data->pnn));
 901                 talloc_free(tmp_ctx);
 902                 return -1;
 903         }
 904
 905         talloc_free(tmp_ctx);
 906         return 0;
 907 }
 908
 909
 910 /*
 911   pull all the remote database contents into the recdb
 912  */
 913 static int pull_remote_database(struct ctdb_context *ctdb,
 914                                 struct ctdb_recoverd *rec,
 915                                 struct ctdb_node_map *nodemap,
 916                                 struct tdb_wrap *recdb, uint32_t dbid,
 917                                 bool persistent)
 918 {
 919         int j;
 920
 921         if (persistent && ctdb->tunable.recover_pdb_by_seqnum != 0) {
 922                 int ret;
 923                 ret = pull_highest_seqnum_pdb(ctdb, rec, nodemap, recdb, dbid);
 924                 if (ret == 0) {
 925                         return 0;
 926                 }
 927         }
 928
 929         /* pull all records from all other nodes across onto this node
 930            (this merges based on rsn)
 931         */
 932         for (j=0; j<nodemap->num; j++) {
 933                 /* dont merge from nodes that are unavailable */
 934                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 935                         continue;
 936                 }
 937                 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
 938                         DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
 939                                  nodemap->nodes[j].pnn));
 940                         ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
 941                         return -1;
 942                 }
 943         }
 944
 945         return 0;
 946 }
 947
 948
 949 /*
 950   update flags on all active nodes
 951  */
 952 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
 953 {
 954         int ret;
 955
 956         ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
 957                 if (ret != 0) {
 958                 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
 959                 return -1;
 960         }
 961
 962         return 0;
 963 }
 964
 965 /*
 966   ensure all nodes have the same vnnmap we do
 967  */
 968 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
 969                                       uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
 970 {
 971         int j, ret;
 972
 973         /* push the new vnn map out to all the nodes */
 974         for (j=0; j<nodemap->num; j++) {
 975                 /* dont push to nodes that are unavailable */
 976                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 977                         continue;
 978                 }
 979
 980                 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
 981                 if (ret != 0) {
 982                         DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
 983                         return -1;
 984                 }
 985         }
 986
 987         return 0;
 988 }
 989
 990
 991 /*
 992   called when a vacuum fetch has completed - just free it and do the next one
 993  */
 994 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
 995 {
 996         talloc_free(state);
 997 }
 998
 999
1000 /**
1001  * Process one elements of the vacuum fetch list:
1002  * Migrate it over to us with the special flag
1003  * CTDB_CALL_FLAG_VACUUM_MIGRATION.
1004  */
1005 static bool vacuum_fetch_process_one(struct ctdb_db_context *ctdb_db,
1006                                      uint32_t pnn,
1007                                      struct ctdb_rec_data *r)
1008 {
1009         struct ctdb_client_call_state *state;
1010         TDB_DATA data;
1011         struct ctdb_ltdb_header *hdr;
1012         struct ctdb_call call;
1013
1014         ZERO_STRUCT(call);
1015         call.call_id = CTDB_NULL_FUNC;
1016         call.flags = CTDB_IMMEDIATE_MIGRATION;
1017         call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
1018
1019         call.key.dptr = &r->data[0];
1020         call.key.dsize = r->keylen;
1021
1022         /* ensure we don't block this daemon - just skip a record if we can't get
1023            the chainlock */
1024         if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, call.key) != 0) {
1025                 return true;
1026         }
1027
1028         data = tdb_fetch(ctdb_db->ltdb->tdb, call.key);
1029         if (data.dptr == NULL) {
1030                 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
1031                 return true;
1032         }
1033
1034         if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
1035                 free(data.dptr);
1036                 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
1037                 return true;
1038         }
1039
1040         hdr = (struct ctdb_ltdb_header *)data.dptr;
1041         if (hdr->dmaster == pnn) {
1042                 /* its already local */
1043                 free(data.dptr);
1044                 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
1045                 return true;
1046         }
1047
1048         free(data.dptr);
1049
1050         state = ctdb_call_send(ctdb_db, &call);
1051         tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
1052         if (state == NULL) {
1053                 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
1054                 return false;
1055         }
1056         state->async.fn = vacuum_fetch_callback;
1057         state->async.private_data = NULL;
1058
1059         return true;
1060 }
1061
1062
1063 /*
1064   handler for vacuum fetch
1065 */
1066 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
1067                                  TDB_DATA data, void *private_data)
1068 {
1069         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1070         struct ctdb_marshall_buffer *recs;
1071         int ret, i;
1072         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1073         const char *name;
1074         struct ctdb_dbid_map *dbmap=NULL;
1075         bool persistent = false;
1076         struct ctdb_db_context *ctdb_db;
1077         struct ctdb_rec_data *r;
1078
1079         recs = (struct ctdb_marshall_buffer *)data.dptr;
1080
1081         if (recs->count == 0) {
1082                 goto done;
1083         }
1084
1085         /* work out if the database is persistent */
1086         ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
1087         if (ret != 0) {
1088                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
1089                 goto done;
1090         }
1091
1092         for (i=0;i<dbmap->num;i++) {
1093                 if (dbmap->dbs[i].dbid == recs->db_id) {
1094                         persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
1095                         break;
1096                 }
1097         }
1098         if (i == dbmap->num) {
1099                 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
1100                 goto done;
1101         }
1102
1103         /* find the name of this database */
1104         if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
1105                 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
1106                 goto done;
1107         }
1108
1109         /* attach to it */
1110         ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, persistent, 0);
1111         if (ctdb_db == NULL) {
1112                 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
1113                 goto done;
1114         }
1115
1116         r = (struct ctdb_rec_data *)&recs->data[0];
1117         while (recs->count) {
1118                 bool ok;
1119
1120                 ok = vacuum_fetch_process_one(ctdb_db, rec->ctdb->pnn, r);
1121                 if (!ok) {
1122                         break;
1123                 }
1124
1125                 r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
1126                 recs->count--;
1127         }
1128
1129 done:
1130         talloc_free(tmp_ctx);
1131 }
1132
1133
1134 /*
1135  * handler for database detach
1136  */
1137 static void detach_database_handler(struct ctdb_context *ctdb, uint64_t srvid,
1138                                     TDB_DATA data, void *private_data)
1139 {
1140         uint32_t db_id;
1141         struct ctdb_db_context *ctdb_db;
1142
1143         if (data.dsize != sizeof(db_id)) {
1144                 return;
1145         }
1146         db_id = *(uint32_t *)data.dptr;
1147
1148         ctdb_db = find_ctdb_db(ctdb, db_id);
1149         if (ctdb_db == NULL) {
1150                 /* database is not attached */
1151                 return;
1152         }
1153
1154         DLIST_REMOVE(ctdb->db_list, ctdb_db);
1155
1156         DEBUG(DEBUG_NOTICE, ("Detached from database '%s'\n",
1157                              ctdb_db->db_name));
1158         talloc_free(ctdb_db);
1159 }
1160
1161 /*
1162   called when ctdb_wait_timeout should finish
1163  */
1164 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
1165                               struct timeval yt, void *p)
1166 {
1167         uint32_t *timed_out = (uint32_t *)p;
1168         (*timed_out) = 1;
1169 }
1170
1171 /*
1172   wait for a given number of seconds
1173  */
1174 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
1175 {
1176         uint32_t timed_out = 0;
1177         time_t usecs = (secs - (time_t)secs) * 1000000;
1178         event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs), ctdb_wait_handler, &timed_out);
1179         while (!timed_out) {
1180                 event_loop_once(ctdb->ev);
1181         }
1182 }
1183
1184 /*
1185   called when an election times out (ends)
1186  */
1187 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
1188                                   struct timeval t, void *p)
1189 {
1190         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1191         rec->election_timeout = NULL;
1192         fast_start = false;
1193
1194         DEBUG(DEBUG_WARNING,("Election period ended\n"));
1195 }
1196
1197
1198 /*
1199   wait for an election to finish. It finished election_timeout seconds after
1200   the last election packet is received
1201  */
1202 static void ctdb_wait_election(struct ctdb_recoverd *rec)
1203 {
1204         struct ctdb_context *ctdb = rec->ctdb;
1205         while (rec->election_timeout) {
1206                 event_loop_once(ctdb->ev);
1207         }
1208 }
1209
1210 /*
1211   Update our local flags from all remote connected nodes.
1212   This is only run when we are or we belive we are the recovery master
1213  */
1214 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
1215 {
1216         int j;
1217         struct ctdb_context *ctdb = rec->ctdb;
1218         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1219
1220         /* get the nodemap for all active remote nodes and verify
1221            they are the same as for this node
1222          */
1223         for (j=0; j<nodemap->num; j++) {
1224                 struct ctdb_node_map *remote_nodemap=NULL;
1225                 int ret;
1226
1227                 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
1228                         continue;
1229                 }
1230                 if (nodemap->nodes[j].pnn == ctdb->pnn) {
1231                         continue;
1232                 }
1233
1234                 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
1235                                            mem_ctx, &remote_nodemap);
1236                 if (ret != 0) {
1237                         DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
1238                                   nodemap->nodes[j].pnn));
1239                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
1240                         talloc_free(mem_ctx);
1241                         return MONITOR_FAILED;
1242                 }
1243                 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
1244                         /* We should tell our daemon about this so it
1245                            updates its flags or else we will log the same
1246                            message again in the next iteration of recovery.
1247                            Since we are the recovery master we can just as
1248                            well update the flags on all nodes.
1249                         */
1250                         ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
1251                         if (ret != 0) {
1252                                 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
1253                                 return -1;
1254                         }
1255
1256                         /* Update our local copy of the flags in the recovery
1257                            daemon.
1258                         */
1259                         DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
1260                                  nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
1261                                  nodemap->nodes[j].flags));
1262                         nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
1263                 }
1264                 talloc_free(remote_nodemap);
1265         }
1266         talloc_free(mem_ctx);
1267         return MONITOR_OK;
1268 }
1269
1270
1271 /* Create a new random generation ip.
1272    The generation id can not be the INVALID_GENERATION id
1273 */
1274 static uint32_t new_generation(void)
1275 {
1276         uint32_t generation;
1277
1278         while (1) {
1279                 generation = random();
1280
1281                 if (generation != INVALID_GENERATION) {
1282                         break;
1283                 }
1284         }
1285
1286         return generation;
1287 }
1288
1289
1290 /*
1291   create a temporary working database
1292  */
1293 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1294 {
1295         char *name;
1296         struct tdb_wrap *recdb;
1297         unsigned tdb_flags;
1298
1299         /* open up the temporary recovery database */
1300         name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
1301                                ctdb->db_directory_state,
1302                                ctdb->pnn);
1303         if (name == NULL) {
1304                 return NULL;
1305         }
1306         unlink(name);
1307
1308         tdb_flags = TDB_NOLOCK;
1309         if (ctdb->valgrinding) {
1310                 tdb_flags |= TDB_NOMMAP;
1311         }
1312         tdb_flags |= (TDB_INCOMPATIBLE_HASH | TDB_DISALLOW_NESTING);
1313
1314         recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1315                               tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1316         if (recdb == NULL) {
1317                 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1318         }
1319
1320         talloc_free(name);
1321
1322         return recdb;
1323 }
1324
1325
1326 /*
1327    a traverse function for pulling all relevant records from recdb
1328  */
1329 struct recdb_data {
1330         struct ctdb_context *ctdb;
1331         struct ctdb_marshall_buffer *recdata;
1332         uint32_t len;
1333         uint32_t allocated_len;
1334         bool failed;
1335         bool persistent;
1336 };
1337
1338 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1339 {
1340         struct recdb_data *params = (struct recdb_data *)p;
1341         struct ctdb_rec_data *recdata;
1342         struct ctdb_ltdb_header *hdr;
1343
1344         /*
1345          * skip empty records - but NOT for persistent databases:
1346          *
1347          * The record-by-record mode of recovery deletes empty records.
1348          * For persistent databases, this can lead to data corruption
1349          * by deleting records that should be there:
1350          *
1351          * - Assume the cluster has been running for a while.
1352          *
1353          * - A record R in a persistent database has been created and
1354          *   deleted a couple of times, the last operation being deletion,
1355          *   leaving an empty record with a high RSN, say 10.
1356          *
1357          * - Now a node N is turned off.
1358          *
1359          * - This leaves the local database copy of D on N with the empty
1360          *   copy of R and RSN 10. On all other nodes, the recovery has deleted
1361          *   the copy of record R.
1362          *
1363          * - Now the record is created again while node N is turned off.
1364          *   This creates R with RSN = 1 on all nodes except for N.
1365          *
1366          * - Now node N is turned on again. The following recovery will chose
1367          *   the older empty copy of R due to RSN 10 > RSN 1.
1368          *
1369          * ==> Hence the record is gone after the recovery.
1370          *
1371          * On databases like Samba's registry, this can damage the higher-level
1372          * data structures built from the various tdb-level records.
1373          */
1374         if (!params->persistent && data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1375                 return 0;
1376         }
1377
1378         /* update the dmaster field to point to us */
1379         hdr = (struct ctdb_ltdb_header *)data.dptr;
1380         if (!params->persistent) {
1381                 hdr->dmaster = params->ctdb->pnn;
1382                 hdr->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
1383         }
1384
1385         /* add the record to the blob ready to send to the nodes */
1386         recdata = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1387         if (recdata == NULL) {
1388                 params->failed = true;
1389                 return -1;
1390         }
1391         if (params->len + recdata->length >= params->allocated_len) {
1392                 params->allocated_len = recdata->length + params->len + params->ctdb->tunable.pulldb_preallocation_size;
1393                 params->recdata = talloc_realloc_size(NULL, params->recdata, params->allocated_len);
1394         }
1395         if (params->recdata == NULL) {
1396                 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u\n",
1397                          recdata->length + params->len));
1398                 params->failed = true;
1399                 return -1;
1400         }
1401         params->recdata->count++;
1402         memcpy(params->len+(uint8_t *)params->recdata, recdata, recdata->length);
1403         params->len += recdata->length;
1404         talloc_free(recdata);
1405
1406         return 0;
1407 }
1408
1409 /*
1410   push the recdb database out to all nodes
1411  */
1412 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1413                                bool persistent,
1414                                struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1415 {
1416         struct recdb_data params;
1417         struct ctdb_marshall_buffer *recdata;
1418         TDB_DATA outdata;
1419         TALLOC_CTX *tmp_ctx;
1420         uint32_t *nodes;
1421
1422         tmp_ctx = talloc_new(ctdb);
1423         CTDB_NO_MEMORY(ctdb, tmp_ctx);
1424
1425         recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1426         CTDB_NO_MEMORY(ctdb, recdata);
1427
1428         recdata->db_id = dbid;
1429
1430         params.ctdb = ctdb;
1431         params.recdata = recdata;
1432         params.len = offsetof(struct ctdb_marshall_buffer, data);
1433         params.allocated_len = params.len;
1434         params.failed = false;
1435         params.persistent = persistent;
1436
1437         if (tdb_traverse_read(recdb->tdb, traverse_recdb, &params) == -1) {
1438                 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1439                 talloc_free(params.recdata);
1440                 talloc_free(tmp_ctx);
1441                 return -1;
1442         }
1443
1444         if (params.failed) {
1445                 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1446                 talloc_free(params.recdata);
1447                 talloc_free(tmp_ctx);
1448                 return -1;
1449         }
1450
1451         recdata = params.recdata;
1452
1453         outdata.dptr = (void *)recdata;
1454         outdata.dsize = params.len;
1455
1456         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1457         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1458                                         nodes, 0,
1459                                         CONTROL_TIMEOUT(), false, outdata,
1460                                         NULL, NULL,
1461                                         NULL) != 0) {
1462                 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1463                 talloc_free(recdata);
1464                 talloc_free(tmp_ctx);
1465                 return -1;
1466         }
1467
1468         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1469                   dbid, recdata->count));
1470
1471         talloc_free(recdata);
1472         talloc_free(tmp_ctx);
1473
1474         return 0;
1475 }
1476
1477
1478 /*
1479   go through a full recovery on one database
1480  */
1481 static int recover_database(struct ctdb_recoverd *rec,
1482                             TALLOC_CTX *mem_ctx,
1483                             uint32_t dbid,
1484                             bool persistent,
1485                             uint32_t pnn,
1486                             struct ctdb_node_map *nodemap,
1487                             uint32_t transaction_id)
1488 {
1489         struct tdb_wrap *recdb;
1490         int ret;
1491         struct ctdb_context *ctdb = rec->ctdb;
1492         TDB_DATA data;
1493         struct ctdb_control_wipe_database w;
1494         uint32_t *nodes;
1495
1496         recdb = create_recdb(ctdb, mem_ctx);
1497         if (recdb == NULL) {
1498                 return -1;
1499         }
1500
1501         /* pull all remote databases onto the recdb */
1502         ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
1503         if (ret != 0) {
1504                 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1505                 return -1;
1506         }
1507
1508         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1509
1510         /* wipe all the remote databases. This is safe as we are in a transaction */
1511         w.db_id = dbid;
1512         w.transaction_id = transaction_id;
1513
1514         data.dptr = (void *)&w;
1515         data.dsize = sizeof(w);
1516
1517         nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1518         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1519                                         nodes, 0,
1520                                         CONTROL_TIMEOUT(), false, data,
1521                                         NULL, NULL,
1522                                         NULL) != 0) {
1523                 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1524                 talloc_free(recdb);
1525                 return -1;
1526         }
1527
1528         /* push out the correct database. This sets the dmaster and skips
1529            the empty records */
1530         ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
1531         if (ret != 0) {
1532                 talloc_free(recdb);
1533                 return -1;
1534         }
1535
1536         /* all done with this database */
1537         talloc_free(recdb);
1538
1539         return 0;
1540 }
1541
1542 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1543                                          struct ctdb_recoverd *rec,
1544                                          struct ctdb_node_map *nodemap,
1545                                          uint32_t *culprit)
1546 {
1547         int j;
1548         int ret;
1549
1550         if (ctdb->num_nodes != nodemap->num) {
1551                 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
1552                                   ctdb->num_nodes, nodemap->num));
1553                 if (culprit) {
1554                         *culprit = ctdb->pnn;
1555                 }
1556                 return -1;
1557         }
1558
1559         for (j=0; j<nodemap->num; j++) {
1560                 /* For readability */
1561                 struct ctdb_node *node = ctdb->nodes[j];
1562
1563                 /* release any existing data */
1564                 if (node->known_public_ips) {
1565                         talloc_free(node->known_public_ips);
1566                         node->known_public_ips = NULL;
1567                 }
1568                 if (node->available_public_ips) {
1569                         talloc_free(node->available_public_ips);
1570                         node->available_public_ips = NULL;
1571                 }
1572
1573                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1574                         continue;
1575                 }
1576
1577                 /* Retrieve the list of known public IPs from the node */
1578                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1579                                         CONTROL_TIMEOUT(),
1580                                         node->pnn,
1581                                         ctdb->nodes,
1582                                         0,
1583                                         &node->known_public_ips);
1584                 if (ret != 0) {
1585                         DEBUG(DEBUG_ERR,
1586                               ("Failed to read known public IPs from node: %u\n",
1587                                node->pnn));
1588                         if (culprit) {
1589                                 *culprit = node->pnn;
1590                         }
1591                         return -1;
1592                 }
1593
1594                 if (ctdb->do_checkpublicip &&
1595                     !ctdb_op_is_disabled(rec->takeover_run) &&
1596                     verify_remote_ip_allocation(ctdb,
1597                                                  node->known_public_ips,
1598                                                  node->pnn)) {
1599                         DEBUG(DEBUG_ERR,("Trigger IP reallocation\n"));
1600                         rec->need_takeover_run = true;
1601                 }
1602
1603                 /* Retrieve the list of available public IPs from the node */
1604                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1605                                         CONTROL_TIMEOUT(),
1606                                         node->pnn,
1607                                         ctdb->nodes,
1608                                         CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1609                                         &node->available_public_ips);
1610                 if (ret != 0) {
1611                         DEBUG(DEBUG_ERR,
1612                               ("Failed to read available public IPs from node: %u\n",
1613                                node->pnn));
1614                         if (culprit) {
1615                                 *culprit = node->pnn;
1616                         }
1617                         return -1;
1618                 }
1619         }
1620
1621         return 0;
1622 }
1623
1624 /* when we start a recovery, make sure all nodes use the same reclock file
1625    setting
1626 */
1627 static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd *rec)
1628 {
1629         struct ctdb_context *ctdb = rec->ctdb;
1630         TALLOC_CTX *tmp_ctx = talloc_new(NULL);
1631         TDB_DATA data;
1632         uint32_t *nodes;
1633
1634         if (ctdb->recovery_lock_file == NULL) {
1635                 data.dptr  = NULL;
1636                 data.dsize = 0;
1637         } else {
1638                 data.dsize = strlen(ctdb->recovery_lock_file) + 1;
1639                 data.dptr  = (uint8_t *)ctdb->recovery_lock_file;
1640         }
1641
1642         nodes = list_of_active_nodes(ctdb, rec->nodemap, tmp_ctx, true);
1643         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECLOCK_FILE,
1644                                         nodes, 0,
1645                                         CONTROL_TIMEOUT(),
1646                                         false, data,
1647                                         NULL, NULL,
1648                                         rec) != 0) {
1649                 DEBUG(DEBUG_ERR, (__location__ " Failed to sync reclock file settings\n"));
1650                 talloc_free(tmp_ctx);
1651                 return -1;
1652         }
1653
1654         talloc_free(tmp_ctx);
1655         return 0;
1656 }
1657
1658
1659 /*
1660  * this callback is called for every node that failed to execute ctdb_takeover_run()
1661  * and set flag to re-run takeover run.
1662  */
1663 static void takeover_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
1664 {
1665         DEBUG(DEBUG_ERR, ("Node %u failed the takeover run\n", node_pnn));
1666
1667         if (callback_data != NULL) {
1668                 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
1669
1670                 DEBUG(DEBUG_ERR, ("Setting node %u as recovery fail culprit\n", node_pnn));
1671
1672                 ctdb_set_culprit(rec, node_pnn);
1673         }
1674 }
1675
1676
1677 static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
1678 {
1679         struct ctdb_context *ctdb = rec->ctdb;
1680         int i;
1681         struct ctdb_banning_state *ban_state;
1682
1683         *self_ban = false;
1684         for (i=0; i<ctdb->num_nodes; i++) {
1685                 if (ctdb->nodes[i]->ban_state == NULL) {
1686                         continue;
1687                 }
1688                 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1689                 if (ban_state->count < 2*ctdb->num_nodes) {
1690                         continue;
1691                 }
1692
1693                 DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
1694                         ctdb->nodes[i]->pnn, ban_state->count,
1695                         ctdb->tunable.recovery_ban_period));
1696                 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1697                 ban_state->count = 0;
1698
1699                 /* Banning ourself? */
1700                 if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
1701                         *self_ban = true;
1702                 }
1703         }
1704 }
1705
1706 static bool do_takeover_run(struct ctdb_recoverd *rec,
1707                             struct ctdb_node_map *nodemap,
1708                             bool banning_credits_on_fail)
1709 {
1710         uint32_t *nodes = NULL;
1711         struct srvid_request_data dtr;
1712         TDB_DATA data;
1713         int i;
1714         uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
1715         int ret;
1716         bool ok;
1717
1718         DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
1719
1720         if (ctdb_op_is_in_progress(rec->takeover_run)) {
1721                 DEBUG(DEBUG_ERR, (__location__
1722                                   " takeover run already in progress \n"));
1723                 ok = false;
1724                 goto done;
1725         }
1726
1727         if (!ctdb_op_begin(rec->takeover_run)) {
1728                 ok = false;
1729                 goto done;
1730         }
1731
1732         /* Disable IP checks (takeover runs, really) on other nodes
1733          * while doing this takeover run.  This will stop those other
1734          * nodes from triggering takeover runs when think they should
1735          * be hosting an IP but it isn't yet on an interface.  Don't
1736          * wait for replies since a failure here might cause some
1737          * noise in the logs but will not actually cause a problem.
1738          */
1739         dtr.srvid = 0; /* No reply */
1740         dtr.pnn = -1;
1741
1742         data.dptr  = (uint8_t*)&dtr;
1743         data.dsize = sizeof(dtr);
1744
1745         nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
1746
1747         /* Disable for 60 seconds.  This can be a tunable later if
1748          * necessary.
1749          */
1750         dtr.data = 60;
1751         for (i = 0; i < talloc_array_length(nodes); i++) {
1752                 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1753                                              CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1754                                              data) != 0) {
1755                         DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
1756                 }
1757         }
1758
1759         ret = ctdb_takeover_run(rec->ctdb, nodemap,
1760                                 rec->force_rebalance_nodes,
1761                                 takeover_fail_callback,
1762                                 banning_credits_on_fail ? rec : NULL);
1763
1764         /* Reenable takeover runs and IP checks on other nodes */
1765         dtr.data = 0;
1766         for (i = 0; i < talloc_array_length(nodes); i++) {
1767                 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1768                                              CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1769                                              data) != 0) {
1770                         DEBUG(DEBUG_INFO,("Failed to reenable takeover runs\n"));
1771                 }
1772         }
1773
1774         if (ret != 0) {
1775                 DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
1776                 ok = false;
1777                 goto done;
1778         }
1779
1780         ok = true;
1781         /* Takeover run was successful so clear force rebalance targets */
1782         if (rebalance_nodes == rec->force_rebalance_nodes) {
1783                 TALLOC_FREE(rec->force_rebalance_nodes);
1784         } else {
1785                 DEBUG(DEBUG_WARNING,
1786                       ("Rebalance target nodes changed during takeover run - not clearing\n"));
1787         }
1788 done:
1789         rec->need_takeover_run = !ok;
1790         talloc_free(nodes);
1791         ctdb_op_end(rec->takeover_run);
1792
1793         DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
1794         return ok;
1795 }
1796
1797
1798 /*
1799   we are the recmaster, and recovery is needed - start a recovery run
1800  */
1801 static int do_recovery(struct ctdb_recoverd *rec,
1802                        TALLOC_CTX *mem_ctx, uint32_t pnn,
1803                        struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap)
1804 {
1805         struct ctdb_context *ctdb = rec->ctdb;
1806         int i, j, ret;
1807         uint32_t generation;
1808         struct ctdb_dbid_map *dbmap;
1809         TDB_DATA data;
1810         uint32_t *nodes;
1811         struct timeval start_time;
1812         uint32_t culprit = (uint32_t)-1;
1813         bool self_ban;
1814
1815         DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1816
1817         /* if recovery fails, force it again */
1818         rec->need_recovery = true;
1819
1820         if (!ctdb_op_begin(rec->recovery)) {
1821                 return -1;
1822         }
1823
1824         if (rec->election_timeout) {
1825                 /* an election is in progress */
1826                 DEBUG(DEBUG_ERR, ("do_recovery called while election in progress - try again later\n"));
1827                 goto fail;
1828         }
1829
1830         ban_misbehaving_nodes(rec, &self_ban);
1831         if (self_ban) {
1832                 DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
1833                 goto fail;
1834         }
1835
1836         if (ctdb->recovery_lock_file != NULL) {
1837                 if (ctdb_recovery_have_lock(ctdb)) {
1838                         DEBUG(DEBUG_NOTICE, ("Already holding recovery lock\n"));
1839                 } else {
1840                         start_time = timeval_current();
1841                         DEBUG(DEBUG_NOTICE, ("Attempting to take recovery lock (%s)\n",
1842                                              ctdb->recovery_lock_file));
1843                         if (!ctdb_recovery_lock(ctdb)) {
1844                                 if (ctdb->runstate == CTDB_RUNSTATE_FIRST_RECOVERY) {
1845                                         /* If ctdb is trying first recovery, it's
1846                                          * possible that current node does not know
1847                                          * yet who the recmaster is.
1848                                          */
1849                                         DEBUG(DEBUG_ERR, ("Unable to get recovery lock"
1850                                                           " - retrying recovery\n"));
1851                                         goto fail;
1852                                 }
1853
1854                                 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
1855                                                  "and ban ourself for %u seconds\n",
1856                                                  ctdb->tunable.recovery_ban_period));
1857                                 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
1858                                 goto fail;
1859                         }
1860                         ctdb_ctrl_report_recd_lock_latency(ctdb,
1861                                                            CONTROL_TIMEOUT(),
1862                                                            timeval_elapsed(&start_time));
1863                         DEBUG(DEBUG_NOTICE,
1864                               ("Recovery lock taken successfully by recovery daemon\n"));
1865                 }
1866         }
1867
1868         DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1869
1870         /* get a list of all databases */
1871         ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1872         if (ret != 0) {
1873                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1874                 goto fail;
1875         }
1876
1877         /* we do the db creation before we set the recovery mode, so the freeze happens
1878            on all databases we will be dealing with. */
1879
1880         /* verify that we have all the databases any other node has */
1881         ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1882         if (ret != 0) {
1883                 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1884                 goto fail;
1885         }
1886
1887         /* verify that all other nodes have all our databases */
1888         ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1889         if (ret != 0) {
1890                 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1891                 goto fail;
1892         }
1893         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1894
1895         /* update the database priority for all remote databases */
1896         ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
1897         if (ret != 0) {
1898                 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
1899         }
1900         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
1901
1902
1903         /* update all other nodes to use the same setting for reclock files
1904            as the local recovery master.
1905         */
1906         sync_recovery_lock_file_across_cluster(rec);
1907
1908         /* set recovery mode to active on all nodes */
1909         ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1910         if (ret != 0) {
1911                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1912                 goto fail;
1913         }
1914
1915         /* execute the "startrecovery" event script on all nodes */
1916         ret = run_startrecovery_eventscript(rec, nodemap);
1917         if (ret!=0) {
1918                 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1919                 goto fail;
1920         }
1921
1922         /*
1923           update all nodes to have the same flags that we have
1924          */
1925         for (i=0;i<nodemap->num;i++) {
1926                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1927                         continue;
1928                 }
1929
1930                 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1931                 if (ret != 0) {
1932                         if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1933                                 DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
1934                         } else {
1935                                 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1936                                 goto fail;
1937                         }
1938                 }
1939         }
1940
1941         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1942
1943         /* pick a new generation number */
1944         generation = new_generation();
1945
1946         /* change the vnnmap on this node to use the new generation
1947            number but not on any other nodes.
1948            this guarantees that if we abort the recovery prematurely
1949            for some reason (a node stops responding?)
1950            that we can just return immediately and we will reenter
1951            recovery shortly again.
1952            I.e. we deliberately leave the cluster with an inconsistent
1953            generation id to allow us to abort recovery at any stage and
1954            just restart it from scratch.
1955          */
1956         vnnmap->generation = generation;
1957         ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1958         if (ret != 0) {
1959                 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1960                 goto fail;
1961         }
1962
1963         data.dptr = (void *)&generation;
1964         data.dsize = sizeof(uint32_t);
1965
1966         nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1967         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1968                                         nodes, 0,
1969                                         CONTROL_TIMEOUT(), false, data,
1970                                         NULL,
1971                                         transaction_start_fail_callback,
1972                                         rec) != 0) {
1973                 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1974                 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
1975                                         nodes, 0,
1976                                         CONTROL_TIMEOUT(), false, tdb_null,
1977                                         NULL,
1978                                         NULL,
1979                                         NULL) != 0) {
1980                         DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
1981                 }
1982                 goto fail;
1983         }
1984
1985         DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1986
1987         for (i=0;i<dbmap->num;i++) {
1988                 ret = recover_database(rec, mem_ctx,
1989                                        dbmap->dbs[i].dbid,
1990                                        dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT,
1991                                        pnn, nodemap, generation);
1992                 if (ret != 0) {
1993                         DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
1994                         goto fail;
1995                 }
1996         }
1997
1998         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
1999
2000         /* commit all the changes */
2001         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
2002                                         nodes, 0,
2003                                         CONTROL_TIMEOUT(), false, data,
2004                                         NULL, NULL,
2005                                         NULL) != 0) {
2006                 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
2007                 goto fail;
2008         }
2009
2010         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
2011
2012
2013         /* update the capabilities for all nodes */
2014         ret = update_capabilities(rec, nodemap);
2015         if (ret!=0) {
2016                 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
2017                 goto fail;
2018         }
2019
2020         /* build a new vnn map with all the currently active and
2021            unbanned nodes */
2022         generation = new_generation();
2023         vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
2024         CTDB_NO_MEMORY(ctdb, vnnmap);
2025         vnnmap->generation = generation;
2026         vnnmap->size = 0;
2027         vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
2028         CTDB_NO_MEMORY(ctdb, vnnmap->map);
2029         for (i=j=0;i<nodemap->num;i++) {
2030                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2031                         continue;
2032                 }
2033                 if (!ctdb_node_has_capabilities(rec->caps,
2034                                                 ctdb->nodes[i]->pnn,
2035                                                 CTDB_CAP_LMASTER)) {
2036                         /* this node can not be an lmaster */
2037                         DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
2038                         continue;
2039                 }
2040
2041                 vnnmap->size++;
2042                 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
2043                 CTDB_NO_MEMORY(ctdb, vnnmap->map);
2044                 vnnmap->map[j++] = nodemap->nodes[i].pnn;
2045
2046         }
2047         if (vnnmap->size == 0) {
2048                 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
2049                 vnnmap->size++;
2050                 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
2051                 CTDB_NO_MEMORY(ctdb, vnnmap->map);
2052                 vnnmap->map[0] = pnn;
2053         }
2054
2055         /* update to the new vnnmap on all nodes */
2056         ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
2057         if (ret != 0) {
2058                 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
2059                 goto fail;
2060         }
2061
2062         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
2063
2064         /* update recmaster to point to us for all nodes */
2065         ret = set_recovery_master(ctdb, nodemap, pnn);
2066         if (ret!=0) {
2067                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
2068                 goto fail;
2069         }
2070
2071         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
2072
2073         /* disable recovery mode */
2074         ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL);
2075         if (ret != 0) {
2076                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
2077                 goto fail;
2078         }
2079
2080         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
2081
2082         /* Fetch known/available public IPs from each active node */
2083         ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
2084         if (ret != 0) {
2085                 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2086                                  culprit));
2087                 rec->need_takeover_run = true;
2088                 goto fail;
2089         }
2090
2091         do_takeover_run(rec, nodemap, false);
2092
2093         /* execute the "recovered" event script on all nodes */
2094         ret = run_recovered_eventscript(rec, nodemap, "do_recovery");
2095         if (ret!=0) {
2096                 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
2097                 goto fail;
2098         }
2099
2100         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
2101
2102         /* send a message to all clients telling them that the cluster
2103            has been reconfigured */
2104         ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
2105                                        CTDB_SRVID_RECONFIGURE, tdb_null);
2106         if (ret != 0) {
2107                 DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n"));
2108                 goto fail;
2109         }
2110
2111         DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
2112
2113         rec->need_recovery = false;
2114         ctdb_op_end(rec->recovery);
2115
2116         /* we managed to complete a full recovery, make sure to forgive
2117            any past sins by the nodes that could now participate in the
2118            recovery.
2119         */
2120         DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
2121         for (i=0;i<nodemap->num;i++) {
2122                 struct ctdb_banning_state *ban_state;
2123
2124                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2125                         continue;
2126                 }
2127
2128                 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
2129                 if (ban_state == NULL) {
2130                         continue;
2131                 }
2132
2133                 ban_state->count = 0;
2134         }
2135
2136         /* We just finished a recovery successfully.
2137            We now wait for rerecovery_timeout before we allow
2138            another recovery to take place.
2139         */
2140         DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
2141         ctdb_op_disable(rec->recovery, ctdb->ev,
2142                         ctdb->tunable.rerecovery_timeout);
2143         return 0;
2144
2145 fail:
2146         ctdb_op_end(rec->recovery);
2147         return -1;
2148 }
2149
2150
2151 /*
2152   elections are won by first checking the number of connected nodes, then
2153   the priority time, then the pnn
2154  */
2155 struct election_message {
2156         uint32_t num_connected;
2157         struct timeval priority_time;
2158         uint32_t pnn;
2159         uint32_t node_flags;
2160 };
2161
2162 /*
2163   form this nodes election data
2164  */
2165 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
2166 {
2167         int ret, i;
2168         struct ctdb_node_map *nodemap;
2169         struct ctdb_context *ctdb = rec->ctdb;
2170
2171         ZERO_STRUCTP(em);
2172
2173         em->pnn = rec->ctdb->pnn;
2174         em->priority_time = rec->priority_time;
2175
2176         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
2177         if (ret != 0) {
2178                 DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
2179                 return;
2180         }
2181
2182         rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
2183         em->node_flags = rec->node_flags;
2184
2185         for (i=0;i<nodemap->num;i++) {
2186                 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
2187                         em->num_connected++;
2188                 }
2189         }
2190
2191         /* we shouldnt try to win this election if we cant be a recmaster */
2192         if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2193                 em->num_connected = 0;
2194                 em->priority_time = timeval_current();
2195         }
2196
2197         talloc_free(nodemap);
2198 }
2199
2200 /*
2201   see if the given election data wins
2202  */
2203 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
2204 {
2205         struct election_message myem;
2206         int cmp = 0;
2207
2208         ctdb_election_data(rec, &myem);
2209
2210         /* we cant win if we dont have the recmaster capability */
2211         if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2212                 return false;
2213         }
2214
2215         /* we cant win if we are banned */
2216         if (rec->node_flags & NODE_FLAGS_BANNED) {
2217                 return false;
2218         }
2219
2220         /* we cant win if we are stopped */
2221         if (rec->node_flags & NODE_FLAGS_STOPPED) {
2222                 return false;
2223         }
2224
2225         /* we will automatically win if the other node is banned */
2226         if (em->node_flags & NODE_FLAGS_BANNED) {
2227                 return true;
2228         }
2229
2230         /* we will automatically win if the other node is banned */
2231         if (em->node_flags & NODE_FLAGS_STOPPED) {
2232                 return true;
2233         }
2234
2235         /* try to use the most connected node */
2236         if (cmp == 0) {
2237                 cmp = (int)myem.num_connected - (int)em->num_connected;
2238         }
2239
2240         /* then the longest running node */
2241         if (cmp == 0) {
2242                 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
2243         }
2244
2245         if (cmp == 0) {
2246                 cmp = (int)myem.pnn - (int)em->pnn;
2247         }
2248
2249         return cmp > 0;
2250 }
2251
2252 /*
2253   send out an election request
2254  */
2255 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
2256 {
2257         int ret;
2258         TDB_DATA election_data;
2259         struct election_message emsg;
2260         uint64_t srvid;
2261         struct ctdb_context *ctdb = rec->ctdb;
2262
2263         srvid = CTDB_SRVID_RECOVERY;
2264
2265         ctdb_election_data(rec, &emsg);
2266
2267         election_data.dsize = sizeof(struct election_message);
2268         election_data.dptr  = (unsigned char *)&emsg;
2269
2270
2271         /* first we assume we will win the election and set
2272            recoverymaster to be ourself on the current node
2273          */
2274         ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
2275         if (ret != 0) {
2276                 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
2277                 return -1;
2278         }
2279
2280
2281         /* send an election message to all active nodes */
2282         DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
2283         return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
2284 }
2285
2286 /*
2287   this function will unban all nodes in the cluster
2288 */
2289 static void unban_all_nodes(struct ctdb_context *ctdb)
2290 {
2291         int ret, i;
2292         struct ctdb_node_map *nodemap;
2293         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2294
2295         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2296         if (ret != 0) {
2297                 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
2298                 return;
2299         }
2300
2301         for (i=0;i<nodemap->num;i++) {
2302                 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
2303                   && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
2304                         ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(),
2305                                                  nodemap->nodes[i].pnn, 0,
2306                                                  NODE_FLAGS_BANNED);
2307                         if (ret != 0) {
2308                                 DEBUG(DEBUG_ERR, (__location__ " failed to reset ban state\n"));
2309                         }
2310                 }
2311         }
2312
2313         talloc_free(tmp_ctx);
2314 }
2315
2316
2317 /*
2318   we think we are winning the election - send a broadcast election request
2319  */
2320 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
2321 {
2322         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2323         int ret;
2324
2325         ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
2326         if (ret != 0) {
2327                 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
2328         }
2329
2330         talloc_free(rec->send_election_te);
2331         rec->send_election_te = NULL;
2332 }
2333
2334 /*
2335   handler for memory dumps
2336 */
2337 static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
2338                              TDB_DATA data, void *private_data)
2339 {
2340         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2341         TDB_DATA *dump;
2342         int ret;
2343         struct srvid_request *rd;
2344
2345         if (data.dsize != sizeof(struct srvid_request)) {
2346                 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2347                 talloc_free(tmp_ctx);
2348                 return;
2349         }
2350         rd = (struct srvid_request *)data.dptr;
2351
2352         dump = talloc_zero(tmp_ctx, TDB_DATA);
2353         if (dump == NULL) {
2354                 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
2355                 talloc_free(tmp_ctx);
2356                 return;
2357         }
2358         ret = ctdb_dump_memory(ctdb, dump);
2359         if (ret != 0) {
2360                 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
2361                 talloc_free(tmp_ctx);
2362                 return;
2363         }
2364
2365 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
2366
2367         ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
2368         if (ret != 0) {
2369                 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
2370                 talloc_free(tmp_ctx);
2371                 return;
2372         }
2373
2374         talloc_free(tmp_ctx);
2375 }
2376
2377 /*
2378   handler for reload_nodes
2379 */
2380 static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid,
2381                              TDB_DATA data, void *private_data)
2382 {
2383         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2384
2385         DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
2386
2387         ctdb_load_nodes_file(rec->ctdb);
2388 }
2389
2390
2391 static void ctdb_rebalance_timeout(struct event_context *ev,
2392                                    struct timed_event *te,
2393                                    struct timeval t, void *p)
2394 {
2395         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2396
2397         if (rec->force_rebalance_nodes == NULL) {
2398                 DEBUG(DEBUG_ERR,
2399                       ("Rebalance timeout occurred - no nodes to rebalance\n"));
2400                 return;
2401         }
2402
2403         DEBUG(DEBUG_NOTICE,
2404               ("Rebalance timeout occurred - do takeover run\n"));
2405         do_takeover_run(rec, rec->nodemap, false);
2406 }
2407
2408
2409 static void recd_node_rebalance_handler(struct ctdb_context *ctdb,
2410                                         uint64_t srvid,
2411                                         TDB_DATA data, void *private_data)
2412 {
2413         uint32_t pnn;
2414         uint32_t *t;
2415         int len;
2416         uint32_t deferred_rebalance;
2417         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2418
2419         if (rec->recmaster != ctdb_get_pnn(ctdb)) {
2420                 return;
2421         }
2422
2423         if (data.dsize != sizeof(uint32_t)) {
2424                 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
2425                 return;
2426         }
2427
2428         pnn = *(uint32_t *)&data.dptr[0];
2429
2430         DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
2431
2432         /* Copy any existing list of nodes.  There's probably some
2433          * sort of realloc variant that will do this but we need to
2434          * make sure that freeing the old array also cancels the timer
2435          * event for the timeout... not sure if realloc will do that.
2436          */
2437         len = (rec->force_rebalance_nodes != NULL) ?
2438                 talloc_array_length(rec->force_rebalance_nodes) :
2439                 0;
2440
2441         /* This allows duplicates to be added but they don't cause
2442          * harm.  A call to add a duplicate PNN arguably means that
2443          * the timeout should be reset, so this is the simplest
2444          * solution.
2445          */
2446         t = talloc_zero_array(rec, uint32_t, len+1);
2447         CTDB_NO_MEMORY_VOID(ctdb, t);
2448         if (len > 0) {
2449                 memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
2450         }
2451         t[len] = pnn;
2452
2453         talloc_free(rec->force_rebalance_nodes);
2454
2455         rec->force_rebalance_nodes = t;
2456
2457         /* If configured, setup a deferred takeover run to make sure
2458          * that certain nodes get IPs rebalanced to them.  This will
2459          * be cancelled if a successful takeover run happens before
2460          * the timeout.  Assign tunable value to variable for
2461          * readability.
2462          */
2463         deferred_rebalance = ctdb->tunable.deferred_rebalance_on_node_add;
2464         if (deferred_rebalance != 0) {
2465                 event_add_timed(ctdb->ev, rec->force_rebalance_nodes,
2466                                 timeval_current_ofs(deferred_rebalance, 0),
2467                                 ctdb_rebalance_timeout, rec);
2468         }
2469 }
2470
2471
2472
2473 static void recd_update_ip_handler(struct ctdb_context *ctdb, uint64_t srvid,
2474                              TDB_DATA data, void *private_data)
2475 {
2476         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2477         struct ctdb_public_ip *ip;
2478
2479         if (rec->recmaster != rec->ctdb->pnn) {
2480                 DEBUG(DEBUG_INFO,("Not recmaster, ignore update ip message\n"));
2481                 return;
2482         }
2483
2484         if (data.dsize != sizeof(struct ctdb_public_ip)) {
2485                 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(struct ctdb_public_ip)));
2486                 return;
2487         }
2488
2489         ip = (struct ctdb_public_ip *)data.dptr;
2490
2491         update_ip_assignment_tree(rec->ctdb, ip);
2492 }
2493
2494 static void srvid_disable_and_reply(struct ctdb_context *ctdb,
2495                                     TDB_DATA data,
2496                                     struct ctdb_op_state *op_state)
2497 {
2498         struct srvid_request_data *r;
2499         uint32_t timeout;
2500         TDB_DATA result;
2501         int32_t ret = 0;
2502
2503         /* Validate input data */
2504         if (data.dsize != sizeof(struct srvid_request_data)) {
2505                 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2506                                  "expecting %lu\n", (long unsigned)data.dsize,
2507                                  (long unsigned)sizeof(struct srvid_request)));
2508                 return;
2509         }
2510         if (data.dptr == NULL) {
2511                 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2512                 return;
2513         }
2514
2515         r = (struct srvid_request_data *)data.dptr;
2516         timeout = r->data;
2517
2518         ret = ctdb_op_disable(op_state, ctdb->ev, timeout);
2519         if (ret != 0) {
2520                 goto done;
2521         }
2522
2523         /* Returning our PNN tells the caller that we succeeded */
2524         ret = ctdb_get_pnn(ctdb);
2525 done:
2526         result.dsize = sizeof(int32_t);
2527         result.dptr  = (uint8_t *)&ret;
2528         srvid_request_reply(ctdb, (struct srvid_request *)r, result);
2529 }
2530
2531 static void disable_takeover_runs_handler(struct ctdb_context *ctdb,
2532                                           uint64_t srvid, TDB_DATA data,
2533                                           void *private_data)
2534 {
2535         struct ctdb_recoverd *rec = talloc_get_type(private_data,
2536                                                     struct ctdb_recoverd);
2537
2538         srvid_disable_and_reply(ctdb, data, rec->takeover_run);
2539 }
2540
2541 /* Backward compatibility for this SRVID */
2542 static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid,
2543                                      TDB_DATA data, void *private_data)
2544 {
2545         struct ctdb_recoverd *rec = talloc_get_type(private_data,
2546                                                     struct ctdb_recoverd);
2547         uint32_t timeout;
2548
2549         if (data.dsize != sizeof(uint32_t)) {
2550                 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2551                                  "expecting %lu\n", (long unsigned)data.dsize,
2552                                  (long unsigned)sizeof(uint32_t)));
2553                 return;
2554         }
2555         if (data.dptr == NULL) {
2556                 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2557                 return;
2558         }
2559
2560         timeout = *((uint32_t *)data.dptr);
2561
2562         ctdb_op_disable(rec->takeover_run, ctdb->ev, timeout);
2563 }
2564
2565 static void disable_recoveries_handler(struct ctdb_context *ctdb,
2566                                        uint64_t srvid, TDB_DATA data,
2567                                        void *private_data)
2568 {
2569         struct ctdb_recoverd *rec = talloc_get_type(private_data,
2570                                                     struct ctdb_recoverd);
2571
2572         srvid_disable_and_reply(ctdb, data, rec->recovery);
2573 }
2574
2575 /*
2576   handler for ip reallocate, just add it to the list of requests and
2577   handle this later in the monitor_cluster loop so we do not recurse
2578   with other requests to takeover_run()
2579 */
2580 static void ip_reallocate_handler(struct ctdb_context *ctdb, uint64_t srvid,
2581                                   TDB_DATA data, void *private_data)
2582 {
2583         struct srvid_request *request;
2584         struct ctdb_recoverd *rec = talloc_get_type(private_data,
2585                                                     struct ctdb_recoverd);
2586
2587         if (data.dsize != sizeof(struct srvid_request)) {
2588                 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2589                 return;
2590         }
2591
2592         request = (struct srvid_request *)data.dptr;
2593
2594         srvid_request_add(ctdb, &rec->reallocate_requests, request);
2595 }
2596
2597 static void process_ipreallocate_requests(struct ctdb_context *ctdb,
2598                                           struct ctdb_recoverd *rec)
2599 {
2600         TDB_DATA result;
2601         int32_t ret;
2602         uint32_t culprit;
2603         struct srvid_requests *current;
2604
2605         DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
2606
2607         /* Only process requests that are currently pending.  More
2608          * might come in while the takeover run is in progress and
2609          * they will need to be processed later since they might
2610          * be in response flag changes.
2611          */
2612         current = rec->reallocate_requests;
2613         rec->reallocate_requests = NULL;
2614
2615         /* update the list of public ips that a node can handle for
2616            all connected nodes
2617         */
2618         ret = ctdb_reload_remote_public_ips(ctdb, rec, rec->nodemap, &culprit);
2619         if (ret != 0) {
2620                 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2621                                  culprit));
2622                 rec->need_takeover_run = true;
2623         }
2624         if (ret == 0) {
2625                 if (do_takeover_run(rec, rec->nodemap, false)) {
2626                         ret = ctdb_get_pnn(ctdb);
2627                 } else {
2628                         ret = -1;
2629                 }
2630         }
2631
2632         result.dsize = sizeof(int32_t);
2633         result.dptr  = (uint8_t *)&ret;
2634
2635         srvid_requests_reply(ctdb, &current, result);
2636 }
2637
2638
2639 /*
2640   handler for recovery master elections
2641 */
2642 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
2643                              TDB_DATA data, void *private_data)
2644 {
2645         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2646         int ret;
2647         struct election_message *em = (struct election_message *)data.dptr;
2648
2649         /* Ignore election packets from ourself */
2650         if (ctdb->pnn == em->pnn) {
2651                 return;
2652         }
2653
2654         /* we got an election packet - update the timeout for the election */
2655         talloc_free(rec->election_timeout);
2656         rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2657                                                 fast_start ?
2658                                                 timeval_current_ofs(0, 500000) :
2659                                                 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2660                                                 ctdb_election_timeout, rec);
2661
2662         /* someone called an election. check their election data
2663            and if we disagree and we would rather be the elected node,
2664            send a new election message to all other nodes
2665          */
2666         if (ctdb_election_win(rec, em)) {
2667                 if (!rec->send_election_te) {
2668                         rec->send_election_te = event_add_timed(ctdb->ev, rec,
2669                                                                 timeval_current_ofs(0, 500000),
2670                                                                 election_send_request, rec);
2671                 }
2672                 /*unban_all_nodes(ctdb);*/
2673                 return;
2674         }
2675
2676         /* we didn't win */
2677         TALLOC_FREE(rec->send_election_te);
2678
2679         /* Release the recovery lock file */
2680         if (ctdb_recovery_have_lock(ctdb)) {
2681                 ctdb_recovery_unlock(ctdb);
2682                 unban_all_nodes(ctdb);
2683         }
2684
2685         clear_ip_assignment_tree(ctdb);
2686
2687         /* ok, let that guy become recmaster then */
2688         ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
2689         if (ret != 0) {
2690                 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
2691                 return;
2692         }
2693
2694         return;
2695 }
2696
2697
2698 /*
2699   force the start of the election process
2700  */
2701 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
2702                            struct ctdb_node_map *nodemap)
2703 {
2704         int ret;
2705         struct ctdb_context *ctdb = rec->ctdb;
2706
2707         DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
2708
2709         /* set all nodes to recovery mode to stop all internode traffic */
2710         ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
2711         if (ret != 0) {
2712                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
2713                 return;
2714         }
2715
2716         talloc_free(rec->election_timeout);
2717         rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2718                                                 fast_start ?
2719                                                 timeval_current_ofs(0, 500000) :
2720                                                 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2721                                                 ctdb_election_timeout, rec);
2722
2723         ret = send_election_request(rec, pnn);
2724         if (ret!=0) {
2725                 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
2726                 return;
2727         }
2728
2729         /* wait for a few seconds to collect all responses */
2730         ctdb_wait_election(rec);
2731 }
2732
2733
2734
2735 /*
2736   handler for when a node changes its flags
2737 */
2738 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
2739                             TDB_DATA data, void *private_data)
2740 {
2741         int ret;
2742         struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2743         struct ctdb_node_map *nodemap=NULL;
2744         TALLOC_CTX *tmp_ctx;
2745         int i;
2746         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2747         int disabled_flag_changed;
2748
2749         if (data.dsize != sizeof(*c)) {
2750                 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2751                 return;
2752         }
2753
2754         tmp_ctx = talloc_new(ctdb);
2755         CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2756
2757         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2758         if (ret != 0) {
2759                 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2760                 talloc_free(tmp_ctx);
2761                 return;
2762         }
2763
2764
2765         for (i=0;i<nodemap->num;i++) {
2766                 if (nodemap->nodes[i].pnn == c->pnn) break;
2767         }
2768
2769         if (i == nodemap->num) {
2770                 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2771                 talloc_free(tmp_ctx);
2772                 return;
2773         }
2774
2775         if (c->old_flags != c->new_flags) {
2776                 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x  was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2777         }
2778
2779         disabled_flag_changed =  (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
2780
2781         nodemap->nodes[i].flags = c->new_flags;
2782
2783         ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2784                                      CTDB_CURRENT_NODE, &ctdb->recovery_master);
2785
2786         if (ret == 0) {
2787                 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2788                                            CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2789         }
2790
2791         if (ret == 0 &&
2792             ctdb->recovery_master == ctdb->pnn &&
2793             ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2794                 /* Only do the takeover run if the perm disabled or unhealthy
2795                    flags changed since these will cause an ip failover but not
2796                    a recovery.
2797                    If the node became disconnected or banned this will also
2798                    lead to an ip address failover but that is handled
2799                    during recovery
2800                 */
2801                 if (disabled_flag_changed) {
2802                         rec->need_takeover_run = true;
2803                 }
2804         }
2805
2806         talloc_free(tmp_ctx);
2807 }
2808
2809 /*
2810   handler for when we need to push out flag changes ot all other nodes
2811 */
2812 static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
2813                             TDB_DATA data, void *private_data)
2814 {
2815         int ret;
2816         struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2817         struct ctdb_node_map *nodemap=NULL;
2818         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2819         uint32_t recmaster;
2820         uint32_t *nodes;
2821
2822         /* find the recovery master */
2823         ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
2824         if (ret != 0) {
2825                 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from local node\n"));
2826                 talloc_free(tmp_ctx);
2827                 return;
2828         }
2829
2830         /* read the node flags from the recmaster */
2831         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), recmaster, tmp_ctx, &nodemap);
2832         if (ret != 0) {
2833                 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2834                 talloc_free(tmp_ctx);
2835                 return;
2836         }
2837         if (c->pnn >= nodemap->num) {
2838                 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2839                 talloc_free(tmp_ctx);
2840                 return;
2841         }
2842
2843         /* send the flags update to all connected nodes */
2844         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2845
2846         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2847                                       nodes, 0, CONTROL_TIMEOUT(),
2848                                       false, data,
2849                                       NULL, NULL,
2850                                       NULL) != 0) {
2851                 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2852
2853                 talloc_free(tmp_ctx);
2854                 return;
2855         }
2856
2857         talloc_free(tmp_ctx);
2858 }
2859
2860
2861 struct verify_recmode_normal_data {
2862         uint32_t count;
2863         enum monitor_result status;
2864 };
2865
2866 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2867 {
2868         struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2869
2870
2871         /* one more node has responded with recmode data*/
2872         rmdata->count--;
2873
2874         /* if we failed to get the recmode, then return an error and let
2875            the main loop try again.
2876         */
2877         if (state->state != CTDB_CONTROL_DONE) {
2878                 if (rmdata->status == MONITOR_OK) {
2879                         rmdata->status = MONITOR_FAILED;
2880                 }
2881                 return;
2882         }
2883
2884         /* if we got a response, then the recmode will be stored in the
2885            status field
2886         */
2887         if (state->status != CTDB_RECOVERY_NORMAL) {
2888                 DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
2889                 rmdata->status = MONITOR_RECOVERY_NEEDED;
2890         }
2891
2892         return;
2893 }
2894
2895
2896 /* verify that all nodes are in normal recovery mode */
2897 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2898 {
2899         struct verify_recmode_normal_data *rmdata;
2900         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2901         struct ctdb_client_control_state *state;
2902         enum monitor_result status;
2903         int j;
2904
2905         rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2906         CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2907         rmdata->count  = 0;
2908         rmdata->status = MONITOR_OK;
2909
2910         /* loop over all active nodes and send an async getrecmode call to
2911            them*/
2912         for (j=0; j<nodemap->num; j++) {
2913                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2914                         continue;
2915                 }
2916                 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2917                                         CONTROL_TIMEOUT(),
2918                                         nodemap->nodes[j].pnn);
2919                 if (state == NULL) {
2920                         /* we failed to send the control, treat this as
2921                            an error and try again next iteration
2922                         */
2923                         DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2924                         talloc_free(mem_ctx);
2925                         return MONITOR_FAILED;
2926                 }
2927
2928                 /* set up the callback functions */
2929                 state->async.fn = verify_recmode_normal_callback;
2930                 state->async.private_data = rmdata;
2931
2932                 /* one more control to wait for to complete */
2933                 rmdata->count++;
2934         }
2935
2936
2937         /* now wait for up to the maximum number of seconds allowed
2938            or until all nodes we expect a response from has replied
2939         */
2940         while (rmdata->count > 0) {
2941                 event_loop_once(ctdb->ev);
2942         }
2943
2944         status = rmdata->status;
2945         talloc_free(mem_ctx);
2946         return status;
2947 }
2948
2949
2950 struct verify_recmaster_data {
2951         struct ctdb_recoverd *rec;
2952         uint32_t count;
2953         uint32_t pnn;
2954         enum monitor_result status;
2955 };
2956
2957 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2958 {
2959         struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2960
2961
2962         /* one more node has responded with recmaster data*/
2963         rmdata->count--;
2964
2965         /* if we failed to get the recmaster, then return an error and let
2966            the main loop try again.
2967         */
2968         if (state->state != CTDB_CONTROL_DONE) {
2969                 if (rmdata->status == MONITOR_OK) {
2970                         rmdata->status = MONITOR_FAILED;
2971                 }
2972                 return;
2973         }
2974
2975         /* if we got a response, then the recmaster will be stored in the
2976            status field
2977         */
2978         if (state->status != rmdata->pnn) {
2979                 DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
2980                 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2981                 rmdata->status = MONITOR_ELECTION_NEEDED;
2982         }
2983
2984         return;
2985 }
2986
2987
2988 /* verify that all nodes agree that we are the recmaster */
2989 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
2990 {
2991         struct ctdb_context *ctdb = rec->ctdb;
2992         struct verify_recmaster_data *rmdata;
2993         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2994         struct ctdb_client_control_state *state;
2995         enum monitor_result status;
2996         int j;
2997
2998         rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2999         CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
3000         rmdata->rec    = rec;
3001         rmdata->count  = 0;
3002         rmdata->pnn    = pnn;
3003         rmdata->status = MONITOR_OK;
3004
3005         /* loop over all active nodes and send an async getrecmaster call to
3006            them*/
3007         for (j=0; j<nodemap->num; j++) {
3008                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3009                         continue;
3010                 }
3011                 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
3012                                         CONTROL_TIMEOUT(),
3013                                         nodemap->nodes[j].pnn);
3014                 if (state == NULL) {
3015                         /* we failed to send the control, treat this as
3016                            an error and try again next iteration
3017                         */
3018                         DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
3019                         talloc_free(mem_ctx);
3020                         return MONITOR_FAILED;
3021                 }
3022
3023                 /* set up the callback functions */
3024                 state->async.fn = verify_recmaster_callback;
3025                 state->async.private_data = rmdata;
3026
3027                 /* one more control to wait for to complete */
3028                 rmdata->count++;
3029         }
3030
3031
3032         /* now wait for up to the maximum number of seconds allowed
3033            or until all nodes we expect a response from has replied
3034         */
3035         while (rmdata->count > 0) {
3036                 event_loop_once(ctdb->ev);
3037         }
3038
3039         status = rmdata->status;
3040         talloc_free(mem_ctx);
3041         return status;
3042 }
3043
3044 static bool interfaces_have_changed(struct ctdb_context *ctdb,
3045                                     struct ctdb_recoverd *rec)
3046 {
3047         struct ctdb_control_get_ifaces *ifaces = NULL;
3048         TALLOC_CTX *mem_ctx;
3049         bool ret = false;
3050
3051         mem_ctx = talloc_new(NULL);
3052
3053         /* Read the interfaces from the local node */
3054         if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
3055                                  CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
3056                 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
3057                 /* We could return an error.  However, this will be
3058                  * rare so we'll decide that the interfaces have
3059                  * actually changed, just in case.
3060                  */
3061                 talloc_free(mem_ctx);
3062                 return true;
3063         }
3064
3065         if (!rec->ifaces) {
3066                 /* We haven't been here before so things have changed */
3067                 DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
3068                 ret = true;
3069         } else if (rec->ifaces->num != ifaces->num) {
3070                 /* Number of interfaces has changed */
3071                 DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
3072                                      rec->ifaces->num, ifaces->num));
3073                 ret = true;
3074         } else {
3075                 /* See if interface names or link states have changed */
3076                 int i;
3077                 for (i = 0; i < rec->ifaces->num; i++) {
3078                         struct ctdb_control_iface_info * iface = &rec->ifaces->ifaces[i];
3079                         if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
3080                                 DEBUG(DEBUG_NOTICE,
3081                                       ("Interface in slot %d changed: %s => %s\n",
3082                                        i, iface->name, ifaces->ifaces[i].name));
3083                                 ret = true;
3084                                 break;
3085                         }
3086                         if (iface->link_state != ifaces->ifaces[i].link_state) {
3087                                 DEBUG(DEBUG_NOTICE,
3088                                       ("Interface %s changed state: %d => %d\n",
3089                                        iface->name, iface->link_state,
3090                                        ifaces->ifaces[i].link_state));
3091                                 ret = true;
3092                                 break;
3093                         }
3094                 }
3095         }
3096
3097         talloc_free(rec->ifaces);
3098         rec->ifaces = talloc_steal(rec, ifaces);
3099
3100         talloc_free(mem_ctx);
3101         return ret;
3102 }
3103
3104 /* called to check that the local allocation of public ip addresses is ok.
3105 */
3106 static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn, struct ctdb_node_map *nodemap)
3107 {
3108         TALLOC_CTX *mem_ctx = talloc_new(NULL);
3109         struct ctdb_uptime *uptime1 = NULL;
3110         struct ctdb_uptime *uptime2 = NULL;
3111         int ret, j;
3112         bool need_takeover_run = false;
3113
3114         ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
3115                                 CTDB_CURRENT_NODE, &uptime1);
3116         if (ret != 0) {
3117                 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
3118                 talloc_free(mem_ctx);
3119                 return -1;
3120         }
3121
3122         if (interfaces_have_changed(ctdb, rec)) {
3123                 DEBUG(DEBUG_NOTICE, ("The interfaces status has changed on "
3124                                      "local node %u - force takeover run\n",
3125                                      pnn));
3126                 need_takeover_run = true;
3127         }
3128
3129         ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
3130                                 CTDB_CURRENT_NODE, &uptime2);
3131         if (ret != 0) {
3132                 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
3133                 talloc_free(mem_ctx);
3134                 return -1;
3135         }
3136
3137         /* skip the check if the startrecovery time has changed */
3138         if (timeval_compare(&uptime1->last_recovery_started,
3139                             &uptime2->last_recovery_started) != 0) {
3140                 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3141                 talloc_free(mem_ctx);
3142                 return 0;
3143         }
3144
3145         /* skip the check if the endrecovery time has changed */
3146         if (timeval_compare(&uptime1->last_recovery_finished,
3147                             &uptime2->last_recovery_finished) != 0) {
3148                 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3149                 talloc_free(mem_ctx);
3150                 return 0;
3151         }
3152
3153         /* skip the check if we have started but not finished recovery */
3154         if (timeval_compare(&uptime1->last_recovery_finished,
3155                             &uptime1->last_recovery_started) != 1) {
3156                 DEBUG(DEBUG_INFO, (__location__ " in the middle of recovery or ip reallocation. skipping public ip address check\n"));
3157                 talloc_free(mem_ctx);
3158
3159                 return 0;
3160         }
3161
3162         /* verify that we have the ip addresses we should have
3163            and we dont have ones we shouldnt have.
3164            if we find an inconsistency we set recmode to
3165            active on the local node and wait for the recmaster
3166            to do a full blown recovery.
3167            also if the pnn is -1 and we are healthy and can host the ip
3168            we also request a ip reallocation.
3169         */
3170         if (ctdb->tunable.disable_ip_failover == 0) {
3171                 struct ctdb_all_public_ips *ips = NULL;
3172
3173                 /* read the *available* IPs from the local node */
3174                 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
3175                 if (ret != 0) {
3176                         DEBUG(DEBUG_ERR, ("Unable to get available public IPs from local node %u\n", pnn));
3177                         talloc_free(mem_ctx);
3178                         return -1;
3179                 }
3180
3181                 for (j=0; j<ips->num; j++) {
3182                         if (ips->ips[j].pnn == -1 &&
3183                             nodemap->nodes[pnn].flags == 0) {
3184                                 DEBUG(DEBUG_CRIT,("Public IP '%s' is not assigned and we could serve it\n",
3185                                                   ctdb_addr_to_str(&ips->ips[j].addr)));
3186                                 need_takeover_run = true;
3187                         }
3188                 }
3189
3190                 talloc_free(ips);
3191
3192                 /* read the *known* IPs from the local node */
3193                 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
3194                 if (ret != 0) {
3195                         DEBUG(DEBUG_ERR, ("Unable to get known public IPs from local node %u\n", pnn));
3196                         talloc_free(mem_ctx);
3197                         return -1;
3198                 }
3199
3200                 for (j=0; j<ips->num; j++) {
3201                         if (ips->ips[j].pnn == pnn) {
3202                                 if (ctdb->do_checkpublicip && !ctdb_sys_have_ip(&ips->ips[j].addr)) {
3203                                         DEBUG(DEBUG_CRIT,("Public IP '%s' is assigned to us but not on an interface\n",
3204                                                 ctdb_addr_to_str(&ips->ips[j].addr)));
3205                                         need_takeover_run = true;
3206                                 }
3207                         } else {
3208                                 if (ctdb->do_checkpublicip &&
3209                                     ctdb_sys_have_ip(&ips->ips[j].addr)) {
3210
3211                                         DEBUG(DEBUG_CRIT,("We are still serving a public IP '%s' that we should not be serving. Removing it\n",
3212                                                 ctdb_addr_to_str(&ips->ips[j].addr)));
3213
3214                                         if (ctdb_ctrl_release_ip(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ips->ips[j]) != 0) {
3215                                                 DEBUG(DEBUG_ERR,("Failed to release local IP address\n"));
3216                                         }
3217                                 }
3218                         }
3219                 }
3220         }
3221
3222         if (need_takeover_run) {
3223                 struct srvid_request rd;
3224                 TDB_DATA data;
3225
3226                 DEBUG(DEBUG_CRIT,("Trigger takeoverrun\n"));
3227
3228                 rd.pnn = ctdb->pnn;
3229                 rd.srvid = 0;
3230                 data.dptr = (uint8_t *)&rd;
3231                 data.dsize = sizeof(rd);
3232
3233                 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
3234                 if (ret != 0) {
3235                         DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
3236                 }
3237         }
3238         talloc_free(mem_ctx);
3239         return 0;
3240 }
3241
3242
3243 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
3244 {
3245         struct ctdb_node_map **remote_nodemaps = callback_data;
3246
3247         if (node_pnn >= ctdb->num_nodes) {
3248                 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
3249                 return;
3250         }
3251
3252         remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
3253
3254 }
3255
3256 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
3257         struct ctdb_node_map *nodemap,
3258         struct ctdb_node_map **remote_nodemaps)
3259 {
3260         uint32_t *nodes;
3261
3262         nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
3263         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
3264                                         nodes, 0,
3265                                         CONTROL_TIMEOUT(), false, tdb_null,
3266                                         async_getnodemap_callback,
3267                                         NULL,
3268                                         remote_nodemaps) != 0) {
3269                 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
3270
3271                 return -1;
3272         }
3273
3274         return 0;
3275 }
3276
3277 static int update_recovery_lock_file(struct ctdb_context *ctdb)
3278 {
3279         TALLOC_CTX *tmp_ctx = talloc_new(NULL);
3280         const char *reclockfile;
3281
3282         if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
3283                 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
3284                 talloc_free(tmp_ctx);
3285                 return -1;
3286         }
3287
3288         if (reclockfile == NULL) {
3289                 if (ctdb->recovery_lock_file != NULL) {
3290                         DEBUG(DEBUG_NOTICE,("Recovery lock file disabled\n"));
3291                         talloc_free(ctdb->recovery_lock_file);
3292                         ctdb->recovery_lock_file = NULL;
3293                         ctdb_recovery_unlock(ctdb);
3294                 }
3295                 talloc_free(tmp_ctx);
3296                 return 0;
3297         }
3298
3299         if (ctdb->recovery_lock_file == NULL) {
3300                 DEBUG(DEBUG_NOTICE,
3301                       ("Recovery lock file enabled (%s)\n", reclockfile));
3302                 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3303                 ctdb_recovery_unlock(ctdb);
3304                 talloc_free(tmp_ctx);
3305                 return 0;
3306         }
3307
3308
3309         if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
3310                 talloc_free(tmp_ctx);
3311                 return 0;
3312         }
3313
3314         DEBUG(DEBUG_NOTICE,
3315               ("Recovery lock file changed (now %s)\n", reclockfile));
3316         talloc_free(ctdb->recovery_lock_file);
3317         ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3318         ctdb_recovery_unlock(ctdb);
3319
3320         talloc_free(tmp_ctx);
3321         return 0;
3322 }
3323
3324 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
3325                       TALLOC_CTX *mem_ctx)
3326 {
3327         uint32_t pnn;
3328         struct ctdb_node_map *nodemap=NULL;
3329         struct ctdb_node_map *recmaster_nodemap=NULL;
3330         struct ctdb_node_map **remote_nodemaps=NULL;
3331         struct ctdb_vnn_map *vnnmap=NULL;
3332         struct ctdb_vnn_map *remote_vnnmap=NULL;
3333         uint32_t num_lmasters;
3334         int32_t debug_level;
3335         int i, j, ret;
3336         bool self_ban;
3337
3338
3339         /* verify that the main daemon is still running */
3340         if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
3341                 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
3342                 exit(-1);
3343         }
3344
3345         /* ping the local daemon to tell it we are alive */
3346         ctdb_ctrl_recd_ping(ctdb);
3347
3348         if (rec->election_timeout) {
3349                 /* an election is in progress */
3350                 return;
3351         }
3352
3353         /* read the debug level from the parent and update locally */
3354         ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
3355         if (ret !=0) {
3356                 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
3357                 return;
3358         }
3359         DEBUGLEVEL = debug_level;
3360
3361         /* get relevant tunables */
3362         ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
3363         if (ret != 0) {
3364                 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
3365                 return;
3366         }
3367
3368         /* get runstate */
3369         ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(),
3370                                      CTDB_CURRENT_NODE, &ctdb->runstate);
3371         if (ret != 0) {
3372                 DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n"));
3373                 return;
3374         }
3375
3376         /* get the current recovery lock file from the server */
3377         if (update_recovery_lock_file(ctdb) != 0) {
3378                 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
3379                 return;
3380         }
3381
3382         /* Make sure that if recovery lock verification becomes disabled when
3383            we close the file
3384         */
3385         if (ctdb->recovery_lock_file == NULL) {
3386                 ctdb_recovery_unlock(ctdb);
3387         }
3388
3389         pnn = ctdb_get_pnn(ctdb);
3390
3391         /* get the vnnmap */
3392         ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
3393         if (ret != 0) {
3394                 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
3395                 return;
3396         }
3397
3398
3399         /* get number of nodes */
3400         if (rec->nodemap) {
3401                 talloc_free(rec->nodemap);
3402                 rec->nodemap = NULL;
3403                 nodemap=NULL;
3404         }
3405         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
3406         if (ret != 0) {
3407                 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
3408                 return;
3409         }
3410         nodemap = rec->nodemap;
3411
3412         /* remember our own node flags */
3413         rec->node_flags = nodemap->nodes[pnn].flags;
3414
3415         ban_misbehaving_nodes(rec, &self_ban);
3416         if (self_ban) {
3417                 DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
3418                 return;
3419         }
3420
3421         /* if the local daemon is STOPPED or BANNED, we verify that the databases are
3422            also frozen and that the recmode is set to active.
3423         */
3424         if (rec->node_flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
3425                 /* If this node has become inactive then we want to
3426                  * reduce the chances of it taking over the recovery
3427                  * master role when it becomes active again.  This
3428                  * helps to stabilise the recovery master role so that
3429                  * it stays on the most stable node.
3430                  */
3431                 rec->priority_time = timeval_current();
3432
3433                 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
3434                 if (ret != 0) {
3435                         DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
3436                 }
3437                 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
3438                         DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
3439
3440                         ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
3441                         if (ret != 0) {
3442                                 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
3443
3444                                 return;
3445                         }
3446                         ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
3447                         if (ret != 0) {
3448                                 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node in STOPPED or BANNED state\n"));
3449                                 return;
3450                         }
3451                 }
3452
3453                 /* If this node is stopped or banned then it is not the recovery
3454                  * master, so don't do anything. This prevents stopped or banned
3455                  * node from starting election and sending unnecessary controls.
3456                  */
3457                 return;
3458         }
3459
3460         /* check which node is the recovery master */
3461         ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
3462         if (ret != 0) {
3463                 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
3464                 return;
3465         }
3466
3467         /* If we are not the recmaster then do some housekeeping */
3468         if (rec->recmaster != pnn) {
3469                 /* Ignore any IP reallocate requests - only recmaster
3470                  * processes them
3471                  */
3472                 TALLOC_FREE(rec->reallocate_requests);
3473                 /* Clear any nodes that should be force rebalanced in
3474                  * the next takeover run.  If the recovery master role
3475                  * has moved then we don't want to process these some
3476                  * time in the future.
3477                  */
3478                 TALLOC_FREE(rec->force_rebalance_nodes);
3479         }
3480
3481         /* This is a special case.  When recovery daemon is started, recmaster
3482          * is set to -1.  If a node is not started in stopped state, then
3483          * start election to decide recovery master
3484          */
3485         if (rec->recmaster == (uint32_t)-1) {
3486                 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
3487                 force_election(rec, pnn, nodemap);
3488                 return;
3489         }
3490
3491         /* update the capabilities for all nodes */
3492         ret = update_capabilities(rec, nodemap);
3493         if (ret != 0) {
3494                 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
3495                 return;
3496         }
3497
3498         /*
3499          * If the current recmaster does not have CTDB_CAP_RECMASTER,
3500          * but we have, then force an election and try to become the new
3501          * recmaster.
3502          */
3503         if (!ctdb_node_has_capabilities(rec->caps,
3504                                         rec->recmaster,
3505                                         CTDB_CAP_RECMASTER) &&
3506             (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
3507             !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
3508                 DEBUG(DEBUG_ERR, (__location__ " Current recmaster node %u does not have CAP_RECMASTER,"
3509                                   " but we (node %u) have - force an election\n",
3510                                   rec->recmaster, pnn));
3511                 force_election(rec, pnn, nodemap);
3512                 return;
3513         }
3514
3515         /* verify that the recmaster node is still active */
3516         for (j=0; j<nodemap->num; j++) {
3517                 if (nodemap->nodes[j].pnn==rec->recmaster) {
3518                         break;
3519                 }
3520         }
3521
3522         if (j == nodemap->num) {
3523                 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
3524                 force_election(rec, pnn, nodemap);
3525                 return;
3526         }
3527
3528         /* if recovery master is disconnected we must elect a new recmaster */
3529         if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
3530                 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
3531                 force_election(rec, pnn, nodemap);
3532                 return;
3533         }
3534
3535         /* get nodemap from the recovery master to check if it is inactive */
3536         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3537                                    mem_ctx, &recmaster_nodemap);
3538         if (ret != 0) {
3539                 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
3540                           nodemap->nodes[j].pnn));
3541                 return;
3542         }
3543
3544
3545         if ((recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) &&
3546             (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
3547                 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
3548                 /*
3549                  * update our nodemap to carry the recmaster's notion of
3550                  * its own flags, so that we don't keep freezing the
3551                  * inactive recmaster node...
3552                  */
3553                 nodemap->nodes[j].flags = recmaster_nodemap->nodes[j].flags;
3554                 force_election(rec, pnn, nodemap);
3555                 return;
3556         }
3557
3558         /* verify that we have all ip addresses we should have and we dont
3559          * have addresses we shouldnt have.
3560          */
3561         if (ctdb->tunable.disable_ip_failover == 0 &&
3562             !ctdb_op_is_disabled(rec->takeover_run)) {
3563                 if (verify_local_ip_allocation(ctdb, rec, pnn, nodemap) != 0) {
3564                         DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
3565                 }
3566         }
3567
3568
3569         /* if we are not the recmaster then we do not need to check
3570            if recovery is needed
3571          */
3572         if (pnn != rec->recmaster) {
3573                 return;
3574         }
3575
3576
3577         /* ensure our local copies of flags are right */
3578         ret = update_local_flags(rec, nodemap);
3579         if (ret == MONITOR_ELECTION_NEEDED) {
3580                 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
3581                 force_election(rec, pnn, nodemap);
3582                 return;
3583         }
3584         if (ret != MONITOR_OK) {
3585                 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
3586                 return;
3587         }
3588
3589         if (ctdb->num_nodes != nodemap->num) {
3590                 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
3591                 ctdb_load_nodes_file(ctdb);
3592                 return;
3593         }
3594
3595         /* verify that all active nodes agree that we are the recmaster */
3596         switch (verify_recmaster(rec, nodemap, pnn)) {
3597         case MONITOR_RECOVERY_NEEDED:
3598                 /* can not happen */
3599                 return;
3600         case MONITOR_ELECTION_NEEDED:
3601                 force_election(rec, pnn, nodemap);
3602                 return;
3603         case MONITOR_OK:
3604                 break;
3605         case MONITOR_FAILED:
3606                 return;
3607         }
3608
3609
3610         if (rec->need_recovery) {
3611                 /* a previous recovery didn't finish */
3612                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3613                 return;
3614         }
3615
3616         /* verify that all active nodes are in normal mode
3617            and not in recovery mode
3618         */
3619         switch (verify_recmode(ctdb, nodemap)) {
3620         case MONITOR_RECOVERY_NEEDED:
3621                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3622                 return;
3623         case MONITOR_FAILED:
3624                 return;
3625         case MONITOR_ELECTION_NEEDED:
3626                 /* can not happen */
3627         case MONITOR_OK:
3628                 break;
3629         }
3630
3631
3632         if (ctdb->recovery_lock_file != NULL) {
3633                 /* We must already hold the recovery lock */
3634                 if (!ctdb_recovery_have_lock(ctdb)) {
3635                         DEBUG(DEBUG_ERR,("Failed recovery lock sanity check.  Force a recovery\n"));
3636                         ctdb_set_culprit(rec, ctdb->pnn);
3637                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3638                         return;
3639                 }
3640         }
3641
3642
3643         /* if there are takeovers requested, perform it and notify the waiters */
3644         if (!ctdb_op_is_disabled(rec->takeover_run) &&
3645             rec->reallocate_requests) {
3646                 process_ipreallocate_requests(ctdb, rec);
3647         }
3648
3649         /* If recoveries are disabled then there is no use doing any
3650          * nodemap or flags checks.  Recoveries might be disabled due
3651          * to "reloadnodes", so doing these checks might cause an
3652          * unnecessary recovery.  */
3653         if (ctdb_op_is_disabled(rec->recovery)) {
3654                 return;
3655         }
3656
3657         /* get the nodemap for all active remote nodes
3658          */
3659         remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
3660         if (remote_nodemaps == NULL) {
3661                 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3662                 return;
3663         }
3664         for(i=0; i<nodemap->num; i++) {
3665                 remote_nodemaps[i] = NULL;
3666         }
3667         if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3668                 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3669                 return;
3670         }
3671
3672         /* verify that all other nodes have the same nodemap as we have
3673         */
3674         for (j=0; j<nodemap->num; j++) {
3675                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3676                         continue;
3677                 }
3678
3679                 if (remote_nodemaps[j] == NULL) {
3680                         DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3681                         ctdb_set_culprit(rec, j);
3682
3683                         return;
3684                 }
3685
3686                 /* if the nodes disagree on how many nodes there are
3687                    then this is a good reason to try recovery
3688                  */
3689                 if (remote_nodemaps[j]->num != nodemap->num) {
3690                         DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3691                                   nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3692                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3693                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3694                         return;
3695                 }
3696
3697                 /* if the nodes disagree on which nodes exist and are
3698                    active, then that is also a good reason to do recovery
3699                  */
3700                 for (i=0;i<nodemap->num;i++) {
3701                         if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3702                                 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3703                                           nodemap->nodes[j].pnn, i,
3704                                           remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3705                                 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3706                                 do_recovery(rec, mem_ctx, pnn, nodemap,
3707                                             vnnmap);
3708                                 return;
3709                         }
3710                 }
3711         }
3712
3713         /*
3714          * Update node flags obtained from each active node. This ensure we have
3715          * up-to-date information for all the nodes.
3716          */
3717         for (j=0; j<nodemap->num; j++) {
3718                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3719                         continue;
3720                 }
3721                 nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
3722         }
3723
3724         for (j=0; j<nodemap->num; j++) {
3725                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3726                         continue;
3727                 }
3728
3729                 /* verify the flags are consistent
3730                 */
3731                 for (i=0; i<nodemap->num; i++) {
3732                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3733                                 continue;
3734                         }
3735
3736                         if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3737                                 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3738                                   nodemap->nodes[j].pnn,
3739                                   nodemap->nodes[i].pnn,
3740                                   remote_nodemaps[j]->nodes[i].flags,
3741                                   nodemap->nodes[i].flags));
3742                                 if (i == j) {
3743                                         DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3744                                         update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3745                                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3746                                         do_recovery(rec, mem_ctx, pnn, nodemap,
3747                                                     vnnmap);
3748                                         return;
3749                                 } else {
3750                                         DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3751                                         update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3752                                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3753                                         do_recovery(rec, mem_ctx, pnn, nodemap,
3754                                                     vnnmap);
3755                                         return;
3756                                 }
3757                         }
3758                 }
3759         }
3760
3761
3762         /* count how many active nodes there are */
3763         num_lmasters  = 0;
3764         for (i=0; i<nodemap->num; i++) {
3765                 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
3766                         if (ctdb_node_has_capabilities(rec->caps,
3767                                                        ctdb->nodes[i]->pnn,
3768                                                        CTDB_CAP_LMASTER)) {
3769                                 num_lmasters++;
3770                         }
3771                 }
3772         }
3773
3774
3775         /* There must be the same number of lmasters in the vnn map as
3776          * there are active nodes with the lmaster capability...  or
3777          * do a recovery.
3778          */
3779         if (vnnmap->size != num_lmasters) {
3780                 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
3781                           vnnmap->size, num_lmasters));
3782                 ctdb_set_culprit(rec, ctdb->pnn);
3783                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3784                 return;
3785         }
3786
3787         /* verify that all active nodes in the nodemap also exist in
3788            the vnnmap.
3789          */
3790         for (j=0; j<nodemap->num; j++) {
3791                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3792                         continue;
3793                 }
3794                 if (nodemap->nodes[j].pnn == pnn) {
3795                         continue;
3796                 }
3797
3798                 for (i=0; i<vnnmap->size; i++) {
3799                         if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3800                                 break;
3801                         }
3802                 }
3803                 if (i == vnnmap->size) {
3804                         DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3805                                   nodemap->nodes[j].pnn));
3806                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3807                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3808                         return;
3809                 }
3810         }
3811
3812
3813         /* verify that all other nodes have the same vnnmap
3814            and are from the same generation
3815          */
3816         for (j=0; j<nodemap->num; j++) {
3817                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3818                         continue;
3819                 }
3820                 if (nodemap->nodes[j].pnn == pnn) {
3821                         continue;
3822                 }
3823
3824                 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3825                                           mem_ctx, &remote_vnnmap);
3826                 if (ret != 0) {
3827                         DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3828                                   nodemap->nodes[j].pnn));
3829                         return;
3830                 }
3831
3832                 /* verify the vnnmap generation is the same */
3833                 if (vnnmap->generation != remote_vnnmap->generation) {
3834                         DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3835                                   nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3836                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3837                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3838                         return;
3839                 }
3840
3841                 /* verify the vnnmap size is the same */
3842                 if (vnnmap->size != remote_vnnmap->size) {
3843                         DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3844                                   nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3845                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3846                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3847                         return;
3848                 }
3849
3850                 /* verify the vnnmap is the same */
3851                 for (i=0;i<vnnmap->size;i++) {
3852                         if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3853                                 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3854                                           nodemap->nodes[j].pnn));
3855                                 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3856                                 do_recovery(rec, mem_ctx, pnn, nodemap,
3857                                             vnnmap);
3858                                 return;
3859                         }
3860                 }
3861         }
3862
3863         /* we might need to change who has what IP assigned */
3864         if (rec->need_takeover_run) {
3865                 uint32_t culprit = (uint32_t)-1;
3866
3867                 rec->need_takeover_run = false;
3868
3869                 /* update the list of public ips that a node can handle for
3870                    all connected nodes
3871                 */
3872                 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
3873                 if (ret != 0) {
3874                         DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
3875                                          culprit));
3876                         rec->need_takeover_run = true;
3877                         return;
3878                 }
3879
3880                 /* execute the "startrecovery" event script on all nodes */
3881                 ret = run_startrecovery_eventscript(rec, nodemap);
3882                 if (ret!=0) {
3883                         DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
3884                         ctdb_set_culprit(rec, ctdb->pnn);
3885                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3886                         return;
3887                 }
3888
3889                 /* If takeover run fails, then the offending nodes are
3890                  * assigned ban culprit counts. And we re-try takeover.
3891                  * If takeover run fails repeatedly, the node would get
3892                  * banned.
3893                  *
3894                  * If rec->need_takeover_run is not set to true at this
3895                  * failure, monitoring is disabled cluster-wide (via
3896                  * startrecovery eventscript) and will not get enabled.
3897                  */
3898                 if (!do_takeover_run(rec, nodemap, true)) {
3899                         return;
3900                 }
3901
3902                 /* execute the "recovered" event script on all nodes */
3903                 ret = run_recovered_eventscript(rec, nodemap, "monitor_cluster");
3904 #if 0
3905 // we cant check whether the event completed successfully
3906 // since this script WILL fail if the node is in recovery mode
3907 // and if that race happens, the code here would just cause a second
3908 // cascading recovery.
3909                 if (ret!=0) {
3910                         DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
3911                         ctdb_set_culprit(rec, ctdb->pnn);
3912                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3913                 }
3914 #endif
3915         }
3916 }
3917
3918 /*
3919   the main monitoring loop
3920  */
3921 static void monitor_cluster(struct ctdb_context *ctdb)
3922 {
3923         struct ctdb_recoverd *rec;
3924
3925         DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
3926
3927         rec = talloc_zero(ctdb, struct ctdb_recoverd);
3928         CTDB_NO_MEMORY_FATAL(ctdb, rec);
3929
3930         rec->ctdb = ctdb;
3931
3932         rec->takeover_run = ctdb_op_init(rec, "takeover runs");
3933         CTDB_NO_MEMORY_FATAL(ctdb, rec->takeover_run);
3934
3935         rec->recovery = ctdb_op_init(rec, "recoveries");
3936         CTDB_NO_MEMORY_FATAL(ctdb, rec->recovery);
3937
3938         rec->priority_time = timeval_current();
3939
3940         /* register a message port for sending memory dumps */
3941         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
3942
3943         /* register a message port for recovery elections */
3944         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
3945
3946         /* when nodes are disabled/enabled */
3947         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
3948
3949         /* when we are asked to puch out a flag change */
3950         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
3951
3952         /* register a message port for vacuum fetch */
3953         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
3954
3955         /* register a message port for reloadnodes  */
3956         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
3957
3958         /* register a message port for performing a takeover run */
3959         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
3960
3961         /* register a message port for disabling the ip check for a short while */
3962         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
3963
3964         /* register a message port for updating the recovery daemons node assignment for an ip */
3965         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECD_UPDATE_IP, recd_update_ip_handler, rec);
3966
3967         /* register a message port for forcing a rebalance of a node next
3968            reallocation */
3969         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
3970
3971         /* Register a message port for disabling takeover runs */
3972         ctdb_client_set_message_handler(ctdb,
3973                                         CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
3974                                         disable_takeover_runs_handler, rec);
3975
3976         /* Register a message port for disabling recoveries */
3977         ctdb_client_set_message_handler(ctdb,
3978                                         CTDB_SRVID_DISABLE_RECOVERIES,
3979                                         disable_recoveries_handler, rec);
3980
3981         /* register a message port for detaching database */
3982         ctdb_client_set_message_handler(ctdb,
3983                                         CTDB_SRVID_DETACH_DATABASE,
3984                                         detach_database_handler, rec);
3985
3986         for (;;) {
3987                 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3988                 struct timeval start;
3989                 double elapsed;
3990
3991                 if (!mem_ctx) {
3992                         DEBUG(DEBUG_CRIT,(__location__
3993                                           " Failed to create temp context\n"));
3994                         exit(-1);
3995                 }
3996
3997                 start = timeval_current();
3998                 main_loop(ctdb, rec, mem_ctx);
3999                 talloc_free(mem_ctx);
4000
4001                 /* we only check for recovery once every second */
4002                 elapsed = timeval_elapsed(&start);
4003                 if (elapsed < ctdb->tunable.recover_interval) {
4004                         ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
4005                                           - elapsed);
4006                 }
4007         }
4008 }
4009
4010 /*
4011   event handler for when the main ctdbd dies
4012  */
4013 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
4014                                  uint16_t flags, void *private_data)
4015 {
4016         DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
4017         _exit(1);
4018 }
4019
4020 /*
4021   called regularly to verify that the recovery daemon is still running
4022  */
4023 static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
4024                               struct timeval yt, void *p)
4025 {
4026         struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
4027
4028         if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
4029                 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
4030
4031                 event_add_timed(ctdb->ev, ctdb, timeval_zero(),
4032                                 ctdb_restart_recd, ctdb);
4033
4034                 return;
4035         }
4036
4037         event_add_timed(ctdb->ev, ctdb->recd_ctx,
4038                         timeval_current_ofs(30, 0),
4039                         ctdb_check_recd, ctdb);
4040 }
4041
4042 static void recd_sig_child_handler(struct event_context *ev,
4043         struct signal_event *se, int signum, int count,
4044         void *dont_care,
4045         void *private_data)
4046 {
4047 //      struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4048         int status;
4049         pid_t pid = -1;
4050
4051         while (pid != 0) {
4052                 pid = waitpid(-1, &status, WNOHANG);
4053                 if (pid == -1) {
4054                         if (errno != ECHILD) {
4055                                 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
4056                         }
4057                         return;
4058                 }
4059                 if (pid > 0) {
4060                         DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
4061                 }
4062         }
4063 }
4064
4065 /*
4066   startup the recovery daemon as a child of the main ctdb daemon
4067  */
4068 int ctdb_start_recoverd(struct ctdb_context *ctdb)
4069 {
4070         int fd[2];
4071         struct signal_event *se;
4072         struct tevent_fd *fde;
4073
4074         if (pipe(fd) != 0) {
4075                 return -1;
4076         }
4077
4078         ctdb->recoverd_pid = ctdb_fork(ctdb);
4079         if (ctdb->recoverd_pid == -1) {
4080                 return -1;
4081         }
4082
4083         if (ctdb->recoverd_pid != 0) {
4084                 talloc_free(ctdb->recd_ctx);
4085                 ctdb->recd_ctx = talloc_new(ctdb);
4086                 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
4087
4088                 close(fd[0]);
4089                 event_add_timed(ctdb->ev, ctdb->recd_ctx,
4090                                 timeval_current_ofs(30, 0),
4091                                 ctdb_check_recd, ctdb);
4092                 return 0;
4093         }
4094
4095         close(fd[1]);
4096
4097         srandom(getpid() ^ time(NULL));
4098
4099         ctdb_set_process_name("ctdb_recovered");
4100         if (switch_from_server_to_client(ctdb, "recoverd") != 0) {
4101                 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
4102                 exit(1);
4103         }
4104
4105         DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
4106
4107         fde = event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ,
4108                      ctdb_recoverd_parent, &fd[0]);
4109         tevent_fd_set_auto_close(fde);
4110
4111         /* set up a handler to pick up sigchld */
4112         se = event_add_signal(ctdb->ev, ctdb,
4113                                      SIGCHLD, 0,
4114                                      recd_sig_child_handler,
4115                                      ctdb);
4116         if (se == NULL) {
4117                 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
4118                 exit(1);
4119         }
4120
4121         monitor_cluster(ctdb);
4122
4123         DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
4124         return -1;
4125 }
4126
4127 /*
4128   shutdown the recovery daemon
4129  */
4130 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
4131 {
4132         if (ctdb->recoverd_pid == 0) {
4133                 return;
4134         }
4135
4136         DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
4137         ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
4138
4139         TALLOC_FREE(ctdb->recd_ctx);
4140         TALLOC_FREE(ctdb->recd_ping_count);
4141 }
4142
4143 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te,
4144                        struct timeval t, void *private_data)
4145 {
4146         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4147
4148         DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
4149         ctdb_stop_recoverd(ctdb);
4150         ctdb_start_recoverd(ctdb);
4151 }