ctdb/server/ctdb_recoverd.c

   1 /*
   2    ctdb recovery daemon
   3
   4    Copyright (C) Ronnie Sahlberg  2007
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20 #include "includes.h"
  21 #include "system/filesys.h"
  22 #include "system/time.h"
  23 #include "system/network.h"
  24 #include "system/wait.h"
  25 #include "popt.h"
  26 #include "cmdline.h"
  27 #include "../include/ctdb_client.h"
  28 #include "../include/ctdb_private.h"
  29 #include "lib/tdb_wrap/tdb_wrap.h"
  30 #include "lib/util/dlinklist.h"
  31
  32
  33 /* List of SRVID requests that need to be processed */
  34 struct srvid_list {
  35         struct srvid_list *next, *prev;
  36         struct srvid_request *request;
  37 };
  38
  39 struct srvid_requests {
  40         struct srvid_list *requests;
  41 };
  42
  43 static void srvid_request_reply(struct ctdb_context *ctdb,
  44                                 struct srvid_request *request,
  45                                 TDB_DATA result)
  46 {
  47         /* Someone that sent srvid==0 does not want a reply */
  48         if (request->srvid == 0) {
  49                 talloc_free(request);
  50                 return;
  51         }
  52
  53         if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
  54                                      result) == 0) {
  55                 DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
  56                                   (unsigned)request->pnn,
  57                                   (unsigned long long)request->srvid));
  58         } else {
  59                 DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
  60                                  (unsigned)request->pnn,
  61                                  (unsigned long long)request->srvid));
  62         }
  63
  64         talloc_free(request);
  65 }
  66
  67 static void srvid_requests_reply(struct ctdb_context *ctdb,
  68                                  struct srvid_requests **requests,
  69                                  TDB_DATA result)
  70 {
  71         struct srvid_list *r;
  72
  73         for (r = (*requests)->requests; r != NULL; r = r->next) {
  74                 srvid_request_reply(ctdb, r->request, result);
  75         }
  76
  77         /* Free the list structure... */
  78         TALLOC_FREE(*requests);
  79 }
  80
  81 static void srvid_request_add(struct ctdb_context *ctdb,
  82                               struct srvid_requests **requests,
  83                               struct srvid_request *request)
  84 {
  85         struct srvid_list *t;
  86         int32_t ret;
  87         TDB_DATA result;
  88
  89         if (*requests == NULL) {
  90                 *requests = talloc_zero(ctdb, struct srvid_requests);
  91                 if (*requests == NULL) {
  92                         goto nomem;
  93                 }
  94         }
  95
  96         t = talloc_zero(*requests, struct srvid_list);
  97         if (t == NULL) {
  98                 /* If *requests was just allocated above then free it */
  99                 if ((*requests)->requests == NULL) {
 100                         TALLOC_FREE(*requests);
 101                 }
 102                 goto nomem;
 103         }
 104
 105         t->request = (struct srvid_request *)talloc_steal(t, request);
 106         DLIST_ADD((*requests)->requests, t);
 107
 108         return;
 109
 110 nomem:
 111         /* Failed to add the request to the list.  Send a fail. */
 112         DEBUG(DEBUG_ERR, (__location__
 113                           " Out of memory, failed to queue SRVID request\n"));
 114         ret = -ENOMEM;
 115         result.dsize = sizeof(ret);
 116         result.dptr = (uint8_t *)&ret;
 117         srvid_request_reply(ctdb, request, result);
 118 }
 119
 120 /* An abstraction to allow an operation (takeover runs, recoveries,
 121  * ...) to be disabled for a given timeout */
 122 struct ctdb_op_state {
 123         struct tevent_timer *timer;
 124         bool in_progress;
 125         const char *name;
 126 };
 127
 128 static struct ctdb_op_state *ctdb_op_init(TALLOC_CTX *mem_ctx, const char *name)
 129 {
 130         struct ctdb_op_state *state = talloc_zero(mem_ctx, struct ctdb_op_state);
 131
 132         if (state != NULL) {
 133                 state->in_progress = false;
 134                 state->name = name;
 135         }
 136
 137         return state;
 138 }
 139
 140 static bool ctdb_op_is_disabled(struct ctdb_op_state *state)
 141 {
 142         return state->timer != NULL;
 143 }
 144
 145 static bool ctdb_op_begin(struct ctdb_op_state *state)
 146 {
 147         if (ctdb_op_is_disabled(state)) {
 148                 DEBUG(DEBUG_NOTICE,
 149                       ("Unable to begin - %s are disabled\n", state->name));
 150                 return false;
 151         }
 152
 153         state->in_progress = true;
 154         return true;
 155 }
 156
 157 static bool ctdb_op_end(struct ctdb_op_state *state)
 158 {
 159         return state->in_progress = false;
 160 }
 161
 162 static bool ctdb_op_is_in_progress(struct ctdb_op_state *state)
 163 {
 164         return state->in_progress;
 165 }
 166
 167 static void ctdb_op_enable(struct ctdb_op_state *state)
 168 {
 169         TALLOC_FREE(state->timer);
 170 }
 171
 172 static void ctdb_op_timeout_handler(struct event_context *ev,
 173                                     struct timed_event *te,
 174                                     struct timeval yt, void *p)
 175 {
 176         struct ctdb_op_state *state =
 177                 talloc_get_type(p, struct ctdb_op_state);
 178
 179         DEBUG(DEBUG_NOTICE,("Reenabling %s after timeout\n", state->name));
 180         ctdb_op_enable(state);
 181 }
 182
 183 static int ctdb_op_disable(struct ctdb_op_state *state,
 184                            struct tevent_context *ev,
 185                            uint32_t timeout)
 186 {
 187         if (timeout == 0) {
 188                 DEBUG(DEBUG_NOTICE,("Reenabling %s\n", state->name));
 189                 ctdb_op_enable(state);
 190                 return 0;
 191         }
 192
 193         if (state->in_progress) {
 194                 DEBUG(DEBUG_ERR,
 195                       ("Unable to disable %s - in progress\n", state->name));
 196                 return -EAGAIN;
 197         }
 198
 199         DEBUG(DEBUG_NOTICE,("Disabling %s for %u seconds\n",
 200                             state->name, timeout));
 201
 202         /* Clear any old timers */
 203         talloc_free(state->timer);
 204
 205         /* Arrange for the timeout to occur */
 206         state->timer = tevent_add_timer(ev, state,
 207                                         timeval_current_ofs(timeout, 0),
 208                                         ctdb_op_timeout_handler, state);
 209         if (state->timer == NULL) {
 210                 DEBUG(DEBUG_ERR,(__location__ " Unable to setup timer\n"));
 211                 return -ENOMEM;
 212         }
 213
 214         return 0;
 215 }
 216
 217 struct ctdb_banning_state {
 218         uint32_t count;
 219         struct timeval last_reported_time;
 220 };
 221
 222 /*
 223   private state of recovery daemon
 224  */
 225 struct ctdb_recoverd {
 226         struct ctdb_context *ctdb;
 227         uint32_t recmaster;
 228         uint32_t last_culprit_node;
 229         struct ctdb_node_map *nodemap;
 230         struct timeval priority_time;
 231         bool need_takeover_run;
 232         bool need_recovery;
 233         uint32_t node_flags;
 234         struct timed_event *send_election_te;
 235         struct timed_event *election_timeout;
 236         struct vacuum_info *vacuum_info;
 237         struct srvid_requests *reallocate_requests;
 238         struct ctdb_op_state *takeover_run;
 239         struct ctdb_op_state *recovery;
 240         struct ctdb_control_get_ifaces *ifaces;
 241         uint32_t *force_rebalance_nodes;
 242         struct ctdb_node_capabilities *caps;
 243 };
 244
 245 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
 246 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
 247
 248 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te, struct timeval t, void *private_data);
 249
 250 /*
 251   ban a node for a period of time
 252  */
 253 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
 254 {
 255         int ret;
 256         struct ctdb_context *ctdb = rec->ctdb;
 257         struct ctdb_ban_time bantime;
 258
 259         if (!ctdb_validate_pnn(ctdb, pnn)) {
 260                 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
 261                 return;
 262         }
 263
 264         DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
 265
 266         bantime.pnn  = pnn;
 267         bantime.time = ban_time;
 268
 269         ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
 270         if (ret != 0) {
 271                 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
 272                 return;
 273         }
 274
 275 }
 276
 277 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
 278
 279
 280 /*
 281   remember the trouble maker
 282  */
 283 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
 284 {
 285         struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
 286         struct ctdb_banning_state *ban_state;
 287
 288         if (culprit > ctdb->num_nodes) {
 289                 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
 290                 return;
 291         }
 292
 293         /* If we are banned or stopped, do not set other nodes as culprits */
 294         if (rec->node_flags & NODE_FLAGS_INACTIVE) {
 295                 DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
 296                 return;
 297         }
 298
 299         if (ctdb->nodes[culprit]->ban_state == NULL) {
 300                 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
 301                 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
 302
 303
 304         }
 305         ban_state = ctdb->nodes[culprit]->ban_state;
 306         if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
 307                 /* this was the first time in a long while this node
 308                    misbehaved so we will forgive any old transgressions.
 309                 */
 310                 ban_state->count = 0;
 311         }
 312
 313         ban_state->count += count;
 314         ban_state->last_reported_time = timeval_current();
 315         rec->last_culprit_node = culprit;
 316 }
 317
 318 /*
 319   remember the trouble maker
 320  */
 321 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
 322 {
 323         ctdb_set_culprit_count(rec, culprit, 1);
 324 }
 325
 326
 327 /* this callback is called for every node that failed to execute the
 328    recovered event
 329 */
 330 static void recovered_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 331 {
 332         struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
 333
 334         DEBUG(DEBUG_ERR, (__location__ " Node %u failed the recovered event. Setting it as recovery fail culprit\n", node_pnn));
 335
 336         ctdb_set_culprit(rec, node_pnn);
 337 }
 338
 339 /*
 340   run the "recovered" eventscript on all nodes
 341  */
 342 static int run_recovered_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, const char *caller)
 343 {
 344         TALLOC_CTX *tmp_ctx;
 345         uint32_t *nodes;
 346         struct ctdb_context *ctdb = rec->ctdb;
 347
 348         tmp_ctx = talloc_new(ctdb);
 349         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 350
 351         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 352         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
 353                                         nodes, 0,
 354                                         CONTROL_TIMEOUT(), false, tdb_null,
 355                                         NULL, recovered_fail_callback,
 356                                         rec) != 0) {
 357                 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
 358
 359                 talloc_free(tmp_ctx);
 360                 return -1;
 361         }
 362
 363         talloc_free(tmp_ctx);
 364         return 0;
 365 }
 366
 367 /* this callback is called for every node that failed to execute the
 368    start recovery event
 369 */
 370 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 371 {
 372         struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
 373
 374         DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
 375
 376         ctdb_set_culprit(rec, node_pnn);
 377 }
 378
 379 /*
 380   run the "startrecovery" eventscript on all nodes
 381  */
 382 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
 383 {
 384         TALLOC_CTX *tmp_ctx;
 385         uint32_t *nodes;
 386         struct ctdb_context *ctdb = rec->ctdb;
 387
 388         tmp_ctx = talloc_new(ctdb);
 389         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 390
 391         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 392         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
 393                                         nodes, 0,
 394                                         CONTROL_TIMEOUT(), false, tdb_null,
 395                                         NULL,
 396                                         startrecovery_fail_callback,
 397                                         rec) != 0) {
 398                 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
 399                 talloc_free(tmp_ctx);
 400                 return -1;
 401         }
 402
 403         talloc_free(tmp_ctx);
 404         return 0;
 405 }
 406
 407 /*
 408   update the node capabilities for all connected nodes
 409  */
 410 static int update_capabilities(struct ctdb_recoverd *rec,
 411                                struct ctdb_node_map *nodemap)
 412 {
 413         uint32_t *capp;
 414         TALLOC_CTX *tmp_ctx;
 415         struct ctdb_node_capabilities *caps;
 416         struct ctdb_context *ctdb = rec->ctdb;
 417
 418         tmp_ctx = talloc_new(rec);
 419         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 420
 421         caps = ctdb_get_capabilities(ctdb, tmp_ctx,
 422                                      CONTROL_TIMEOUT(), nodemap);
 423
 424         if (caps == NULL) {
 425                 DEBUG(DEBUG_ERR,
 426                       (__location__ " Failed to get node capabilities\n"));
 427                 talloc_free(tmp_ctx);
 428                 return -1;
 429         }
 430
 431         capp = ctdb_get_node_capabilities(caps, ctdb_get_pnn(ctdb));
 432         if (capp == NULL) {
 433                 DEBUG(DEBUG_ERR,
 434                       (__location__
 435                        " Capabilities don't include current node.\n"));
 436                 talloc_free(tmp_ctx);
 437                 return -1;
 438         }
 439         ctdb->capabilities = *capp;
 440
 441         TALLOC_FREE(rec->caps);
 442         rec->caps = talloc_steal(rec, caps);
 443
 444         talloc_free(tmp_ctx);
 445         return 0;
 446 }
 447
 448 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 449 {
 450         struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
 451
 452         DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
 453         ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
 454 }
 455
 456 static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 457 {
 458         struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
 459
 460         DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
 461         ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
 462 }
 463
 464 /*
 465   change recovery mode on all nodes
 466  */
 467 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t rec_mode)
 468 {
 469         TDB_DATA data;
 470         uint32_t *nodes;
 471         TALLOC_CTX *tmp_ctx;
 472
 473         tmp_ctx = talloc_new(ctdb);
 474         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 475
 476         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 477
 478         data.dsize = sizeof(uint32_t);
 479         data.dptr = (unsigned char *)&rec_mode;
 480
 481         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
 482                                         nodes, 0,
 483                                         CONTROL_TIMEOUT(),
 484                                         false, data,
 485                                         NULL, NULL,
 486                                         NULL) != 0) {
 487                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
 488                 talloc_free(tmp_ctx);
 489                 return -1;
 490         }
 491
 492         /* freeze all nodes */
 493         if (rec_mode == CTDB_RECOVERY_ACTIVE) {
 494                 int i;
 495
 496                 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
 497                         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
 498                                                 nodes, i,
 499                                                 CONTROL_TIMEOUT(),
 500                                                 false, tdb_null,
 501                                                 NULL,
 502                                                 set_recmode_fail_callback,
 503                                                 rec) != 0) {
 504                                 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
 505                                 talloc_free(tmp_ctx);
 506                                 return -1;
 507                         }
 508                 }
 509         }
 510
 511         talloc_free(tmp_ctx);
 512         return 0;
 513 }
 514
 515 /*
 516   change recovery master on all node
 517  */
 518 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
 519 {
 520         TDB_DATA data;
 521         TALLOC_CTX *tmp_ctx;
 522         uint32_t *nodes;
 523
 524         tmp_ctx = talloc_new(ctdb);
 525         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 526
 527         data.dsize = sizeof(uint32_t);
 528         data.dptr = (unsigned char *)&pnn;
 529
 530         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 531         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
 532                                         nodes, 0,
 533                                         CONTROL_TIMEOUT(), false, data,
 534                                         NULL, NULL,
 535                                         NULL) != 0) {
 536                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
 537                 talloc_free(tmp_ctx);
 538                 return -1;
 539         }
 540
 541         talloc_free(tmp_ctx);
 542         return 0;
 543 }
 544
 545 /* update all remote nodes to use the same db priority that we have
 546    this can fail if the remove node has not yet been upgraded to
 547    support this function, so we always return success and never fail
 548    a recovery if this call fails.
 549 */
 550 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
 551         struct ctdb_node_map *nodemap,
 552         uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
 553 {
 554         int db;
 555
 556         /* step through all local databases */
 557         for (db=0; db<dbmap->num;db++) {
 558                 struct ctdb_db_priority db_prio;
 559                 int ret;
 560
 561                 db_prio.db_id     = dbmap->dbs[db].dbid;
 562                 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].dbid, &db_prio.priority);
 563                 if (ret != 0) {
 564                         DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].dbid));
 565                         continue;
 566                 }
 567
 568                 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].dbid, db_prio.priority));
 569
 570                 ret = ctdb_ctrl_set_db_priority(ctdb, CONTROL_TIMEOUT(),
 571                                                 CTDB_CURRENT_NODE, &db_prio);
 572                 if (ret != 0) {
 573                         DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n",
 574                                          db_prio.db_id));
 575                 }
 576         }
 577
 578         return 0;
 579 }
 580
 581 /*
 582   ensure all other nodes have attached to any databases that we have
 583  */
 584 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
 585                                            uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
 586 {
 587         int i, j, db, ret;
 588         struct ctdb_dbid_map *remote_dbmap;
 589
 590         /* verify that all other nodes have all our databases */
 591         for (j=0; j<nodemap->num; j++) {
 592                 /* we dont need to ourself ourselves */
 593                 if (nodemap->nodes[j].pnn == pnn) {
 594                         continue;
 595                 }
 596                 /* dont check nodes that are unavailable */
 597                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 598                         continue;
 599                 }
 600
 601                 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 602                                          mem_ctx, &remote_dbmap);
 603                 if (ret != 0) {
 604                         DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
 605                         return -1;
 606                 }
 607
 608                 /* step through all local databases */
 609                 for (db=0; db<dbmap->num;db++) {
 610                         const char *name;
 611
 612
 613                         for (i=0;i<remote_dbmap->num;i++) {
 614                                 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
 615                                         break;
 616                                 }
 617                         }
 618                         /* the remote node already have this database */
 619                         if (i!=remote_dbmap->num) {
 620                                 continue;
 621                         }
 622                         /* ok so we need to create this database */
 623                         ret = ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn,
 624                                                   dbmap->dbs[db].dbid, mem_ctx,
 625                                                   &name);
 626                         if (ret != 0) {
 627                                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
 628                                 return -1;
 629                         }
 630                         ret = ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(),
 631                                                  nodemap->nodes[j].pnn,
 632                                                  mem_ctx, name,
 633                                                  dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
 634                         if (ret != 0) {
 635                                 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
 636                                 return -1;
 637                         }
 638                 }
 639         }
 640
 641         return 0;
 642 }
 643
 644
 645 /*
 646   ensure we are attached to any databases that anyone else is attached to
 647  */
 648 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
 649                                           uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
 650 {
 651         int i, j, db, ret;
 652         struct ctdb_dbid_map *remote_dbmap;
 653
 654         /* verify that we have all database any other node has */
 655         for (j=0; j<nodemap->num; j++) {
 656                 /* we dont need to ourself ourselves */
 657                 if (nodemap->nodes[j].pnn == pnn) {
 658                         continue;
 659                 }
 660                 /* dont check nodes that are unavailable */
 661                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 662                         continue;
 663                 }
 664
 665                 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 666                                          mem_ctx, &remote_dbmap);
 667                 if (ret != 0) {
 668                         DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
 669                         return -1;
 670                 }
 671
 672                 /* step through all databases on the remote node */
 673                 for (db=0; db<remote_dbmap->num;db++) {
 674                         const char *name;
 675
 676                         for (i=0;i<(*dbmap)->num;i++) {
 677                                 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
 678                                         break;
 679                                 }
 680                         }
 681                         /* we already have this db locally */
 682                         if (i!=(*dbmap)->num) {
 683                                 continue;
 684                         }
 685                         /* ok so we need to create this database and
 686                            rebuild dbmap
 687                          */
 688                         ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 689                                             remote_dbmap->dbs[db].dbid, mem_ctx, &name);
 690                         if (ret != 0) {
 691                                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
 692                                           nodemap->nodes[j].pnn));
 693                                 return -1;
 694                         }
 695                         ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
 696                                            remote_dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
 697                         if (ret != 0) {
 698                                 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
 699                                 return -1;
 700                         }
 701                         ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
 702                         if (ret != 0) {
 703                                 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
 704                                 return -1;
 705                         }
 706                 }
 707         }
 708
 709         return 0;
 710 }
 711
 712
 713 /*
 714   pull the remote database contents from one node into the recdb
 715  */
 716 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
 717                                     struct tdb_wrap *recdb, uint32_t dbid)
 718 {
 719         int ret;
 720         TDB_DATA outdata;
 721         struct ctdb_marshall_buffer *reply;
 722         struct ctdb_rec_data *recdata;
 723         int i;
 724         TALLOC_CTX *tmp_ctx = talloc_new(recdb);
 725
 726         ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
 727                                CONTROL_TIMEOUT(), &outdata);
 728         if (ret != 0) {
 729                 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
 730                 talloc_free(tmp_ctx);
 731                 return -1;
 732         }
 733
 734         reply = (struct ctdb_marshall_buffer *)outdata.dptr;
 735
 736         if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
 737                 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
 738                 talloc_free(tmp_ctx);
 739                 return -1;
 740         }
 741
 742         recdata = (struct ctdb_rec_data *)&reply->data[0];
 743
 744         for (i=0;
 745              i<reply->count;
 746              recdata = (struct ctdb_rec_data *)(recdata->length + (uint8_t *)recdata), i++) {
 747                 TDB_DATA key, data;
 748                 struct ctdb_ltdb_header *hdr;
 749                 TDB_DATA existing;
 750
 751                 key.dptr = &recdata->data[0];
 752                 key.dsize = recdata->keylen;
 753                 data.dptr = &recdata->data[key.dsize];
 754                 data.dsize = recdata->datalen;
 755
 756                 hdr = (struct ctdb_ltdb_header *)data.dptr;
 757
 758                 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
 759                         DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
 760                         talloc_free(tmp_ctx);
 761                         return -1;
 762                 }
 763
 764                 /* fetch the existing record, if any */
 765                 existing = tdb_fetch(recdb->tdb, key);
 766
 767                 if (existing.dptr != NULL) {
 768                         struct ctdb_ltdb_header header;
 769                         if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
 770                                 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
 771                                          (unsigned)existing.dsize, srcnode));
 772                                 free(existing.dptr);
 773                                 talloc_free(tmp_ctx);
 774                                 return -1;
 775                         }
 776                         header = *(struct ctdb_ltdb_header *)existing.dptr;
 777                         free(existing.dptr);
 778                         if (!(header.rsn < hdr->rsn ||
 779                               (header.dmaster != ctdb_get_pnn(ctdb) &&
 780                                header.rsn == hdr->rsn))) {
 781                                 continue;
 782                         }
 783                 }
 784
 785                 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
 786                         DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
 787                         talloc_free(tmp_ctx);
 788                         return -1;
 789                 }
 790         }
 791
 792         talloc_free(tmp_ctx);
 793
 794         return 0;
 795 }
 796
 797
 798 struct pull_seqnum_cbdata {
 799         int failed;
 800         uint32_t pnn;
 801         uint64_t seqnum;
 802 };
 803
 804 static void pull_seqnum_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 805 {
 806         struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
 807         uint64_t seqnum;
 808
 809         if (cb_data->failed != 0) {
 810                 DEBUG(DEBUG_ERR, ("Got seqnum from node %d but we have already failed the entire operation\n", node_pnn));
 811                 return;
 812         }
 813
 814         if (res != 0) {
 815                 DEBUG(DEBUG_ERR, ("Error when pulling seqnum from node %d\n", node_pnn));
 816                 cb_data->failed = 1;
 817                 return;
 818         }
 819
 820         if (outdata.dsize != sizeof(uint64_t)) {
 821                 DEBUG(DEBUG_ERR, ("Error when reading pull seqnum from node %d, got %d bytes but expected %d\n", node_pnn, (int)outdata.dsize, (int)sizeof(uint64_t)));
 822                 cb_data->failed = -1;
 823                 return;
 824         }
 825
 826         seqnum = *((uint64_t *)outdata.dptr);
 827
 828         if (seqnum > cb_data->seqnum ||
 829             (cb_data->pnn == -1 && seqnum == 0)) {
 830                 cb_data->seqnum = seqnum;
 831                 cb_data->pnn = node_pnn;
 832         }
 833 }
 834
 835 static void pull_seqnum_fail_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 836 {
 837         struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
 838
 839         DEBUG(DEBUG_ERR, ("Failed to pull db seqnum from node %d\n", node_pnn));
 840         cb_data->failed = 1;
 841 }
 842
 843 static int pull_highest_seqnum_pdb(struct ctdb_context *ctdb,
 844                                 struct ctdb_recoverd *rec,
 845                                 struct ctdb_node_map *nodemap,
 846                                 struct tdb_wrap *recdb, uint32_t dbid)
 847 {
 848         TALLOC_CTX *tmp_ctx = talloc_new(NULL);
 849         uint32_t *nodes;
 850         TDB_DATA data;
 851         uint32_t outdata[2];
 852         struct pull_seqnum_cbdata *cb_data;
 853
 854         DEBUG(DEBUG_NOTICE, ("Scan for highest seqnum pdb for db:0x%08x\n", dbid));
 855
 856         outdata[0] = dbid;
 857         outdata[1] = 0;
 858
 859         data.dsize = sizeof(outdata);
 860         data.dptr  = (uint8_t *)&outdata[0];
 861
 862         cb_data = talloc(tmp_ctx, struct pull_seqnum_cbdata);
 863         if (cb_data == NULL) {
 864                 DEBUG(DEBUG_ERR, ("Failed to allocate pull highest seqnum cb_data structure\n"));
 865                 talloc_free(tmp_ctx);
 866                 return -1;
 867         }
 868
 869         cb_data->failed = 0;
 870         cb_data->pnn    = -1;
 871         cb_data->seqnum = 0;
 872
 873         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 874         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_DB_SEQNUM,
 875                                         nodes, 0,
 876                                         CONTROL_TIMEOUT(), false, data,
 877                                         pull_seqnum_cb,
 878                                         pull_seqnum_fail_cb,
 879                                         cb_data) != 0) {
 880                 DEBUG(DEBUG_ERR, (__location__ " Failed to run async GET_DB_SEQNUM\n"));
 881
 882                 talloc_free(tmp_ctx);
 883                 return -1;
 884         }
 885
 886         if (cb_data->failed != 0) {
 887                 DEBUG(DEBUG_NOTICE, ("Failed to pull sequence numbers for DB 0x%08x\n", dbid));
 888                 talloc_free(tmp_ctx);
 889                 return -1;
 890         }
 891
 892         if (cb_data->pnn == -1) {
 893                 DEBUG(DEBUG_NOTICE, ("Failed to find a node with highest sequence numbers for DB 0x%08x\n", dbid));
 894                 talloc_free(tmp_ctx);
 895                 return -1;
 896         }
 897
 898         DEBUG(DEBUG_NOTICE, ("Pull persistent db:0x%08x from node %d with highest seqnum:%lld\n", dbid, cb_data->pnn, (long long)cb_data->seqnum));
 899
 900         if (pull_one_remote_database(ctdb, cb_data->pnn, recdb, dbid) != 0) {
 901                 DEBUG(DEBUG_ERR, ("Failed to pull higest seqnum database 0x%08x from node %d\n", dbid, cb_data->pnn));
 902                 talloc_free(tmp_ctx);
 903                 return -1;
 904         }
 905
 906         talloc_free(tmp_ctx);
 907         return 0;
 908 }
 909
 910
 911 /*
 912   pull all the remote database contents into the recdb
 913  */
 914 static int pull_remote_database(struct ctdb_context *ctdb,
 915                                 struct ctdb_recoverd *rec,
 916                                 struct ctdb_node_map *nodemap,
 917                                 struct tdb_wrap *recdb, uint32_t dbid,
 918                                 bool persistent)
 919 {
 920         int j;
 921
 922         if (persistent && ctdb->tunable.recover_pdb_by_seqnum != 0) {
 923                 int ret;
 924                 ret = pull_highest_seqnum_pdb(ctdb, rec, nodemap, recdb, dbid);
 925                 if (ret == 0) {
 926                         return 0;
 927                 }
 928         }
 929
 930         /* pull all records from all other nodes across onto this node
 931            (this merges based on rsn)
 932         */
 933         for (j=0; j<nodemap->num; j++) {
 934                 /* dont merge from nodes that are unavailable */
 935                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 936                         continue;
 937                 }
 938                 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
 939                         DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
 940                                  nodemap->nodes[j].pnn));
 941                         ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
 942                         return -1;
 943                 }
 944         }
 945
 946         return 0;
 947 }
 948
 949
 950 /*
 951   update flags on all active nodes
 952  */
 953 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
 954 {
 955         int ret;
 956
 957         ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
 958                 if (ret != 0) {
 959                 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
 960                 return -1;
 961         }
 962
 963         return 0;
 964 }
 965
 966 /*
 967   ensure all nodes have the same vnnmap we do
 968  */
 969 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
 970                                       uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
 971 {
 972         int j, ret;
 973
 974         /* push the new vnn map out to all the nodes */
 975         for (j=0; j<nodemap->num; j++) {
 976                 /* dont push to nodes that are unavailable */
 977                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 978                         continue;
 979                 }
 980
 981                 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
 982                 if (ret != 0) {
 983                         DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
 984                         return -1;
 985                 }
 986         }
 987
 988         return 0;
 989 }
 990
 991
 992 struct vacuum_info {
 993         struct vacuum_info *next, *prev;
 994         struct ctdb_recoverd *rec;
 995         uint32_t srcnode;
 996         struct ctdb_db_context *ctdb_db;
 997         struct ctdb_marshall_buffer *recs;
 998         struct ctdb_rec_data *r;
 999 };
1000
1001 static void vacuum_fetch_next(struct vacuum_info *v);
1002
1003 /*
1004   called when a vacuum fetch has completed - just free it and do the next one
1005  */
1006 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
1007 {
1008         talloc_free(state);
1009 }
1010
1011
1012 /*
1013   process the next element from the vacuum list
1014 */
1015 static void vacuum_fetch_next(struct vacuum_info *v)
1016 {
1017         struct ctdb_call call;
1018         struct ctdb_rec_data *r;
1019
1020         while (v->recs->count) {
1021                 struct ctdb_client_call_state *state;
1022                 TDB_DATA data;
1023                 struct ctdb_ltdb_header *hdr;
1024
1025                 ZERO_STRUCT(call);
1026                 call.call_id = CTDB_NULL_FUNC;
1027                 call.flags = CTDB_IMMEDIATE_MIGRATION;
1028                 call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
1029
1030                 r = v->r;
1031                 v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
1032                 v->recs->count--;
1033
1034                 call.key.dptr = &r->data[0];
1035                 call.key.dsize = r->keylen;
1036
1037                 /* ensure we don't block this daemon - just skip a record if we can't get
1038                    the chainlock */
1039                 if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
1040                         continue;
1041                 }
1042
1043                 data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
1044                 if (data.dptr == NULL) {
1045                         tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
1046                         continue;
1047                 }
1048
1049                 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
1050                         free(data.dptr);
1051                         tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
1052                         continue;
1053                 }
1054
1055                 hdr = (struct ctdb_ltdb_header *)data.dptr;
1056                 if (hdr->dmaster == v->rec->ctdb->pnn) {
1057                         /* its already local */
1058                         free(data.dptr);
1059                         tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
1060                         continue;
1061                 }
1062
1063                 free(data.dptr);
1064
1065                 state = ctdb_call_send(v->ctdb_db, &call);
1066                 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
1067                 if (state == NULL) {
1068                         DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
1069                         talloc_free(v);
1070                         return;
1071                 }
1072                 state->async.fn = vacuum_fetch_callback;
1073                 state->async.private_data = NULL;
1074         }
1075
1076         talloc_free(v);
1077 }
1078
1079
1080 /*
1081   destroy a vacuum info structure
1082  */
1083 static int vacuum_info_destructor(struct vacuum_info *v)
1084 {
1085         DLIST_REMOVE(v->rec->vacuum_info, v);
1086         return 0;
1087 }
1088
1089
1090 /*
1091   handler for vacuum fetch
1092 */
1093 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
1094                                  TDB_DATA data, void *private_data)
1095 {
1096         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1097         struct ctdb_marshall_buffer *recs;
1098         int ret, i;
1099         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1100         const char *name;
1101         struct ctdb_dbid_map *dbmap=NULL;
1102         bool persistent = false;
1103         struct ctdb_db_context *ctdb_db;
1104         struct ctdb_rec_data *r;
1105         uint32_t srcnode;
1106         struct vacuum_info *v;
1107
1108         recs = (struct ctdb_marshall_buffer *)data.dptr;
1109         r = (struct ctdb_rec_data *)&recs->data[0];
1110
1111         if (recs->count == 0) {
1112                 talloc_free(tmp_ctx);
1113                 return;
1114         }
1115
1116         srcnode = r->reqid;
1117
1118         for (v=rec->vacuum_info;v;v=v->next) {
1119                 if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
1120                         /* we're already working on records from this node */
1121                         talloc_free(tmp_ctx);
1122                         return;
1123                 }
1124         }
1125
1126         /* work out if the database is persistent */
1127         ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
1128         if (ret != 0) {
1129                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
1130                 talloc_free(tmp_ctx);
1131                 return;
1132         }
1133
1134         for (i=0;i<dbmap->num;i++) {
1135                 if (dbmap->dbs[i].dbid == recs->db_id) {
1136                         persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
1137                         break;
1138                 }
1139         }
1140         if (i == dbmap->num) {
1141                 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
1142                 talloc_free(tmp_ctx);
1143                 return;
1144         }
1145
1146         /* find the name of this database */
1147         if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
1148                 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
1149                 talloc_free(tmp_ctx);
1150                 return;
1151         }
1152
1153         /* attach to it */
1154         ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, persistent, 0);
1155         if (ctdb_db == NULL) {
1156                 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
1157                 talloc_free(tmp_ctx);
1158                 return;
1159         }
1160
1161         v = talloc_zero(rec, struct vacuum_info);
1162         if (v == NULL) {
1163                 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
1164                 talloc_free(tmp_ctx);
1165                 return;
1166         }
1167
1168         v->rec = rec;
1169         v->srcnode = srcnode;
1170         v->ctdb_db = ctdb_db;
1171         v->recs = talloc_memdup(v, recs, data.dsize);
1172         if (v->recs == NULL) {
1173                 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
1174                 talloc_free(v);
1175                 talloc_free(tmp_ctx);
1176                 return;
1177         }
1178         v->r =  (struct ctdb_rec_data *)&v->recs->data[0];
1179
1180         DLIST_ADD(rec->vacuum_info, v);
1181
1182         talloc_set_destructor(v, vacuum_info_destructor);
1183
1184         vacuum_fetch_next(v);
1185         talloc_free(tmp_ctx);
1186 }
1187
1188
1189 /*
1190  * handler for database detach
1191  */
1192 static void detach_database_handler(struct ctdb_context *ctdb, uint64_t srvid,
1193                                     TDB_DATA data, void *private_data)
1194 {
1195         struct ctdb_recoverd *rec = talloc_get_type(private_data,
1196                                                     struct ctdb_recoverd);
1197         uint32_t db_id;
1198         struct vacuum_info *v, *vnext;
1199         struct ctdb_db_context *ctdb_db;
1200
1201         if (data.dsize != sizeof(db_id)) {
1202                 return;
1203         }
1204         db_id = *(uint32_t *)data.dptr;
1205
1206         ctdb_db = find_ctdb_db(ctdb, db_id);
1207         if (ctdb_db == NULL) {
1208                 /* database is not attached */
1209                 return;
1210         }
1211
1212         /* Stop any active vacuum fetch */
1213         v = rec->vacuum_info;
1214         while (v != NULL) {
1215                 vnext = v->next;
1216
1217                 if (v->ctdb_db->db_id == db_id) {
1218                         talloc_free(v);
1219                 }
1220                 v = vnext;
1221         }
1222
1223         DLIST_REMOVE(ctdb->db_list, ctdb_db);
1224
1225         DEBUG(DEBUG_NOTICE, ("Detached from database '%s'\n",
1226                              ctdb_db->db_name));
1227         talloc_free(ctdb_db);
1228 }
1229
1230 /*
1231   called when ctdb_wait_timeout should finish
1232  */
1233 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
1234                               struct timeval yt, void *p)
1235 {
1236         uint32_t *timed_out = (uint32_t *)p;
1237         (*timed_out) = 1;
1238 }
1239
1240 /*
1241   wait for a given number of seconds
1242  */
1243 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
1244 {
1245         uint32_t timed_out = 0;
1246         time_t usecs = (secs - (time_t)secs) * 1000000;
1247         event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs), ctdb_wait_handler, &timed_out);
1248         while (!timed_out) {
1249                 event_loop_once(ctdb->ev);
1250         }
1251 }
1252
1253 /*
1254   called when an election times out (ends)
1255  */
1256 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
1257                                   struct timeval t, void *p)
1258 {
1259         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1260         rec->election_timeout = NULL;
1261         fast_start = false;
1262
1263         DEBUG(DEBUG_WARNING,("Election period ended\n"));
1264 }
1265
1266
1267 /*
1268   wait for an election to finish. It finished election_timeout seconds after
1269   the last election packet is received
1270  */
1271 static void ctdb_wait_election(struct ctdb_recoverd *rec)
1272 {
1273         struct ctdb_context *ctdb = rec->ctdb;
1274         while (rec->election_timeout) {
1275                 event_loop_once(ctdb->ev);
1276         }
1277 }
1278
1279 /*
1280   Update our local flags from all remote connected nodes.
1281   This is only run when we are or we belive we are the recovery master
1282  */
1283 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
1284 {
1285         int j;
1286         struct ctdb_context *ctdb = rec->ctdb;
1287         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1288
1289         /* get the nodemap for all active remote nodes and verify
1290            they are the same as for this node
1291          */
1292         for (j=0; j<nodemap->num; j++) {
1293                 struct ctdb_node_map *remote_nodemap=NULL;
1294                 int ret;
1295
1296                 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
1297                         continue;
1298                 }
1299                 if (nodemap->nodes[j].pnn == ctdb->pnn) {
1300                         continue;
1301                 }
1302
1303                 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
1304                                            mem_ctx, &remote_nodemap);
1305                 if (ret != 0) {
1306                         DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
1307                                   nodemap->nodes[j].pnn));
1308                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
1309                         talloc_free(mem_ctx);
1310                         return MONITOR_FAILED;
1311                 }
1312                 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
1313                         /* We should tell our daemon about this so it
1314                            updates its flags or else we will log the same
1315                            message again in the next iteration of recovery.
1316                            Since we are the recovery master we can just as
1317                            well update the flags on all nodes.
1318                         */
1319                         ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
1320                         if (ret != 0) {
1321                                 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
1322                                 return -1;
1323                         }
1324
1325                         /* Update our local copy of the flags in the recovery
1326                            daemon.
1327                         */
1328                         DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
1329                                  nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
1330                                  nodemap->nodes[j].flags));
1331                         nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
1332                 }
1333                 talloc_free(remote_nodemap);
1334         }
1335         talloc_free(mem_ctx);
1336         return MONITOR_OK;
1337 }
1338
1339
1340 /* Create a new random generation ip.
1341    The generation id can not be the INVALID_GENERATION id
1342 */
1343 static uint32_t new_generation(void)
1344 {
1345         uint32_t generation;
1346
1347         while (1) {
1348                 generation = random();
1349
1350                 if (generation != INVALID_GENERATION) {
1351                         break;
1352                 }
1353         }
1354
1355         return generation;
1356 }
1357
1358
1359 /*
1360   create a temporary working database
1361  */
1362 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1363 {
1364         char *name;
1365         struct tdb_wrap *recdb;
1366         unsigned tdb_flags;
1367
1368         /* open up the temporary recovery database */
1369         name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
1370                                ctdb->db_directory_state,
1371                                ctdb->pnn);
1372         if (name == NULL) {
1373                 return NULL;
1374         }
1375         unlink(name);
1376
1377         tdb_flags = TDB_NOLOCK;
1378         if (ctdb->valgrinding) {
1379                 tdb_flags |= TDB_NOMMAP;
1380         }
1381         tdb_flags |= (TDB_INCOMPATIBLE_HASH | TDB_DISALLOW_NESTING);
1382
1383         recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1384                               tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1385         if (recdb == NULL) {
1386                 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1387         }
1388
1389         talloc_free(name);
1390
1391         return recdb;
1392 }
1393
1394
1395 /*
1396    a traverse function for pulling all relevant records from recdb
1397  */
1398 struct recdb_data {
1399         struct ctdb_context *ctdb;
1400         struct ctdb_marshall_buffer *recdata;
1401         uint32_t len;
1402         uint32_t allocated_len;
1403         bool failed;
1404         bool persistent;
1405 };
1406
1407 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1408 {
1409         struct recdb_data *params = (struct recdb_data *)p;
1410         struct ctdb_rec_data *recdata;
1411         struct ctdb_ltdb_header *hdr;
1412
1413         /*
1414          * skip empty records - but NOT for persistent databases:
1415          *
1416          * The record-by-record mode of recovery deletes empty records.
1417          * For persistent databases, this can lead to data corruption
1418          * by deleting records that should be there:
1419          *
1420          * - Assume the cluster has been running for a while.
1421          *
1422          * - A record R in a persistent database has been created and
1423          *   deleted a couple of times, the last operation being deletion,
1424          *   leaving an empty record with a high RSN, say 10.
1425          *
1426          * - Now a node N is turned off.
1427          *
1428          * - This leaves the local database copy of D on N with the empty
1429          *   copy of R and RSN 10. On all other nodes, the recovery has deleted
1430          *   the copy of record R.
1431          *
1432          * - Now the record is created again while node N is turned off.
1433          *   This creates R with RSN = 1 on all nodes except for N.
1434          *
1435          * - Now node N is turned on again. The following recovery will chose
1436          *   the older empty copy of R due to RSN 10 > RSN 1.
1437          *
1438          * ==> Hence the record is gone after the recovery.
1439          *
1440          * On databases like Samba's registry, this can damage the higher-level
1441          * data structures built from the various tdb-level records.
1442          */
1443         if (!params->persistent && data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1444                 return 0;
1445         }
1446
1447         /* update the dmaster field to point to us */
1448         hdr = (struct ctdb_ltdb_header *)data.dptr;
1449         if (!params->persistent) {
1450                 hdr->dmaster = params->ctdb->pnn;
1451                 hdr->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
1452         }
1453
1454         /* add the record to the blob ready to send to the nodes */
1455         recdata = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1456         if (recdata == NULL) {
1457                 params->failed = true;
1458                 return -1;
1459         }
1460         if (params->len + recdata->length >= params->allocated_len) {
1461                 params->allocated_len = recdata->length + params->len + params->ctdb->tunable.pulldb_preallocation_size;
1462                 params->recdata = talloc_realloc_size(NULL, params->recdata, params->allocated_len);
1463         }
1464         if (params->recdata == NULL) {
1465                 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u\n",
1466                          recdata->length + params->len));
1467                 params->failed = true;
1468                 return -1;
1469         }
1470         params->recdata->count++;
1471         memcpy(params->len+(uint8_t *)params->recdata, recdata, recdata->length);
1472         params->len += recdata->length;
1473         talloc_free(recdata);
1474
1475         return 0;
1476 }
1477
1478 /*
1479   push the recdb database out to all nodes
1480  */
1481 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1482                                bool persistent,
1483                                struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1484 {
1485         struct recdb_data params;
1486         struct ctdb_marshall_buffer *recdata;
1487         TDB_DATA outdata;
1488         TALLOC_CTX *tmp_ctx;
1489         uint32_t *nodes;
1490
1491         tmp_ctx = talloc_new(ctdb);
1492         CTDB_NO_MEMORY(ctdb, tmp_ctx);
1493
1494         recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1495         CTDB_NO_MEMORY(ctdb, recdata);
1496
1497         recdata->db_id = dbid;
1498
1499         params.ctdb = ctdb;
1500         params.recdata = recdata;
1501         params.len = offsetof(struct ctdb_marshall_buffer, data);
1502         params.allocated_len = params.len;
1503         params.failed = false;
1504         params.persistent = persistent;
1505
1506         if (tdb_traverse_read(recdb->tdb, traverse_recdb, &params) == -1) {
1507                 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1508                 talloc_free(params.recdata);
1509                 talloc_free(tmp_ctx);
1510                 return -1;
1511         }
1512
1513         if (params.failed) {
1514                 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1515                 talloc_free(params.recdata);
1516                 talloc_free(tmp_ctx);
1517                 return -1;
1518         }
1519
1520         recdata = params.recdata;
1521
1522         outdata.dptr = (void *)recdata;
1523         outdata.dsize = params.len;
1524
1525         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1526         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1527                                         nodes, 0,
1528                                         CONTROL_TIMEOUT(), false, outdata,
1529                                         NULL, NULL,
1530                                         NULL) != 0) {
1531                 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1532                 talloc_free(recdata);
1533                 talloc_free(tmp_ctx);
1534                 return -1;
1535         }
1536
1537         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1538                   dbid, recdata->count));
1539
1540         talloc_free(recdata);
1541         talloc_free(tmp_ctx);
1542
1543         return 0;
1544 }
1545
1546
1547 /*
1548   go through a full recovery on one database
1549  */
1550 static int recover_database(struct ctdb_recoverd *rec,
1551                             TALLOC_CTX *mem_ctx,
1552                             uint32_t dbid,
1553                             bool persistent,
1554                             uint32_t pnn,
1555                             struct ctdb_node_map *nodemap,
1556                             uint32_t transaction_id)
1557 {
1558         struct tdb_wrap *recdb;
1559         int ret;
1560         struct ctdb_context *ctdb = rec->ctdb;
1561         TDB_DATA data;
1562         struct ctdb_control_wipe_database w;
1563         uint32_t *nodes;
1564
1565         recdb = create_recdb(ctdb, mem_ctx);
1566         if (recdb == NULL) {
1567                 return -1;
1568         }
1569
1570         /* pull all remote databases onto the recdb */
1571         ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
1572         if (ret != 0) {
1573                 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1574                 return -1;
1575         }
1576
1577         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1578
1579         /* wipe all the remote databases. This is safe as we are in a transaction */
1580         w.db_id = dbid;
1581         w.transaction_id = transaction_id;
1582
1583         data.dptr = (void *)&w;
1584         data.dsize = sizeof(w);
1585
1586         nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1587         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1588                                         nodes, 0,
1589                                         CONTROL_TIMEOUT(), false, data,
1590                                         NULL, NULL,
1591                                         NULL) != 0) {
1592                 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1593                 talloc_free(recdb);
1594                 return -1;
1595         }
1596
1597         /* push out the correct database. This sets the dmaster and skips
1598            the empty records */
1599         ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
1600         if (ret != 0) {
1601                 talloc_free(recdb);
1602                 return -1;
1603         }
1604
1605         /* all done with this database */
1606         talloc_free(recdb);
1607
1608         return 0;
1609 }
1610
1611 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1612                                          struct ctdb_recoverd *rec,
1613                                          struct ctdb_node_map *nodemap,
1614                                          uint32_t *culprit)
1615 {
1616         int j;
1617         int ret;
1618
1619         if (ctdb->num_nodes != nodemap->num) {
1620                 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
1621                                   ctdb->num_nodes, nodemap->num));
1622                 if (culprit) {
1623                         *culprit = ctdb->pnn;
1624                 }
1625                 return -1;
1626         }
1627
1628         for (j=0; j<nodemap->num; j++) {
1629                 /* For readability */
1630                 struct ctdb_node *node = ctdb->nodes[j];
1631
1632                 /* release any existing data */
1633                 if (node->known_public_ips) {
1634                         talloc_free(node->known_public_ips);
1635                         node->known_public_ips = NULL;
1636                 }
1637                 if (node->available_public_ips) {
1638                         talloc_free(node->available_public_ips);
1639                         node->available_public_ips = NULL;
1640                 }
1641
1642                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1643                         continue;
1644                 }
1645
1646                 /* Retrieve the list of known public IPs from the node */
1647                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1648                                         CONTROL_TIMEOUT(),
1649                                         node->pnn,
1650                                         ctdb->nodes,
1651                                         0,
1652                                         &node->known_public_ips);
1653                 if (ret != 0) {
1654                         DEBUG(DEBUG_ERR,
1655                               ("Failed to read known public IPs from node: %u\n",
1656                                node->pnn));
1657                         if (culprit) {
1658                                 *culprit = node->pnn;
1659                         }
1660                         return -1;
1661                 }
1662
1663                 if (ctdb->do_checkpublicip &&
1664                     !ctdb_op_is_disabled(rec->takeover_run) &&
1665                     verify_remote_ip_allocation(ctdb,
1666                                                  node->known_public_ips,
1667                                                  node->pnn)) {
1668                         DEBUG(DEBUG_ERR,("Trigger IP reallocation\n"));
1669                         rec->need_takeover_run = true;
1670                 }
1671
1672                 /* Retrieve the list of available public IPs from the node */
1673                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1674                                         CONTROL_TIMEOUT(),
1675                                         node->pnn,
1676                                         ctdb->nodes,
1677                                         CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1678                                         &node->available_public_ips);
1679                 if (ret != 0) {
1680                         DEBUG(DEBUG_ERR,
1681                               ("Failed to read available public IPs from node: %u\n",
1682                                node->pnn));
1683                         if (culprit) {
1684                                 *culprit = node->pnn;
1685                         }
1686                         return -1;
1687                 }
1688         }
1689
1690         return 0;
1691 }
1692
1693 /* when we start a recovery, make sure all nodes use the same reclock file
1694    setting
1695 */
1696 static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd *rec)
1697 {
1698         struct ctdb_context *ctdb = rec->ctdb;
1699         TALLOC_CTX *tmp_ctx = talloc_new(NULL);
1700         TDB_DATA data;
1701         uint32_t *nodes;
1702
1703         if (ctdb->recovery_lock_file == NULL) {
1704                 data.dptr  = NULL;
1705                 data.dsize = 0;
1706         } else {
1707                 data.dsize = strlen(ctdb->recovery_lock_file) + 1;
1708                 data.dptr  = (uint8_t *)ctdb->recovery_lock_file;
1709         }
1710
1711         nodes = list_of_active_nodes(ctdb, rec->nodemap, tmp_ctx, true);
1712         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECLOCK_FILE,
1713                                         nodes, 0,
1714                                         CONTROL_TIMEOUT(),
1715                                         false, data,
1716                                         NULL, NULL,
1717                                         rec) != 0) {
1718                 DEBUG(DEBUG_ERR, (__location__ " Failed to sync reclock file settings\n"));
1719                 talloc_free(tmp_ctx);
1720                 return -1;
1721         }
1722
1723         talloc_free(tmp_ctx);
1724         return 0;
1725 }
1726
1727
1728 /*
1729  * this callback is called for every node that failed to execute ctdb_takeover_run()
1730  * and set flag to re-run takeover run.
1731  */
1732 static void takeover_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
1733 {
1734         DEBUG(DEBUG_ERR, ("Node %u failed the takeover run\n", node_pnn));
1735
1736         if (callback_data != NULL) {
1737                 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
1738
1739                 DEBUG(DEBUG_ERR, ("Setting node %u as recovery fail culprit\n", node_pnn));
1740
1741                 ctdb_set_culprit(rec, node_pnn);
1742         }
1743 }
1744
1745
1746 static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
1747 {
1748         struct ctdb_context *ctdb = rec->ctdb;
1749         int i;
1750         struct ctdb_banning_state *ban_state;
1751
1752         *self_ban = false;
1753         for (i=0; i<ctdb->num_nodes; i++) {
1754                 if (ctdb->nodes[i]->ban_state == NULL) {
1755                         continue;
1756                 }
1757                 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1758                 if (ban_state->count < 2*ctdb->num_nodes) {
1759                         continue;
1760                 }
1761
1762                 DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
1763                         ctdb->nodes[i]->pnn, ban_state->count,
1764                         ctdb->tunable.recovery_ban_period));
1765                 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1766                 ban_state->count = 0;
1767
1768                 /* Banning ourself? */
1769                 if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
1770                         *self_ban = true;
1771                 }
1772         }
1773 }
1774
1775 static bool do_takeover_run(struct ctdb_recoverd *rec,
1776                             struct ctdb_node_map *nodemap,
1777                             bool banning_credits_on_fail)
1778 {
1779         uint32_t *nodes = NULL;
1780         struct srvid_request_data dtr;
1781         TDB_DATA data;
1782         int i;
1783         uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
1784         int ret;
1785         bool ok;
1786
1787         DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
1788
1789         if (ctdb_op_is_in_progress(rec->takeover_run)) {
1790                 DEBUG(DEBUG_ERR, (__location__
1791                                   " takeover run already in progress \n"));
1792                 ok = false;
1793                 goto done;
1794         }
1795
1796         if (!ctdb_op_begin(rec->takeover_run)) {
1797                 ok = false;
1798                 goto done;
1799         }
1800
1801         /* Disable IP checks (takeover runs, really) on other nodes
1802          * while doing this takeover run.  This will stop those other
1803          * nodes from triggering takeover runs when think they should
1804          * be hosting an IP but it isn't yet on an interface.  Don't
1805          * wait for replies since a failure here might cause some
1806          * noise in the logs but will not actually cause a problem.
1807          */
1808         dtr.srvid = 0; /* No reply */
1809         dtr.pnn = -1;
1810
1811         data.dptr  = (uint8_t*)&dtr;
1812         data.dsize = sizeof(dtr);
1813
1814         nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
1815
1816         /* Disable for 60 seconds.  This can be a tunable later if
1817          * necessary.
1818          */
1819         dtr.data = 60;
1820         for (i = 0; i < talloc_array_length(nodes); i++) {
1821                 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1822                                              CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1823                                              data) != 0) {
1824                         DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
1825                 }
1826         }
1827
1828         ret = ctdb_takeover_run(rec->ctdb, nodemap,
1829                                 rec->force_rebalance_nodes,
1830                                 takeover_fail_callback,
1831                                 banning_credits_on_fail ? rec : NULL);
1832
1833         /* Reenable takeover runs and IP checks on other nodes */
1834         dtr.data = 0;
1835         for (i = 0; i < talloc_array_length(nodes); i++) {
1836                 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1837                                              CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1838                                              data) != 0) {
1839                         DEBUG(DEBUG_INFO,("Failed to reenable takeover runs\n"));
1840                 }
1841         }
1842
1843         if (ret != 0) {
1844                 DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
1845                 ok = false;
1846                 goto done;
1847         }
1848
1849         ok = true;
1850         /* Takeover run was successful so clear force rebalance targets */
1851         if (rebalance_nodes == rec->force_rebalance_nodes) {
1852                 TALLOC_FREE(rec->force_rebalance_nodes);
1853         } else {
1854                 DEBUG(DEBUG_WARNING,
1855                       ("Rebalance target nodes changed during takeover run - not clearing\n"));
1856         }
1857 done:
1858         rec->need_takeover_run = !ok;
1859         talloc_free(nodes);
1860         ctdb_op_end(rec->takeover_run);
1861
1862         DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
1863         return ok;
1864 }
1865
1866
1867 /*
1868   we are the recmaster, and recovery is needed - start a recovery run
1869  */
1870 static int do_recovery(struct ctdb_recoverd *rec,
1871                        TALLOC_CTX *mem_ctx, uint32_t pnn,
1872                        struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap)
1873 {
1874         struct ctdb_context *ctdb = rec->ctdb;
1875         int i, j, ret;
1876         uint32_t generation;
1877         struct ctdb_dbid_map *dbmap;
1878         TDB_DATA data;
1879         uint32_t *nodes;
1880         struct timeval start_time;
1881         uint32_t culprit = (uint32_t)-1;
1882         bool self_ban;
1883
1884         DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1885
1886         /* if recovery fails, force it again */
1887         rec->need_recovery = true;
1888
1889         if (!ctdb_op_begin(rec->recovery)) {
1890                 return -1;
1891         }
1892
1893         if (rec->election_timeout) {
1894                 /* an election is in progress */
1895                 DEBUG(DEBUG_ERR, ("do_recovery called while election in progress - try again later\n"));
1896                 goto fail;
1897         }
1898
1899         ban_misbehaving_nodes(rec, &self_ban);
1900         if (self_ban) {
1901                 DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
1902                 goto fail;
1903         }
1904
1905         if (ctdb->recovery_lock_file != NULL) {
1906                 if (ctdb_recovery_have_lock(ctdb)) {
1907                         DEBUG(DEBUG_NOTICE, ("Already holding recovery lock\n"));
1908                 } else {
1909                         start_time = timeval_current();
1910                         DEBUG(DEBUG_NOTICE, ("Attempting to take recovery lock (%s)\n",
1911                                              ctdb->recovery_lock_file));
1912                         if (!ctdb_recovery_lock(ctdb)) {
1913                                 if (ctdb->runstate == CTDB_RUNSTATE_FIRST_RECOVERY) {
1914                                         /* If ctdb is trying first recovery, it's
1915                                          * possible that current node does not know
1916                                          * yet who the recmaster is.
1917                                          */
1918                                         DEBUG(DEBUG_ERR, ("Unable to get recovery lock"
1919                                                           " - retrying recovery\n"));
1920                                         goto fail;
1921                                 }
1922
1923                                 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
1924                                                  "and ban ourself for %u seconds\n",
1925                                                  ctdb->tunable.recovery_ban_period));
1926                                 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
1927                                 goto fail;
1928                         }
1929                         ctdb_ctrl_report_recd_lock_latency(ctdb,
1930                                                            CONTROL_TIMEOUT(),
1931                                                            timeval_elapsed(&start_time));
1932                         DEBUG(DEBUG_NOTICE,
1933                               ("Recovery lock taken successfully by recovery daemon\n"));
1934                 }
1935         }
1936
1937         DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1938
1939         /* get a list of all databases */
1940         ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1941         if (ret != 0) {
1942                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1943                 goto fail;
1944         }
1945
1946         /* we do the db creation before we set the recovery mode, so the freeze happens
1947            on all databases we will be dealing with. */
1948
1949         /* verify that we have all the databases any other node has */
1950         ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1951         if (ret != 0) {
1952                 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1953                 goto fail;
1954         }
1955
1956         /* verify that all other nodes have all our databases */
1957         ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1958         if (ret != 0) {
1959                 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1960                 goto fail;
1961         }
1962         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1963
1964         /* update the database priority for all remote databases */
1965         ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
1966         if (ret != 0) {
1967                 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
1968         }
1969         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
1970
1971
1972         /* update all other nodes to use the same setting for reclock files
1973            as the local recovery master.
1974         */
1975         sync_recovery_lock_file_across_cluster(rec);
1976
1977         /* set recovery mode to active on all nodes */
1978         ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1979         if (ret != 0) {
1980                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1981                 goto fail;
1982         }
1983
1984         /* execute the "startrecovery" event script on all nodes */
1985         ret = run_startrecovery_eventscript(rec, nodemap);
1986         if (ret!=0) {
1987                 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1988                 goto fail;
1989         }
1990
1991         /*
1992           update all nodes to have the same flags that we have
1993          */
1994         for (i=0;i<nodemap->num;i++) {
1995                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1996                         continue;
1997                 }
1998
1999                 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
2000                 if (ret != 0) {
2001                         if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2002                                 DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
2003                         } else {
2004                                 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
2005                                 goto fail;
2006                         }
2007                 }
2008         }
2009
2010         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
2011
2012         /* pick a new generation number */
2013         generation = new_generation();
2014
2015         /* change the vnnmap on this node to use the new generation
2016            number but not on any other nodes.
2017            this guarantees that if we abort the recovery prematurely
2018            for some reason (a node stops responding?)
2019            that we can just return immediately and we will reenter
2020            recovery shortly again.
2021            I.e. we deliberately leave the cluster with an inconsistent
2022            generation id to allow us to abort recovery at any stage and
2023            just restart it from scratch.
2024          */
2025         vnnmap->generation = generation;
2026         ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
2027         if (ret != 0) {
2028                 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
2029                 goto fail;
2030         }
2031
2032         data.dptr = (void *)&generation;
2033         data.dsize = sizeof(uint32_t);
2034
2035         nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
2036         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
2037                                         nodes, 0,
2038                                         CONTROL_TIMEOUT(), false, data,
2039                                         NULL,
2040                                         transaction_start_fail_callback,
2041                                         rec) != 0) {
2042                 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
2043                 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
2044                                         nodes, 0,
2045                                         CONTROL_TIMEOUT(), false, tdb_null,
2046                                         NULL,
2047                                         NULL,
2048                                         NULL) != 0) {
2049                         DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
2050                 }
2051                 goto fail;
2052         }
2053
2054         DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
2055
2056         for (i=0;i<dbmap->num;i++) {
2057                 ret = recover_database(rec, mem_ctx,
2058                                        dbmap->dbs[i].dbid,
2059                                        dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT,
2060                                        pnn, nodemap, generation);
2061                 if (ret != 0) {
2062                         DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
2063                         goto fail;
2064                 }
2065         }
2066
2067         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
2068
2069         /* commit all the changes */
2070         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
2071                                         nodes, 0,
2072                                         CONTROL_TIMEOUT(), false, data,
2073                                         NULL, NULL,
2074                                         NULL) != 0) {
2075                 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
2076                 goto fail;
2077         }
2078
2079         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
2080
2081
2082         /* update the capabilities for all nodes */
2083         ret = update_capabilities(rec, nodemap);
2084         if (ret!=0) {
2085                 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
2086                 goto fail;
2087         }
2088
2089         /* build a new vnn map with all the currently active and
2090            unbanned nodes */
2091         generation = new_generation();
2092         vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
2093         CTDB_NO_MEMORY(ctdb, vnnmap);
2094         vnnmap->generation = generation;
2095         vnnmap->size = 0;
2096         vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
2097         CTDB_NO_MEMORY(ctdb, vnnmap->map);
2098         for (i=j=0;i<nodemap->num;i++) {
2099                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2100                         continue;
2101                 }
2102                 if (!ctdb_node_has_capabilities(rec->caps,
2103                                                 ctdb->nodes[i]->pnn,
2104                                                 CTDB_CAP_LMASTER)) {
2105                         /* this node can not be an lmaster */
2106                         DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
2107                         continue;
2108                 }
2109
2110                 vnnmap->size++;
2111                 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
2112                 CTDB_NO_MEMORY(ctdb, vnnmap->map);
2113                 vnnmap->map[j++] = nodemap->nodes[i].pnn;
2114
2115         }
2116         if (vnnmap->size == 0) {
2117                 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
2118                 vnnmap->size++;
2119                 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
2120                 CTDB_NO_MEMORY(ctdb, vnnmap->map);
2121                 vnnmap->map[0] = pnn;
2122         }
2123
2124         /* update to the new vnnmap on all nodes */
2125         ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
2126         if (ret != 0) {
2127                 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
2128                 goto fail;
2129         }
2130
2131         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
2132
2133         /* update recmaster to point to us for all nodes */
2134         ret = set_recovery_master(ctdb, nodemap, pnn);
2135         if (ret!=0) {
2136                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
2137                 goto fail;
2138         }
2139
2140         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
2141
2142         /* disable recovery mode */
2143         ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL);
2144         if (ret != 0) {
2145                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
2146                 goto fail;
2147         }
2148
2149         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
2150
2151         /* Fetch known/available public IPs from each active node */
2152         ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
2153         if (ret != 0) {
2154                 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2155                                  culprit));
2156                 rec->need_takeover_run = true;
2157                 goto fail;
2158         }
2159
2160         do_takeover_run(rec, nodemap, false);
2161
2162         /* execute the "recovered" event script on all nodes */
2163         ret = run_recovered_eventscript(rec, nodemap, "do_recovery");
2164         if (ret!=0) {
2165                 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
2166                 goto fail;
2167         }
2168
2169         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
2170
2171         /* send a message to all clients telling them that the cluster
2172            has been reconfigured */
2173         ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
2174                                        CTDB_SRVID_RECONFIGURE, tdb_null);
2175         if (ret != 0) {
2176                 DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n"));
2177                 goto fail;
2178         }
2179
2180         DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
2181
2182         rec->need_recovery = false;
2183         ctdb_op_end(rec->recovery);
2184
2185         /* we managed to complete a full recovery, make sure to forgive
2186            any past sins by the nodes that could now participate in the
2187            recovery.
2188         */
2189         DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
2190         for (i=0;i<nodemap->num;i++) {
2191                 struct ctdb_banning_state *ban_state;
2192
2193                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2194                         continue;
2195                 }
2196
2197                 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
2198                 if (ban_state == NULL) {
2199                         continue;
2200                 }
2201
2202                 ban_state->count = 0;
2203         }
2204
2205         /* We just finished a recovery successfully.
2206            We now wait for rerecovery_timeout before we allow
2207            another recovery to take place.
2208         */
2209         DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
2210         ctdb_op_disable(rec->recovery, ctdb->ev,
2211                         ctdb->tunable.rerecovery_timeout);
2212         return 0;
2213
2214 fail:
2215         ctdb_op_end(rec->recovery);
2216         return -1;
2217 }
2218
2219
2220 /*
2221   elections are won by first checking the number of connected nodes, then
2222   the priority time, then the pnn
2223  */
2224 struct election_message {
2225         uint32_t num_connected;
2226         struct timeval priority_time;
2227         uint32_t pnn;
2228         uint32_t node_flags;
2229 };
2230
2231 /*
2232   form this nodes election data
2233  */
2234 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
2235 {
2236         int ret, i;
2237         struct ctdb_node_map *nodemap;
2238         struct ctdb_context *ctdb = rec->ctdb;
2239
2240         ZERO_STRUCTP(em);
2241
2242         em->pnn = rec->ctdb->pnn;
2243         em->priority_time = rec->priority_time;
2244
2245         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
2246         if (ret != 0) {
2247                 DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
2248                 return;
2249         }
2250
2251         rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
2252         em->node_flags = rec->node_flags;
2253
2254         for (i=0;i<nodemap->num;i++) {
2255                 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
2256                         em->num_connected++;
2257                 }
2258         }
2259
2260         /* we shouldnt try to win this election if we cant be a recmaster */
2261         if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2262                 em->num_connected = 0;
2263                 em->priority_time = timeval_current();
2264         }
2265
2266         talloc_free(nodemap);
2267 }
2268
2269 /*
2270   see if the given election data wins
2271  */
2272 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
2273 {
2274         struct election_message myem;
2275         int cmp = 0;
2276
2277         ctdb_election_data(rec, &myem);
2278
2279         /* we cant win if we dont have the recmaster capability */
2280         if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2281                 return false;
2282         }
2283
2284         /* we cant win if we are banned */
2285         if (rec->node_flags & NODE_FLAGS_BANNED) {
2286                 return false;
2287         }
2288
2289         /* we cant win if we are stopped */
2290         if (rec->node_flags & NODE_FLAGS_STOPPED) {
2291                 return false;
2292         }
2293
2294         /* we will automatically win if the other node is banned */
2295         if (em->node_flags & NODE_FLAGS_BANNED) {
2296                 return true;
2297         }
2298
2299         /* we will automatically win if the other node is banned */
2300         if (em->node_flags & NODE_FLAGS_STOPPED) {
2301                 return true;
2302         }
2303
2304         /* try to use the most connected node */
2305         if (cmp == 0) {
2306                 cmp = (int)myem.num_connected - (int)em->num_connected;
2307         }
2308
2309         /* then the longest running node */
2310         if (cmp == 0) {
2311                 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
2312         }
2313
2314         if (cmp == 0) {
2315                 cmp = (int)myem.pnn - (int)em->pnn;
2316         }
2317
2318         return cmp > 0;
2319 }
2320
2321 /*
2322   send out an election request
2323  */
2324 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
2325 {
2326         int ret;
2327         TDB_DATA election_data;
2328         struct election_message emsg;
2329         uint64_t srvid;
2330         struct ctdb_context *ctdb = rec->ctdb;
2331
2332         srvid = CTDB_SRVID_RECOVERY;
2333
2334         ctdb_election_data(rec, &emsg);
2335
2336         election_data.dsize = sizeof(struct election_message);
2337         election_data.dptr  = (unsigned char *)&emsg;
2338
2339
2340         /* first we assume we will win the election and set
2341            recoverymaster to be ourself on the current node
2342          */
2343         ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
2344         if (ret != 0) {
2345                 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
2346                 return -1;
2347         }
2348
2349
2350         /* send an election message to all active nodes */
2351         DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
2352         return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
2353 }
2354
2355 /*
2356   this function will unban all nodes in the cluster
2357 */
2358 static void unban_all_nodes(struct ctdb_context *ctdb)
2359 {
2360         int ret, i;
2361         struct ctdb_node_map *nodemap;
2362         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2363
2364         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2365         if (ret != 0) {
2366                 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
2367                 return;
2368         }
2369
2370         for (i=0;i<nodemap->num;i++) {
2371                 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
2372                   && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
2373                         ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(),
2374                                                  nodemap->nodes[i].pnn, 0,
2375                                                  NODE_FLAGS_BANNED);
2376                         if (ret != 0) {
2377                                 DEBUG(DEBUG_ERR, (__location__ " failed to reset ban state\n"));
2378                         }
2379                 }
2380         }
2381
2382         talloc_free(tmp_ctx);
2383 }
2384
2385
2386 /*
2387   we think we are winning the election - send a broadcast election request
2388  */
2389 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
2390 {
2391         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2392         int ret;
2393
2394         ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
2395         if (ret != 0) {
2396                 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
2397         }
2398
2399         talloc_free(rec->send_election_te);
2400         rec->send_election_te = NULL;
2401 }
2402
2403 /*
2404   handler for memory dumps
2405 */
2406 static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
2407                              TDB_DATA data, void *private_data)
2408 {
2409         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2410         TDB_DATA *dump;
2411         int ret;
2412         struct srvid_request *rd;
2413
2414         if (data.dsize != sizeof(struct srvid_request)) {
2415                 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2416                 talloc_free(tmp_ctx);
2417                 return;
2418         }
2419         rd = (struct srvid_request *)data.dptr;
2420
2421         dump = talloc_zero(tmp_ctx, TDB_DATA);
2422         if (dump == NULL) {
2423                 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
2424                 talloc_free(tmp_ctx);
2425                 return;
2426         }
2427         ret = ctdb_dump_memory(ctdb, dump);
2428         if (ret != 0) {
2429                 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
2430                 talloc_free(tmp_ctx);
2431                 return;
2432         }
2433
2434 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
2435
2436         ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
2437         if (ret != 0) {
2438                 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
2439                 talloc_free(tmp_ctx);
2440                 return;
2441         }
2442
2443         talloc_free(tmp_ctx);
2444 }
2445
2446 /*
2447   handler for reload_nodes
2448 */
2449 static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid,
2450                              TDB_DATA data, void *private_data)
2451 {
2452         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2453
2454         DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
2455
2456         ctdb_load_nodes_file(rec->ctdb);
2457 }
2458
2459
2460 static void ctdb_rebalance_timeout(struct event_context *ev,
2461                                    struct timed_event *te,
2462                                    struct timeval t, void *p)
2463 {
2464         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2465
2466         if (rec->force_rebalance_nodes == NULL) {
2467                 DEBUG(DEBUG_ERR,
2468                       ("Rebalance timeout occurred - no nodes to rebalance\n"));
2469                 return;
2470         }
2471
2472         DEBUG(DEBUG_NOTICE,
2473               ("Rebalance timeout occurred - do takeover run\n"));
2474         do_takeover_run(rec, rec->nodemap, false);
2475 }
2476
2477
2478 static void recd_node_rebalance_handler(struct ctdb_context *ctdb,
2479                                         uint64_t srvid,
2480                                         TDB_DATA data, void *private_data)
2481 {
2482         uint32_t pnn;
2483         uint32_t *t;
2484         int len;
2485         uint32_t deferred_rebalance;
2486         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2487
2488         if (rec->recmaster != ctdb_get_pnn(ctdb)) {
2489                 return;
2490         }
2491
2492         if (data.dsize != sizeof(uint32_t)) {
2493                 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
2494                 return;
2495         }
2496
2497         pnn = *(uint32_t *)&data.dptr[0];
2498
2499         DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
2500
2501         /* Copy any existing list of nodes.  There's probably some
2502          * sort of realloc variant that will do this but we need to
2503          * make sure that freeing the old array also cancels the timer
2504          * event for the timeout... not sure if realloc will do that.
2505          */
2506         len = (rec->force_rebalance_nodes != NULL) ?
2507                 talloc_array_length(rec->force_rebalance_nodes) :
2508                 0;
2509
2510         /* This allows duplicates to be added but they don't cause
2511          * harm.  A call to add a duplicate PNN arguably means that
2512          * the timeout should be reset, so this is the simplest
2513          * solution.
2514          */
2515         t = talloc_zero_array(rec, uint32_t, len+1);
2516         CTDB_NO_MEMORY_VOID(ctdb, t);
2517         if (len > 0) {
2518                 memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
2519         }
2520         t[len] = pnn;
2521
2522         talloc_free(rec->force_rebalance_nodes);
2523
2524         rec->force_rebalance_nodes = t;
2525
2526         /* If configured, setup a deferred takeover run to make sure
2527          * that certain nodes get IPs rebalanced to them.  This will
2528          * be cancelled if a successful takeover run happens before
2529          * the timeout.  Assign tunable value to variable for
2530          * readability.
2531          */
2532         deferred_rebalance = ctdb->tunable.deferred_rebalance_on_node_add;
2533         if (deferred_rebalance != 0) {
2534                 event_add_timed(ctdb->ev, rec->force_rebalance_nodes,
2535                                 timeval_current_ofs(deferred_rebalance, 0),
2536                                 ctdb_rebalance_timeout, rec);
2537         }
2538 }
2539
2540
2541
2542 static void recd_update_ip_handler(struct ctdb_context *ctdb, uint64_t srvid,
2543                              TDB_DATA data, void *private_data)
2544 {
2545         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2546         struct ctdb_public_ip *ip;
2547
2548         if (rec->recmaster != rec->ctdb->pnn) {
2549                 DEBUG(DEBUG_INFO,("Not recmaster, ignore update ip message\n"));
2550                 return;
2551         }
2552
2553         if (data.dsize != sizeof(struct ctdb_public_ip)) {
2554                 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(struct ctdb_public_ip)));
2555                 return;
2556         }
2557
2558         ip = (struct ctdb_public_ip *)data.dptr;
2559
2560         update_ip_assignment_tree(rec->ctdb, ip);
2561 }
2562
2563 static void srvid_disable_and_reply(struct ctdb_context *ctdb,
2564                                     TDB_DATA data,
2565                                     struct ctdb_op_state *op_state)
2566 {
2567         struct srvid_request_data *r;
2568         uint32_t timeout;
2569         TDB_DATA result;
2570         int32_t ret = 0;
2571
2572         /* Validate input data */
2573         if (data.dsize != sizeof(struct srvid_request_data)) {
2574                 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2575                                  "expecting %lu\n", (long unsigned)data.dsize,
2576                                  (long unsigned)sizeof(struct srvid_request)));
2577                 return;
2578         }
2579         if (data.dptr == NULL) {
2580                 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2581                 return;
2582         }
2583
2584         r = (struct srvid_request_data *)data.dptr;
2585         timeout = r->data;
2586
2587         ret = ctdb_op_disable(op_state, ctdb->ev, timeout);
2588         if (ret != 0) {
2589                 goto done;
2590         }
2591
2592         /* Returning our PNN tells the caller that we succeeded */
2593         ret = ctdb_get_pnn(ctdb);
2594 done:
2595         result.dsize = sizeof(int32_t);
2596         result.dptr  = (uint8_t *)&ret;
2597         srvid_request_reply(ctdb, (struct srvid_request *)r, result);
2598 }
2599
2600 static void disable_takeover_runs_handler(struct ctdb_context *ctdb,
2601                                           uint64_t srvid, TDB_DATA data,
2602                                           void *private_data)
2603 {
2604         struct ctdb_recoverd *rec = talloc_get_type(private_data,
2605                                                     struct ctdb_recoverd);
2606
2607         srvid_disable_and_reply(ctdb, data, rec->takeover_run);
2608 }
2609
2610 /* Backward compatibility for this SRVID */
2611 static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid,
2612                                      TDB_DATA data, void *private_data)
2613 {
2614         struct ctdb_recoverd *rec = talloc_get_type(private_data,
2615                                                     struct ctdb_recoverd);
2616         uint32_t timeout;
2617
2618         if (data.dsize != sizeof(uint32_t)) {
2619                 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2620                                  "expecting %lu\n", (long unsigned)data.dsize,
2621                                  (long unsigned)sizeof(uint32_t)));
2622                 return;
2623         }
2624         if (data.dptr == NULL) {
2625                 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2626                 return;
2627         }
2628
2629         timeout = *((uint32_t *)data.dptr);
2630
2631         ctdb_op_disable(rec->takeover_run, ctdb->ev, timeout);
2632 }
2633
2634 static void disable_recoveries_handler(struct ctdb_context *ctdb,
2635                                        uint64_t srvid, TDB_DATA data,
2636                                        void *private_data)
2637 {
2638         struct ctdb_recoverd *rec = talloc_get_type(private_data,
2639                                                     struct ctdb_recoverd);
2640
2641         srvid_disable_and_reply(ctdb, data, rec->recovery);
2642 }
2643
2644 /*
2645   handler for ip reallocate, just add it to the list of requests and
2646   handle this later in the monitor_cluster loop so we do not recurse
2647   with other requests to takeover_run()
2648 */
2649 static void ip_reallocate_handler(struct ctdb_context *ctdb, uint64_t srvid,
2650                                   TDB_DATA data, void *private_data)
2651 {
2652         struct srvid_request *request;
2653         struct ctdb_recoverd *rec = talloc_get_type(private_data,
2654                                                     struct ctdb_recoverd);
2655
2656         if (data.dsize != sizeof(struct srvid_request)) {
2657                 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2658                 return;
2659         }
2660
2661         request = (struct srvid_request *)data.dptr;
2662
2663         srvid_request_add(ctdb, &rec->reallocate_requests, request);
2664 }
2665
2666 static void process_ipreallocate_requests(struct ctdb_context *ctdb,
2667                                           struct ctdb_recoverd *rec)
2668 {
2669         TDB_DATA result;
2670         int32_t ret;
2671         uint32_t culprit;
2672         struct srvid_requests *current;
2673
2674         DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
2675
2676         /* Only process requests that are currently pending.  More
2677          * might come in while the takeover run is in progress and
2678          * they will need to be processed later since they might
2679          * be in response flag changes.
2680          */
2681         current = rec->reallocate_requests;
2682         rec->reallocate_requests = NULL;
2683
2684         /* update the list of public ips that a node can handle for
2685            all connected nodes
2686         */
2687         ret = ctdb_reload_remote_public_ips(ctdb, rec, rec->nodemap, &culprit);
2688         if (ret != 0) {
2689                 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2690                                  culprit));
2691                 rec->need_takeover_run = true;
2692         }
2693         if (ret == 0) {
2694                 if (do_takeover_run(rec, rec->nodemap, false)) {
2695                         ret = ctdb_get_pnn(ctdb);
2696                 } else {
2697                         ret = -1;
2698                 }
2699         }
2700
2701         result.dsize = sizeof(int32_t);
2702         result.dptr  = (uint8_t *)&ret;
2703
2704         srvid_requests_reply(ctdb, &current, result);
2705 }
2706
2707
2708 /*
2709   handler for recovery master elections
2710 */
2711 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
2712                              TDB_DATA data, void *private_data)
2713 {
2714         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2715         int ret;
2716         struct election_message *em = (struct election_message *)data.dptr;
2717
2718         /* Ignore election packets from ourself */
2719         if (ctdb->pnn == em->pnn) {
2720                 return;
2721         }
2722
2723         /* we got an election packet - update the timeout for the election */
2724         talloc_free(rec->election_timeout);
2725         rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2726                                                 fast_start ?
2727                                                 timeval_current_ofs(0, 500000) :
2728                                                 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2729                                                 ctdb_election_timeout, rec);
2730
2731         /* someone called an election. check their election data
2732            and if we disagree and we would rather be the elected node,
2733            send a new election message to all other nodes
2734          */
2735         if (ctdb_election_win(rec, em)) {
2736                 if (!rec->send_election_te) {
2737                         rec->send_election_te = event_add_timed(ctdb->ev, rec,
2738                                                                 timeval_current_ofs(0, 500000),
2739                                                                 election_send_request, rec);
2740                 }
2741                 /*unban_all_nodes(ctdb);*/
2742                 return;
2743         }
2744
2745         /* we didn't win */
2746         TALLOC_FREE(rec->send_election_te);
2747
2748         if (ctdb->recovery_lock_file != NULL) {
2749                 /* Release the recovery lock file */
2750                 if (ctdb_recovery_have_lock(ctdb)) {
2751                         ctdb_recovery_unlock(ctdb);
2752                         unban_all_nodes(ctdb);
2753                 }
2754         }
2755
2756         /* ok, let that guy become recmaster then */
2757         ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
2758         if (ret != 0) {
2759                 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
2760                 return;
2761         }
2762
2763         return;
2764 }
2765
2766
2767 /*
2768   force the start of the election process
2769  */
2770 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
2771                            struct ctdb_node_map *nodemap)
2772 {
2773         int ret;
2774         struct ctdb_context *ctdb = rec->ctdb;
2775
2776         DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
2777
2778         /* set all nodes to recovery mode to stop all internode traffic */
2779         ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
2780         if (ret != 0) {
2781                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
2782                 return;
2783         }
2784
2785         talloc_free(rec->election_timeout);
2786         rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2787                                                 fast_start ?
2788                                                 timeval_current_ofs(0, 500000) :
2789                                                 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2790                                                 ctdb_election_timeout, rec);
2791
2792         ret = send_election_request(rec, pnn);
2793         if (ret!=0) {
2794                 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
2795                 return;
2796         }
2797
2798         /* wait for a few seconds to collect all responses */
2799         ctdb_wait_election(rec);
2800 }
2801
2802
2803
2804 /*
2805   handler for when a node changes its flags
2806 */
2807 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
2808                             TDB_DATA data, void *private_data)
2809 {
2810         int ret;
2811         struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2812         struct ctdb_node_map *nodemap=NULL;
2813         TALLOC_CTX *tmp_ctx;
2814         int i;
2815         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2816         int disabled_flag_changed;
2817
2818         if (data.dsize != sizeof(*c)) {
2819                 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2820                 return;
2821         }
2822
2823         tmp_ctx = talloc_new(ctdb);
2824         CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2825
2826         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2827         if (ret != 0) {
2828                 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2829                 talloc_free(tmp_ctx);
2830                 return;
2831         }
2832
2833
2834         for (i=0;i<nodemap->num;i++) {
2835                 if (nodemap->nodes[i].pnn == c->pnn) break;
2836         }
2837
2838         if (i == nodemap->num) {
2839                 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2840                 talloc_free(tmp_ctx);
2841                 return;
2842         }
2843
2844         if (c->old_flags != c->new_flags) {
2845                 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x  was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2846         }
2847
2848         disabled_flag_changed =  (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
2849
2850         nodemap->nodes[i].flags = c->new_flags;
2851
2852         ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2853                                      CTDB_CURRENT_NODE, &ctdb->recovery_master);
2854
2855         if (ret == 0) {
2856                 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2857                                            CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2858         }
2859
2860         if (ret == 0 &&
2861             ctdb->recovery_master == ctdb->pnn &&
2862             ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2863                 /* Only do the takeover run if the perm disabled or unhealthy
2864                    flags changed since these will cause an ip failover but not
2865                    a recovery.
2866                    If the node became disconnected or banned this will also
2867                    lead to an ip address failover but that is handled
2868                    during recovery
2869                 */
2870                 if (disabled_flag_changed) {
2871                         rec->need_takeover_run = true;
2872                 }
2873         }
2874
2875         talloc_free(tmp_ctx);
2876 }
2877
2878 /*
2879   handler for when we need to push out flag changes ot all other nodes
2880 */
2881 static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
2882                             TDB_DATA data, void *private_data)
2883 {
2884         int ret;
2885         struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2886         struct ctdb_node_map *nodemap=NULL;
2887         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2888         uint32_t recmaster;
2889         uint32_t *nodes;
2890
2891         /* find the recovery master */
2892         ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
2893         if (ret != 0) {
2894                 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from local node\n"));
2895                 talloc_free(tmp_ctx);
2896                 return;
2897         }
2898
2899         /* read the node flags from the recmaster */
2900         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), recmaster, tmp_ctx, &nodemap);
2901         if (ret != 0) {
2902                 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2903                 talloc_free(tmp_ctx);
2904                 return;
2905         }
2906         if (c->pnn >= nodemap->num) {
2907                 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2908                 talloc_free(tmp_ctx);
2909                 return;
2910         }
2911
2912         /* send the flags update to all connected nodes */
2913         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2914
2915         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2916                                       nodes, 0, CONTROL_TIMEOUT(),
2917                                       false, data,
2918                                       NULL, NULL,
2919                                       NULL) != 0) {
2920                 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2921
2922                 talloc_free(tmp_ctx);
2923                 return;
2924         }
2925
2926         talloc_free(tmp_ctx);
2927 }
2928
2929
2930 struct verify_recmode_normal_data {
2931         uint32_t count;
2932         enum monitor_result status;
2933 };
2934
2935 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2936 {
2937         struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2938
2939
2940         /* one more node has responded with recmode data*/
2941         rmdata->count--;
2942
2943         /* if we failed to get the recmode, then return an error and let
2944            the main loop try again.
2945         */
2946         if (state->state != CTDB_CONTROL_DONE) {
2947                 if (rmdata->status == MONITOR_OK) {
2948                         rmdata->status = MONITOR_FAILED;
2949                 }
2950                 return;
2951         }
2952
2953         /* if we got a response, then the recmode will be stored in the
2954            status field
2955         */
2956         if (state->status != CTDB_RECOVERY_NORMAL) {
2957                 DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
2958                 rmdata->status = MONITOR_RECOVERY_NEEDED;
2959         }
2960
2961         return;
2962 }
2963
2964
2965 /* verify that all nodes are in normal recovery mode */
2966 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2967 {
2968         struct verify_recmode_normal_data *rmdata;
2969         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2970         struct ctdb_client_control_state *state;
2971         enum monitor_result status;
2972         int j;
2973
2974         rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2975         CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2976         rmdata->count  = 0;
2977         rmdata->status = MONITOR_OK;
2978
2979         /* loop over all active nodes and send an async getrecmode call to
2980            them*/
2981         for (j=0; j<nodemap->num; j++) {
2982                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2983                         continue;
2984                 }
2985                 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2986                                         CONTROL_TIMEOUT(),
2987                                         nodemap->nodes[j].pnn);
2988                 if (state == NULL) {
2989                         /* we failed to send the control, treat this as
2990                            an error and try again next iteration
2991                         */
2992                         DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2993                         talloc_free(mem_ctx);
2994                         return MONITOR_FAILED;
2995                 }
2996
2997                 /* set up the callback functions */
2998                 state->async.fn = verify_recmode_normal_callback;
2999                 state->async.private_data = rmdata;
3000
3001                 /* one more control to wait for to complete */
3002                 rmdata->count++;
3003         }
3004
3005
3006         /* now wait for up to the maximum number of seconds allowed
3007            or until all nodes we expect a response from has replied
3008         */
3009         while (rmdata->count > 0) {
3010                 event_loop_once(ctdb->ev);
3011         }
3012
3013         status = rmdata->status;
3014         talloc_free(mem_ctx);
3015         return status;
3016 }
3017
3018
3019 struct verify_recmaster_data {
3020         struct ctdb_recoverd *rec;
3021         uint32_t count;
3022         uint32_t pnn;
3023         enum monitor_result status;
3024 };
3025
3026 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
3027 {
3028         struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
3029
3030
3031         /* one more node has responded with recmaster data*/
3032         rmdata->count--;
3033
3034         /* if we failed to get the recmaster, then return an error and let
3035            the main loop try again.
3036         */
3037         if (state->state != CTDB_CONTROL_DONE) {
3038                 if (rmdata->status == MONITOR_OK) {
3039                         rmdata->status = MONITOR_FAILED;
3040                 }
3041                 return;
3042         }
3043
3044         /* if we got a response, then the recmaster will be stored in the
3045            status field
3046         */
3047         if (state->status != rmdata->pnn) {
3048                 DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
3049                 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
3050                 rmdata->status = MONITOR_ELECTION_NEEDED;
3051         }
3052
3053         return;
3054 }
3055
3056
3057 /* verify that all nodes agree that we are the recmaster */
3058 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
3059 {
3060         struct ctdb_context *ctdb = rec->ctdb;
3061         struct verify_recmaster_data *rmdata;
3062         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3063         struct ctdb_client_control_state *state;
3064         enum monitor_result status;
3065         int j;
3066
3067         rmdata = talloc(mem_ctx, struct verify_recmaster_data);
3068         CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
3069         rmdata->rec    = rec;
3070         rmdata->count  = 0;
3071         rmdata->pnn    = pnn;
3072         rmdata->status = MONITOR_OK;
3073
3074         /* loop over all active nodes and send an async getrecmaster call to
3075            them*/
3076         for (j=0; j<nodemap->num; j++) {
3077                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3078                         continue;
3079                 }
3080                 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
3081                                         CONTROL_TIMEOUT(),
3082                                         nodemap->nodes[j].pnn);
3083                 if (state == NULL) {
3084                         /* we failed to send the control, treat this as
3085                            an error and try again next iteration
3086                         */
3087                         DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
3088                         talloc_free(mem_ctx);
3089                         return MONITOR_FAILED;
3090                 }
3091
3092                 /* set up the callback functions */
3093                 state->async.fn = verify_recmaster_callback;
3094                 state->async.private_data = rmdata;
3095
3096                 /* one more control to wait for to complete */
3097                 rmdata->count++;
3098         }
3099
3100
3101         /* now wait for up to the maximum number of seconds allowed
3102            or until all nodes we expect a response from has replied
3103         */
3104         while (rmdata->count > 0) {
3105                 event_loop_once(ctdb->ev);
3106         }
3107
3108         status = rmdata->status;
3109         talloc_free(mem_ctx);
3110         return status;
3111 }
3112
3113 static bool interfaces_have_changed(struct ctdb_context *ctdb,
3114                                     struct ctdb_recoverd *rec)
3115 {
3116         struct ctdb_control_get_ifaces *ifaces = NULL;
3117         TALLOC_CTX *mem_ctx;
3118         bool ret = false;
3119
3120         mem_ctx = talloc_new(NULL);
3121
3122         /* Read the interfaces from the local node */
3123         if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
3124                                  CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
3125                 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
3126                 /* We could return an error.  However, this will be
3127                  * rare so we'll decide that the interfaces have
3128                  * actually changed, just in case.
3129                  */
3130                 talloc_free(mem_ctx);
3131                 return true;
3132         }
3133
3134         if (!rec->ifaces) {
3135                 /* We haven't been here before so things have changed */
3136                 DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
3137                 ret = true;
3138         } else if (rec->ifaces->num != ifaces->num) {
3139                 /* Number of interfaces has changed */
3140                 DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
3141                                      rec->ifaces->num, ifaces->num));
3142                 ret = true;
3143         } else {
3144                 /* See if interface names or link states have changed */
3145                 int i;
3146                 for (i = 0; i < rec->ifaces->num; i++) {
3147                         struct ctdb_control_iface_info * iface = &rec->ifaces->ifaces[i];
3148                         if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
3149                                 DEBUG(DEBUG_NOTICE,
3150                                       ("Interface in slot %d changed: %s => %s\n",
3151                                        i, iface->name, ifaces->ifaces[i].name));
3152                                 ret = true;
3153                                 break;
3154                         }
3155                         if (iface->link_state != ifaces->ifaces[i].link_state) {
3156                                 DEBUG(DEBUG_NOTICE,
3157                                       ("Interface %s changed state: %d => %d\n",
3158                                        iface->name, iface->link_state,
3159                                        ifaces->ifaces[i].link_state));
3160                                 ret = true;
3161                                 break;
3162                         }
3163                 }
3164         }
3165
3166         talloc_free(rec->ifaces);
3167         rec->ifaces = talloc_steal(rec, ifaces);
3168
3169         talloc_free(mem_ctx);
3170         return ret;
3171 }
3172
3173 /* called to check that the local allocation of public ip addresses is ok.
3174 */
3175 static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn, struct ctdb_node_map *nodemap)
3176 {
3177         TALLOC_CTX *mem_ctx = talloc_new(NULL);
3178         struct ctdb_uptime *uptime1 = NULL;
3179         struct ctdb_uptime *uptime2 = NULL;
3180         int ret, j;
3181         bool need_takeover_run = false;
3182
3183         ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
3184                                 CTDB_CURRENT_NODE, &uptime1);
3185         if (ret != 0) {
3186                 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
3187                 talloc_free(mem_ctx);
3188                 return -1;
3189         }
3190
3191         if (interfaces_have_changed(ctdb, rec)) {
3192                 DEBUG(DEBUG_NOTICE, ("The interfaces status has changed on "
3193                                      "local node %u - force takeover run\n",
3194                                      pnn));
3195                 need_takeover_run = true;
3196         }
3197
3198         ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
3199                                 CTDB_CURRENT_NODE, &uptime2);
3200         if (ret != 0) {
3201                 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
3202                 talloc_free(mem_ctx);
3203                 return -1;
3204         }
3205
3206         /* skip the check if the startrecovery time has changed */
3207         if (timeval_compare(&uptime1->last_recovery_started,
3208                             &uptime2->last_recovery_started) != 0) {
3209                 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3210                 talloc_free(mem_ctx);
3211                 return 0;
3212         }
3213
3214         /* skip the check if the endrecovery time has changed */
3215         if (timeval_compare(&uptime1->last_recovery_finished,
3216                             &uptime2->last_recovery_finished) != 0) {
3217                 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3218                 talloc_free(mem_ctx);
3219                 return 0;
3220         }
3221
3222         /* skip the check if we have started but not finished recovery */
3223         if (timeval_compare(&uptime1->last_recovery_finished,
3224                             &uptime1->last_recovery_started) != 1) {
3225                 DEBUG(DEBUG_INFO, (__location__ " in the middle of recovery or ip reallocation. skipping public ip address check\n"));
3226                 talloc_free(mem_ctx);
3227
3228                 return 0;
3229         }
3230
3231         /* verify that we have the ip addresses we should have
3232            and we dont have ones we shouldnt have.
3233            if we find an inconsistency we set recmode to
3234            active on the local node and wait for the recmaster
3235            to do a full blown recovery.
3236            also if the pnn is -1 and we are healthy and can host the ip
3237            we also request a ip reallocation.
3238         */
3239         if (ctdb->tunable.disable_ip_failover == 0) {
3240                 struct ctdb_all_public_ips *ips = NULL;
3241
3242                 /* read the *available* IPs from the local node */
3243                 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
3244                 if (ret != 0) {
3245                         DEBUG(DEBUG_ERR, ("Unable to get available public IPs from local node %u\n", pnn));
3246                         talloc_free(mem_ctx);
3247                         return -1;
3248                 }
3249
3250                 for (j=0; j<ips->num; j++) {
3251                         if (ips->ips[j].pnn == -1 &&
3252                             nodemap->nodes[pnn].flags == 0) {
3253                                 DEBUG(DEBUG_CRIT,("Public IP '%s' is not assigned and we could serve it\n",
3254                                                   ctdb_addr_to_str(&ips->ips[j].addr)));
3255                                 need_takeover_run = true;
3256                         }
3257                 }
3258
3259                 talloc_free(ips);
3260
3261                 /* read the *known* IPs from the local node */
3262                 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
3263                 if (ret != 0) {
3264                         DEBUG(DEBUG_ERR, ("Unable to get known public IPs from local node %u\n", pnn));
3265                         talloc_free(mem_ctx);
3266                         return -1;
3267                 }
3268
3269                 for (j=0; j<ips->num; j++) {
3270                         if (ips->ips[j].pnn == pnn) {
3271                                 if (ctdb->do_checkpublicip && !ctdb_sys_have_ip(&ips->ips[j].addr)) {
3272                                         DEBUG(DEBUG_CRIT,("Public IP '%s' is assigned to us but not on an interface\n",
3273                                                 ctdb_addr_to_str(&ips->ips[j].addr)));
3274                                         need_takeover_run = true;
3275                                 }
3276                         } else {
3277                                 if (ctdb->do_checkpublicip &&
3278                                     ctdb_sys_have_ip(&ips->ips[j].addr)) {
3279
3280                                         DEBUG(DEBUG_CRIT,("We are still serving a public IP '%s' that we should not be serving. Removing it\n",
3281                                                 ctdb_addr_to_str(&ips->ips[j].addr)));
3282
3283                                         if (ctdb_ctrl_release_ip(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ips->ips[j]) != 0) {
3284                                                 DEBUG(DEBUG_ERR,("Failed to release local IP address\n"));
3285                                         }
3286                                 }
3287                         }
3288                 }
3289         }
3290
3291         if (need_takeover_run) {
3292                 struct srvid_request rd;
3293                 TDB_DATA data;
3294
3295                 DEBUG(DEBUG_CRIT,("Trigger takeoverrun\n"));
3296
3297                 rd.pnn = ctdb->pnn;
3298                 rd.srvid = 0;
3299                 data.dptr = (uint8_t *)&rd;
3300                 data.dsize = sizeof(rd);
3301
3302                 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
3303                 if (ret != 0) {
3304                         DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
3305                 }
3306         }
3307         talloc_free(mem_ctx);
3308         return 0;
3309 }
3310
3311
3312 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
3313 {
3314         struct ctdb_node_map **remote_nodemaps = callback_data;
3315
3316         if (node_pnn >= ctdb->num_nodes) {
3317                 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
3318                 return;
3319         }
3320
3321         remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
3322
3323 }
3324
3325 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
3326         struct ctdb_node_map *nodemap,
3327         struct ctdb_node_map **remote_nodemaps)
3328 {
3329         uint32_t *nodes;
3330
3331         nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
3332         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
3333                                         nodes, 0,
3334                                         CONTROL_TIMEOUT(), false, tdb_null,
3335                                         async_getnodemap_callback,
3336                                         NULL,
3337                                         remote_nodemaps) != 0) {
3338                 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
3339
3340                 return -1;
3341         }
3342
3343         return 0;
3344 }
3345
3346 static int update_recovery_lock_file(struct ctdb_context *ctdb)
3347 {
3348         TALLOC_CTX *tmp_ctx = talloc_new(NULL);
3349         const char *reclockfile;
3350
3351         if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
3352                 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
3353                 talloc_free(tmp_ctx);
3354                 return -1;
3355         }
3356
3357         if (reclockfile == NULL) {
3358                 if (ctdb->recovery_lock_file != NULL) {
3359                         DEBUG(DEBUG_NOTICE,("Recovery lock file disabled\n"));
3360                         talloc_free(ctdb->recovery_lock_file);
3361                         ctdb->recovery_lock_file = NULL;
3362                         ctdb_recovery_unlock(ctdb);
3363                 }
3364                 talloc_free(tmp_ctx);
3365                 return 0;
3366         }
3367
3368         if (ctdb->recovery_lock_file == NULL) {
3369                 DEBUG(DEBUG_NOTICE,
3370                       ("Recovery lock file enabled (%s)\n", reclockfile));
3371                 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3372                 ctdb_recovery_unlock(ctdb);
3373                 talloc_free(tmp_ctx);
3374                 return 0;
3375         }
3376
3377
3378         if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
3379                 talloc_free(tmp_ctx);
3380                 return 0;
3381         }
3382
3383         DEBUG(DEBUG_NOTICE,
3384               ("Recovery lock file changed (now %s)\n", reclockfile));
3385         talloc_free(ctdb->recovery_lock_file);
3386         ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3387         ctdb_recovery_unlock(ctdb);
3388
3389         talloc_free(tmp_ctx);
3390         return 0;
3391 }
3392
3393 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
3394                       TALLOC_CTX *mem_ctx)
3395 {
3396         uint32_t pnn;
3397         struct ctdb_node_map *nodemap=NULL;
3398         struct ctdb_node_map *recmaster_nodemap=NULL;
3399         struct ctdb_node_map **remote_nodemaps=NULL;
3400         struct ctdb_vnn_map *vnnmap=NULL;
3401         struct ctdb_vnn_map *remote_vnnmap=NULL;
3402         uint32_t num_lmasters;
3403         int32_t debug_level;
3404         int i, j, ret;
3405         bool self_ban;
3406
3407
3408         /* verify that the main daemon is still running */
3409         if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
3410                 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
3411                 exit(-1);
3412         }
3413
3414         /* ping the local daemon to tell it we are alive */
3415         ctdb_ctrl_recd_ping(ctdb);
3416
3417         if (rec->election_timeout) {
3418                 /* an election is in progress */
3419                 return;
3420         }
3421
3422         /* read the debug level from the parent and update locally */
3423         ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
3424         if (ret !=0) {
3425                 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
3426                 return;
3427         }
3428         DEBUGLEVEL = debug_level;
3429
3430         /* get relevant tunables */
3431         ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
3432         if (ret != 0) {
3433                 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
3434                 return;
3435         }
3436
3437         /* get runstate */
3438         ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(),
3439                                      CTDB_CURRENT_NODE, &ctdb->runstate);
3440         if (ret != 0) {
3441                 DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n"));
3442                 return;
3443         }
3444
3445         /* get the current recovery lock file from the server */
3446         if (update_recovery_lock_file(ctdb) != 0) {
3447                 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
3448                 return;
3449         }
3450
3451         /* Make sure that if recovery lock verification becomes disabled when
3452            we close the file
3453         */
3454         if (ctdb->recovery_lock_file == NULL) {
3455                 ctdb_recovery_unlock(ctdb);
3456         }
3457
3458         pnn = ctdb_get_pnn(ctdb);
3459
3460         /* get the vnnmap */
3461         ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
3462         if (ret != 0) {
3463                 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
3464                 return;
3465         }
3466
3467
3468         /* get number of nodes */
3469         if (rec->nodemap) {
3470                 talloc_free(rec->nodemap);
3471                 rec->nodemap = NULL;
3472                 nodemap=NULL;
3473         }
3474         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
3475         if (ret != 0) {
3476                 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
3477                 return;
3478         }
3479         nodemap = rec->nodemap;
3480
3481         /* remember our own node flags */
3482         rec->node_flags = nodemap->nodes[pnn].flags;
3483
3484         ban_misbehaving_nodes(rec, &self_ban);
3485         if (self_ban) {
3486                 DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
3487                 return;
3488         }
3489
3490         /* if the local daemon is STOPPED or BANNED, we verify that the databases are
3491            also frozen and that the recmode is set to active.
3492         */
3493         if (rec->node_flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
3494                 /* If this node has become inactive then we want to
3495                  * reduce the chances of it taking over the recovery
3496                  * master role when it becomes active again.  This
3497                  * helps to stabilise the recovery master role so that
3498                  * it stays on the most stable node.
3499                  */
3500                 rec->priority_time = timeval_current();
3501
3502                 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
3503                 if (ret != 0) {
3504                         DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
3505                 }
3506                 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
3507                         DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
3508
3509                         ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
3510                         if (ret != 0) {
3511                                 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
3512
3513                                 return;
3514                         }
3515                         ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
3516                         if (ret != 0) {
3517                                 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node in STOPPED or BANNED state\n"));
3518                                 return;
3519                         }
3520                 }
3521
3522                 /* If this node is stopped or banned then it is not the recovery
3523                  * master, so don't do anything. This prevents stopped or banned
3524                  * node from starting election and sending unnecessary controls.
3525                  */
3526                 return;
3527         }
3528
3529         /* check which node is the recovery master */
3530         ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
3531         if (ret != 0) {
3532                 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
3533                 return;
3534         }
3535
3536         /* If we are not the recmaster then do some housekeeping */
3537         if (rec->recmaster != pnn) {
3538                 /* Ignore any IP reallocate requests - only recmaster
3539                  * processes them
3540                  */
3541                 TALLOC_FREE(rec->reallocate_requests);
3542                 /* Clear any nodes that should be force rebalanced in
3543                  * the next takeover run.  If the recovery master role
3544                  * has moved then we don't want to process these some
3545                  * time in the future.
3546                  */
3547                 TALLOC_FREE(rec->force_rebalance_nodes);
3548         }
3549
3550         /* This is a special case.  When recovery daemon is started, recmaster
3551          * is set to -1.  If a node is not started in stopped state, then
3552          * start election to decide recovery master
3553          */
3554         if (rec->recmaster == (uint32_t)-1) {
3555                 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
3556                 force_election(rec, pnn, nodemap);
3557                 return;
3558         }
3559
3560         /* update the capabilities for all nodes */
3561         ret = update_capabilities(rec, nodemap);
3562         if (ret != 0) {
3563                 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
3564                 return;
3565         }
3566
3567         /*
3568          * If the current recmaster does not have CTDB_CAP_RECMASTER,
3569          * but we have, then force an election and try to become the new
3570          * recmaster.
3571          */
3572         if (!ctdb_node_has_capabilities(rec->caps,
3573                                         rec->recmaster,
3574                                         CTDB_CAP_RECMASTER) &&
3575             (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
3576             !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
3577                 DEBUG(DEBUG_ERR, (__location__ " Current recmaster node %u does not have CAP_RECMASTER,"
3578                                   " but we (node %u) have - force an election\n",
3579                                   rec->recmaster, pnn));
3580                 force_election(rec, pnn, nodemap);
3581                 return;
3582         }
3583
3584         /* verify that the recmaster node is still active */
3585         for (j=0; j<nodemap->num; j++) {
3586                 if (nodemap->nodes[j].pnn==rec->recmaster) {
3587                         break;
3588                 }
3589         }
3590
3591         if (j == nodemap->num) {
3592                 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
3593                 force_election(rec, pnn, nodemap);
3594                 return;
3595         }
3596
3597         /* if recovery master is disconnected we must elect a new recmaster */
3598         if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
3599                 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
3600                 force_election(rec, pnn, nodemap);
3601                 return;
3602         }
3603
3604         /* get nodemap from the recovery master to check if it is inactive */
3605         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3606                                    mem_ctx, &recmaster_nodemap);
3607         if (ret != 0) {
3608                 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
3609                           nodemap->nodes[j].pnn));
3610                 return;
3611         }
3612
3613
3614         if ((recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) &&
3615             (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
3616                 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
3617                 /*
3618                  * update our nodemap to carry the recmaster's notion of
3619                  * its own flags, so that we don't keep freezing the
3620                  * inactive recmaster node...
3621                  */
3622                 nodemap->nodes[j].flags = recmaster_nodemap->nodes[j].flags;
3623                 force_election(rec, pnn, nodemap);
3624                 return;
3625         }
3626
3627         /* verify that we have all ip addresses we should have and we dont
3628          * have addresses we shouldnt have.
3629          */
3630         if (ctdb->tunable.disable_ip_failover == 0 &&
3631             !ctdb_op_is_disabled(rec->takeover_run)) {
3632                 if (verify_local_ip_allocation(ctdb, rec, pnn, nodemap) != 0) {
3633                         DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
3634                 }
3635         }
3636
3637
3638         /* if we are not the recmaster then we do not need to check
3639            if recovery is needed
3640          */
3641         if (pnn != rec->recmaster) {
3642                 return;
3643         }
3644
3645
3646         /* ensure our local copies of flags are right */
3647         ret = update_local_flags(rec, nodemap);
3648         if (ret == MONITOR_ELECTION_NEEDED) {
3649                 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
3650                 force_election(rec, pnn, nodemap);
3651                 return;
3652         }
3653         if (ret != MONITOR_OK) {
3654                 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
3655                 return;
3656         }
3657
3658         if (ctdb->num_nodes != nodemap->num) {
3659                 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
3660                 ctdb_load_nodes_file(ctdb);
3661                 return;
3662         }
3663
3664         /* verify that all active nodes agree that we are the recmaster */
3665         switch (verify_recmaster(rec, nodemap, pnn)) {
3666         case MONITOR_RECOVERY_NEEDED:
3667                 /* can not happen */
3668                 return;
3669         case MONITOR_ELECTION_NEEDED:
3670                 force_election(rec, pnn, nodemap);
3671                 return;
3672         case MONITOR_OK:
3673                 break;
3674         case MONITOR_FAILED:
3675                 return;
3676         }
3677
3678
3679         if (rec->need_recovery) {
3680                 /* a previous recovery didn't finish */
3681                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3682                 return;
3683         }
3684
3685         /* verify that all active nodes are in normal mode
3686            and not in recovery mode
3687         */
3688         switch (verify_recmode(ctdb, nodemap)) {
3689         case MONITOR_RECOVERY_NEEDED:
3690                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3691                 return;
3692         case MONITOR_FAILED:
3693                 return;
3694         case MONITOR_ELECTION_NEEDED:
3695                 /* can not happen */
3696         case MONITOR_OK:
3697                 break;
3698         }
3699
3700
3701         if (ctdb->recovery_lock_file != NULL) {
3702                 /* We must already hold the recovery lock */
3703                 if (!ctdb_recovery_have_lock(ctdb)) {
3704                         DEBUG(DEBUG_ERR,("Failed recovery lock sanity check.  Force a recovery\n"));
3705                         ctdb_set_culprit(rec, ctdb->pnn);
3706                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3707                         return;
3708                 }
3709         }
3710
3711
3712         /* if there are takeovers requested, perform it and notify the waiters */
3713         if (!ctdb_op_is_disabled(rec->takeover_run) &&
3714             rec->reallocate_requests) {
3715                 process_ipreallocate_requests(ctdb, rec);
3716         }
3717
3718         /* If recoveries are disabled then there is no use doing any
3719          * nodemap or flags checks.  Recoveries might be disabled due
3720          * to "reloadnodes", so doing these checks might cause an
3721          * unnecessary recovery.  */
3722         if (ctdb_op_is_disabled(rec->recovery)) {
3723                 return;
3724         }
3725
3726         /* get the nodemap for all active remote nodes
3727          */
3728         remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
3729         if (remote_nodemaps == NULL) {
3730                 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3731                 return;
3732         }
3733         for(i=0; i<nodemap->num; i++) {
3734                 remote_nodemaps[i] = NULL;
3735         }
3736         if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3737                 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3738                 return;
3739         }
3740
3741         /* verify that all other nodes have the same nodemap as we have
3742         */
3743         for (j=0; j<nodemap->num; j++) {
3744                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3745                         continue;
3746                 }
3747
3748                 if (remote_nodemaps[j] == NULL) {
3749                         DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3750                         ctdb_set_culprit(rec, j);
3751
3752                         return;
3753                 }
3754
3755                 /* if the nodes disagree on how many nodes there are
3756                    then this is a good reason to try recovery
3757                  */
3758                 if (remote_nodemaps[j]->num != nodemap->num) {
3759                         DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3760                                   nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3761                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3762                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3763                         return;
3764                 }
3765
3766                 /* if the nodes disagree on which nodes exist and are
3767                    active, then that is also a good reason to do recovery
3768                  */
3769                 for (i=0;i<nodemap->num;i++) {
3770                         if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3771                                 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3772                                           nodemap->nodes[j].pnn, i,
3773                                           remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3774                                 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3775                                 do_recovery(rec, mem_ctx, pnn, nodemap,
3776                                             vnnmap);
3777                                 return;
3778                         }
3779                 }
3780         }
3781
3782         /*
3783          * Update node flags obtained from each active node. This ensure we have
3784          * up-to-date information for all the nodes.
3785          */
3786         for (j=0; j<nodemap->num; j++) {
3787                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3788                         continue;
3789                 }
3790                 nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
3791         }
3792
3793         for (j=0; j<nodemap->num; j++) {
3794                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3795                         continue;
3796                 }
3797
3798                 /* verify the flags are consistent
3799                 */
3800                 for (i=0; i<nodemap->num; i++) {
3801                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3802                                 continue;
3803                         }
3804
3805                         if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3806                                 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3807                                   nodemap->nodes[j].pnn,
3808                                   nodemap->nodes[i].pnn,
3809                                   remote_nodemaps[j]->nodes[i].flags,
3810                                   nodemap->nodes[i].flags));
3811                                 if (i == j) {
3812                                         DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3813                                         update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3814                                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3815                                         do_recovery(rec, mem_ctx, pnn, nodemap,
3816                                                     vnnmap);
3817                                         return;
3818                                 } else {
3819                                         DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3820                                         update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3821                                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3822                                         do_recovery(rec, mem_ctx, pnn, nodemap,
3823                                                     vnnmap);
3824                                         return;
3825                                 }
3826                         }
3827                 }
3828         }
3829
3830
3831         /* count how many active nodes there are */
3832         num_lmasters  = 0;
3833         for (i=0; i<nodemap->num; i++) {
3834                 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
3835                         if (ctdb_node_has_capabilities(rec->caps,
3836                                                        ctdb->nodes[i]->pnn,
3837                                                        CTDB_CAP_LMASTER)) {
3838                                 num_lmasters++;
3839                         }
3840                 }
3841         }
3842
3843
3844         /* There must be the same number of lmasters in the vnn map as
3845          * there are active nodes with the lmaster capability...  or
3846          * do a recovery.
3847          */
3848         if (vnnmap->size != num_lmasters) {
3849                 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
3850                           vnnmap->size, num_lmasters));
3851                 ctdb_set_culprit(rec, ctdb->pnn);
3852                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3853                 return;
3854         }
3855
3856         /* verify that all active nodes in the nodemap also exist in
3857            the vnnmap.
3858          */
3859         for (j=0; j<nodemap->num; j++) {
3860                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3861                         continue;
3862                 }
3863                 if (nodemap->nodes[j].pnn == pnn) {
3864                         continue;
3865                 }
3866
3867                 for (i=0; i<vnnmap->size; i++) {
3868                         if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3869                                 break;
3870                         }
3871                 }
3872                 if (i == vnnmap->size) {
3873                         DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3874                                   nodemap->nodes[j].pnn));
3875                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3876                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3877                         return;
3878                 }
3879         }
3880
3881
3882         /* verify that all other nodes have the same vnnmap
3883            and are from the same generation
3884          */
3885         for (j=0; j<nodemap->num; j++) {
3886                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3887                         continue;
3888                 }
3889                 if (nodemap->nodes[j].pnn == pnn) {
3890                         continue;
3891                 }
3892
3893                 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3894                                           mem_ctx, &remote_vnnmap);
3895                 if (ret != 0) {
3896                         DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3897                                   nodemap->nodes[j].pnn));
3898                         return;
3899                 }
3900
3901                 /* verify the vnnmap generation is the same */
3902                 if (vnnmap->generation != remote_vnnmap->generation) {
3903                         DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3904                                   nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3905                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3906                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3907                         return;
3908                 }
3909
3910                 /* verify the vnnmap size is the same */
3911                 if (vnnmap->size != remote_vnnmap->size) {
3912                         DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3913                                   nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3914                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3915                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3916                         return;
3917                 }
3918
3919                 /* verify the vnnmap is the same */
3920                 for (i=0;i<vnnmap->size;i++) {
3921                         if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3922                                 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3923                                           nodemap->nodes[j].pnn));
3924                                 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3925                                 do_recovery(rec, mem_ctx, pnn, nodemap,
3926                                             vnnmap);
3927                                 return;
3928                         }
3929                 }
3930         }
3931
3932         /* we might need to change who has what IP assigned */
3933         if (rec->need_takeover_run) {
3934                 uint32_t culprit = (uint32_t)-1;
3935
3936                 rec->need_takeover_run = false;
3937
3938                 /* update the list of public ips that a node can handle for
3939                    all connected nodes
3940                 */
3941                 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
3942                 if (ret != 0) {
3943                         DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
3944                                          culprit));
3945                         rec->need_takeover_run = true;
3946                         return;
3947                 }
3948
3949                 /* execute the "startrecovery" event script on all nodes */
3950                 ret = run_startrecovery_eventscript(rec, nodemap);
3951                 if (ret!=0) {
3952                         DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
3953                         ctdb_set_culprit(rec, ctdb->pnn);
3954                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3955                         return;
3956                 }
3957
3958                 /* If takeover run fails, then the offending nodes are
3959                  * assigned ban culprit counts. And we re-try takeover.
3960                  * If takeover run fails repeatedly, the node would get
3961                  * banned.
3962                  *
3963                  * If rec->need_takeover_run is not set to true at this
3964                  * failure, monitoring is disabled cluster-wide (via
3965                  * startrecovery eventscript) and will not get enabled.
3966                  */
3967                 if (!do_takeover_run(rec, nodemap, true)) {
3968                         return;
3969                 }
3970
3971                 /* execute the "recovered" event script on all nodes */
3972                 ret = run_recovered_eventscript(rec, nodemap, "monitor_cluster");
3973 #if 0
3974 // we cant check whether the event completed successfully
3975 // since this script WILL fail if the node is in recovery mode
3976 // and if that race happens, the code here would just cause a second
3977 // cascading recovery.
3978                 if (ret!=0) {
3979                         DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
3980                         ctdb_set_culprit(rec, ctdb->pnn);
3981                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3982                 }
3983 #endif
3984         }
3985 }
3986
3987 /*
3988   the main monitoring loop
3989  */
3990 static void monitor_cluster(struct ctdb_context *ctdb)
3991 {
3992         struct ctdb_recoverd *rec;
3993
3994         DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
3995
3996         rec = talloc_zero(ctdb, struct ctdb_recoverd);
3997         CTDB_NO_MEMORY_FATAL(ctdb, rec);
3998
3999         rec->ctdb = ctdb;
4000
4001         rec->takeover_run = ctdb_op_init(rec, "takeover runs");
4002         CTDB_NO_MEMORY_FATAL(ctdb, rec->takeover_run);
4003
4004         rec->recovery = ctdb_op_init(rec, "recoveries");
4005         CTDB_NO_MEMORY_FATAL(ctdb, rec->recovery);
4006
4007         rec->priority_time = timeval_current();
4008
4009         /* register a message port for sending memory dumps */
4010         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
4011
4012         /* register a message port for recovery elections */
4013         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
4014
4015         /* when nodes are disabled/enabled */
4016         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
4017
4018         /* when we are asked to puch out a flag change */
4019         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
4020
4021         /* register a message port for vacuum fetch */
4022         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
4023
4024         /* register a message port for reloadnodes  */
4025         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
4026
4027         /* register a message port for performing a takeover run */
4028         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
4029
4030         /* register a message port for disabling the ip check for a short while */
4031         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
4032
4033         /* register a message port for updating the recovery daemons node assignment for an ip */
4034         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECD_UPDATE_IP, recd_update_ip_handler, rec);
4035
4036         /* register a message port for forcing a rebalance of a node next
4037            reallocation */
4038         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
4039
4040         /* Register a message port for disabling takeover runs */
4041         ctdb_client_set_message_handler(ctdb,
4042                                         CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
4043                                         disable_takeover_runs_handler, rec);
4044
4045         /* Register a message port for disabling recoveries */
4046         ctdb_client_set_message_handler(ctdb,
4047                                         CTDB_SRVID_DISABLE_RECOVERIES,
4048                                         disable_recoveries_handler, rec);
4049
4050         /* register a message port for detaching database */
4051         ctdb_client_set_message_handler(ctdb,
4052                                         CTDB_SRVID_DETACH_DATABASE,
4053                                         detach_database_handler, rec);
4054
4055         for (;;) {
4056                 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
4057                 struct timeval start;
4058                 double elapsed;
4059
4060                 if (!mem_ctx) {
4061                         DEBUG(DEBUG_CRIT,(__location__
4062                                           " Failed to create temp context\n"));
4063                         exit(-1);
4064                 }
4065
4066                 start = timeval_current();
4067                 main_loop(ctdb, rec, mem_ctx);
4068                 talloc_free(mem_ctx);
4069
4070                 /* we only check for recovery once every second */
4071                 elapsed = timeval_elapsed(&start);
4072                 if (elapsed < ctdb->tunable.recover_interval) {
4073                         ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
4074                                           - elapsed);
4075                 }
4076         }
4077 }
4078
4079 /*
4080   event handler for when the main ctdbd dies
4081  */
4082 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
4083                                  uint16_t flags, void *private_data)
4084 {
4085         DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
4086         _exit(1);
4087 }
4088
4089 /*
4090   called regularly to verify that the recovery daemon is still running
4091  */
4092 static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
4093                               struct timeval yt, void *p)
4094 {
4095         struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
4096
4097         if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
4098                 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
4099
4100                 event_add_timed(ctdb->ev, ctdb, timeval_zero(),
4101                                 ctdb_restart_recd, ctdb);
4102
4103                 return;
4104         }
4105
4106         event_add_timed(ctdb->ev, ctdb->recd_ctx,
4107                         timeval_current_ofs(30, 0),
4108                         ctdb_check_recd, ctdb);
4109 }
4110
4111 static void recd_sig_child_handler(struct event_context *ev,
4112         struct signal_event *se, int signum, int count,
4113         void *dont_care,
4114         void *private_data)
4115 {
4116 //      struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4117         int status;
4118         pid_t pid = -1;
4119
4120         while (pid != 0) {
4121                 pid = waitpid(-1, &status, WNOHANG);
4122                 if (pid == -1) {
4123                         if (errno != ECHILD) {
4124                                 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
4125                         }
4126                         return;
4127                 }
4128                 if (pid > 0) {
4129                         DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
4130                 }
4131         }
4132 }
4133
4134 /*
4135   startup the recovery daemon as a child of the main ctdb daemon
4136  */
4137 int ctdb_start_recoverd(struct ctdb_context *ctdb)
4138 {
4139         int fd[2];
4140         struct signal_event *se;
4141         struct tevent_fd *fde;
4142
4143         if (pipe(fd) != 0) {
4144                 return -1;
4145         }
4146
4147         ctdb->recoverd_pid = ctdb_fork(ctdb);
4148         if (ctdb->recoverd_pid == -1) {
4149                 return -1;
4150         }
4151
4152         if (ctdb->recoverd_pid != 0) {
4153                 talloc_free(ctdb->recd_ctx);
4154                 ctdb->recd_ctx = talloc_new(ctdb);
4155                 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
4156
4157                 close(fd[0]);
4158                 event_add_timed(ctdb->ev, ctdb->recd_ctx,
4159                                 timeval_current_ofs(30, 0),
4160                                 ctdb_check_recd, ctdb);
4161                 return 0;
4162         }
4163
4164         close(fd[1]);
4165
4166         srandom(getpid() ^ time(NULL));
4167
4168         ctdb_set_process_name("ctdb_recovered");
4169         if (switch_from_server_to_client(ctdb, "recoverd") != 0) {
4170                 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
4171                 exit(1);
4172         }
4173
4174         DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
4175
4176         fde = event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ,
4177                      ctdb_recoverd_parent, &fd[0]);
4178         tevent_fd_set_auto_close(fde);
4179
4180         /* set up a handler to pick up sigchld */
4181         se = event_add_signal(ctdb->ev, ctdb,
4182                                      SIGCHLD, 0,
4183                                      recd_sig_child_handler,
4184                                      ctdb);
4185         if (se == NULL) {
4186                 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
4187                 exit(1);
4188         }
4189
4190         monitor_cluster(ctdb);
4191
4192         DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
4193         return -1;
4194 }
4195
4196 /*
4197   shutdown the recovery daemon
4198  */
4199 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
4200 {
4201         if (ctdb->recoverd_pid == 0) {
4202                 return;
4203         }
4204
4205         DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
4206         ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
4207
4208         TALLOC_FREE(ctdb->recd_ctx);
4209         TALLOC_FREE(ctdb->recd_ping_count);
4210 }
4211
4212 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te,
4213                        struct timeval t, void *private_data)
4214 {
4215         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4216
4217         DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
4218         ctdb_stop_recoverd(ctdb);
4219         ctdb_start_recoverd(ctdb);
4220 }